classification_performance module

class ConfusionMatrix(labels: Sequence[Union[str, int]], values: list)

Bases: object


labels : Sequence[Union[str, int]]
values : list

class DatasetClassificationQuality(accuracy: float, precision: float, recall: float, f1: float, roc_auc: Optional[float] = None, log_loss: Optional[float] = None, tpr: Optional[float] = None, tnr: Optional[float] = None, fpr: Optional[float] = None, fnr: Optional[float] = None, rate_plots_data: Optional[Dict] = None, plot_data: Optional[Dict] = None)

Bases: object


accuracy : float
f1 : float
fnr : Optional[float] = None
fpr : Optional[float] = None
log_loss : Optional[float] = None
plot_data : Optional[Dict] = None
precision : float
rate_plots_data : Optional[Dict] = None
recall : float
roc_auc : Optional[float] = None
tnr : Optional[float] = None
tpr : Optional[float] = None

class PredictionData(predictions: pandas.core.series.Series, prediction_probas: Optional[pandas.core.frame.DataFrame], labels: List[Union[str, int]])

Bases: object


labels : List[Union[str, int]]
prediction_probas : Optional[DataFrame]
predictions : Series

calculate_confusion_by_classes(confusion_matrix: ndarray, class_names: Sequence[Union[str, int]])

Calculate metrics:
  • TP (true positive)
  • TN (true negative)
  • FP (false positive)
  • FN (false negative) for each class from confusion matrix.
  • Returns
    a dict like:
    "class_1_name": {
    "tp": 1,
    "tn": 5,
    "fp": 0,
    "fn": 3,
    "class_1_name": {
    "tp": 1,
    "tn": 5,
    "fp": 0,
    "fn": 3,

calculate_matrix(target: Series, prediction: Series, labels: List[Union[str, int]])

calculate_metrics(column_mapping: ColumnMapping, confusion_matrix: ConfusionMatrix, target: Series, prediction: PredictionData)


collect_plot_data(prediction_probas: DataFrame)

get_prediction_data(data: DataFrame, data_columns: DatasetColumns, pos_label: Optional[Union[str, int]], threshold: float = 0.5)

Get predicted values and optional prediction probabilities from source data. Also take into account a threshold value - if a probability is less than the value, do not take it into account.
Return and object with predicted values and an optional prediction probabilities.

k_probability_threshold(prediction_probas: DataFrame, k: Union[int, float])

threshold_probability_labels(prediction_probas: DataFrame, pos_label: Union[str, int], neg_label: Union[str, int], threshold: float)

Get prediction values by probabilities with the threshold apply

data_drift module

Methods and types for data drift calculations.

class ColumnDataDriftMetrics(column_name: str, column_type: str, stattest_name: str, drift_score: float, drift_detected: bool, threshold: float, current_distribution: Distribution, reference_distribution: Distribution, current_small_distribution: Optional[list] = None, reference_small_distribution: Optional[list] = None, current_scatter: Optional[Dict[str, list]] = None, x_name: Optional[str] = None, plot_shape: Optional[Dict[str, float]] = None, current_correlations: Optional[Dict[str, float]] = None, reference_correlations: Optional[Dict[str, float]] = None)

Bases: object
One column drift metrics.


column_name : str
column_type : str
current_correlations : Optional[Dict[str, float]] = None
current_distribution : Distribution
current_scatter : Optional[Dict[str, list]] = None
current_small_distribution : Optional[list] = None
drift_detected : bool
drift_score : float
plot_shape : Optional[Dict[str, float]] = None
reference_correlations : Optional[Dict[str, float]] = None
reference_distribution : Distribution
reference_small_distribution : Optional[list] = None
stattest_name : str
threshold : float
x_name : Optional[str] = None

class DatasetDrift(number_of_drifted_columns: int, dataset_drift_score: float, dataset_drift: bool)

Bases: object
Dataset drift calculation results


dataset_drift : bool
dataset_drift_score : float
number_of_drifted_columns : int

class DatasetDriftMetrics(number_of_columns: int, number_of_drifted_columns: int, share_of_drifted_columns: float, dataset_drift: bool, drift_by_columns: Dict[str, ColumnDataDriftMetrics], options: DataDriftOptions, dataset_columns: DatasetColumns)

Bases: object


dataset_columns : DatasetColumns
dataset_drift : bool
drift_by_columns : Dict[str, ColumnDataDriftMetrics]
number_of_columns : int
number_of_drifted_columns : int
options : DataDriftOptions
share_of_drifted_columns : float

ensure_prediction_column_is_string(*, prediction_column: Optional[Union[str, Sequence]], current_data: DataFrame, reference_data: DataFrame, threshold: float = 0.5)

Update dataset by predictions type:
  • if prediction column is None or a string, no dataset changes
  • (binary classification) if predictions is a list and its length equals 2
    set predicted_labels column by threshold
  • (multy label classification) if predictions is a list and its length is greater than 2
    set predicted_labels from probability values in columns by prediction column
  • Returns
    prediction column name.

get_dataset_drift(drift_metrics, drift_share=0.5)

get_drift_for_columns(*, current_data: DataFrame, reference_data: DataFrame, dataset_columns: DatasetColumns, data_drift_options: DataDriftOptions, drift_share_threshold: Optional[float] = None, columns: Optional[List[str]] = None)

get_one_column_drift(*, current_data: DataFrame, reference_data: DataFrame, column_name: str, options: DataDriftOptions, dataset_columns: DatasetColumns, column_type: Optional[str] = None)

data_integration module

get_number_of_all_pandas_missed_values(dataset: DataFrame)

Calculate the number of missed - nulls by pandas - values in a dataset

get_number_of_almost_constant_columns(dataset: DataFrame, threshold: float)

Calculate the number of almost constant columns in a dataset

get_number_of_almost_duplicated_columns(dataset: DataFrame, threshold: float)

Calculate the number of almost duplicated columns in a dataset

get_number_of_constant_columns(dataset: DataFrame)

Calculate the number of constant columns in a dataset

get_number_of_duplicated_columns(dataset: DataFrame)

Calculate the number of duplicated columns in a dataset

get_number_of_empty_columns(dataset: DataFrame)

Calculate the number of empty columns in a dataset

data_quality module

Methods for overall dataset quality calculations - rows count, a specific values count, etc.

class ColumnCorrelations(column_name: str, kind: str, values: Distribution)

Bases: object


column_name : str
kind : str
values : Distribution

class DataQualityGetPlotData()

Bases: object


calculate_data_by_target(curr: DataFrame, ref: Optional[DataFrame], feature_name: str, feature_type: str, target_name: str, target_type: str, merge_small_cat: Optional[int] = 5)
calculate_data_in_time(curr: DataFrame, ref: Optional[DataFrame], feature_name: str, feature_type: str, datetime_name: str, merge_small_cat: Optional[int] = 5)
calculate_main_plot(curr: DataFrame, ref: Optional[DataFrame], feature_name: str, feature_type: str, merge_small_cat: Optional[int] = 5)

class DataQualityPlot(bins_for_hist: Dict[str, pandas.core.frame.DataFrame])

Bases: object


bins_for_hist : Dict[str, DataFrame]

class DataQualityStats(rows_count: int, num_features_stats: Optional[Dict[str, FeatureQualityStats]] = None, cat_features_stats: Optional[Dict[str, FeatureQualityStats]] = None, datetime_features_stats: Optional[Dict[str, FeatureQualityStats]] = None, target_stats: Optional[Dict[str, FeatureQualityStats]] = None, prediction_stats: Optional[Dict[str, FeatureQualityStats]] = None)

Bases: object


cat_features_stats : Optional[Dict[str, FeatureQualityStats]] = None
datetime_features_stats : Optional[Dict[str, FeatureQualityStats]] = None
num_features_stats : Optional[Dict[str, FeatureQualityStats]] = None
prediction_stats : Optional[Dict[str, FeatureQualityStats]] = None
rows_count : int
target_stats : Optional[Dict[str, FeatureQualityStats]] = None



class FeatureQualityStats(feature_type: str, number_of_rows: int = 0, count: int = 0, infinite_count: Optional[int] = None, infinite_percentage: Optional[float] = None, missing_count: Optional[int] = None, missing_percentage: Optional[float] = None, unique_count: Optional[int] = None, unique_percentage: Optional[float] = None, percentile_25: Optional[float] = None, percentile_50: Optional[float] = None, percentile_75: Optional[float] = None, max: Optional[Union[int, float, bool, str]] = None, min: Optional[Union[int, float, bool, str]] = None, mean: Optional[float] = None, most_common_value: Optional[Union[int, float, bool, str]] = None, most_common_value_percentage: Optional[float] = None, std: Optional[float] = None, most_common_not_null_value: Optional[Union[int, float, bool, str]] = None, most_common_not_null_value_percentage: Optional[float] = None, new_in_current_values_count: Optional[int] = None, unused_in_current_values_count: Optional[int] = None)

Bases: object
Class for all features data quality metrics store.
A type of the feature is stored in feature_type field. Concrete stat kit depends on the feature type. Is a metric is not applicable - leave None value for it.
Metrics for all feature types:
- feature type - cat for category, num for numeric, datetime for datetime features
- count - quantity of a meaningful values (do not take into account NaN values)
- missing_count - quantity of meaningless (NaN) values
- missing_percentage - the percentage of the missed values
- unique_count - quantity of unique values
- unique_percentage - the percentage of the unique values
- max - maximum value (not applicable for category features)
- min - minimum value (not applicable for category features)
- most_common_value - the most common value in the feature values
- most_common_value_percentage - the percentage of the most common value
- most_common_not_null_value - if most_common_value equals NaN - the next most common value. Otherwise - None
- most_common_not_null_value_percentage - the percentage of most_common_not_null_value if it is defined.
If most_common_not_null_value is not defined, equals None too.
Metrics for numeric features only:
- infinite_count - quantity infinite values (for numeric features only)
- infinite_percentage - the percentage of infinite values (for numeric features only)
- percentile_25 - 25% percentile for meaningful values
- percentile_50 - 50% percentile for meaningful values
- percentile_75 - 75% percentile for meaningful values
- mean - the sum of the meaningful values divided by the number of the meaningful values
- std - standard deviation of the values
Metrics for category features only:
  • new_in_current_values_count - quantity of new values in the current dataset after the reference
    Defined for reference dataset only.
    • new_in_current_values_count - quantity of values in the reference dataset that not presented in the current
      Defined for reference dataset only.


count : int = 0
feature_type : str
infinite_count : Optional[int] = None
infinite_percentage : Optional[float] = None
max : Optional[Union[int, float, bool, str]] = None
mean : Optional[float] = None
min : Optional[Union[int, float, bool, str]] = None
missing_count : Optional[int] = None
missing_percentage : Optional[float] = None
most_common_not_null_value : Optional[Union[int, float, bool, str]] = None
most_common_not_null_value_percentage : Optional[float] = None
most_common_value : Optional[Union[int, float, bool, str]] = None
most_common_value_percentage : Optional[float] = None
new_in_current_values_count : Optional[int] = None
number_of_rows : int = 0
percentile_25 : Optional[float] = None
percentile_50 : Optional[float] = None
percentile_75 : Optional[float] = None
std : Optional[float] = None
unique_count : Optional[int] = None
unique_percentage : Optional[float] = None
unused_in_current_values_count : Optional[int] = None


Checks that the object store stats for a category feature
Checks that the object store stats for a datetime feature
Checks that the object store stats for a numeric feature

calculate_category_column_correlations(column_name: str, dataset: DataFrame, columns: List[str])

For category columns calculate cramer_v correlation

calculate_column_distribution(column: Series, column_type: str)

calculate_correlations(dataset: DataFrame, columns: DatasetColumns)

calculate_cramer_v_correlation(column_name: str, dataset: DataFrame, columns: List[str])

calculate_data_quality_stats(dataset: DataFrame, columns: DatasetColumns, task: Optional[str])

calculate_numerical_column_correlations(column_name: str, dataset: DataFrame, columns: List[str])

get_features_stats(feature: Series, feature_type: str)

get_pairwise_correlation(df, func: Callable[[Series, Series], float])

Compute pairwise correlation of columns :param df: initial data frame. :param func: function for computing pairwise correlation.
  • Returns
    Correlation matrix.

get_rows_count(data: Union[DataFrame, Series])

Count quantity of rows in a dataset

regression_performance module

class ErrorWithQuantiles(error, quantile_top, quantile_other)

Bases: object

class FeatureBias(feature_type: str, majority: float, under: float, over: float, range: float)

Bases: object


feature_type : str
majority : float
over : float
range : float
under : float



class RegressionPerformanceMetrics(mean_error: float, mean_abs_error: float, mean_abs_perc_error: float, error_std: float, abs_error_max: float, abs_error_std: float, abs_perc_error_std: float, error_normality: dict, underperformance: dict, error_bias: dict)

Bases: object


abs_error_max : float
abs_error_std : float
abs_perc_error_std : float
error_bias : dict
error_normality : dict
error_std : float
mean_abs_error : float
mean_abs_perc_error : float
mean_error : float
underperformance : dict

calculate_regression_performance(dataset: DataFrame, columns: DatasetColumns, error_bias_prefix: str)

error_bias_table(dataset, err_quantiles, num_feature_names, cat_feature_names)

error_with_quantiles(dataset, prediction_column, target_column, quantile: float)