classification_performance module

class ConfusionMatrix(labels: Sequence[Union[str, int]], values: list)

Bases: object


labels : Sequence[Union[str, int]]

values : list

class DatasetClassificationQuality(accuracy: float, precision: float, recall: float, f1: float, roc_auc: Optional[float] = None, log_loss: Optional[float] = None, tpr: Optional[float] = None, tnr: Optional[float] = None, fpr: Optional[float] = None, fnr: Optional[float] = None, rate_plots_data: Optional[Dict] = None, plot_data: Optional[Dict] = None)

Bases: object


accuracy : float

f1 : float

fnr : Optional[float] = None

fpr : Optional[float] = None

log_loss : Optional[float] = None

plot_data : Optional[Dict] = None

precision : float

rate_plots_data : Optional[Dict] = None

recall : float

roc_auc : Optional[float] = None

tnr : Optional[float] = None

tpr : Optional[float] = None

class PredictionData(predictions: pandas.core.series.Series, prediction_probas: Optional[pandas.core.frame.DataFrame], labels: List[Union[str, int]])

Bases: object


labels : List[Union[str, int]]

prediction_probas : Optional[DataFrame]

predictions : Series

calculate_confusion_by_classes(confusion_matrix: ndarray, class_names: Sequence[Union[str, int]])

Calculate metrics:

  • TP (true positive)

  • TN (true negative)

  • FP (false positive)

  • FN (false negative) for each class from confusion matrix.

  • Returns

    a dict like:

        "class_1_name": {
            "tp": 1,
            "tn": 5,
            "fp": 0,
            "fn": 3,
        "class_1_name": {
            "tp": 1,
            "tn": 5,
            "fp": 0,
            "fn": 3,

calculate_matrix(target: Series, prediction: Series, labels: List[Union[str, int]])

calculate_metrics(column_mapping: ColumnMapping, confusion_matrix: ConfusionMatrix, target: Series, prediction: PredictionData)


collect_plot_data(prediction_probas: DataFrame)

get_prediction_data(data: DataFrame, data_columns: DatasetColumns, pos_label: Optional[Union[str, int]], threshold: float = 0.5)

Get predicted values and optional prediction probabilities from source data. Also take into account a threshold value - if a probability is less than the value, do not take it into account.

Return and object with predicted values and an optional prediction probabilities.

k_probability_threshold(prediction_probas: DataFrame, k: Union[int, float])

threshold_probability_labels(prediction_probas: DataFrame, pos_label: Union[str, int], neg_label: Union[str, int], threshold: float)

Get prediction values by probabilities with the threshold apply

data_drift module

Methods and types for data drift calculations.

class ColumnDataDriftMetrics(column_name: str, column_type: str, stattest_name: str, drift_score: float, drift_detected: bool, threshold: float, current_distribution: Distribution, reference_distribution: Distribution, current_small_distribution: Optional[list] = None, reference_small_distribution: Optional[list] = None, current_scatter: Optional[Dict[str, list]] = None, x_name: Optional[str] = None, plot_shape: Optional[Dict[str, float]] = None, current_correlations: Optional[Dict[str, float]] = None, reference_correlations: Optional[Dict[str, float]] = None)

Bases: object

One column drift metrics.


column_name : str

column_type : str

current_correlations : Optional[Dict[str, float]] = None

current_distribution : Distribution

current_scatter : Optional[Dict[str, list]] = None

current_small_distribution : Optional[list] = None

drift_detected : bool

drift_score : float

plot_shape : Optional[Dict[str, float]] = None

reference_correlations : Optional[Dict[str, float]] = None

reference_distribution : Distribution

reference_small_distribution : Optional[list] = None

stattest_name : str

threshold : float

x_name : Optional[str] = None

class DatasetDrift(number_of_drifted_columns: int, dataset_drift_score: float, dataset_drift: bool)

Bases: object

Dataset drift calculation results


dataset_drift : bool

dataset_drift_score : float

number_of_drifted_columns : int

class DatasetDriftMetrics(number_of_columns: int, number_of_drifted_columns: int, share_of_drifted_columns: float, dataset_drift: bool, drift_by_columns: Dict[str, ColumnDataDriftMetrics], options: DataDriftOptions, dataset_columns: DatasetColumns)

Bases: object


dataset_columns : DatasetColumns

dataset_drift : bool

drift_by_columns : Dict[str, ColumnDataDriftMetrics]

number_of_columns : int

number_of_drifted_columns : int

options : DataDriftOptions

share_of_drifted_columns : float

ensure_prediction_column_is_string(*, prediction_column: Optional[Union[str, Sequence]], current_data: DataFrame, reference_data: DataFrame, threshold: float = 0.5)

Update dataset by predictions type:

  • if prediction column is None or a string, no dataset changes

  • (binary classification) if predictions is a list and its length equals 2

    set predicted_labels column by threshold

  • (multy label classification) if predictions is a list and its length is greater than 2

    set predicted_labels from probability values in columns by prediction column

  • Returns

    prediction column name.

get_dataset_drift(drift_metrics, drift_share=0.5)

get_drift_for_columns(*, current_data: DataFrame, reference_data: DataFrame, dataset_columns: DatasetColumns, data_drift_options: DataDriftOptions, drift_share_threshold: Optional[float] = None, columns: Optional[List[str]] = None)

get_one_column_drift(*, current_data: DataFrame, reference_data: DataFrame, column_name: str, options: DataDriftOptions, dataset_columns: DatasetColumns, column_type: Optional[str] = None)

data_integration module

get_number_of_all_pandas_missed_values(dataset: DataFrame)

Calculate the number of missed - nulls by pandas - values in a dataset

get_number_of_almost_constant_columns(dataset: DataFrame, threshold: float)

Calculate the number of almost constant columns in a dataset

get_number_of_almost_duplicated_columns(dataset: DataFrame, threshold: float)

Calculate the number of almost duplicated columns in a dataset

get_number_of_constant_columns(dataset: DataFrame)

Calculate the number of constant columns in a dataset

get_number_of_duplicated_columns(dataset: DataFrame)

Calculate the number of duplicated columns in a dataset

get_number_of_empty_columns(dataset: DataFrame)

Calculate the number of empty columns in a dataset

data_quality module

Methods for overall dataset quality calculations - rows count, a specific values count, etc.

class ColumnCorrelations(column_name: str, kind: str, values: Distribution)

Bases: object


column_name : str

kind : str

values : Distribution

class DataQualityGetPlotData()

Bases: object


calculate_data_by_target(curr: DataFrame, ref: Optional[DataFrame], feature_name: str, feature_type: str, target_name: str, target_type: str, merge_small_cat: Optional[int] = 5)

calculate_data_in_time(curr: DataFrame, ref: Optional[DataFrame], feature_name: str, feature_type: str, datetime_name: str, merge_small_cat: Optional[int] = 5)

calculate_main_plot(curr: DataFrame, ref: Optional[DataFrame], feature_name: str, feature_type: str, merge_small_cat: Optional[int] = 5)

class DataQualityPlot(bins_for_hist: Dict[str, pandas.core.frame.DataFrame])

Bases: object


bins_for_hist : Dict[str, DataFrame]

class DataQualityStats(rows_count: int, num_features_stats: Optional[Dict[str, FeatureQualityStats]] = None, cat_features_stats: Optional[Dict[str, FeatureQualityStats]] = None, datetime_features_stats: Optional[Dict[str, FeatureQualityStats]] = None, target_stats: Optional[Dict[str, FeatureQualityStats]] = None, prediction_stats: Optional[Dict[str, FeatureQualityStats]] = None)

Bases: object


cat_features_stats : Optional[Dict[str, FeatureQualityStats]] = None

datetime_features_stats : Optional[Dict[str, FeatureQualityStats]] = None

num_features_stats : Optional[Dict[str, FeatureQualityStats]] = None

prediction_stats : Optional[Dict[str, FeatureQualityStats]] = None

rows_count : int

target_stats : Optional[Dict[str, FeatureQualityStats]] = None



class FeatureQualityStats(feature_type: str, number_of_rows: int = 0, count: int = 0, infinite_count: Optional[int] = None, infinite_percentage: Optional[float] = None, missing_count: Optional[int] = None, missing_percentage: Optional[float] = None, unique_count: Optional[int] = None, unique_percentage: Optional[float] = None, percentile_25: Optional[float] = None, percentile_50: Optional[float] = None, percentile_75: Optional[float] = None, max: Optional[Union[int, float, bool, str]] = None, min: Optional[Union[int, float, bool, str]] = None, mean: Optional[float] = None, most_common_value: Optional[Union[int, float, bool, str]] = None, most_common_value_percentage: Optional[float] = None, std: Optional[float] = None, most_common_not_null_value: Optional[Union[int, float, bool, str]] = None, most_common_not_null_value_percentage: Optional[float] = None, new_in_current_values_count: Optional[int] = None, unused_in_current_values_count: Optional[int] = None)

Bases: object

Class for all features data quality metrics store.

A type of the feature is stored in feature_type field. Concrete stat kit depends on the feature type. Is a metric is not applicable - leave None value for it.

Metrics for all feature types:

- feature type - cat for category, num for numeric, datetime for datetime features

- count - quantity of a meaningful values (do not take into account NaN values)

- missing_count - quantity of meaningless (NaN) values

- missing_percentage - the percentage of the missed values

- unique_count - quantity of unique values

- unique_percentage - the percentage of the unique values

- max - maximum value (not applicable for category features)

- min - minimum value (not applicable for category features)

- most_common_value - the most common value in the feature values

- most_common_value_percentage - the percentage of the most common value

- most_common_not_null_value - if most_common_value equals NaN - the next most common value. Otherwise - None

- most_common_not_null_value_percentage - the percentage of most_common_not_null_value if it is defined.

    If most_common_not_null_value is not defined, equals None too.

Metrics for numeric features only:

- infinite_count - quantity infinite values (for numeric features only)

- infinite_percentage - the percentage of infinite values (for numeric features only)

- percentile_25 - 25% percentile for meaningful values

- percentile_50 - 50% percentile for meaningful values

- percentile_75 - 75% percentile for meaningful values

- mean - the sum of the meaningful values divided by the number of the meaningful values

- std - standard deviation of the values

Metrics for category features only:

  • new_in_current_values_count - quantity of new values in the current dataset after the reference

      Defined for reference dataset only.
    • new_in_current_values_count - quantity of values in the reference dataset that not presented in the current

      Defined for reference dataset only.


count : int = 0

feature_type : str

infinite_count : Optional[int] = None

infinite_percentage : Optional[float] = None

max : Optional[Union[int, float, bool, str]] = None

mean : Optional[float] = None

min : Optional[Union[int, float, bool, str]] = None

missing_count : Optional[int] = None

missing_percentage : Optional[float] = None

most_common_not_null_value : Optional[Union[int, float, bool, str]] = None

most_common_not_null_value_percentage : Optional[float] = None

most_common_value : Optional[Union[int, float, bool, str]] = None

most_common_value_percentage : Optional[float] = None

new_in_current_values_count : Optional[int] = None

number_of_rows : int = 0

percentile_25 : Optional[float] = None

percentile_50 : Optional[float] = None

percentile_75 : Optional[float] = None

std : Optional[float] = None

unique_count : Optional[int] = None

unique_percentage : Optional[float] = None

unused_in_current_values_count : Optional[int] = None




Checks that the object store stats for a category feature


Checks that the object store stats for a datetime feature


Checks that the object store stats for a numeric feature

calculate_category_column_correlations(column_name: str, dataset: DataFrame, columns: List[str])

For category columns calculate cramer_v correlation

calculate_column_distribution(column: Series, column_type: str)

calculate_correlations(dataset: DataFrame, columns: DatasetColumns)

calculate_cramer_v_correlation(column_name: str, dataset: DataFrame, columns: List[str])

calculate_data_quality_stats(dataset: DataFrame, columns: DatasetColumns, task: Optional[str])

calculate_numerical_column_correlations(column_name: str, dataset: DataFrame, columns: List[str])

get_features_stats(feature: Series, feature_type: str)

get_pairwise_correlation(df, func: Callable[[Series, Series], float])

Compute pairwise correlation of columns :param df: initial data frame. :param func: function for computing pairwise correlation.

  • Returns

    Correlation matrix.

get_rows_count(data: Union[DataFrame, Series])

Count quantity of rows in a dataset

regression_performance module

class ErrorWithQuantiles(error, quantile_top, quantile_other)

Bases: object

class FeatureBias(feature_type: str, majority: float, under: float, over: float, range: float)

Bases: object


feature_type : str

majority : float

over : float

range : float

under : float



class RegressionPerformanceMetrics(mean_error: float, mean_abs_error: float, mean_abs_perc_error: float, error_std: float, abs_error_max: float, abs_error_std: float, abs_perc_error_std: float, error_normality: dict, underperformance: dict, error_bias: dict)

Bases: object


abs_error_max : float

abs_error_std : float

abs_perc_error_std : float

error_bias : dict

error_normality : dict

error_std : float

mean_abs_error : float

mean_abs_perc_error : float

mean_error : float

underperformance : dict

calculate_regression_performance(dataset: DataFrame, columns: DatasetColumns, error_bias_prefix: str)

error_bias_table(dataset, err_quantiles, num_feature_names, cat_feature_names)

error_with_quantiles(dataset, prediction_column, target_column, quantile: float)

Last updated