evidently.metrics.data_integrity

Submodules

column_missing_values_metric module

class ColumnMissingValues(number_of_rows: int, different_missing_values: Dict[Any, int], number_of_different_missing_values: int, number_of_missing_values: int, share_of_missing_values: float)

Bases: object

Statistics about missing values in a column

Attributes:

different_missing_values : Dict[Any, int]

number_of_different_missing_values : int

number_of_missing_values : int

number_of_rows : int

share_of_missing_values : float

class ColumnMissingValuesMetric(column_name: str, missing_values: Optional[list] = None, replace: bool = True)

Bases: Metric[ColumnMissingValuesMetricResult]

Count missing values in a column.

Missing value is a null or NaN value.

Calculate an amount of missing values kinds and count for such values. NA-types like numpy.NaN, pandas.NaT are counted as one type.

You can set you own missing values list with missing_values parameter. Value None in the list means that Pandas null values will be included in the calculation.

If replace parameter is False - add defaults to user’s list. If replace parameter is True - use values from missing_values list only.

Attributes:

DEFAULT_MISSING_VALUES = ['', inf, -inf, None]

column_name : str

missing_values : frozenset

Methods:

calculate(data: InputData)

class ColumnMissingValuesMetricRenderer(color_options: Optional[ColorOptions] = None)

Bases: MetricRenderer

Attributes:

color_options : ColorOptions

Methods:

render_html(obj: ColumnMissingValuesMetric)

render_json(obj: ColumnMissingValuesMetric)

class ColumnMissingValuesMetricResult(column_name: str, current: ColumnMissingValues, reference: Optional[ColumnMissingValues] = None)

Bases: object

Attributes:

column_name : str

current : ColumnMissingValues

reference : Optional[ColumnMissingValues] = None

column_regexp_metric module

class ColumnRegExpMetric(column_name: str, reg_exp: str, top: int = 10)

Bases: Metric[DataIntegrityValueByRegexpMetricResult]

Count number of values in a column matched or not by a regular expression (regexp)

Attributes:

column_name : str

reg_exp : str

top : int

Methods:

calculate(data: InputData)

class ColumnRegExpMetricRenderer(color_options: Optional[ColorOptions] = None)

Bases: MetricRenderer

Attributes:

color_options : ColorOptions

Methods:

render_html(obj: ColumnRegExpMetric)

render_json(obj: ColumnRegExpMetric)

class DataIntegrityValueByRegexpMetricResult(column_name: str, reg_exp: str, top: int, current: DataIntegrityValueByRegexpStat, reference: Optional[DataIntegrityValueByRegexpStat] = None)

Bases: object

Attributes:

column_name : str

current : DataIntegrityValueByRegexpStat

reference : Optional[DataIntegrityValueByRegexpStat] = None

reg_exp : str

top : int

class DataIntegrityValueByRegexpStat(number_of_matched: int, number_of_not_matched: int, number_of_rows: int, table_of_matched: Dict[str, int], table_of_not_matched: Dict[str, int])

Bases: object

Statistics about matched by a regular expression values in a column for one dataset

Attributes:

number_of_matched : int

number_of_not_matched : int

number_of_rows : int

table_of_matched : Dict[str, int]

table_of_not_matched : Dict[str, int]

column_summary_metric module

class CategoricalCharacteristics(number_of_rows: int, count: int, unique: Optional[int], unique_percentage: Optional[float], most_common: Optional[object], most_common_percentage: Optional[float], missing: Optional[int], missing_percentage: Optional[float], new_in_current_values_count: Optional[int] = None, unused_in_current_values_count: Optional[int] = None)

Bases: object

Attributes:

count : int

missing : Optional[int]

missing_percentage : Optional[float]

most_common : Optional[object]

most_common_percentage : Optional[float]

new_in_current_values_count : Optional[int] = None

number_of_rows : int

unique : Optional[int]

unique_percentage : Optional[float]

unused_in_current_values_count : Optional[int] = None

class ColumnSummary(column_name: str, column_type: str, reference_characteristics: Union[NumericCharacteristics, CategoricalCharacteristics, DatetimeCharacteristics, NoneType], current_characteristics: Union[NumericCharacteristics, CategoricalCharacteristics, DatetimeCharacteristics], plot_data: DataQualityPlot)

Bases: object

Attributes:

column_name : str

column_type : str

current_characteristics : Union[NumericCharacteristics, CategoricalCharacteristics, DatetimeCharacteristics]

plot_data : DataQualityPlot

reference_characteristics : Optional[Union[NumericCharacteristics, CategoricalCharacteristics, DatetimeCharacteristics]]

class ColumnSummaryMetric(column_name: str)

Bases: Metric[ColumnSummary]

Methods:

calculate(data: InputData)

static map_data(stats: FeatureQualityStats)

class ColumnSummaryMetricRenderer(color_options: Optional[ColorOptions] = None)

Bases: MetricRenderer

Attributes:

color_options : ColorOptions

Methods:

render_html(obj: ColumnSummaryMetric)

render_json(obj: ColumnSummaryMetric)

class DataByTarget(data_for_plots: Dict[str, Dict[str, Union[list, pandas.core.frame.DataFrame]]], target_name: str, target_type: str)

Bases: object

Attributes:

data_for_plots : Dict[str, Dict[str, Union[list, DataFrame]]]

target_name : str

target_type : str

class DataInTime(data_for_plots: Dict[str, pandas.core.frame.DataFrame], freq: str, datetime_name: str)

Bases: object

Attributes:

data_for_plots : Dict[str, DataFrame]

datetime_name : str

freq : str

class DataQualityPlot(bins_for_hist: Dict[str, pandas.core.frame.DataFrame], data_in_time: Optional[DataInTime], data_by_target: Optional[DataByTarget], counts_of_values: Optional[Dict[str, pandas.core.frame.DataFrame]])

Bases: object

Attributes:

bins_for_hist : Dict[str, DataFrame]

counts_of_values : Optional[Dict[str, DataFrame]]

data_by_target : Optional[DataByTarget]

data_in_time : Optional[DataInTime]

class DatetimeCharacteristics(number_of_rows: int, count: int, unique: Optional[int], unique_percentage: Optional[float], most_common: Optional[object], most_common_percentage: Optional[float], missing: Optional[int], missing_percentage: Optional[float], first: Optional[str], last: Optional[str])

Bases: object

Attributes:

count : int

first : Optional[str]

last : Optional[str]

missing : Optional[int]

missing_percentage : Optional[float]

most_common : Optional[object]

most_common_percentage : Optional[float]

number_of_rows : int

unique : Optional[int]

unique_percentage : Optional[float]

class NumericCharacteristics(number_of_rows: int, count: int, mean: Union[float, int, NoneType], std: Union[float, int, NoneType], min: Union[float, int, NoneType], p25: Union[float, int, NoneType], p50: Union[float, int, NoneType], p75: Union[float, int, NoneType], max: Union[float, int, NoneType], unique: Optional[int], unique_percentage: Optional[float], missing: Optional[int], missing_percentage: Optional[float], infinite_count: Optional[int], infinite_percentage: Optional[float], most_common: Union[float, int, NoneType], most_common_percentage: Optional[float])

Bases: object

Attributes:

count : int

infinite_count : Optional[int]

infinite_percentage : Optional[float]

max : Optional[Union[float, int]]

mean : Optional[Union[float, int]]

min : Optional[Union[float, int]]

missing : Optional[int]

missing_percentage : Optional[float]

most_common : Optional[Union[float, int]]

most_common_percentage : Optional[float]

number_of_rows : int

p25 : Optional[Union[float, int]]

p50 : Optional[Union[float, int]]

p75 : Optional[Union[float, int]]

std : Optional[Union[float, int]]

unique : Optional[int]

unique_percentage : Optional[float]

dataset_missing_values_metric module

class DatasetMissingValues(different_missing_values: Dict[Any, int], number_of_different_missing_values: int, different_missing_values_by_column: Dict[str, Dict[Any, int]], number_of_different_missing_values_by_column: Dict[str, int], number_of_missing_values: int, share_of_missing_values: float, number_of_missing_values_by_column: Dict[str, int], share_of_missing_values_by_column: Dict[str, float], number_of_rows: int, number_of_rows_with_missing_values: int, share_of_rows_with_missing_values: float, number_of_columns: int, columns_with_missing_values: List[str], number_of_columns_with_missing_values: int, share_of_columns_with_missing_values: float)

Bases: object

Statistics about missed values in a dataset

Attributes:

columns_with_missing_values : List[str]

different_missing_values : Dict[Any, int]

different_missing_values_by_column : Dict[str, Dict[Any, int]]

number_of_columns : int

number_of_columns_with_missing_values : int

number_of_different_missing_values : int

number_of_different_missing_values_by_column : Dict[str, int]

number_of_missing_values : int

number_of_missing_values_by_column : Dict[str, int]

number_of_rows : int

number_of_rows_with_missing_values : int

share_of_columns_with_missing_values : float

share_of_missing_values : float

share_of_missing_values_by_column : Dict[str, float]

share_of_rows_with_missing_values : float

class DatasetMissingValuesMetric(missing_values: Optional[list] = None, replace: bool = True)

Bases: Metric[DatasetMissingValuesMetricResult]

Count missing values in a dataset.

Missing value is a null or NaN value.

Calculate an amount of missing values kinds and count for such values. NA-types like numpy.NaN, pandas.NaT are counted as one type.

You can set you own missing values list with missing_values parameter. Value None in the list means that Pandas null values will be included in the calculation.

If replace parameter is False - add defaults to user’s list. If replace parameter is True - use values from missing_values list only.

Attributes:

DEFAULT_MISSING_VALUES = ['', inf, -inf, None]

missing_values : frozenset

Methods:

calculate(data: InputData)

class DatasetMissingValuesMetricRenderer(color_options: Optional[ColorOptions] = None)

Bases: MetricRenderer

Attributes:

color_options : ColorOptions

Methods:

render_html(obj: DatasetMissingValuesMetric)

render_json(obj: DatasetMissingValuesMetric)

class DatasetMissingValuesMetricResult(current: DatasetMissingValues, reference: Optional[DatasetMissingValues] = None)

Bases: object

Attributes:

current : DatasetMissingValues

reference : Optional[DatasetMissingValues] = None

dataset_summary_metric module

class DatasetSummary(target: Optional[str], prediction: Optional[Union[str, Sequence[str]]], date_column: Optional[str], id_column: Optional[str], number_of_columns: int, number_of_rows: int, number_of_missing_values: int, number_of_categorical_columns: int, number_of_numeric_columns: int, number_of_datetime_columns: int, number_of_constant_columns: int, number_of_almost_constant_columns: int, number_of_duplicated_columns: int, number_of_almost_duplicated_columns: int, number_of_empty_rows: int, number_of_empty_columns: int, number_of_duplicated_rows: int, columns_type: dict, nans_by_columns: dict, number_uniques_by_columns: dict)

Bases: object

Columns information in a dataset

Attributes:

columns_type : dict

date_column : Optional[str]

id_column : Optional[str]

nans_by_columns : dict

number_of_almost_constant_columns : int

number_of_almost_duplicated_columns : int

number_of_categorical_columns : int

number_of_columns : int

number_of_constant_columns : int

number_of_datetime_columns : int

number_of_duplicated_columns : int

number_of_duplicated_rows : int

number_of_empty_columns : int

number_of_empty_rows : int

number_of_missing_values : int

number_of_numeric_columns : int

number_of_rows : int

number_uniques_by_columns : dict

prediction : Optional[Union[str, Sequence[str]]]

target : Optional[str]

class DatasetSummaryMetric(almost_duplicated_threshold: float = 0.95, almost_constant_threshold: float = 0.95)

Bases: Metric[DatasetSummaryMetricResult]

Common dataset(s) columns/features characteristics

Attributes:

almost_constant_threshold : float

almost_duplicated_threshold : float

Methods:

calculate(data: InputData)

class DatasetSummaryMetricRenderer(color_options: Optional[ColorOptions] = None)

Bases: MetricRenderer

Attributes:

color_options : ColorOptions

Methods:

render_html(obj: DatasetSummaryMetric)

render_json(obj: DatasetSummaryMetric)

class DatasetSummaryMetricResult(almost_duplicated_threshold: float, current: DatasetSummary, reference: Optional[DatasetSummary] = None)

Bases: object

Attributes:

almost_duplicated_threshold : float

current : DatasetSummary

reference : Optional[DatasetSummary] = None

Last updated