Source code for sklearn_utils.utils.data_utils
from collections import defaultdict
import pandas as pd
from statsmodels.sandbox.stats.multicomp import multipletests
from sklearn.feature_selection import VarianceThreshold, f_classif
[docs]def filter_by_label(X, y, ref_label, reverse=False):
'''
Select items with label from dataset.
:param X: dataset
:param y: labels
:param ref_label: reference label
:param bool reverse: if false selects ref_labels else eliminates
'''
check_reference_label(y, ref_label)
return list(zip(*filter(lambda t: (not reverse) == (t[1] == ref_label),
zip(X, y))))
[docs]def average_by_label(X, y, ref_label):
'''
Calculates average dictinary from list of dictionary for give label
:param List[Dict] X: dataset
:param list y: labels
:param ref_label: reference label
'''
# TODO: consider to delete defaultdict
return defaultdict(float,
pd.DataFrame.from_records(
filter_by_label(X, y, ref_label)[0]
).mean().to_dict())
[docs]def map_dict(d, key_func=None, value_func=None, if_func=None):
'''
:param dict d: dictionary
:param func key_func: func which will run on key.
:param func value_func: func which will run on values.
'''
key_func = key_func or (lambda k, v: k)
value_func = value_func or (lambda k, v: v)
if_func = if_func or (lambda k, v: True)
return {
key_func(*k_v): value_func(*k_v)
for k_v in d.items() if if_func(*k_v)
}
[docs]def map_dict_list(ds, key_func=None, value_func=None, if_func=None):
'''
:param List[Dict] ds: list of dict
:param func key_func: func which will run on key.
:param func value_func: func which will run on values.
'''
return [map_dict(d, key_func, value_func, if_func) for d in ds]
def check_reference_label(y, ref_label):
'''
:param list y: label
:param ref_label: reference label
'''
set_y = set(y)
if ref_label not in set_y:
raise ValueError('There is not reference label in dataset. '
"Reference label: '%s' "
'Labels in dataset: %s' % (ref_label, set_y))
def variance_threshold_on_df(df: pd.DataFrame, threshold=0):
vt = VarianceThreshold(threshold)
vt.fit(df.values)
return df.iloc[:, vt.variances_ > threshold]
def feature_importance_report(X,
y,
threshold=0.001,
correcting_multiple_hypotesis=True,
method='fdr_bh',
alpha=0.1,
sort_by='pval'):
'''
Provide signifance for features in dataset with anova using multiple hypostesis testing
:param X: List of dict with key as feature names and values as features
:param y: Labels
:param threshold: Low-variens threshold to eliminate low varience features
:param correcting_multiple_hypotesis: corrects p-val with multiple hypotesis testing
:param method: method of multiple hypotesis testing
:param alpha: alpha of multiple hypotesis testing
:param sort_by: sorts output dataframe by pval or F
:return: DataFrame with F and pval for each feature with their average values
'''
df = variance_threshold_on_df(
pd.DataFrame.from_records(X), threshold=threshold)
F, pvals = f_classif(df.values, y)
if correcting_multiple_hypotesis:
_, pvals, _, _ = multipletests(pvals, alpha=alpha, method=method)
df['labels'] = y
df_mean = df.groupby('labels').mean().T
df_mean['F'] = F
df_mean['pval'] = pvals
return df_mean.sort_values(sort_by, ascending=True)