Source code for sklearn_utils.utils.data_utils

from collections import defaultdict

import pandas as pd
from statsmodels.sandbox.stats.multicomp import multipletests
from sklearn.feature_selection import VarianceThreshold, f_classif


[docs]def filter_by_label(X, y, ref_label, reverse=False):
    '''
    Select items with label from dataset.

    :param X: dataset
    :param y: labels
    :param ref_label: reference label
    :param bool reverse: if false selects ref_labels else eliminates
    '''
    check_reference_label(y, ref_label)

    return list(zip(*filter(lambda t: (not reverse) == (t[1] == ref_label),
                            zip(X, y))))


[docs]def average_by_label(X, y, ref_label):
    '''
    Calculates average dictinary from list of dictionary for give label

    :param List[Dict] X: dataset
    :param list y: labels
    :param ref_label: reference label
    '''
    # TODO: consider to delete defaultdict
    return defaultdict(float,
                       pd.DataFrame.from_records(
                           filter_by_label(X, y, ref_label)[0]
                       ).mean().to_dict())


[docs]def map_dict(d, key_func=None, value_func=None, if_func=None):
    '''
    :param dict d: dictionary
    :param func key_func: func which will run on key.
    :param func value_func: func which will run on values.
    '''
    key_func = key_func or (lambda k, v: k)
    value_func = value_func or (lambda k, v: v)
    if_func = if_func or (lambda k, v: True)
    return {
        key_func(*k_v): value_func(*k_v)
        for k_v in d.items() if if_func(*k_v)
    }


[docs]def map_dict_list(ds, key_func=None, value_func=None, if_func=None):
    '''
    :param List[Dict] ds: list of dict
    :param func key_func: func which will run on key.
    :param func value_func: func which will run on values.
    '''
    return [map_dict(d, key_func, value_func, if_func) for d in ds]


def check_reference_label(y, ref_label):
    '''
    :param list y: label
    :param ref_label: reference label
    '''
    set_y = set(y)
    if ref_label not in set_y:
        raise ValueError('There is not reference label in dataset. '
                         "Reference label: '%s' "
                         'Labels in dataset: %s' % (ref_label, set_y))


def variance_threshold_on_df(df: pd.DataFrame, threshold=0):
    vt = VarianceThreshold(threshold)
    vt.fit(df.values)
    return df.iloc[:, vt.variances_ > threshold]


def feature_importance_report(X,
                              y,
                              threshold=0.001,
                              correcting_multiple_hypotesis=True,
                              method='fdr_bh',
                              alpha=0.1,
                              sort_by='pval'):
    '''
    Provide signifance for features in dataset with anova using multiple hypostesis testing

    :param X: List of dict with key as feature names and values as features
    :param y: Labels
    :param threshold: Low-variens threshold to eliminate low varience features
    :param correcting_multiple_hypotesis: corrects p-val with multiple hypotesis testing
    :param method: method of multiple hypotesis testing
    :param alpha: alpha of multiple hypotesis testing
    :param sort_by: sorts output dataframe by pval or F
    :return: DataFrame with F and pval for each feature with their average values 
    '''
    df = variance_threshold_on_df(
        pd.DataFrame.from_records(X), threshold=threshold)

    F, pvals = f_classif(df.values, y)

    if correcting_multiple_hypotesis:
        _, pvals, _, _ = multipletests(pvals, alpha=alpha, method=method)

    df['labels'] = y
    df_mean = df.groupby('labels').mean().T

    df_mean['F'] = F
    df_mean['pval'] = pvals

    return df_mean.sort_values(sort_by, ascending=True)