import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns; sns.set()
import warnings
from sklearn.feature_selection import RFE, SelectKBest, chi2
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
import statsmodels.api as sm
import scipy.stats as ss
import joblib
from mlxtend.feature_selection import SequentialFeatureSelector
from openTSNE import TSNE # formerly fastTSNE
from openTSNE.callbacks import ErrorLogger
from umap import UMAP

C:\Users\Rygu\Anaconda3\envs\i4061\lib\site-packages\sklearn\externals\joblib\__init__.py:15: DeprecationWarning: sklearn.externals.joblib is deprecated in 0.21 and will be removed in 0.23. Please import this functionality directly from joblib, which can be installed with: pip install joblib. If this warning is raised when loading pickled models, you may need to re-serialize those models with scikit-learn 0.21+.
  warnings.warn(msg, category=DeprecationWarning)

RS = 404 # random_state
IS_AWS = False # conditional calls when running locally vs on Amazon Web Service

if not IS_AWS:
    with warnings.catch_warnings():
        # Ignore UserWarning sklearn 0.20.3 vs 0.21 
        warnings.simplefilter('ignore') 
        lbfgs_bwd = joblib.load('saves/lbfgs_backward.pkl')
        lbfgs_fwd = joblib.load('saves/lbfgs_forward.pkl')
        lbfgs_pfwd = joblib.load('saves/lbfgs_pforward.pkl')
        rfc_bwd = joblib.load('saves/rf_backward.pkl')
        rfc_fwd = joblib.load('saves/rf_forward.pkl')

coil2k_raw = pd.read_csv('data/caravan-insurance-challenge.csv')
coil = coil2k_raw.drop(columns=['ORIGIN']).copy()
X,y = coil.drop(columns='CARAVAN'), coil.CARAVAN

# plotting helper function
def scatter_density(data, labels, sca_title='', den_title='', **kwargs):
    """Plot a scatter plot and a density plot
    Args:
        data : 2-d array, shape (n_samples,2)
        labels: array-like, class labels to be used for coloring scatter plot
        sca_title: str, scatter plot title
        den_title: str, density plot title
        **kwargs: keyword arguments passed to seaborn.kdeplot
    
    Returns: 
        ax, matplotlib axis object
    """
    fig, ax = plt.subplots(1,2,figsize=(10,4),sharey=True,sharex=True)#,gridspec_kw={'width_ratios':[48,48,4]})
    
    dataneg = data[labels == 0]
    datapos = data[labels == 1]
    
    sns.scatterplot(data[:,0], data[:,1],hue=labels, ax=ax[0])
    #sns.scatterplot(dataneg[:,0], dataneg[:,1], palette='Blues', ax=ax[0],alpha=0.06)
    #sns.scatterplot(datapos[:,0], datapos[:,1], palette='Oranges', ax=ax[0],alpha=1)

    sns.kdeplot(datapos[:,0], datapos[:,1], ax=ax[1], cmap='Oranges',**kwargs) #,cbar=True,cbar_ax=ax[2])
    sns.kdeplot(dataneg[:,0], dataneg[:,1], ax=ax[1], cmap='Blues',n_levels=30,**kwargs,shade=True,shade_lowest=False)#,cbar=True,cbar_ax=ax[2])

    ax[0].set_title(sca_title)
    ax[1].set_title(den_title)
    
    fig.tight_layout()
    plt.show()
    return ax

Xs = pd.DataFrame(StandardScaler().fit_transform(X), columns=X.columns)

pca = PCA(random_state=RS)
Xpca=pca.fit_transform(Xs)

_Xpca_raw = PCA(n_components=2, random_state=RS).fit_transform(X)
scatter_density(_Xpca_raw, y, 'PCA Scatter Unscaled', 'PCA Density Unscaled');

scatter_density(Xpca, y, 'PCA Scaled: Scatter', 'PCA Scaled: Density');

pca.explained_variance_ratio_[:2]

array([0.11035515, 0.05773411])

plt.plot(np.cumsum(pca.explained_variance_ratio_))
plt.annotate('(64, 0.993)', xy=(64, 0.993), xytext=(64,0.8), fontsize='medium',arrowprops={'arrowstyle':'->','mutation_scale':15})
plt.xlabel('number of components')
plt.ylabel('cumulative explained variance')
plt.title('Explained variance')
plt.show()

tsne = TSNE(perplexity=75, learning_rate=500, n_iter=1000, metric='euclidean', 
            negative_gradient_method='bh', n_jobs=4, callbacks=ErrorLogger(), random_state=RS)
Xembd = tsne.fit(Xs)

Iteration   50, KL divergence  4.4702, 50 iterations in 2.7536 sec
Iteration  100, KL divergence  4.4649, 50 iterations in 2.7606 sec
Iteration  150, KL divergence  4.4645, 50 iterations in 2.9711 sec
Iteration  200, KL divergence  4.4643, 50 iterations in 3.2134 sec
Iteration  250, KL divergence  4.4642, 50 iterations in 3.0040 sec
Iteration   50, KL divergence  2.0479, 50 iterations in 2.6928 sec
Iteration  100, KL divergence  1.7084, 50 iterations in 2.7117 sec
Iteration  150, KL divergence  1.5606, 50 iterations in 2.7586 sec
Iteration  200, KL divergence  1.4822, 50 iterations in 2.7945 sec
Iteration  250, KL divergence  1.4382, 50 iterations in 2.8703 sec
Iteration  300, KL divergence  1.4125, 50 iterations in 2.9611 sec
Iteration  350, KL divergence  1.3957, 50 iterations in 2.8963 sec
Iteration  400, KL divergence  1.3847, 50 iterations in 3.0688 sec
Iteration  450, KL divergence  1.3764, 50 iterations in 2.9352 sec
Iteration  500, KL divergence  1.3705, 50 iterations in 2.9571 sec
Iteration  550, KL divergence  1.3651, 50 iterations in 3.0449 sec
Iteration  600, KL divergence  1.3614, 50 iterations in 3.2054 sec
Iteration  650, KL divergence  1.3579, 50 iterations in 3.0937 sec
Iteration  700, KL divergence  1.3553, 50 iterations in 2.9371 sec
Iteration  750, KL divergence  1.3523, 50 iterations in 3.1635 sec
Iteration  800, KL divergence  1.3501, 50 iterations in 3.0538 sec
Iteration  850, KL divergence  1.3485, 50 iterations in 2.9910 sec
Iteration  900, KL divergence  1.3471, 50 iterations in 2.8923 sec
Iteration  950, KL divergence  1.3454, 50 iterations in 2.9172 sec
Iteration  1000, KL divergence  1.3443, 50 iterations in 2.8793 sec

scatter_density(Xembd, y,'t-SNE scatter','t-SNE density');

ump = UMAP(n_neighbors=30, min_dist=0.2, random_state=RS, verbose=True)
Xumap = ump.fit_transform(Xs,y)

UMAP(a=None, angular_rp_forest=False, b=None, init='spectral',
     learning_rate=1.0, local_connectivity=1.0, metric='euclidean',
     metric_kwds=None, min_dist=0.2, n_components=2, n_epochs=None,
     n_neighbors=30, negative_sample_rate=5, random_state=404,
     repulsion_strength=1.0, set_op_mix_ratio=1.0, spread=1.0,
     target_metric='categorical', target_metric_kwds=None,
     target_n_neighbors=-1, target_weight=0.5, transform_queue_size=4.0,
     transform_seed=42, verbose=True)
Construct fuzzy simplicial set
Tue Jun 18 11:10:17 2019 Finding Nearest Neighbors
Tue Jun 18 11:10:17 2019 Building RP forest with 10 trees
Tue Jun 18 11:10:20 2019 NN descent for 13 iterations
	 0  /  13
	 1  /  13
	 2  /  13
	 3  /  13
	 4  /  13
Tue Jun 18 11:10:34 2019 Finished Nearest Neighbor Search
Tue Jun 18 11:10:39 2019 Construct embedding
	completed  0  /  500 epochs
	completed  50  /  500 epochs
	completed  100  /  500 epochs
	completed  150  /  500 epochs
	completed  200  /  500 epochs
	completed  250  /  500 epochs
	completed  300  /  500 epochs
	completed  350  /  500 epochs
	completed  400  /  500 epochs
	completed  450  /  500 epochs
Tue Jun 18 11:11:29 2019 Finished embedding

scatter_density(Xumap, y, 'UMAP: Scatter', 'UMAP: Density');

ump = UMAP(n_neighbors=30, min_dist=0.2, random_state=RS, verbose=False)
Xumap = ump.fit_transform(Xs)

scatter_density(Xumap, y, 'UMAP: Scatter', 'UMAP: Density');

# minor QoL tweaks to mlxtend.plotting.plot_sequential_feature_selection
def plot_sfs(selector, title='', kind='std_dev', color='blue', bcolor='steelblue', marker='o', 
             alpha=0.2, ylabel=None, confidence_interval=0.95, ax=None, **kwargs):
    
    """Plot feature selection results.

    Parameters
    ----------
    selector : a fitted mlxtend.SequentialFeatureSelector object
    title : str (default: '')
        plot title
    kind : str (default: "std_dev")
        The kind of error bar or confidence interval in
        {'std_dev', 'std_err', 'ci', None}.
    color : str (default: "blue")
        Color of the lineplot (accepts any matplotlib color name)
    bcolor : str (default: "steelblue").
        Color of the error bars / confidence intervals
        (accepts any matplotlib color name).
    marker : str (default: "o")
        Marker of the line plot
        (accepts any matplotlib marker name).
    alpha : float in [0, 1] (default: 0.2)
        Transparency of the error bars / confidence intervals.
    ylabel : str (default: None)
        Y-axis label, if not overriden, will be set to ``selector``.scoring
    confidence_interval : float (default: 0.95)
        Confidence level if `kind='ci'`.
    ax : matplotlib Axes, optional
        Axes object to draw the plot onto, otherwise uses the current Axes.  
    **kwargs : key, value mappings
        Other keyword arguments are passed through to ``matplotlib.pyplot.subplots``.

    Returns
    ----------
    fig, ax : matplotlib.pyplot subplot objects
        Figure and axis elements of the subplot.

    Examples
    -----------
    For usage examples, please see
    http://rasbt.github.io/mlxtend/user_guide/plotting/plot_sequential_feature_selection/

    """

    allowed = {'std_dev', 'std_err', 'ci', None}
    if kind not in allowed:
        raise AttributeError('kind not in %s' % allowed)

    fig, ax = (plt.gcf(), ax) if ax is not None else plt.subplots(**kwargs)
    
    metric_dict = selector.get_metric_dict()
    k_feat = sorted(metric_dict.keys())
    avg = [metric_dict[k]['avg_score'] for k in k_feat]

    if kind:
        upper, lower = [], []
        if kind == 'ci':
            kind = 'ci_bound'

        for k in k_feat:
            upper.append(metric_dict[k]['avg_score'] + metric_dict[k][kind])
            lower.append(metric_dict[k]['avg_score'] - metric_dict[k][kind])

        ax.fill_between(k_feat, upper, lower, alpha=alpha, color=bcolor, lw=1)

        if kind == 'ci_bound':
            kind = 'Confidence Interval (%d%%)' % (confidence_interval * 100)

    ax.plot(k_feat, avg, color=color, marker=marker)
    ax.set_ylabel(ylabel if ylabel else selector.scoring)
    ax.set_xlabel('Number of Features')
    ax.set_title(title)
    
    feature_min = len(metric_dict[k_feat[0]]['feature_idx'])
    feature_max = len(metric_dict[k_feat[-1]]['feature_idx'])
    ax.set_xticks(range(feature_min, feature_max + 1), range(feature_min, feature_max + 1))
    
    return ax

class StepSelect:
    """Perform a forward-backward feature selection based on p-value from statsmodels.api.OLS
    
    Args:
        X : pandas.DataFrame, containing candidate features
        y : list-like, target/response
        threshold_in : float, if p-value < threshold_in, include feature
        threshold_out : float, if p-value > threshold_out exclude feature
        verbose : boolean, if True print the sequence of inclusions and exclusions
    
    Returns:
        list : selected features     
    
    See also:
        https://en.wikipedia.org/wiki/Stepwise_regression
        https://datascience.stackexchange.com/a/24447
    """
    def __init__(self, X, y, threshold_in=0.01, threshold_out=0.05, verbose=True):
        self.X = X
        self.y = y 
        self.threshold_in = threshold_in
        self.threshold_out = threshold_out
        self.verbose = verbose
        assert self.threshold_in < threshold_out, "inclusive p-value should be smaller than exclusive"
        
        self._init_list = []
        self._dlist=[] # dropped values
        self._updated = False # sentinel value
        
    def __call__(self):
        return self._step(self.X, self.y, self._init_list)
        
    def _drop_list(self,add_drop=None):
        """Tracks dropped values"""
        if add_drop is not None:
            self._dlist.append(add_drop) 
        return self._dlist
    
    def _forward(self, included, excluded):
        # forward step
        new_pval = pd.Series(index=excluded)
        for new_col in excluded:
            model = sm.OLS(self.y, sm.add_constant(pd.DataFrame(self.X[included+[new_col]]))).fit()
            new_pval[new_col] = model.pvalues[new_col] # build Series - feat : pvalue

        best_pval = new_pval.min()
        if best_pval < self.threshold_in:
            feat_added = new_pval.idxmin() # name of feature with lowest p-value
            included.append(feat_added)
            self._updated=True
            if self.verbose:
                print('Add  {:30} with p-value {:.6}'.format(feat_added, best_pval))
        
        return included
    
    def _backward(self, included):
        # backward step
        model = sm.OLS(self.y, sm.add_constant(pd.DataFrame(self.X[included]))).fit()
        pvalues = model.pvalues.iloc[1:] 

        worst_pval = pvalues.max()
        if worst_pval > self.threshold_out:
            feat_removed = pvalues.idxmax() # name of feature with highest p-value
            self._drop_list(feat_removed) # add to drop list
            included.remove(feat_removed) # remove from inclusions
            self._updated = True
            if self.verbose:
                print('Drop {:30} with p-value {:.6}'.format(feat_removed, worst_pval))
                
        return included
    
    def _step(self, X, y, included):
        self._updated=False 
        included = list(included)
        excluded = list(set(X.columns)-set(included))

        included = self._forward(included, excluded)
        included = self._backward(included)

        # sanity check for infinite loop, if any value has been dropped 
        # more than twice the number of unique values, return included
        
        drop_vc = pd.Series(self._drop_list()).value_counts()
        is_inf_loop = (drop_vc > 2*len(drop_vc)).any()
        if self._updated and not is_inf_loop:
            return self._step(X, y, included) 
    
        return np.array(included)

with warnings.catch_warnings():
    warnings.simplefilter('ignore',FutureWarning) # Ignore FutureWarning numpy.ptp  
    selected_pval = StepSelect(Xs,y,threshold_in=0.01,threshold_out=0.05)()

Add  PPERSAUT                       with p-value 2.14684e-42
Add  MKOOPKLA                       with p-value 1.36739e-21
Add  PWAPART                        with p-value 3.66711e-15
Add  APLEZIER                       with p-value 8.20766e-15
Add  MOPLHOOG                       with p-value 4.25236e-06
Add  PBRAND                         with p-value 3.92829e-06
Add  MBERBOER                       with p-value 8.31838e-06
Add  MRELGE                         with p-value 1.41977e-05
Add  PWALAND                        with p-value 0.000361295
Add  ABRAND                         with p-value 0.000937601
Add  AZEILPL                        with p-value 0.00153041
Add  MINK123M                       with p-value 0.00152554
Add  PBYSTAND                       with p-value 0.00243579
Add  PGEZONG                        with p-value 0.00485648
Add  AGEZONG                        with p-value 0.00450709
Add  MHHUUR                         with p-value 0.00630075

lr_lbfgs = LogisticRegression(class_weight='balanced', solver='lbfgs', max_iter=1000)

%%time
lbfgs_fwd = SequentialFeatureSelector(lr_lbfgs, k_features="best", forward=True, verbose=0, scoring='roc_auc', n_jobs=-1)
lbfgs_fwd = lbfgs_fwd.fit(Xs, y, custom_feature_names=Xs.columns)

CPU times: user 21.2 s, sys: 1.85 s, total: 23.1 s
Wall time: 2min 58s

print('Forward Best, nfeat={} (AUC: {:.3f}):\n{}'.format(
    len(lbfgs_fwd.k_feature_names_), lbfgs_fwd.k_score_, lbfgs_fwd.k_feature_names_))

Forward Best, nfeat=29 (AUC: 0.755):
('MOSTYPE', 'MAANTHUI', 'MRELGE', 'MRELSA', 'MOPLHOOG', 'MOPLLAAG', 'MBERBOER', 'MBERARBG', 'MAUT1', 'MINKM30', 'MINK123M', 'MKOOPKLA', 'PWAPART', 'PWALAND', 'PPERSAUT', 'PVRAAUT', 'PWERKT', 'PGEZONG', 'PBRAND', 'PPLEZIER', 'PBYSTAND', 'AWAPART', 'AVRAAUT', 'AWERKT', 'ALEVEN', 'ABRAND', 'AZEILPL', 'APLEZIER', 'AFIETS')

plot_sfs(lbfgs_fwd, 'lbfgs : Forward best', figsize=(6,4));

pd.DataFrame(lbfgs_fwd.get_metric_dict()).T.query('avg_score > 0.75').head(10)

%%time
lbfgs_pfwd = SequentialFeatureSelector(lr_lbfgs, k_features="parsimonious", forward=True, verbose=0, scoring='roc_auc', n_jobs=-1)
lbfgs_pfwd = lbfgs_pfwd.fit(Xs, y, custom_feature_names=Xs.columns)

CPU times: user 21.9 s, sys: 1.39 s, total: 23.3 s
Wall time: 2min 45s

print('Forward Parsimonious, nfeat={} (AUC: {:.3f}):\n{}'.format(
    len(lbfgs_pfwd.k_feature_names_),lbfgs_pfwd.k_score_, lbfgs_pfwd.k_feature_names_))

Forward Parsimonious, nfeat=19 (AUC: 0.754):
('MOSTYPE', 'MAANTHUI', 'MRELSA', 'MOPLHOOG', 'MBERBOER', 'MAUT1', 'MINKM30', 'MINK123M', 'MKOOPKLA', 'PWAPART', 'PWALAND', 'PPERSAUT', 'PWERKT', 'PGEZONG', 'PBRAND', 'PBYSTAND', 'AWAPART', 'ABRAND', 'APLEZIER')

%%time
lbfgs_bwd = SequentialFeatureSelector(lr_lbfgs, k_features="best", forward=False, verbose=0, scoring='roc_auc', n_jobs=-1)
lbfgs_bwd = lbfgs_bwd.fit(Xs, y, custom_feature_names=Xs.columns)

CPU times: user 45.3 s, sys: 14.2 s, total: 59.5 s
Wall time: 8min 41s

print('Backward Best, nfeat={} (AUC: {:.3f}):\n{}'.format(
    len(lbfgs_bwd.k_feature_names_),lbfgs_bwd.k_score_, lbfgs_bwd.k_feature_names_))

Backward Best, nfeat=35 (AUC: 0.755):
('MOSTYPE', 'MAANTHUI', 'MGEMLEEF', 'MRELGE', 'MFGEKIND', 'MOPLHOOG', 'MOPLLAAG', 'MBERBOER', 'MBERMIDD', 'MSKB1', 'MSKB2', 'MAUT1', 'MAUT2', 'MAUT0', 'MZFONDS', 'MZPART', 'MINK3045', 'MINK123M', 'MINKGEM', 'MKOOPKLA', 'PWAPART', 'PPERSAUT', 'PWERKT', 'PGEZONG', 'PWAOREG', 'PBRAND', 'PZEILPL', 'PBYSTAND', 'AWAPART', 'AWALAND', 'AVRAAUT', 'ALEVEN', 'AWAOREG', 'ABRAND', 'APLEZIER')

plot_sfs(lbfgs_bwd,'lbfgs : Backward Best',ylabel='AUC',figsize=(6,4));

# Obtained from RandomizedSearchCV. Omitted: out of scope for analysis
rfparams = {
    'n_estimators': 891,
    'min_samples_split': 2,
    'min_samples_leaf': 4,
    'max_features': 'auto',
    'max_depth': 3,
    'bootstrap': False,
    'class_weight':'balanced',
    'random_state':RS
}

rfc = RandomForestClassifier(**rfparams)

%%time
rfc_fwd = SequentialFeatureSelector(rfc, scoring='roc_auc', verbose=0, k_features="best", forward=True, n_jobs=-1)
rfc_fwd = rfc_fwd.fit(X.values, y, custom_feature_names=X.columns)

print('Forward RF Best, nfeat={} (AUC: {:.3f}):\n{}'.format(
    len(rfc_fwd.k_feature_names_),rfc_fwd.k_score_, rfc_fwd.k_feature_names_))

Forward RF Best, nfeat=24 (AUC: 0.765):
('MGODRK', 'MGODPR', 'MGODOV', 'MRELGE', 'MRELSA', 'MOPLLAAG', 'MBERBOER', 'MSKB1', 'MINKM30', 'MINK123M', 'PWALAND', 'PPERSAUT', 'PMOTSCO', 'PWERKT', 'PBROM', 'PBRAND', 'PPLEZIER', 'PFIETS', 'AWALAND', 'AVRAAUT', 'ATRACTOR', 'AWERKT', 'ABROM', 'APLEZIER')
CPU times: user 22.6 s, sys: 892 ms, total: 23.5 s
Wall time: 1h 3min 46s

plot_sfs(rfc_fwd,'RF : Forward Best', figsize=(6,4));

%%time
rfc_bwd = SequentialFeatureSelector(rfc, scoring='roc_auc', verbose=0, k_features="best", forward=False, n_jobs=-1)
rfc_bwd = rfc_bwd.fit(X.values, y, custom_feature_names=X.columns)

print('Backward RF Best, nfeat={} (AUC: {:.3f}):\n{}'.format(
    len(rfc_bwd.k_feature_names_),rfc_bwd.k_score_, rfc_bwd.k_feature_names_))

Backward RF Best, nfeat=30 (AUC: 0.764):
('MGODPR', 'MGODOV', 'MRELGE', 'MRELSA', 'MOPLLAAG', 'MBERBOER', 'MBERMIDD', 'MINKM30', 'MINK3045', 'MINK123M', 'MKOOPKLA', 'PWALAND', 'PPERSAUT', 'PMOTSCO', 'PVRAAUT', 'PBROM', 'PLEVEN', 'PPERSONG', 'PBRAND', 'PPLEZIER', 'PBYSTAND', 'AWAPART', 'AWABEDR', 'AWALAND', 'ABESAUT', 'AAANHANG', 'AWERKT', 'ABROM', 'AWAOREG', 'APLEZIER')
CPU times: user 49.5 s, sys: 868 ms, total: 50.4 s
Wall time: 1h 28min 34s

if IS_AWS:
    joblib.dump(lbfgs_bwd,'saves/lbfgs_backward.pkl')
    joblib.dump(lbfgs_fwd,'saves/lbfgs_forward.pkl')
    joblib.dump(lbfgs_pfwd,'saves/lbfgs_pforward.pkl')
    joblib.dump(rfc_bwd,'saves/rf_backward.pkl')
    joblib.dump(rfc_fwd,'saves/rf_forward.pkl')

rfe_lr = RFE(LogisticRegression(solver='lbfgs',class_weight='balanced',random_state=RS,max_iter=1000)).fit(Xs,y)
print('Num features:',rfe_lr.n_features_)
feats_lr = X.columns[rfe_lr.support_];feats_lr

Num features: 42

Index(['MOSTYPE', 'MGEMLEEF', 'MOSHOOFD', 'MRELGE', 'MRELOV', 'MFALLEEN',
       'MFGEKIND', 'MFWEKIND', 'MOPLLAAG', 'MBERBOER', 'MHHUUR', 'MHKOOP',
       'MAUT1', 'MAUT2', 'MAUT0', 'MZFONDS', 'MZPART', 'MINKM30', 'MINK3045',
       'MINK4575', 'MINK123M', 'MINKGEM', 'MKOOPKLA', 'PWAPART', 'PWALAND',
       'PPERSAUT', 'PBESAUT', 'PVRAAUT', 'PTRACTOR', 'PWERKT', 'PPERSONG',
       'PWAOREG', 'PBRAND', 'AWAPART', 'ABESAUT', 'AVRAAUT', 'ATRACTOR',
       'AWERKT', 'APERSONG', 'AWAOREG', 'ABRAND', 'APLEZIER'],
      dtype='object')

rfe_rf = RFE(RandomForestClassifier(n_jobs=-1, **rfparams)).fit(X,y)
print('Num features:',rfe_rf.n_features_)
feats_rf = X.columns[rfe_rf.support_];feats_rf

Num features: 42

Index(['MOSTYPE', 'MGEMOMV', 'MOSHOOFD', 'MGODPR', 'MGODOV', 'MRELGE',
       'MRELSA', 'MRELOV', 'MFALLEEN', 'MFWEKIND', 'MOPLHOOG', 'MOPLLAAG',
       'MBERHOOG', 'MBERBOER', 'MBERMIDD', 'MBERARBG', 'MBERARBO', 'MSKA',
       'MSKC', 'MHHUUR', 'MHKOOP', 'MAUT1', 'MAUT0', 'MZFONDS', 'MZPART',
       'MINKM30', 'MINK3045', 'MINK4575', 'MINKGEM', 'MKOOPKLA', 'PWAPART',
       'PWALAND', 'PPERSAUT', 'PBROM', 'PBRAND', 'PPLEZIER', 'AWAPART',
       'AWALAND', 'APERSAUT', 'ABROM', 'ABRAND', 'APLEZIER'],
      dtype='object')

print('Common Features:',len(feats_lr & feats_rf))
feats_lr & feats_rf

Common Features: 26

Index(['MOSTYPE', 'MOSHOOFD', 'MRELGE', 'MRELOV', 'MFALLEEN', 'MFWEKIND',
       'MOPLLAAG', 'MBERBOER', 'MHHUUR', 'MHKOOP', 'MAUT1', 'MAUT0', 'MZFONDS',
       'MZPART', 'MINKM30', 'MINK3045', 'MINK4575', 'MINKGEM', 'MKOOPKLA',
       'PWAPART', 'PWALAND', 'PPERSAUT', 'PBRAND', 'AWAPART', 'ABRAND',
       'APLEZIER'],
      dtype='object')

kb_chi = SelectKBest(score_func=chi2, k='all').fit(X,y)

rfc = RandomForestClassifier(n_jobs=-1, **rfparams).fit(X,y)

fi = pd.DataFrame({'feature':X.columns,'importance':rfc.feature_importances_,'pvalue':kb_chi.pvalues_}).sort_values('importance', ascending=False)

fi.head()

fi.importance[:3].sum()

0.4216625787235057

sns.catplot(x='feature',y='importance',data=fi[:42],kind='bar',aspect=1.5).set_xticklabels(rotation=90);

sns.relplot('feature','importance',data=fi[:42],hue='pvalue', palette='coolwarm',
            aspect=1.75, hue_norm=(0,0.05)).set_xticklabels(rotation=90);

g = sns.lineplot(data=np.cumsum(fi.importance).values)
g.set_xlabel('Num Features')
g.set_ylabel('Cumulative Importance')
g.set_title('Random Forest Feature Importance');

seqsels = [lbfgs_bwd,lbfgs_fwd, lbfgs_pfwd, rfc_bwd,rfc_fwd]
selfeats = [np.array(s.k_feature_names_) for s in seqsels]
allfeats = [x.astype('object') for x in [*selfeats,selected_pval,feats_lr.values,feats_rf.values]]
feats_ser = pd.Series([i for s in allfeats for i in s if i])

feats_ser.value_counts().head(10)

APLEZIER    8
PPERSAUT    8
PBRAND      8
MBERBOER    8
PWALAND     7
MRELGE      7
MKOOPKLA    7
MINK123M    7
MINKM30     6
PWAPART     6
dtype: int64

print(len(feats_ser.value_counts()[feats_ser.value_counts() == 1])) # only once occurrance
X.columns ^ feats_ser.unique()

18

Index(['ABYSTAND', 'AINBOED', 'AMOTSCO', 'MBERZELF', 'MGODGE', 'MINK7512',
       'MOPLMIDD', 'MSKD', 'PAANHANG', 'PINBOED', 'PWABEDR'],
      dtype='object')

	avg_score	ci_bound	cv_scores	feature_idx	feature_names	std_dev	std_err
14	0.750819	0.011137	[0.750527368112114, 0.764957264957265, 0.75303...	(10, 15, 20, 31, 36, 40, 42, 43, 45, 46, 58, 6...	(MRELSA, MOPLHOOG, MBERBOER, MAUT1, MINKM30, M...	0.00866497	0.00433248
15	0.75156	0.0112257	[0.751123523369286, 0.7660030819207865, 0.7539...	(10, 15, 20, 31, 36, 40, 42, 43, 45, 46, 52, 5...	(MRELSA, MOPLHOOG, MBERBOER, MAUT1, MINKM30, M...	0.00873395	0.00436697
16	0.752097	0.0116983	[0.7516646489104116, 0.7670743501820925, 0.755...	(1, 10, 15, 20, 31, 36, 40, 42, 43, 45, 46, 52...	(MAANTHUI, MRELSA, MOPLHOOG, MBERBOER, MAUT1, ...	0.0091017	0.00455085
17	0.752578	0.0113802	[0.7512610976594027, 0.7675394148052513, 0.754...	(1, 10, 15, 20, 31, 36, 40, 42, 43, 45, 46, 52...	(MAANTHUI, MRELSA, MOPLHOOG, MBERBOER, MAUT1, ...	0.00885417	0.00442709
18	0.753024	0.011405	[0.7458246202949592, 0.7683237775278923, 0.755...	(0, 1, 10, 15, 20, 31, 36, 40, 42, 43, 45, 46,...	(MOSTYPE, MAANTHUI, MRELSA, MOPLHOOG, MBERBOER...	0.00887347	0.00443674
19	0.75353	0.0119217	[0.7440361545234426, 0.7693927320348544, 0.755...	(0, 1, 10, 15, 20, 31, 36, 40, 42, 43, 45, 46,...	(MOSTYPE, MAANTHUI, MRELSA, MOPLHOOG, MBERBOER...	0.00927553	0.00463776
20	0.753897	0.0120354	[0.744311303103676, 0.7697444226951536, 0.7565...	(0, 1, 10, 15, 20, 31, 36, 40, 42, 43, 45, 46,...	(MOSTYPE, MAANTHUI, MRELSA, MOPLHOOG, MBERBOER...	0.00936396	0.00468198
21	0.754208	0.0131567	[0.7456732885758309, 0.7710378113734908, 0.757...	(0, 1, 10, 15, 17, 20, 31, 36, 40, 42, 43, 45,...	(MOSTYPE, MAANTHUI, MRELSA, MOPLHOOG, MOPLLAAG...	0.0102364	0.00511818
22	0.754406	0.0124859	[0.7452926663731749, 0.7708064359390836, 0.757...	(0, 1, 10, 15, 17, 20, 31, 36, 40, 42, 43, 45,...	(MOSTYPE, MAANTHUI, MRELSA, MOPLHOOG, MOPLLAAG...	0.00971448	0.00485724
23	0.754573	0.0135995	[0.7465835717954362, 0.7722802974562585, 0.758...	(0, 1, 10, 15, 17, 20, 31, 36, 40, 42, 43, 45,...	(MOSTYPE, MAANTHUI, MRELSA, MOPLHOOG, MOPLLAAG...	0.0105809	0.00529043

Dimensionality Reduction and Feature Analysis

Overview¶

Imports¶

Data¶

Exploratory Data Analysis¶

Dimensionality Reduction¶

PCA (Principal component analysis)¶

t-SNE (T-distributed Stochastic Neighbor Embedding)¶

UMAP (Uniform Manifold Approximation and Projection)¶

Models¶

Stepwise feature selection¶

p-value criterion¶

Stepwise selection ROC AUC¶

LBFGS Forward Best¶

LBFGS Forward parsimonious¶

LBFGS backward best¶

Random Forest forward best¶

Random Forest backward best¶

Recursive Feature Elimination¶

Feature Importance¶

Conclusions¶

Future work:¶

References¶

Code & Docs¶

Text¶

Other¶

	feature	importance	pvalue
46	PPERSAUT	0.189921	7.450789e-118
67	APERSAUT	0.135713	1.284432e-24
58	PBRAND	0.096029	4.055311e-40
43	PWAPART	0.067278	9.737212e-27
42	MKOOPKLA	0.052901	2.037681e-21