Exploring feature selection strategies to monitor model performance

In this tutorial we will employ feature selection strategies in an existing feature engineering pipeline and try to understand its impact on classification performance of different machine learning models.

Streamline model fit and validation

In order to compare the performance of different models, let's streamline the transformation, fit and prediction steps.

In [11]:
# Our training and validation features and target labels were prepared before:
import pickle
with open("X_val1.pkl","rb") as f:
    X_train = pickle.load(f)   
with open("y_val1.pkl","rb") as f:
    y_train = pickle.load(f)
        
with open("X_val2.pkl","rb") as f:
    X_val = pickle.load(f)       
with open("y_val2.pkl","rb") as f:
    y_val = pickle.load(f)    
In [114]:
# Write a function to streamline classifiers
from sklearn.linear_model import LogisticRegression
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import AdaBoostClassifier
from xgboost import XGBClassifier
from sklearn.metrics import roc_auc_score
from datetime import datetime
import warnings
import pandas as pd
warnings.filterwarnings("ignore")


def calculate_validation(X_val_trans,y_val,clf):
    probs = clf.predict_proba(X_val_trans)[:,1]
    return roc_auc_score(y_val,probs)


def streamline_classifiers(fpipeline,nfeatures,X_train,y_train,X_val,y_val):
    """ We will monitor performance of different default classifiers."""
    print(">>> streamline_classifiers: started exploring model performance using " + str(nfeatures) + " features.")
    # Transform the training and test set using the current pipeline
    start = datetime.now()
    X_train_trans = fpipeline.fit(X_train,y_train).transform(X_train)
    process = datetime.now() - start
    print(" >>> streamline_classifiers: fitted and transformed X_train. It took: " + str(process.seconds/60) + " minutes.")
    print("Shape of X_train is " + str(X_train_trans.shape))
    start = datetime.now()
    X_val_trans = fpipeline.transform(X_val)
    process = datetime.now() - start
    print(" >>> streamline_classifiers: transformed X_val. It took: " + str(process.seconds/60) + " minutes.")
    print("Shape of X_val is " + str(X_val_trans.shape))
    #######################################################################
    # Train different classifiers report and calculate the validation score
    #######################################################################
    # LogisticRegression
    clf_dict = {
        "LogisticRegression": LogisticRegression(),
        "MultinomialNB":MultinomialNB(),
        "AdaBoostClassifier":AdaBoostClassifier()
    }
    
    # Container to collect current scores
    scores = pd.DataFrame()
    scores["nfeatures"] = pd.Series(nfeatures)
    
    for key,value in clf_dict.items():
        start = datetime.now()
        
        clf = value
        clf.fit(X_train_trans.toarray(),y_train)
        val_score =calculate_validation(X_val_trans.toarray(),y_val,clf)
        scores[key] = pd.Series(val_score)
        
        process = datetime.now() - start
        print(" >>> Completed " +str(key) + " classifier with an roc score of "+str(val_score) + " it took " +str(process.seconds/60)+ " minutes." )
        print("*" * 80)
    # Return a data frame of validation roc scores for a given number of features
    return scores
In [13]:
# Label text features specific for this data set
Text_features = ["app","device","os","channel"]

##############################################################
# Define utility function to parse and process text features
##############################################################
# Note we avoid lambda functions since they don't pickle when we want to save the pipeline later   
def column_text_processer_nolambda(df,text_columns = Text_features):
    import pandas as pd
    import numpy as np
    """"A function that will merge/join all text in a given row to make it ready for tokenization. 
    - This function should take care of converting missing values to empty strings. 
    - It should also convert the text to lowercase.
    df= pandas dataframe
    text_columns = names of the text features in df
    """ 
    # Select only non-text columns that are in the df
    text_data = df[text_columns]
    
    # Fill the missing values in text_data using empty strings
    text_data.fillna("",inplace=True)
    
    # Concatenate feature name to each category encoding for each row
    # E.g: encoding 3 at device column will read as device3 to make each encoding unique for a given feature
    for col_index in list(text_data.columns):
        text_data[col_index] = col_index + text_data[col_index].astype(str)
    
    # Join all the strings in a given row to make a vector
    # text_vector = text_data.apply(lambda x: " ".join(x), axis = 1)
    text_vector = []
    for index,rows in text_data.iterrows():
        text_item = " ".join(rows).lower()
        text_vector.append(text_item)

    # return text_vector as pd.Series object to enter the tokenization pipeline
    return pd.Series(text_vector)

####################################################################################
# Define custom processing functions to add the log_total_clicks and 
# log_total_click_time features, and remove the unwanted base features
# These are original features in the data set that represent different time points
####################################################################################
def column_time_processer(X_train):
    import pandas as pd
    import numpy as np

    # Convert click_time to datetime64 dtype 
    X_train.click_time = pd.to_datetime(X_train.click_time)

    # Calculate the log_total_clicks for each ip and add as a new feature to temp_data
    temp_data = pd.DataFrame(np.log(X_train.groupby(["ip"]).size()),
                                    columns = ["log_total_clicks"]).reset_index()


    # Calculate the log_total_click_time for each ip and add as a new feature to temp_data
    # First define a function to process selected ip group 
    def get_log_total_click_time(group):
        diff = (max(group.click_time) - min(group.click_time)).seconds
        return np.log(diff+1)

    # Then apply this function to each ip group and extract the total click time per ip group
    log_time_frame = pd.DataFrame(X_train.groupby(["ip"]).apply(get_log_total_click_time),
                                  columns=["log_total_click_time"]).reset_index()

    # Then add this new feature to the temp_data
    temp_data = pd.merge(temp_data,log_time_frame, how = "left",on = "ip")

    # Combine temp_data with X_train to maintain X_train key order
    temp_data = pd.merge(X_train,temp_data,how = "left",on = "ip")

    # Drop features that are not needed
    temp_data = temp_data[["log_total_clicks","log_total_click_time"]]

    # Return only the numeric features as a tensor to integrate into the numeric feature branch of the pipeline
    return temp_data


###############################################################################################
# We need to wrap these custom utility functions using FunctionTransformer
from sklearn.preprocessing import FunctionTransformer
# FunctionTransformer wrapper of utility functions to parse text and numeric features
# Note how we avoid putting any arguments into column_text_processer or column_time_processer
###############################################################################################
get_numeric_data = FunctionTransformer(func = column_time_processer, validate=False) 
get_text_data = FunctionTransformer(func = column_text_processer_nolambda,validate=False) 

#############################################################################
# Create the token pattern: TOKENS_ALPHANUMERIC
# #Note this regex will match either a whitespace or a punctuation to tokenize 
# the string vector on these preferences, in our case we only have white spaces in our text  
#############################################################################
TOKENS_ALPHANUMERIC = '[A-Za-z0-9]+(?=\\s+)'  
In [ ]:
######################################################
# Construct our feature extraction selection pipeline
######################################################
import warnings
warnings.filterwarnings("ignore")
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.feature_extraction.text import HashingVectorizer
from sklearn.preprocessing import MaxAbsScaler, Imputer
from sklearn.feature_selection import SelectKBest,VarianceThreshold, chi2,SelectFromModel
from SparseInteractions import * #Load SparseInteractions (from : https://github.com/drivendataorg/box-plots-sklearn/blob/master/src/features/SparseInteractions.py) as a module
from sklearn.svm import LinearSVC

nfeatures_list = [10,20,50,100,200,500,1000,2000,5000]
results = pd.DataFrame(columns=['nfeatures', 'LogisticRegression', 'MultinomialNB', 'AdaBoostClassifier'])

for nfeatures in nfeatures_list:
  
    featureselection_pipeline = Pipeline([
            ("union",FeatureUnion(
                # Note that FeatureUnion() also accepts list of tuples, the first half of each tuple 
                # is the name of the transformer within the FeatureUnion

                transformer_list = [

                    ("numeric_subpipeline",Pipeline([        # Note we have subpipeline branches inside the main pipeline
                        ("parser",get_numeric_data), # Step1: parse the numeric data (note how we avoid () when using FunctionTransformer objects)
                        ("imputer",Imputer()) # Step2: impute any missing data using default (mean), note we don't expect missing values in this case. 
                    ])), # End of: numeric_subpipeline

                    ("text_subpipeline",Pipeline([
                        ("parser",get_text_data), # Step1: parse the text data 
                        ("tokenizer",HashingVectorizer(token_pattern= TOKENS_ALPHANUMERIC, # Step2: use HashingVectorizer for automated tokenization and feature extraction
                                                     ngram_range = (1,1),
                                                     non_negative=True, 
                                                     norm=None, binary=True )), # Note here we use binary=True since our hack is to use tokenization to generate dummy variables  
                        ('dim_red', SelectKBest(k=500)) # Step3: use dimension reduction to select best features 
                    ]))
                ]

            )),# End of step: union, this is the fusion point to main pipeline, all features are numeric at this stage

            # Common steps:

            ("int", SparseInteractions(degree=2)), # Add polynomial interaction terms up to the second degree polynomial
            ("scaler",MaxAbsScaler()), # Scale the features between 0 and 1.       
            ('dim_red2', SelectKBest(k = nfeatures))
        ])# End of: featureselection_pipeline

    results = pd.concat([results, streamline_classifiers(fpipeline=featureselection_pipeline,
                       nfeatures = nfeatures,X_train = X_train,y_train = y_train,
                       X_val = X_val,y_val = y_val)], axis = 0)
    print("--" * 50)
In [71]:
results
Out[71]:
nfeatures LogisticRegression MultinomialNB AdaBoostClassifier
0 10 0.741096 0.477099 0.741077
0 20 0.870281 0.869970 0.868690
0 50 0.885522 0.880458 0.882335
0 100 0.892628 0.884411 0.889322
0 200 0.899964 0.891626 0.900375
0 500 0.904338 0.894316 0.888938
0 1000 0.942774 0.922506 0.913993
0 2000 0.951608 0.943533 0.931726
0 5000 0.951570 0.942507 0.931726
In [98]:
import matplotlib.pyplot as plt
plt.rcParams['figure.figsize'] = [20, 10]
plt.plot('nfeatures','LogisticRegression', data=results, label = "LogisticRegression", marker = 'o')
plt.plot('nfeatures','MultinomialNB', data=results, label = "MultinomialNB", marker = 'o')
plt.plot('nfeatures','AdaBoostClassifier', data=results, label = "AdaBoostClassifier", marker = 'o')
plt.xlabel("Number of features",fontsize=20)
plt.ylabel("Validation ROC score",fontsize=20)
plt.title("Impact of number of features on model performance",fontsize=25)
plt.legend(fontsize=20)
plt.show()

It looks like adding more features beyond 2000 features have minimum impact on model performance. For this data set, it is seems to keep only the 100 best features before hyperparameter optimization.