# Differential Evolution for Hyperparameter Tuning

In [1]:
import numpy as np
import pandas as pd
from data_preprocessing import DataProcessor

### Loading data and bringing in the right format

In [2]:
train_df = pd.read_csv('../data/processed/train_processed.csv')
test_df = pd.read_csv('../data/processed/test_processed.csv')

In [3]:
p = DataProcessor()

In [4]:
train_processed, train_original = p.fit_transform(train_df)

removed nan values
count vectorizer finished fitting
count vector finished transforming
dummy variables created
created a sparse matrix of all features


In [5]:
train_processed

<83041x15940 sparse matrix of type '<type 'numpy.float64'>'
	with 3359482 stored elements in Compressed Sparse Row format>

In [6]:
test_processed, test_orignal = p.transform(test_df)

count vector finished transforming
dummy variables created
created a sparse matrix of all features


In [7]:
test_processed

<20761x15940 sparse matrix of type '<type 'numpy.float64'>'
	with 837223 stored elements in Compressed Sparse Row format>

### ML model building

In [10]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import f1_score, make_scorer, precision_score, recall_score
from sklearn.model_selection import cross_val_score
# Differential evolution optimizer
from scipy.optimize import differential_evolution

In [11]:
args = (train_processed,train_original['Priority'])
bounds = [ (5,50),(2,15),(1,10) ] # order: n_estimators,max_depth,min_samples_split,min_samples_leaf

In [14]:
def func(parameters, *args):
    #print args[0], args[1]
    parameters = map(int,parameters)
    clf = RandomForestClassifier(
        n_jobs=-1, 
        n_estimators=parameters[0], 
        max_depth=None,
        min_samples_split=parameters[1],
        min_samples_leaf=parameters[2],
        class_weight="balanced"
    )
#     clf = RandomForestClassifier(class_weight="balanced", n_jobs=-1)
    recallscorer_macro = make_scorer(recall_score, average='macro')
#     print args[0].shape, args[1].shape
    recall = cross_val_score(clf, args[0], args[1], scoring=recallscorer_macro, cv=3, n_jobs=-1)
    average_recall = np.mean(recall)
    print average_recall, parameters
    return -1*average_recall

In [15]:
%time result = differential_evolution(func, bounds, args, strategy='rand2bin', popsize=30, mutation=(0.5,1.9), recombination=0.7, maxiter=3)

0.586174511116 [12, 9, 2]
0.592268401148 [48, 11, 8]
0.604849403166 [40, 8, 2]
0.406819745194 [11, 5, 1]
0.587057767765 [13, 4, 6]
0.587738102776 [34, 2, 8]
0.608667982552 [47, 9, 4]
0.596172660203 [24, 6, 2]
0.58790538734 [12, 10, 6]
0.600938434489 [45, 7, 3]
0.603639650213 [44, 7, 2]
0.594813895945 [36, 6, 7]
0.595260532799 [22, 7, 3]
0.604716274658 [43, 14, 3]
0.601931093989 [18, 12, 2]
0.596323174679 [46, 14, 7]
0.60200202233 [36, 2, 6]
0.423265867933 [18, 6, 1]
0.574271227327 [8, 10, 7]
0.603866072532 [35, 2, 5]
0.575866919591 [14, 7, 9]
0.598685503378 [23, 9, 5]
0.582947855518 [9, 10, 2]
0.599826130071 [26, 3, 5]
0.583482776272 [8, 14, 3]
0.595438285491 [15, 2, 4]
0.596621354805 [49, 10, 7]
0.599901747733 [25, 2, 4]
0.598764728773 [35, 11, 5]
0.600859656409 [19, 14, 2]
0.484420938896 [25, 12, 1]
0.603337838609 [39, 3, 4]
0.592268401148 [48, 9, 8]
0.594266860998 [14, 4, 4]
0.601165301264 [30, 14, 4]
0.599268603049 [29, 13, 5]
0.592877958423 [28, 8, 7]
0.595903897658 [21, 11, 4]
0.

0.588705083768 [38, 2, 9]
0.601406740829 [38, 4, 3]
0.5845792116 [21, 3, 7]
0.49421952323 [44, 14, 1]
0.59817495598 [45, 12, 7]
0.598506105197 [23, 2, 2]
0.604207165258 [44, 11, 2]
0.586375435247 [16, 10, 5]
0.601573968926 [48, 9, 5]
0.593158405207 [45, 2, 9]
0.593008097633 [47, 13, 9]
0.591287925788 [19, 4, 8]
0.589066819703 [19, 3, 7]
0.592446953851 [43, 10, 7]
0.574456357081 [9, 4, 7]
0.589876755436 [30, 8, 8]
0.59808736041 [40, 14, 6]
0.604011305506 [49, 12, 3]
0.459747765938 [21, 10, 1]
0.592673328675 [32, 12, 7]
0.601122502844 [47, 7, 2]
0.598229366952 [37, 4, 5]
0.601676499765 [25, 12, 3]
0.593915834456 [24, 3, 5]
0.593161221704 [19, 3, 6]
0.594236728714 [38, 5, 8]
0.585957271469 [21, 7, 9]
0.594077059331 [47, 11, 8]
0.596197584922 [25, 7, 2]
0.595643442748 [43, 3, 8]
0.581141258358 [8, 13, 6]
0.598772021306 [45, 5, 6]
0.603695265613 [43, 11, 2]
0.585168829079 [15, 13, 5]
0.590084893196 [48, 9, 9]
0.60407305448 [42, 2, 2]
0.56649387902 [9, 2, 8]
0.57682797891 [16, 3, 8]
0.599011

In [12]:
result.x

array([ 46.35332601,  13.24744516,   1.09649762])

In [13]:
parameters = map(int,result.x)
print 'Tuned Parameters:',parameters

Tuned Parameters: [46, 13, 1]


In [15]:
clf = RandomForestClassifier(
        n_jobs=-1, 
        n_estimators=parameters[0], 
        max_depth=None,
        min_samples_split=parameters[1],
        min_samples_leaf=parameters[2],
        class_weight='balanced'
    )
f1scorer_macro = make_scorer(f1_score, average='macro')
f1 = cross_val_score(clf, train_processed, train_original['Priority'], scoring=f1scorer_macro, cv=10, n_jobs=-1)
average_f1 = np.mean(f1)
print 'Cross validation metrics:',average_f1, np.std(f1)
%time clf.fit(train_processed, train_original['Priority'])

Cross validation metrics: 0.506047850816 0.00815623805465
CPU times: user 1min, sys: 259 ms, total: 1min 1s
Wall time: 8.5 s


RandomForestClassifier(bootstrap=True, class_weight='balanced',
            criterion='gini', max_depth=None, max_features='auto',
            max_leaf_nodes=None, min_impurity_split=1e-07,
            min_samples_leaf=1, min_samples_split=13,
            min_weight_fraction_leaf=0.0, n_estimators=46, n_jobs=-1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [17]:
# test_processed, test_orignal
test_preds = clf.predict(test_processed)
score = f1_score(test_orignal['Priority'], test_preds, average=None)

In [20]:
print np.mean(score)

0.500320956231


In [None]:
clf = RandomForestClassifier(n_jobs=-1)
f1scorer_macro = make_scorer(f1_score, average='macro')
f1 = cross_val_score(clf, train_processed, train_original['Priority'], scoring=f1scorer_macro, cv=10, n_jobs=-1)
average_f1 = np.mean(f1)
print 'Cross validation metrics:',average_f1, np.std(f1)
%time clf.fit(train_processed, train_original['Priority'])