In [1]:
import pandas as pd
import numpy as np
import scipy
from copy import deepcopy
from sklearn.linear_model import LinearRegression
from data_preprocessing import DataProcessor

In [2]:
train_df = pd.read_csv('../data/processed/train_processed.csv')
test_df = pd.read_csv('../data/processed/test_processed.csv')

In [3]:
p = DataProcessor()
train_processed, train_original = p.fit_transform(train_df)

removed nan values
count vectorizer finished fitting
count vector finished transforming
dummy variables created
created a sparse matrix of all features


In [4]:
test_processed, test_original = p.transform(test_df)

count vector finished transforming
dummy variables created
created a sparse matrix of all features


In [5]:
train_processed

<83041x15940 sparse matrix of type '<type 'numpy.float64'>'
	with 3359482 stored elements in Compressed Sparse Row format>

### Run sklearn countvectorizer

In [6]:
def ctoi(x):
    if x=='P1':
        return 1 
    if x=='P2':
        return 2
    if x=='P3':
        return 3
    if x=='P4':
        return 4
    return 5

In [7]:
Priority_int = train_original['Priority'].apply(lambda x: ctoi(x))

### Make spare representation

In [8]:
training_ip = train_processed[0:83041/2]
training_op = Priority_int[0:83041/2]
print training_ip.shape
validation_ip = train_processed[83041/2:]
validation_op = train_original['Priority'][83041/2:]
print validation_ip.shape

test_ip = test_processed
test_op = test_original['Priority']
print test_ip.shape

(41520, 15940)
(41521, 15940)
(20761, 15940)


### Train Linear Regression on Training set

In [9]:
from sklearn.ensemble import RandomForestRegressor

In [10]:
lr = LinearRegression(n_jobs=-1)
# lr = RandomForestRegressor(n_jobs=-1)

In [11]:
lr.fit(training_ip, training_op)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=-1, normalize=False)

#### Initializing Thresholds

In [12]:
validation_set_preds = lr.predict(validation_ip)
print validation_set_preds
print validation_set_preds.shape
validation_set_preds = validation_set_preds.reshape(validation_set_preds.shape[0], 1)
print validation_set_preds.shape

[ 2.41306288  2.7908672   2.73131806 ...,  2.4883916   2.8945196
  3.63032065]
(41521,)
(41521, 1)


In [13]:
# Finding percentile of each class in training data
p1 = int((validation_op=='P1').sum()/float(validation_op.shape[0])*100)
p2 = int((validation_op=='P2').sum()/float(validation_op.shape[0])*100)
p3 = int((validation_op=='P3').sum()/float(validation_op.shape[0])*100)
p4 = int((validation_op=='P4').sum()/float(validation_op.shape[0])*100)
# p2 = int((train_original['Priority']=='P2').sum()/float(train_original['Priority'].shape[0])*100)
# p3 = int((train_original['Priority']=='P3').sum()/float(train_original['Priority'].shape[0])*100)
# p4 = int((train_original['Priority']=='P4').sum()/float(train_original['Priority'].shape[0])*100)
# print p1,p2,p3,p4
print p1, p2, p3, p4

3 7 85 2


In [14]:
# Create Thresholds based on these percentiles
T0 = validation_set_preds.min()
(T1,T2,T3,T4) = np.percentile(validation_set_preds,[p1,p2,p3,p4])

In [102]:
# T1 = np.percentile(validation_set_preds[validation_op=='P1'], p1)
# T2 = np.percentile(validation_set_preds[validation_op=='P2'], p2)
# T3 = np.percentile(validation_set_preds[validation_op=='P3'], p3)
# T4 = np.percentile(validation_set_preds[validation_op=='P4'], p4)
T0 = 0
T1 = 1
T2 = 2
T3 = 3
T4 = 4

In [103]:
T0,T1,T2,T3,T4

(0, 1, 2, 3, 4)

In [15]:
# Function to map regression output to class labels according to threshold
T = {'T0': T0, 'T1': T1, 'T2': T2, 'T3': T3, 'T4': T4}
print T

{'T4': 2.1795256076450489, 'T2': 2.4892263957931648, 'T3': 3.1412804340464566, 'T0': 0.23618537009426799, 'T1': 2.2730375631856763}


In [16]:
# function to get class label based on threhsolds for a single test sample
def itoc(x, T):
    if x <= T['T1']:
        return 'P1'
    if x <= T['T2']:
        return 'P2'
    if x <= T['T3']:
        return 'P3'
    if x <= T['T4']:
        return 'P4'
    return 'P5'

In [17]:
# Calculating F1 Score before tweaking thresholds
from sklearn.metrics import f1_score
validation_set_class_preds = [itoc(x, T) for x in validation_set_preds]
f1_score(validation_op, validation_set_class_preds, average='macro')

  'precision', 'predicted', average, warn_for)


0.2731744339784859

## Threshold Tweaking (Greedy) 

In [18]:
# get f1 score for given set of thresholds
def F1ScoreTH(T, val_preds, actual_labels):
    val_class_preds = np.apply_along_axis(lambda x: itoc(x, T), 1, val_preds)
#     val_class_preds = val_preds.map(lambda x: itoc(x, T))
    return f1_score(actual_labels, val_class_preds, average='macro')

In [19]:
TH = ['T0', 'T1', 'T2', 'T3', 'T4']
def optimize_thresholds(T, actual, preds, d):
    for i in range(1, len(TH)):
        D = T[TH[i]] - T[TH[i-1]]
#         print "For T", i, '----------'
        while (True):
            f1_v0 = F1ScoreTH(T, preds, actual)
#             print f1_v0
            delta = (d*D)

            if (i + 1 < len(TH) and T[TH[i]] + delta < T[TH[i+1]]):
                T[TH[i]] += delta
                f1_v1 = F1ScoreTH(T, preds, actual)
                T[TH[i]] -= delta
            else:
                f1_v1 = f1_v0
            
            if (T[TH[i]] - delta > T[TH[i-1]]):
                T[TH[i]] -= delta
                f1_v2 = F1ScoreTH(T, preds, actual)
                T[TH[i]] += delta
            else:
                f1_v2 = f1_v0
                
#             print "----- handling TH for ", TH[i], "------"
#             print f1_v0, f1_v1, f1_v2
            
            if (f1_v1 > f1_v0 and f1_v1 > f1_v2):
                T[TH[i]] += delta
#                 print "increasing threshold for ", TH[i]
            elif (f1_v1 > f1_v0 and f1_v2 > f1_v1):
                T[TH[i]] -= delta
#                 print "increasing threshold for ", TH[i]
            elif (f1_v1 < f1_v0 and f1_v2 > f1_v0):
                T[TH[i]] -= delta
#                 print "decreasing threshold for ", TH[i]
            else:
                print "Completed for ", i
                break;
    return T

In [20]:
T = {'T0': T0, 'T1': T1, 'T2': T2, 'T3': T3, 'T4': T4}
T_new = deepcopy(T)
T_new

{'T0': 0.23618537009426799,
 'T1': 2.2730375631856763,
 'T2': 2.4892263957931648,
 'T3': 3.1412804340464566,
 'T4': 2.1795256076450489}

In [21]:
%time optimize_thresholds(T_new, validation_op, validation_set_preds, 0.02)

Completed for  1
Completed for  2
Completed for  3
Completed for  4
CPU times: user 11.3 s, sys: 211 ms, total: 11.5 s
Wall time: 11.4 s


{'T0': 0.23618537009426799,
 'T1': 2.2323005193238483,
 'T2': 2.5149189834400976,
 'T3': 3.1412804340464566,
 'T4': 2.1795256076450489}

In [22]:
T

{'T0': 0.23618537009426799,
 'T1': 2.2730375631856763,
 'T2': 2.4892263957931648,
 'T3': 3.1412804340464566,
 'T4': 2.1795256076450489}

In [23]:
T_new

{'T0': 0.23618537009426799,
 'T1': 2.2323005193238483,
 'T2': 2.5149189834400976,
 'T3': 3.1412804340464566,
 'T4': 2.1795256076450489}

In [24]:
F1ScoreTH(T_new, validation_set_preds, validation_op)

0.27779951679747311

### Cross Validation

In [25]:
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import f1_score, precision_score, recall_score

In [26]:
p1 = int((train_original['Priority']=='P1').sum()/float(train_original['Priority'].shape[0])*100)
p2 = int((train_original['Priority']=='P2').sum()/float(train_original['Priority'].shape[0])*100)
p3 = int((train_original['Priority']=='P3').sum()/float(train_original['Priority'].shape[0])*100)
p4 = int((train_original['Priority']=='P4').sum()/float(train_original['Priority'].shape[0])*100)

In [119]:
print p1,p2,p3,p4

3 7 85 2


In [120]:
kf = StratifiedKFold(n_splits=10)

In [138]:
cvf1scores = []
cvprecisionscores = []
cvrecallscores = []
lr = LinearRegression(n_jobs=-1)
for train,val in kf.split(train_processed, Priority_int):
    train_input = train_processed[train]
    val_input = train_processed[val]
    train_output = Priority_int[train]
    val_output = train_original['Priority'][val]
    
    vlen = len(Priority_int)
    vtraining_ip = train_input[0:vlen/2]
    vtraining_op = train_output[0:vlen/2]

    vvalidation_ip = train_input[vlen/2:]
    vvalidation_op = train_original['Priority'][train][vlen/2:]#train_output[vlen/2:]
    
    %time lr.fit(vtraining_ip,vtraining_op)
    
    val_prediction = lr.predict(vvalidation_ip)
    val_prediction = val_prediction.reshape(val_prediction.shape[0], 1)
    
    vT0 = val_prediction.min()
    (vT1,vT2,vT3,vT4) = np.percentile(val_prediction,[p1,p2,p3,p4])
    vT = {'T0': vT0, 'T1': vT1, 'T2': vT2, 'T3': vT3, 'T4': vT4}
#     vT = {'T0': 0, 'T1': 1, 'T2': 2, 'T3': 3, 'T4': 4}
    print "Optimizing thresholds"
    vT_new = optimize_thresholds(vT, vvalidation_op, val_prediction, 0.01)
    
    val_prediction = lr.predict(val_input)
    val_prediction = val_prediction.reshape(val_prediction.shape[0], 1)
    
    val_class_preds = np.apply_along_axis(lambda x: itoc(x, vT_new), 1, val_prediction)
    f1scores = f1_score(val_output, val_class_preds, average=None)
    pscores = precision_score(val_output, val_class_preds, average=None)
    rscores = recall_score(val_output, val_class_preds, average=None)
    print np.mean(f1scores), np.mean(pscores), np.mean(rscores)
    cvf1scores.append(f1scores)
    cvprecisionscores.append(pscores)
    cvrecallscores.append(rscores)
#     average_f1_scores.append(np.mean(scores))

CPU times: user 12.5 s, sys: 241 ms, total: 12.8 s
Wall time: 6.92 s
Optimizing thresholds
Completed for  1
Completed for  2
Completed for  3
Completed for  4
0.284596689176 0.291609624372 0.389361576282


  'precision', 'predicted', average, warn_for)


CPU times: user 11.8 s, sys: 178 ms, total: 12 s
Wall time: 6.21 s
Optimizing thresholds
Completed for  1
Completed for  2
Completed for  3
Completed for  4
0.265561096824 0.274714706352 0.345939038818
CPU times: user 13.4 s, sys: 251 ms, total: 13.7 s
Wall time: 7.48 s
Optimizing thresholds
Completed for  1
Completed for  2
Completed for  3
Completed for  4
0.278348404248 0.292544133416 0.378770964198
CPU times: user 12.1 s, sys: 208 ms, total: 12.3 s
Wall time: 6.44 s
Optimizing thresholds
Completed for  1
Completed for  2
Completed for  3
Completed for  4
0.290743626378 0.304515319037 0.378298513289
CPU times: user 11.9 s, sys: 179 ms, total: 12.1 s
Wall time: 6.42 s
Optimizing thresholds
Completed for  1
Completed for  2
Completed for  3
Completed for  4
0.269732653896 0.276004493743 0.359637156949
CPU times: user 11.9 s, sys: 167 ms, total: 12.1 s
Wall time: 6.31 s
Optimizing thresholds
Completed for  1
Completed for  2
Completed for  3
Completed for  4
0.274609525075 0.2886927547

In [124]:
print np.mean(cvscores), np.std(average_f1_scores)

0.327652911699 0.0104032134742


In [139]:
def print_results(s):
    print np.mean(s), np.std(np.mean(s, axis=1))
    print np.mean(s, axis=1)
    print np.mean(s, axis=0)

print_results(cvf1scores)
print_results(cvprecisionscores)
print_results(cvrecallscores)

0.276460383492 0.00777425348505
[ 0.28459669  0.2655611   0.2783484   0.29074363  0.26973265  0.27460953
  0.27981871  0.26513056  0.28107709  0.27498549]
[ 0.24613498  0.16600619  0.84429598  0.          0.12586478]
0.287560095889 0.00965186019553
[ 0.29160962  0.27471471  0.29254413  0.30451532  0.27600449  0.28869275
  0.296754    0.27571981  0.29378098  0.28126514]
[ 0.26401377  0.21589913  0.88845555  0.          0.06943203]
0.369071163915 0.0117689821919
[ 0.38936158  0.34593904  0.37877096  0.37829851  0.35963716  0.36926964
  0.36517688  0.35875992  0.37602644  0.36947151]
[ 0.23253719  0.13587499  0.8043419   0.          0.67260174]


## Differential Evolution  - Parameter tuning for threshold change delta

In [37]:
from scipy.optimize import differential_evolution

In [38]:
args = (validation_op,validation_set_preds)
bounds = [ (0.01,0.10) ] # order: percentage shift in threhsold

In [39]:
def func(parameters, *args):
    #print args[0], args[1]
    parameters = map(float,parameters)
    T = {'T0': T0, 'T1': T1, 'T2': T2, 'T3': T3, 'T4': T4}
    T_new = optimize_thresholds(T, validation_op, validation_set_preds, parameters[0])
    
    f1 = F1ScoreTH(T_new, validation_set_preds, validation_op)
    print f1, parameters
    return -1*f1

In [40]:
%time result = differential_evolution(func, bounds, args, strategy='rand2bin', popsize=10, mutation=(0.5,1.9), recombination=0.7, maxiter=2)

0.330752435557 [0.08847658547469857]
0.330752435557 [0.0457725393425509]
0.337810575475 [0.09129733573351453]
0.330752435557 [0.06925096047167976]
0.330752435557 [0.07807642673240787]
0.330752435557 [0.02643104309136409]
0.330752435557 [0.058429377457218185]
0.330752435557 [0.03348321886290703]
0.330752435557 [0.016594290404370535]
0.330752435557 [0.046029896126210765]
0.330752435557 [0.06410965966777817]
0.330752435557 [0.030937913154039133]
0.330752435557 [0.03293310468917572]
0.330752435557 [0.060931724948426635]
0.330752435557 [0.08998310034159304]
0.330752435557 [0.05205466786673592]
0.330752435557 [0.039444211741080766]
0.330752435557 [0.04333032049282347]
0.330752435557 [0.060603724773334915]
0.330752435557 [0.04847243261437668]
0.337810575475 [0.09129733573351453]
0.337810575475 [0.09129734573351453]
CPU times: user 1min 20s, sys: 48 ms, total: 1min 20s
Wall time: 1min 20s


In [41]:
result.x

array([ 0.09129734])

#### Recalculating thresholds using this parameter

In [144]:
T = {'T0': T0, 'T1': T1, 'T2': T2, 'T3': T3, 'T4': T4}
T_new = deepcopy(T)
T_new = optimize_thresholds(T_new, validation_op, validation_set_preds, 0.01)

Completed for  1
Completed for  2
Completed for  3
Completed for  4


In [146]:
T_new

{'T0': 0.23618537009426799,
 'T1': 2.2119319973929343,
 'T2': 2.5141828916491855,
 'T3': 3.1412804340464566,
 'T4': 2.1795256076450489}

### Running it finally on test data

In [29]:
test_set_preds = lr.predict(test_ip)
print test_set_preds
print test_set_preds.shape
test_set_preds = test_set_preds.reshape(test_set_preds.shape[0], 1)
print test_set_preds.shape

[ 2.8479649   2.74556011  3.10346262 ...,  3.43844443  2.98782104
  3.04460992]
(20761,)
(20761, 1)


In [31]:
from sklearn.metrics import f1_score
test_set_class_preds = [itoc(x, T_new) for x in test_set_preds]
print f1_score(test_op, test_set_class_preds, average='macro')
print precision_score(test_op, test_set_class_preds, average='macro')
print recall_score(test_op, test_set_class_preds, average='macro')

0.275828601408
0.289137478822
0.363868652678
