In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import sklearn, datetime
import random
from sklearn.metrics import roc_auc_score,roc_curve,auc
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder

from ggplot import *

In [None]:
df = pd.read_csv("./cesar_data", sep=",", index_col=False)
print df.shape
df.head()

In [None]:
def from_str_to_date(mystr):
    return datetime.datetime.strptime(mystr, '%m/%d/%y %H:%M')

df['date'] = df.datetime.apply(from_str_to_date)
df.set_index('date',inplace=True)
df.drop(['datetime'],axis=1,inplace=True)

In [None]:
def prepare_dataset(mydata,featurelist):
    features=featurelist
    liste_to_encode = ['qualification'] + list(set(features))
    dict_encoder = dict.fromkeys(liste_to_encode, LabelEncoder())

    for i in liste_to_encode:
        mydata[i] = dict_encoder[i].fit_transform(mydata[i].astype('str'))
    X = mydata[features]
    y = mydata.qualification
    return (mydata,X,y)

def prepare_dataset_binary(mydata,featurelist):
    liste_to_le_encode = ['qualification']
    liste_to_b_encode = featurelist
    dict_encoder = dict.fromkeys(liste_to_le_encode, LabelEncoder())

    for i in liste_to_le_encode:
        mydata[i] = dict_encoder[i].fit_transform(mydata[i].astype('str'))
    
    mydata = pd.get_dummies(mydata, columns=liste_to_b_encode)
    
    X = mydata.drop(['qualification'],axis=1)
    y = mydata.qualification
    return (mydata,X,y)    

In [None]:
perf_train = {}
perf_test = {}

X = df.drop(['qualification'],axis=1)
y = df[['qualification']]

# Modèle triviaux

In [None]:
def random_predictor(myinput):
    np.random.seed(42)
    tmp = np.random.rand(myinput.shape[0])
    ret = map(lambda x: round(x), tmp)
    return ret

In [None]:
print "[random_predictor] AUROC score = %0.3f" % roc_auc_score(y,random_predictor(X))                                 
perf_train['random'] = roc_auc_score(y,random_predictor(X))
perf_test['random'] = roc_auc_score(y,random_predictor(X))

# Comparaison de modèles

In [None]:
def from_dict_to_df(mydict,mytype):    
    ret_df = pd.DataFrame(mydict.items())
    ret_df.columns = ["modele","auroc"]
    ret_df = ret_df.sort_values(by=['auroc'])
    ret_df['type'] = mytype
    return ret_df

def plot_perf(perf_train,perf_test):
    #df_train = from_dict_to_df(perf_train,'train')
    df_test = from_dict_to_df(perf_test,'test')
    #df_perf = pd.concat([df_train,df_test])
    df_perf = df_test
    return ggplot(df_perf, aes(x='modele',y='auroc')) + geom_point(size=100) +ylim(0,1) #,alpha=0.2)
    #return ggplot(df_perf, aes(x='modele',y='auroc',shape='type')) + geom_point(size=100) #,alpha=0.2)

# Random forest

In [None]:
def train_my_random_forest(X_train, y_train,X_test,y_test):
    scores = ['roc_auc']
    parameters = { "n_estimators":range(1,100,10), "min_samples_split": range(10,100,10)}
    dict_rf = dict.fromkeys(scores,0)
    dict_rf

    for score in scores:
        clf = GridSearchCV(RandomForestClassifier(), parameters, cv=3,
                           scoring='%s' % score,verbose = 1,return_train_score = False)
        clf.fit(X_train, y_train)
        dict_rf[score] = clf
        return [dict_rf,clf]
    
def extract_feat_rf(model,feat):
    ret  = pd.DataFrame(model.feature_importances_)
    ret['feat'] = feat
    ret.columns = ['feat_importance','feat']
    ret = ret.sort_values(by='feat_importance',ascending=False)
    return ret   

## Random forest - all features

In [None]:
df_feat_all = df.copy()
df_feat_all = df_feat_all.reset_index()
df_feat_all.drop(['srcstr',"date"],axis=1,inplace=True)

feat_list_all  = set(list(df_feat_all.columns)) - set(['qualification'])

(data,X,y) = prepare_dataset_binary(df_feat_all,list(feat_list_all))

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5, random_state=0,stratify=y,shuffle=True)

[dict_rf_all,clf_all] = train_my_random_forest(X_train, y_train,X_test,y_test)

#print "== Performance sur les données d'entrainement =="
#print "AUROC score = %0.3f" % roc_auc_score(y_train,clf_all.best_estimator_.predict(X_train))
#print "\n"

print "== Performance sur les données de test =="
print "AUROC score = %0.3f" % roc_auc_score(y_test,clf_all.best_estimator_.predict(X_test))

In [None]:
perf_train['RF_all'] = roc_auc_score(y_train,clf_all.best_estimator_.predict(X_train))
perf_test['RF_all'] = roc_auc_score(y_test,clf_all.best_estimator_.predict(X_test))

In [None]:
rf_all = extract_feat_rf(clf_all.best_estimator_,X.columns)
#print rf_all.head()

g = sns.factorplot(x="feat_importance", y="feat", data=rf_all.head(10), kind="bar", 
                   palette="BuPu", size=6, aspect=1.5)
g.set_xticklabels(step=1)
plt.title("Top 10 des features explicatives")
plt.show()

In [None]:
plot_perf(perf_train,perf_test)

## Random forest - 2 features

In [None]:
df_rf2 = df.copy()
df_rf2['is_US'] = df_rf2.country.str.contains("United States")
df_rf2['is_oregon'] = df_rf2.host.str.contains("oregon")

df_feat = df_rf2[['is_US','is_oregon','qualification']].reset_index()
df_feat.drop('date',axis=1,inplace = True)
df_feat.head()

In [None]:
feat_list  = ['is_US','is_oregon']
(data,X,y) = prepare_dataset(df_feat,feat_list)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5, random_state=0,stratify=y,shuffle=True)

[dict_rf2,clf2] = train_my_random_forest(X_train, y_train,X_test,y_test)

In [None]:
print "== Performance sur les données de test =="
print "AUROC score = %0.3f" % roc_auc_score(y_test,clf2.best_estimator_.predict(X_test))

In [None]:
perf_train['RF_2'] = roc_auc_score(y_train,clf2.best_estimator_.predict(X_train))
perf_test['RF_2'] = roc_auc_score(y_test,clf2.best_estimator_.predict(X_test))

In [None]:
rf = extract_feat_rf(clf2.best_estimator_,feat_list)
print rf.head()

g = sns.factorplot(x="feat_importance", y="feat", data=rf, kind="bar", 
                   palette="BuPu", size=6, aspect=1.5)
g.set_xticklabels(step=1)
plt.title("Top 10 des features explicatives")
plt.show()

In [None]:
plot_perf(perf_train,perf_test)