Machine Learning with Python example
import pandas as pd
import pylab as pl
import numpy as np
train=pd.read_csv('https://raw.githubusercontent.com/yhat/DataGotham2013/master/notebooks/data/credit-data-trainingset.csv')
test=pd.read_csv('https://raw.githubusercontent.com/yhat/DataGotham2013/master/notebooks/data/credit-data-testset.csv')
pd.value_counts(train.serious_dlqin2yrs).plot(kind='bar')
pd.value_counts(train.number_of_dependents).plot(kind='bar')
pd.crosstab(train.serious_dlqin2yrs,train.number_of_times90_days_late)
test.head()
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
featuresTest = ['revolving_utilization_of_unsecured_lines', 'debt_ratio',
'monthly_income', 'age', 'number_of_times90_days_late']
featuresTrain = ['RevolvingUtilizationOfUnsecuredLines', 'DebtRatio',
'MonthlyIncome', 'age', 'NumberOfTimes90DaysLate']
clf = KNeighborsClassifier(n_neighbors=13)
clf.fit(train[featuresTest],train.serious_dlqin2yrs)
clf.predict(test[featuresTest])
probs=clf.predict_proba(test[featuresTest])
prob_true=probs[::,1]
pl.hist(prob_true)
from sklearn.metrics import roc_curve, auc, classification_report, confusion_matrix
preds = clf.predict_proba(test[featuresTest])
preds
confusion_matrix(test['serious_dlqin2yrs'], clf.predict(test[featuresTest]))
print (classification_report(test['serious_dlqin2yrs'], clf.predict(test[featuresTest]), labels=[0, 1]))
pd.crosstab(test['serious_dlqin2yrs'], clf.predict(test[featuresTest]), rownames=["Actual"], colnames=["Predicted"])
def plot_roc(name, probs):
fpr, tpr, thresholds = roc_curve(test['serious_dlqin2yrs'], probs)
roc_auc = auc(fpr, tpr)
pl.clf()
pl.plot(fpr, tpr, label='ROC curve (area = %0.2f)' % roc_auc)
pl.plot([0, 1], [0, 1], 'k--')
pl.xlim([0.0, 1.05])
pl.ylim([0.0, 1.05])
pl.xlabel('False Positive Rate')
pl.ylabel('True Positive Rate')
pl.title(name)
pl.legend(loc="lower right")
pl.show()
plot_roc("Perfect Classifier", test['serious_dlqin2yrs'])
plot_roc("Guessing", np.random.uniform(0, 1, len(test['serious_dlqin2yrs'])))
#[::,1] selects the 2nd column of the numpy array
plot_roc("KNN", preds[::,1])
clf = RandomForestClassifier()
clf.fit(train[featuresTest], train.serious_dlqin2yrs)
probs = clf.predict_proba(test[featuresTest])[::,1]
plot_roc("RandomForest", probs)
featuresTest2 = ['revolving_utilization_of_unsecured_lines', 'debt_ratio',
'number_of_times90_days_late', 'number_real_estate_loans_or_lines']
featuresTrain2 = ['revolving_utilization_of_unsecured_lines', 'debt_ratio',
'number_of_times90_days_late', 'number_real_estate_loans_or_lines']
clf = GradientBoostingClassifier()
clf.fit(train[featuresTrain2], train.serious_dlqin2yrs)
probs = clf.predict_proba(test[featuresTest2])[::,1]
plot_roc("Your Classifier", probs)
probs
odds = (1 - probs) / probs
score = np.log(odds)*(40/np.log(2)) + 340
pl.hist(score)
Comments
Post a Comment