Machine Learning with Python example

import pandas as pd import pylab as pl import numpy as np train=pd.read_csv('https://raw.githubusercontent.com/yhat/DataGotham2013/master/notebooks/data/credit-data-trainingset.csv') test=pd.read_csv('https://raw.githubusercontent.com/yhat/DataGotham2013/master/notebooks/data/credit-data-testset.csv') pd.value_counts(train.serious_dlqin2yrs).plot(kind='bar') pd.value_counts(train.number_of_dependents).plot(kind='bar') pd.crosstab(train.serious_dlqin2yrs,train.number_of_times90_days_late) test.head() from sklearn.linear_model import LogisticRegression from sklearn.neighbors import KNeighborsClassifier from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier from sklearn.svm import SVC featuresTest = ['revolving_utilization_of_unsecured_lines', 'debt_ratio', 'monthly_income', 'age', 'number_of_times90_days_late'] featuresTrain = ['RevolvingUtilizationOfUnsecuredLines', 'DebtRatio', 'MonthlyIncome', 'age', 'NumberOfTimes90DaysLate'] clf = KNeighborsClassifier(n_neighbors=13) clf.fit(train[featuresTest],train.serious_dlqin2yrs) clf.predict(test[featuresTest]) probs=clf.predict_proba(test[featuresTest]) prob_true=probs[::,1] pl.hist(prob_true) from sklearn.metrics import roc_curve, auc, classification_report, confusion_matrix preds = clf.predict_proba(test[featuresTest]) preds confusion_matrix(test['serious_dlqin2yrs'], clf.predict(test[featuresTest])) print (classification_report(test['serious_dlqin2yrs'], clf.predict(test[featuresTest]), labels=[0, 1])) pd.crosstab(test['serious_dlqin2yrs'], clf.predict(test[featuresTest]), rownames=["Actual"], colnames=["Predicted"]) def plot_roc(name, probs): fpr, tpr, thresholds = roc_curve(test['serious_dlqin2yrs'], probs) roc_auc = auc(fpr, tpr) pl.clf() pl.plot(fpr, tpr, label='ROC curve (area = %0.2f)' % roc_auc) pl.plot([0, 1], [0, 1], 'k--') pl.xlim([0.0, 1.05]) pl.ylim([0.0, 1.05]) pl.xlabel('False Positive Rate') pl.ylabel('True Positive Rate') pl.title(name) pl.legend(loc="lower right") pl.show() plot_roc("Perfect Classifier", test['serious_dlqin2yrs']) plot_roc("Guessing", np.random.uniform(0, 1, len(test['serious_dlqin2yrs']))) #[::,1] selects the 2nd column of the numpy array plot_roc("KNN", preds[::,1]) clf = RandomForestClassifier() clf.fit(train[featuresTest], train.serious_dlqin2yrs) probs = clf.predict_proba(test[featuresTest])[::,1] plot_roc("RandomForest", probs) featuresTest2 = ['revolving_utilization_of_unsecured_lines', 'debt_ratio', 'number_of_times90_days_late', 'number_real_estate_loans_or_lines'] featuresTrain2 = ['revolving_utilization_of_unsecured_lines', 'debt_ratio', 'number_of_times90_days_late', 'number_real_estate_loans_or_lines'] clf = GradientBoostingClassifier() clf.fit(train[featuresTrain2], train.serious_dlqin2yrs) probs = clf.predict_proba(test[featuresTest2])[::,1] plot_roc("Your Classifier", probs) probs odds = (1 - probs) / probs score = np.log(odds)*(40/np.log(2)) + 340 pl.hist(score)

Comments

Popular posts from this blog

Python 12-22

Python 22-39