Spark Pyhon credit
In [1]:
from pyspark.mllib.regression import LabeledPoint
from pyspark.mllib.classification import LogisticRegressionWithSGD
In [2]:
from pyspark import SparkContext
sc = SparkContext( 'local', 'pyspark')
In [3]:
inputData = sc.textFile("./creditdata.csv")
In [4]:
inputData.count()
Out[4]:
In [5]:
inputData.take(5)
Out[5]:
In [6]:
def parseData(line):
"""
Parse a line of text into an MLlib LabeledPoint object.
"""
values = line.split(',')
if values[1] == "Yes": # Convert -1 labels to 0 for MLlib
values[1] = 1
else : values[1] = 0
if values[2] == "Yes":
values[2] = 1
else : values[2] = 0
return LabeledPoint(values[1], values[2:])
In [7]:
modelInput = inputData.map(parseData)
In [8]:
modelInput.take(5)
Out[8]:
In [9]:
numIters = 3
In [10]:
LRmodel = LogisticRegressionWithSGD.train(modelInput, numIters)
In [11]:
print("Final weights: " + str(LRmodel.weights))
print("Final intercept: " + str(LRmodel.intercept))
In [12]:
# Evaluate the model on training data
labelsAndPreds = modelInput.map(lambda p: (p.label, LRmodel.predict(p.features)))
print(labelsAndPreds.count())
#print(labelsAndPreds)
#print(labelsAndPreds.take(10))
In [13]:
Errors = labelsAndPreds.filter(lambda vp: vp[0] != float(vp[1]))
#print(Errors.collect())
print(Errors.count())
#trainErr = labelsAndPreds.filter(lambda (v, p): v != float(p)).count() / float(modelInput.count())
trainErr = Errors.count()/labelsAndPreds.count()
print("Training Error = " + str(trainErr))
In [14]:
checkData = modelInput.map(lambda p: (LRmodel.predict(p.features),p.label, p.features))
#checkNotCorrect = checkData.filter(lambda p,v,f : v!=p)
checkNotCorrect = checkData.filter(lambda x : x[1] != x[0])
#print(checkNotCorrect)
checkNotCorrect.count()
Out[14]:
In [15]:
checkCorrect = checkData.filter(lambda x : x[1] == x[0])
checkCorrect.count()
Out[15]:
In [ ]:
checkNotCorrect.sample(False,0.1).saveAsTextFile("LR-notCorrect1")
In [ ]:
checkCorrect.sample(False,0.01).saveAsTextFile("LR-Correct1")
In [ ]:
In [ ]:
Comments
Post a Comment