Spark Pyhon credit


LR-creditdata
In [1]:
from pyspark.mllib.regression import LabeledPoint
from pyspark.mllib.classification import LogisticRegressionWithSGD
In [2]:
from pyspark import  SparkContext
sc = SparkContext( 'local', 'pyspark')
In [3]:
inputData = sc.textFile("./creditdata.csv")
In [4]:
inputData.count()
Out[4]:
10000
In [5]:
inputData.take(5)
Out[5]:
['1,No,No,729.5264952073,44361.6250742669',
 '2,No,Yes,817.1804065555,12106.1347003149',
 '3,No,No,1073.5491640117,31767.1389473999',
 '4,No,No,529.2506047453,35704.4939350781',
 '5,No,No,785.6558829305,38463.4958787229']
In [6]:
def parseData(line):
    """
    Parse a line of text into an MLlib LabeledPoint object.
    """
    values = line.split(',')
    if values[1] == "Yes":   # Convert -1 labels to 0 for MLlib
        values[1] = 1
    else : values[1] = 0
    if values[2] == "Yes":
        values[2] = 1
    else : values[2] = 0
    return LabeledPoint(values[1], values[2:])
In [7]:
modelInput = inputData.map(parseData)
In [8]:
modelInput.take(5)
Out[8]:
[LabeledPoint(0.0, [0.0,729.526495207,44361.6250743]),
 LabeledPoint(0.0, [1.0,817.180406555,12106.1347003]),
 LabeledPoint(0.0, [0.0,1073.54916401,31767.1389474]),
 LabeledPoint(0.0, [0.0,529.250604745,35704.4939351]),
 LabeledPoint(0.0, [0.0,785.65588293,38463.4958787])]
In [9]:
numIters = 3
In [10]:
LRmodel = LogisticRegressionWithSGD.train(modelInput, numIters)
/usr/local/spark/python/pyspark/mllib/classification.py:313: UserWarning: Deprecated in 2.0.0. Use ml.classification.LogisticRegression or LogisticRegressionWithLBFGS.
  "Deprecated in 2.0.0. Use ml.classification.LogisticRegression or "
In [11]:
print("Final weights: " + str(LRmodel.weights))
print("Final intercept: " + str(LRmodel.intercept))
Final weights: [-0.1165171392,-280.36127383,-14120.8645054]
Final intercept: 0.0
In [12]:
# Evaluate the model on training data
labelsAndPreds = modelInput.map(lambda p: (p.label, LRmodel.predict(p.features)))
print(labelsAndPreds.count())
#print(labelsAndPreds)
#print(labelsAndPreds.take(10))
10000
In [13]:
Errors = labelsAndPreds.filter(lambda vp: vp[0] != float(vp[1]))
#print(Errors.collect())
print(Errors.count())
#trainErr = labelsAndPreds.filter(lambda (v, p): v != float(p)).count() / float(modelInput.count())
trainErr = Errors.count()/labelsAndPreds.count()
print("Training Error = " + str(trainErr))
333
Training Error = 0.0333
In [14]:
checkData = modelInput.map(lambda p: (LRmodel.predict(p.features),p.label,  p.features))
#checkNotCorrect = checkData.filter(lambda p,v,f : v!=p)
checkNotCorrect = checkData.filter(lambda x : x[1] != x[0])
#print(checkNotCorrect)
checkNotCorrect.count()
Out[14]:
333
In [15]:
checkCorrect = checkData.filter(lambda x : x[1] == x[0])
checkCorrect.count()
Out[15]:
9667
In [ ]:
checkNotCorrect.sample(False,0.1).saveAsTextFile("LR-notCorrect1")
In [ ]:
checkCorrect.sample(False,0.01).saveAsTextFile("LR-Correct1")
In [ ]:
 
In [ ]:
 

Comments

Popular posts from this blog

Python 12-22

Python 22-39

Machine Learning with Python example