from pyspark.mllib.regression import LabeledPoint
from pyspark.mllib.classification import LogisticRegressionWithSGD

from pyspark import  SparkContext
sc = SparkContext( 'local', 'pyspark')

inputData = sc.textFile("./creditdata.csv")

inputData.count()

10000

inputData.take(5)

['1,No,No,729.5264952073,44361.6250742669',
 '2,No,Yes,817.1804065555,12106.1347003149',
 '3,No,No,1073.5491640117,31767.1389473999',
 '4,No,No,529.2506047453,35704.4939350781',
 '5,No,No,785.6558829305,38463.4958787229']

def parseData(line):
    """
    Parse a line of text into an MLlib LabeledPoint object.
    """
    values = line.split(',')
    if values[1] == "Yes":   # Convert -1 labels to 0 for MLlib
        values[1] = 1
    else : values[1] = 0
    if values[2] == "Yes":
        values[2] = 1
    else : values[2] = 0
    return LabeledPoint(values[1], values[2:])

modelInput = inputData.map(parseData)

modelInput.take(5)

[LabeledPoint(0.0, [0.0,729.526495207,44361.6250743]),
 LabeledPoint(0.0, [1.0,817.180406555,12106.1347003]),
 LabeledPoint(0.0, [0.0,1073.54916401,31767.1389474]),
 LabeledPoint(0.0, [0.0,529.250604745,35704.4939351]),
 LabeledPoint(0.0, [0.0,785.65588293,38463.4958787])]

numIters = 3

LRmodel = LogisticRegressionWithSGD.train(modelInput, numIters)

/usr/local/spark/python/pyspark/mllib/classification.py:313: UserWarning: Deprecated in 2.0.0. Use ml.classification.LogisticRegression or LogisticRegressionWithLBFGS.
  "Deprecated in 2.0.0. Use ml.classification.LogisticRegression or "

print("Final weights: " + str(LRmodel.weights))
print("Final intercept: " + str(LRmodel.intercept))

Final weights: [-0.1165171392,-280.36127383,-14120.8645054]
Final intercept: 0.0

# Evaluate the model on training data
labelsAndPreds = modelInput.map(lambda p: (p.label, LRmodel.predict(p.features)))
print(labelsAndPreds.count())
#print(labelsAndPreds)
#print(labelsAndPreds.take(10))

10000

Errors = labelsAndPreds.filter(lambda vp: vp[0] != float(vp[1]))
#print(Errors.collect())
print(Errors.count())
#trainErr = labelsAndPreds.filter(lambda (v, p): v != float(p)).count() / float(modelInput.count())
trainErr = Errors.count()/labelsAndPreds.count()
print("Training Error = " + str(trainErr))

333
Training Error = 0.0333

checkData = modelInput.map(lambda p: (LRmodel.predict(p.features),p.label,  p.features))
#checkNotCorrect = checkData.filter(lambda p,v,f : v!=p)
checkNotCorrect = checkData.filter(lambda x : x[1] != x[0])
#print(checkNotCorrect)
checkNotCorrect.count()

333

checkCorrect = checkData.filter(lambda x : x[1] == x[0])
checkCorrect.count()

9667

checkNotCorrect.sample(False,0.1).saveAsTextFile("LR-notCorrect1")

checkCorrect.sample(False,0.01).saveAsTextFile("LR-Correct1")

Search This Blog

Python Tutorials

Spark Pyhon credit

Comments

Post a Comment

Popular posts from this blog

Python 12-22

Python 22-39

Machine Learning with Python example