Spark Python iris data


KM-irisdata
In [1]:
import numpy as np
from pyspark.mllib.clustering import KMeans
In [2]:
from pyspark import  SparkContext
sc = SparkContext( 'local', 'pyspark')
In [3]:
inputData = sc.textFile("./irisdata.csv")
In [20]:
type(inputData)
Out[20]:
pyspark.rdd.RDD
In [4]:
inputData.count()
Out[4]:
150
In [5]:
inputData.take(10)
Out[5]:
['5.1,3.5,1.4,0.2,Iris-setosa',
 '4.9,3,1.4,0.2,Iris-setosa',
 '4.7,3.2,1.3,0.2,Iris-setosa',
 '4.6,3.1,1.5,0.2,Iris-setosa',
 '5,3.6,1.4,0.2,Iris-setosa',
 '5.4,3.9,1.7,0.4,Iris-setosa',
 '4.6,3.4,1.4,0.3,Iris-setosa',
 '5,3.4,1.5,0.2,Iris-setosa',
 '4.4,2.9,1.4,0.2,Iris-setosa',
 '4.9,3.1,1.5,0.1,Iris-setosa']
In [6]:
def parseData(line):
    ld = line.split(",") 
    return np.array([float(ld[0]),float(ld[1]), float(ld[2]),float(ld[3])])
In [7]:
modelInput = inputData.map(parseData)
In [8]:
modelInput.count()
Out[8]:
150
In [9]:
modelInput.take(10)
Out[9]:
[array([ 5.1,  3.5,  1.4,  0.2]),
 array([ 4.9,  3. ,  1.4,  0.2]),
 array([ 4.7,  3.2,  1.3,  0.2]),
 array([ 4.6,  3.1,  1.5,  0.2]),
 array([ 5. ,  3.6,  1.4,  0.2]),
 array([ 5.4,  3.9,  1.7,  0.4]),
 array([ 4.6,  3.4,  1.4,  0.3]),
 array([ 5. ,  3.4,  1.5,  0.2]),
 array([ 4.4,  2.9,  1.4,  0.2]),
 array([ 4.9,  3.1,  1.5,  0.1])]
In [10]:
numClusters = 3
In [11]:
KMmodel = KMeans.train(modelInput, numClusters)
In [12]:
print("Final centers: " + str(KMmodel.clusterCenters))
print("Total Cost: " + str(KMmodel.computeCost(modelInput)))
Final centers: [array([ 5.88360656,  2.74098361,  4.38852459,  1.43442623]), array([ 5.006,  3.418,  1.464,  0.244]), array([ 6.85384615,  3.07692308,  5.71538462,  2.05384615])]
Total Cost: 78.94506582597637
In [13]:
def parseData2(line):
    ld = line.split(",") 
    return (ld[4],np.array([float(ld[0]),float(ld[1]), float(ld[2]),float(ld[3])]))
In [14]:
checkData = inputData.map(parseData2)
In [15]:
checkData.count()
Out[15]:
150
In [16]:
checkData.take(10)
Out[16]:
[('Iris-setosa', array([ 5.1,  3.5,  1.4,  0.2])),
 ('Iris-setosa', array([ 4.9,  3. ,  1.4,  0.2])),
 ('Iris-setosa', array([ 4.7,  3.2,  1.3,  0.2])),
 ('Iris-setosa', array([ 4.6,  3.1,  1.5,  0.2])),
 ('Iris-setosa', array([ 5. ,  3.6,  1.4,  0.2])),
 ('Iris-setosa', array([ 5.4,  3.9,  1.7,  0.4])),
 ('Iris-setosa', array([ 4.6,  3.4,  1.4,  0.3])),
 ('Iris-setosa', array([ 5. ,  3.4,  1.5,  0.2])),
 ('Iris-setosa', array([ 4.4,  2.9,  1.4,  0.2])),
 ('Iris-setosa', array([ 4.9,  3.1,  1.5,  0.1]))]
In [17]:
labelsAndPreds = checkData.map(lambda p: (p[0], KMmodel.predict(p[1])) )
In [18]:
labelsAndPreds.saveAsTextFile("KM-lap1")

Comments

Post a Comment

Popular posts from this blog

Python 12-22

Machine Learning with Python example

Spark Pyhon credit