Spark Python iris data
In [1]:
import numpy as np
from pyspark.mllib.clustering import KMeans
In [2]:
from pyspark import SparkContext
sc = SparkContext( 'local', 'pyspark')
In [3]:
inputData = sc.textFile("./irisdata.csv")
In [20]:
type(inputData)
Out[20]:
In [4]:
inputData.count()
Out[4]:
In [5]:
inputData.take(10)
Out[5]:
In [6]:
def parseData(line):
ld = line.split(",")
return np.array([float(ld[0]),float(ld[1]), float(ld[2]),float(ld[3])])
In [7]:
modelInput = inputData.map(parseData)
In [8]:
modelInput.count()
Out[8]:
In [9]:
modelInput.take(10)
Out[9]:
In [10]:
numClusters = 3
In [11]:
KMmodel = KMeans.train(modelInput, numClusters)
In [12]:
print("Final centers: " + str(KMmodel.clusterCenters))
print("Total Cost: " + str(KMmodel.computeCost(modelInput)))
In [13]:
def parseData2(line):
ld = line.split(",")
return (ld[4],np.array([float(ld[0]),float(ld[1]), float(ld[2]),float(ld[3])]))
In [14]:
checkData = inputData.map(parseData2)
In [15]:
checkData.count()
Out[15]:
In [16]:
checkData.take(10)
Out[16]:
In [17]:
labelsAndPreds = checkData.map(lambda p: (p[0], KMmodel.predict(p[1])) )
In [18]:
labelsAndPreds.saveAsTextFile("KM-lap1")
seen ...
ReplyDelete