Lab Machine Learning2 Klasifikasi
#install spark
!apt-get install openjdk-8-jdk-headless -qq > /dev/null
!wget -q https://downloads.apache.org/spark/spark-2.4.5/spark-2.4.5-
bin-hadoop2.7.tgz
!tar xf spark-2.4.5-bin-hadoop2.7.tgz
!pip install -q findspark
#set environment
import os
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
os.environ["SPARK_HOME"] = "/content/spark-2.4.5-bin-hadoop2.7"
#inisialisasi
import findspark
findspark.init()
from pyspark.sql import SparkSession
spark = SparkSession.builder.master("local[*]").getOrCreate()
#upload file
from google.colab import files
!rm contoh_clust2.csv
files.upload() #contoh_clust.csv
#load data latih
datalatih=spark.read.csv('contoh_clust2.csv',inferSchema=True,header=
True)
#lihat data
datalatih.show(3)
#visualisasikan data pelatihan
import matplotlib.pyplot as plt
x = datalatih.select("x").rdd.flatMap(lambda x: x).collect()
y = datalatih.select("y").rdd.flatMap(lambda x: x).collect()
hasil = datalatih.select("hasil").rdd.flatMap(lambda x: x).collect()
plt.scatter(x, y,c=hasil,cmap='RdBu');
#buat obyek mesin learning LogisticRegression
from pyspark.ml.classification import LogisticRegression
mesin = LogisticRegression(labelCol='hasil',elasticNetParam=0.7)
#from pyspark.ml.classification import RandomForestClassifier
#mesin = RandomForestClassifier(labelCol='hasil')
#from pyspark.ml.classification import GBTClassifier
#mesin = GBTClassifier(labelCol='hasil')
#periksa data awal
2.
datalatih.show(3)
#persiapkan data agardapat digunakan untuk pelatihan
from pyspark.ml.feature import VectorAssembler
vec=VectorAssembler(inputCols=['x','y'],outputCol='features')
datalatih_vec = vec.transform(datalatih)
#periksa hasilnya
datalatih_vec.show(3)
#lakukan pelatihan
model = mesin.fit(datalatih_vec)
#lakukan pengujian dengan membuat 2000 sample data
import numpy as np
rng = np.random.RandomState(0)
xuji= [-8, -4] + [12, 12] * rng.rand(2000, 2)
arr = xuji.tolist()
#sesuaikan format untuk inputan model
from pyspark.sql.types import DoubleType
dftest = spark.createDataFrame(arr)
#periksa beberapa data awal
dftest.show(3)
#sesuaikan format untuk inputan model
testvec = VectorAssembler(inputCols=['_1','_2'],outputCol='features')
testvec = testvec.transform(dftest)
#periksa beberapa data awal
testvec.show(3)
#lakukan prediksi
prediksi = model.transform(testvec)
#periksa hasil prediksi
prediksi.show(3)
#visualisasikan prediksi
soalx = prediksi.select("_1").rdd.flatMap(lambda x: x).collect()
soaly = prediksi.select("_2").rdd.flatMap(lambda x: x).collect()
pred = prediksi.select("prediction").rdd.flatMap(lambda x: x).collect
()
plt.scatter(x,y,c=hasil)
plt.scatter(soalx,soaly,c=pred,alpha=0.1)