Lab Machine Learning 2 Klasifikasi
#install spark
!apt-get install openjdk-8-jdk-headless -qq > /dev/null
!wget -q https://downloads.apache.org/spark/spark-2.4.5/spark-2.4.5-
bin-hadoop2.7.tgz
!tar xf spark-2.4.5-bin-hadoop2.7.tgz
!pip install -q findspark
#set environment
import os
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
os.environ["SPARK_HOME"] = "/content/spark-2.4.5-bin-hadoop2.7"
#inisialisasi
import findspark
findspark.init()
from pyspark.sql import SparkSession
spark = SparkSession.builder.master("local[*]").getOrCreate()
#upload file
from google.colab import files
!rm contoh_clust2.csv
files.upload() #contoh_clust.csv
#load data latih
datalatih=spark.read.csv('contoh_clust2.csv',inferSchema=True,header=
True)
#lihat data
datalatih.show(3)
#visualisasikan data pelatihan
import matplotlib.pyplot as plt
x = datalatih.select("x").rdd.flatMap(lambda x: x).collect()
y = datalatih.select("y").rdd.flatMap(lambda x: x).collect()
hasil = datalatih.select("hasil").rdd.flatMap(lambda x: x).collect()
plt.scatter(x, y,c=hasil,cmap='RdBu');
#buat obyek mesin learning LogisticRegression
from pyspark.ml.classification import LogisticRegression
mesin = LogisticRegression(labelCol='hasil',elasticNetParam=0.7)
#from pyspark.ml.classification import RandomForestClassifier
#mesin = RandomForestClassifier(labelCol='hasil')
#from pyspark.ml.classification import GBTClassifier
#mesin = GBTClassifier(labelCol='hasil')
#periksa data awal
datalatih.show(3)
#persiapkan data agar dapat digunakan untuk pelatihan
from pyspark.ml.feature import VectorAssembler
vec=VectorAssembler(inputCols=['x','y'],outputCol='features')
datalatih_vec = vec.transform(datalatih)
#periksa hasilnya
datalatih_vec.show(3)
#lakukan pelatihan
model = mesin.fit(datalatih_vec)
#lakukan pengujian dengan membuat 2000 sample data
import numpy as np
rng = np.random.RandomState(0)
xuji= [-8, -4] + [12, 12] * rng.rand(2000, 2)
arr = xuji.tolist()
#sesuaikan format untuk inputan model
from pyspark.sql.types import DoubleType
dftest = spark.createDataFrame(arr)
#periksa beberapa data awal
dftest.show(3)
#sesuaikan format untuk inputan model
testvec = VectorAssembler(inputCols=['_1','_2'],outputCol='features')
testvec = testvec.transform(dftest)
#periksa beberapa data awal
testvec.show(3)
#lakukan prediksi
prediksi = model.transform(testvec)
#periksa hasil prediksi
prediksi.show(3)
#visualisasikan prediksi
soalx = prediksi.select("_1").rdd.flatMap(lambda x: x).collect()
soaly = prediksi.select("_2").rdd.flatMap(lambda x: x).collect()
pred = prediksi.select("prediction").rdd.flatMap(lambda x: x).collect
()
plt.scatter(x,y,c=hasil)
plt.scatter(soalx,soaly,c=pred,alpha=0.1)

Lab machine learning 2 klasifikasi

  • 1.
    Lab Machine Learning2 Klasifikasi #install spark !apt-get install openjdk-8-jdk-headless -qq > /dev/null !wget -q https://downloads.apache.org/spark/spark-2.4.5/spark-2.4.5- bin-hadoop2.7.tgz !tar xf spark-2.4.5-bin-hadoop2.7.tgz !pip install -q findspark #set environment import os os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64" os.environ["SPARK_HOME"] = "/content/spark-2.4.5-bin-hadoop2.7" #inisialisasi import findspark findspark.init() from pyspark.sql import SparkSession spark = SparkSession.builder.master("local[*]").getOrCreate() #upload file from google.colab import files !rm contoh_clust2.csv files.upload() #contoh_clust.csv #load data latih datalatih=spark.read.csv('contoh_clust2.csv',inferSchema=True,header= True) #lihat data datalatih.show(3) #visualisasikan data pelatihan import matplotlib.pyplot as plt x = datalatih.select("x").rdd.flatMap(lambda x: x).collect() y = datalatih.select("y").rdd.flatMap(lambda x: x).collect() hasil = datalatih.select("hasil").rdd.flatMap(lambda x: x).collect() plt.scatter(x, y,c=hasil,cmap='RdBu'); #buat obyek mesin learning LogisticRegression from pyspark.ml.classification import LogisticRegression mesin = LogisticRegression(labelCol='hasil',elasticNetParam=0.7) #from pyspark.ml.classification import RandomForestClassifier #mesin = RandomForestClassifier(labelCol='hasil') #from pyspark.ml.classification import GBTClassifier #mesin = GBTClassifier(labelCol='hasil') #periksa data awal
  • 2.
    datalatih.show(3) #persiapkan data agardapat digunakan untuk pelatihan from pyspark.ml.feature import VectorAssembler vec=VectorAssembler(inputCols=['x','y'],outputCol='features') datalatih_vec = vec.transform(datalatih) #periksa hasilnya datalatih_vec.show(3) #lakukan pelatihan model = mesin.fit(datalatih_vec) #lakukan pengujian dengan membuat 2000 sample data import numpy as np rng = np.random.RandomState(0) xuji= [-8, -4] + [12, 12] * rng.rand(2000, 2) arr = xuji.tolist() #sesuaikan format untuk inputan model from pyspark.sql.types import DoubleType dftest = spark.createDataFrame(arr) #periksa beberapa data awal dftest.show(3) #sesuaikan format untuk inputan model testvec = VectorAssembler(inputCols=['_1','_2'],outputCol='features') testvec = testvec.transform(dftest) #periksa beberapa data awal testvec.show(3) #lakukan prediksi prediksi = model.transform(testvec) #periksa hasil prediksi prediksi.show(3) #visualisasikan prediksi soalx = prediksi.select("_1").rdd.flatMap(lambda x: x).collect() soaly = prediksi.select("_2").rdd.flatMap(lambda x: x).collect() pred = prediksi.select("prediction").rdd.flatMap(lambda x: x).collect () plt.scatter(x,y,c=hasil) plt.scatter(soalx,soaly,c=pred,alpha=0.1)