機械学習

線形回帰モデル
◦ 回帰モデルは、数値を出力するモデルである。
◦ 与えられたデータから関係式を仮定して、データに最も当てはまる係数を求める。
◦ インプットを説明変数、アウトプットを目的変数という。
◦ 最小二乗法を用いて、回帰係数と切片を計算する。
import numpy as np
import matplotlib.pyplot as plt
def reg1dim2(x, y):
n = len(x)
a = ((np.dot(x, y) - y.sum() * x.sum() / n) /
((x**2).sum() - x.sum()**2 / n))
b = (y.sum() - a * x.sum()) / n
return a, b
x = np.array([1, 2, 4, 6, 7])
y = np.array([1, 3, 3, 5, 4])
a, b = reg1dim2(x, y)
plt.scatter(x, y, color="k")
xmax = x.max()
plt.plot([0, xmax], [b, a * xmax + b], color="k")
plt.show()
結果から、出力は入力に対して
直線になるので、複雑なモデルの解析には
向かないことが読み取れる。

非線形回帰モデル
◦ 線形回帰モデルでは、目的変数に対して説明変数は１つだったが、説明変数が１つではなく複数あるモデルを非線形モデルという。
◦ 回帰係数は予想値と目的変数の二乗誤差が最小になるように推定される。
◦ 回帰係数自体を大きくならないようにするモデルに、ラッソ回帰やリッジ回帰がある。
for i in range(len(degrees)):
ax = plt.subplot(1, len(degrees), i + 1)
plt.setp(ax, xticks=(), yticks=())
polynomial_features = PolynomialFeatures(degree=degrees[i],
include_bias=False)
linear_regression = LinearRegression()
pipeline = Pipeline([("polynomial_features", polynomial_features),
("linear_regression", linear_regression)])
pipeline.fit(X[:, np.newaxis], y)
# Evaluate the models using crossvalidation
scores = cross_val_score(pipeline, X[:, np.newaxis], y,
scoring="neg_mean_squared_error", cv=10)
X_test = np.linspace(0, 1, 100)
plt.plot(X_test, pipeline.predict(X_test[:, np.newaxis]), label="Model")
plt.plot(X_test, true_fun(X_test), label="True function")
plt.scatter(X, y, edgecolor='b', s=20, label="Samples")
plt.xlabel("x")
plt.ylabel("y")
plt.xlim((0, 1))
plt.ylim((-2, 2))
plt.legend(loc="best")
plt.title("Degree {}nMSE = {:.2e}(+/- {:.2e})".format(
degrees[i], -scores.mean(), scores.std()))
plt.show()
import numpy as np
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import cross_val_score
def true_fun(X):
return np.cos(1.5 * np.pi * X)
np.random.seed(0)
n_samples = 30
degrees = [5]
X = np.sort(np.random.rand(n_samples))
y = true_fun(X) + np.random.randn(n_samples) * 0.1
plt.figure(figsize=(14, 5))
◦ 線形回帰が直線だったのに対して、非線
形回帰は曲線なのがわかる。
◦ より複雑なモデルに対応することができる。

ロジスティック回帰モデル
◦ ロジスティック回帰モデルは、カテゴリのデータを扱うアルゴリズムである。
◦ ロジスティック回帰関数の目的は、確率を推定することによって、入力変数と出力変数の間の関係性を見つけ出すことである。
◦ データサンプルが、あるカテゴリに属する確率を求める分類するモデルである。
◦ 交差エントロピー誤差関数が最小になるように学習する。
ロジスティック回帰は、分類モデルなので、注意が必要である。
境界線で、出力結果が変わってることがわかる。
コードは次のページ

ロジスティック回帰モデル
import numpy as np
import seaborn as sns
from sklearn import datasets
iris = datasets.load_iris()
X = iris.data[:, :2]
y = (iris.target != 0) * 1
plt.scatter(X[y == 0][:, 0], X[y == 0][:, 1], color='b', label='0')
plt.scatter(X[y == 1][:, 0], X[y == 1][:, 1], color='r', label='1')
plt.legend();
class LogisticRegression:
def __init__(self, lr=0.01, num_iter=100000, fit_intercept=True,
verbose=False):
self.lr = lr
self.num_iter = num_iter
self.fit_intercept = fit_intercept
self.verbose = verbose
def __add_intercept(self, X):
intercept = np.ones((X.shape[0], 1))
return np.concatenate((intercept, X), axis=1)
def __sigmoid(self, z):
return 1 / (1 + np.exp(-z))
def __loss(self, h, y):
return (-y * np.log(h) - (1 - y) * np.log(1 - h)).mean()
def fit(self, X, y):
if self.fit_intercept:
X = self.__add_intercept(X)
# weights initialization
self.theta = np.zeros(X.shape[1])
for i in range(self.num_iter):
z = np.dot(X, self.theta)
h = self.__sigmoid(z)
gradient = np.dot(X.T, (h - y)) / y.size
self.theta -= self.lr * gradient
z = np.dot(X, self.theta)
h = self.__sigmoid(z)
loss = self.__loss(h, y)
if(self.verbose ==True and i % 10000 == 0):
print(f'loss: {loss} t')
def predict_prob(self, X):
if self.fit_intercept:
X = self.__add_intercept(X)
return self.__sigmoid(np.dot(X, self.theta))
def predict(self, X):
return self.predict_prob(X).round()
model = LogisticRegression(lr=0.1, num_iter=300000)
model.fit(X, y)
preds = model.predict(X)
(preds == y).mean()
plt.scatter(X[y == 0][:, 0], X[y == 0][:, 1], color='b',
label='0')
plt.scatter(X[y == 1][:, 0], X[y == 1][:, 1], color='r',
label='1')
plt.legend()
x1_min, x1_max = X[:,0].min(), X[:,0].max(),
x2_min, x2_max = X[:,1].min(), X[:,1].max(),
xx1, xx2 = np.meshgrid(np.linspace(x1_min, x1_max),
np.linspace(x2_min, x2_max))
grid = np.c_[xx1.ravel(), xx2.ravel()]
probs = model.predict_prob(grid).reshape(xx1.shape)
plt.contour(xx1, xx2, probs, [0.5], linewidths=1,
colors='black');
plt.show()

主成分分析
◦ 主成分分析（PCA）とは、元データの持つ情報をできるだけ失わずに変数の数を圧縮することができるため、探索的分析の前処
理や予想モデル構築時の前処理に使われる。
from sklearn.preprocessing import
StandardScaler
import numpy as np
import scipy as sp
# 可視化ライブラリ
# RandomStateオブジェクトを作成
sample = np.random.RandomState(1)
#２つの乱数を生成
X = np.dot(sample.rand(2, 2), sample.randn(2,
200)).T
# 標準化
sc = StandardScaler()
X_std = sc.fit_transform(X)
# 相関係数の算出とグラフ化
print('相関係数
{:.3f}:'.format(sp.stats.pearsonr(X_std[:, 0],
X_std[:, 1])[0]))
plt.scatter(X_std[:, 0], X_std[:, 1])
# インポート
from sklearn.decomposition import PCA
# 主成分分析
pca = PCA(n_components=2)
pca.fit(X_std)
# パラメータ設定
arrowprops=dict(arrowstyle='->',
linewidth=2,
shrinkA=0, shrinkB=0)
# 矢印を描くための関数
def draw_vector(v0, v1):
plt.gca().annotate('', v1, v0,
arrowprops=arrowprops)
# 元のデータをプロット
plt.scatter(X_std[:, 0], X_std[:, 1], alpha=0.2)
# 主成分分析の2軸を矢印で表示する
for length, vector in zip(pca.explained_variance_,
pca.components_):
v = vector * 3 * np.sqrt(length)
draw_vector(pca.mean_, pca.mean_ + v)
plt.axis('equal');
plt.show()
主成分分析のアルゴリズム
1）全データの重心（平均値）を算出
2）重心からデータの分散（ばらつき）が最大となる方
向（第1主成分）を算出
3）第1主成分と直角に交わる（直交）方向で分散が最
大となる箇所（第2主成分）を算出
4）直近の主成分と直交する方向で分散が最大となる箇
所（第3主成分）を算出
5）4）をデータの次元分だけ繰り返す
◦ 第一成分と第二成分を視覚化できた。

アルゴリズム
◦ 教師あり学習のアルゴリズムは下記の種類がある。
◦ 単回帰
◦ 重回帰
◦ ロジスティック回帰
◦ K近傍法
◦ 決定木
◦ サポートベクターマシン
◦ ランダムフォレスト
◦ 勾配ブースティング
◦ 教師なし学習のアルゴリズムは下記の種類がある。
◦ クラスタリング
# k-means法を使うためのインポート
from sklearn.cluster import KMeans
# データ取得のためのインポート
from sklearn.datasets import make_blobs
# サンプルデータ生成
# 注意：make_blobsは2つの値を返すため、一方
は使用しない「 _ 」で受け取る
X, _ = make_blobs(random_state=10)
# グラフを描画
# colorのオプションで色付けができる
plt.scatter(X[:,0],X[:,1],color='black')
# リッジ回帰用のクラス
from sklearn.linear_model import Ridge
from sklearn.model_selection import train_test_split
# 訓練データとテストデータに分割
X = auto.drop('price', axis=1)
y = auto['price']
X_train, X_test, y_train, y_test = train_test_split(X, y,
test_size=0.5, random_state=0)
# モデルの構築と評価
linear = LinearRegression()
ridge = Ridge(random_state=0)
for model in [linear, ridge]:
model.fit(X_train,y_train)
print('{}(train):{:.6f}'.format(model.__class__.__name__ ,
model.score(X_train,y_train)))
print('{}(test):{:.6f}'.format(model.__class__.__name__ ,
model.score(X_test,y_test)))
Sklearnのライブラリを使うことで、さまざまな種類のア
ルゴリズムを使用することができることがわかる。
教師あり学習の方がアルゴリズムが多彩である。

サポートベクターマシーン
◦ サポートベクターマシーン（SVM)は、カテゴリを識別する境界線を、マージンが最大となるように引くアルゴリズムである。
◦ 多次元における超平面を訓練し、データの分類を行う。
◦ 線形判別関数と最も近いデータ点との距離をマージンと言う。
# データやモデルを構築するためのライブラリ等のインポート
from sklearn.datasets import load_breast_cancer
# SVMのライブラリ
from sklearn.svm import LinearSVC
# 訓練データとテストデータを分けるライブラリ
from sklearn.model_selection import train_test_split
# データの読み込み
cancer = load_breast_cancer()
# 訓練データとテストデータに分ける
X_train, X_test, y_train, y_test = train_test_split(
cancer.data, cancer.target, stratify = cancer.target, random_state=0)
# クラスの初期化と学習
model = LinearSVC()
model.fit(X_train,y_train)
# 訓練データとテストデータのスコア
print('正解率(train):{:.3f}'.format(model.score(X_train, y_train)))
print('正解率(test):{:.3f}'.format(model.score(X_test, y_test)))
◦ 訓練データの正解率は、93.2%
◦ 検証データの正解率は、92.3%
◦ 同じくらいの正解率となった。

機械学習

Recommended

Recommended

More Related Content

What's hot

What's hot (20)

Similar to 機械学習

Similar to 機械学習 (20)

Recently uploaded

Recently uploaded (15)

機械学習