辻真吾・矢吹太朗『ゼロからはじめるデータサイエンス入門』(講談社, 2021)
RStudioでKnitする方法
docker run -d -e PASSWORD=password -e ROOT=TRUE -p 8787:8787 -v "$(pwd)":/home/rstudio/work rocker/ml
reticulate::py_install(packages=c("lxml", "graphviz", "h2o", "keras", "matplotlib", "pmdarima", "pandas==1.5.3", "pandarallel", "pca", "prophet", "scikit-learn==1.1.3", "scipy", "seaborn", "statsmodels", "tensorflow", "xgboost"));
## Using virtual environment '/opt/venv' ...
## + /opt/venv/bin/python -m pip install --upgrade --no-user lxml graphviz h2o keras matplotlib pmdarima 'pandas==1.5.3' pandarallel pca prophet 'scikit-learn==1.1.3' scipy seaborn statsmodels tensorflow xgboost
# これはPythonのコードの例です.
1 + 1
## 2
1 + 1
## 2
# 2 # これは表示されない.
print(1 + 2)
## 3
# 3 # 表示される.
1 + 3
## 4
# 4 # 表示される.
0x10
## 16
1.23e5
## 123000.0
2 * 3
## 6
10 / 3
## 3.3333333333333335
10 // 3 # 商
## 3
10 % 3 # 余り
## 1
x = 2
y = 3
x * y
## 6
x, y = 20, 30 # まとめて名付け
x * y
## 600
x = 1 + 1
# この段階では結果は表示されない
x # 変数名を評価する.
## 2
my_s = 'abcde'
len(my_s)
## 5
'This is' ' a' ' pen.'
## 'This is a pen.'
# あるいは
'This is ' + 'a' + ' pen.'
## 'This is a pen.'
my_s[1:4]
## 'bcd'
tmp = "{} is {}."
tmp.format('This', 'a pen')
## 'This is a pen.'
1 <= 2
## True
1 < 0
## False
0.1 + 0.1 + 0.1 == 0.3
## False
import math
math.isclose(0.1 + 0.1 + 0.1, 0.3)
## True
True and False # 論理積(かつ)
## False
True or False # 論理和(または)
## True
not True # 否定(でない)
## False
0 if 3 < 5 else 10
## 0
import os
os.getcwd()
## '/home/rstudio/work'
os.chdir('..')
os.getcwd()
## '/home/rstudio'
import math
math.sqrt(4)
## 2.0
math.log(100, 10)
## 2.0
math.log(100) # 自然対数
## 4.605170185988092
# あるいは
math.log(100, math.e) # 省略しない場合
## 4.605170185988092
math.log10(100) # 常用対数
## 2.0
math.log2(1024) # 底が2の対数
## 10.0
def f(a, b):
return a - b
f(3, 5)
## -2
def f(a, b=5):
return a - b
f(3) # f(3, 5)と同じこと
## -2
(lambda a, b: a - b)(3, 5)
## -2
x = ['foo', 'bar', 'baz']
len(x)
## 3
x[1]
## 'bar'
x[1] = 'BAR'
x # 結果の確認
## ['foo', 'BAR', 'baz']
x[1] = 'bar' # 元に戻す.
x[-2]
## 'bar'
x + ['qux']
## ['foo', 'bar', 'baz', 'qux']
x = x + ['qux']
# あるいは
#x.append('qux')
x # 結果の確認
## ['foo', 'bar', 'baz', 'qux']
list(range(5))
## [0, 1, 2, 3, 4]
list(range(0, 11, 2))
## [0, 2, 4, 6, 8, 10]
import numpy as np
np.arange(0, 1.1, 0.5)
## array([0. , 0.5, 1. ])
np.linspace(0, 100, 5)
## array([ 0., 25., 50., 75., 100.])
[10] * 5
## [10, 10, 10, 10, 10]
import numpy as np
x = np.array([2, 3, 5, 7])
x + 10 # 加算
## array([12, 13, 15, 17])
x * 10 # 乗算
## array([20, 30, 50, 70])
x = [2, 3]
np.sin(x)
## array([0.90929743, 0.14112001])
x = np.array([2, 3, 5, 7])
y = np.array([1, 10, 100, 1000])
x + y
## array([ 3, 13, 105, 1007])
x * y
## array([ 2, 30, 500, 7000])
np.dot(x, y)
## 7532
# あるいは
x @ y
## 7532
x = np.array([True, False])
y = np.array([True, True])
x & y
## array([ True, False])
u = np.array([1, 2, 3])
v = np.array([1, 2, 3])
w = np.array([1, 2, 4])
all(u == v) # 全体の比較
## True
all(u == w) # 全体の比較
## False
u == v # 要素ごとの比較
## array([ True, True, True])
u == w # 要素ごとの比較
## array([ True, True, False])
(u == w).sum() # 同じ要素の数
## 2
(u == w).mean() # 同じ要素の割合
## 0.6666666666666666
x = [1, "two"]
x[1]
## 'two'
x = {'apple' : 'りんご',
'orange': 'みかん'}
x['grape'] = 'ぶどう'
x['apple']
## 'りんご'
# あるいは
tmp = 'apple'
x[tmp]
## 'りんご'
x = ['foo', 'bar', 'baz']
y = x
y[1] = 'BAR' # yを更新する.
y
## ['foo', 'BAR', 'baz']
x # xも変わる.
## ['foo', 'BAR', 'baz']
x = ['foo', 'bar', 'baz']
y = x.copy() # 「y = x」とせずに,コピーする.
x == y, x is y
## (True, False)
y[1] = 'BAR' # yを更新しても,
x
## ['foo', 'bar', 'baz']
import pandas as pd
my_df = pd.DataFrame({
'name': ['A', 'B', 'C', 'D'],
'english': [ 60, 90, 70, 90],
'math': [ 70, 80, 90, 100],
'gender': ['f', 'm', 'm', 'f']})
my_df = pd.DataFrame([
['A', 60, 70, 'f'],
['B', 90, 80, 'm'],
['C', 70, 90, 'm'],
['D', 90, 100, 'f']],
columns=['name', 'english',
'math', 'gender'])
my_df.head()
## name english math gender
## 0 A 60 70 f
## 1 B 90 80 m
## 2 C 70 90 m
## 3 D 90 100 f
# 結果は割愛
r, c = my_df.shape # 行数と列数
r, c
## (4, 4)
r # 行数(len(my_df)も可)
## 4
c # 列数
## 4
from itertools import product
my_df2 = pd.DataFrame(
product([1, 2, 3],
[10, 100]),
columns=['X', 'Y'])
my_df2
## X Y
## 0 1 10
## 1 1 100
## 2 2 10
## 3 2 100
## 4 3 10
## 5 3 100
my_df2.columns
## Index(['X', 'Y'], dtype='object')
my_df2.columns = ['P', 'Q']
my_df2
## P Q
## 0 1 10
## 1 1 100
## 2 2 10
## 3 2 100
## 4 3 10
## 5 3 100
# 以下省略
list(my_df.index)
## [0, 1, 2, 3]
my_df2.index = [
'a', 'b', 'c', 'd', 'e', 'f']
my_df2
## P Q
## a 1 10
## b 1 100
## c 2 10
## d 2 100
## e 3 10
## f 3 100
# 以下省略
my_df3 = pd.DataFrame({
'english': [ 60, 90, 70, 90],
'math': [ 70, 80, 90, 100],
'gender': ['f', 'm', 'm', 'f']},
index= ['A', 'B', 'C', 'D'])
my_df3
## english math gender
## A 60 70 f
## B 90 80 m
## C 70 90 m
## D 90 100 f
tmp = pd.DataFrame({
'name' : ['E'],
'english': [80],
'math' : [80],
'gender' : ['m']})
my_df2 = my_df.append(tmp)
## <string>:1: FutureWarning: The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.
my_df2 = my_df.assign(id=[1, 2, 3, 4])
my_df3 = my_df.copy() # コピー
my_df3['id'] = [1, 2, 3, 4] # 更新
my_df3 # 結果の確認(割愛)
## name english math gender id
## 0 A 60 70 f 1
## 1 B 90 80 m 2
## 2 C 70 90 m 3
## 3 D 90 100 f 4
my_df.iloc[0, 1]
## 60
x = my_df.iloc[:, 1]
# あるいは
x = my_df['english']
# あるいは
x = my_df.english
# あるいは
tmp = 'english'
x = my_df[tmp]
x # 結果の確認(割愛)
## 0 60
## 1 90
## 2 70
## 3 90
## Name: english, dtype: int64
x = my_df[['name', 'math']]
# あるいは
x = my_df.loc[:, ['name', 'math']]
x = my_df.take([0, 2], axis=1)
# あるいは
x = my_df.iloc[:, [0, 2]]
x = my_df.drop(
columns=['english', 'gender'])
# あるいは
x = my_df.drop(
columns=my_df.columns[[1, 3]])
x = my_df.take([0, 2])
# あるいは
x = my_df.iloc[[0, 2], :]
x = my_df.drop([1, 3])
x = my_df[my_df['gender'] == 'm']
# あるいは
x = my_df.query('gender == "m"')
x = my_df[(my_df['english'] > 80) & (my_df['gender'] == "m")]
# あるいは
x = my_df.query('english > 80 and gender == "m"')
x = my_df[my_df['english'] == my_df['english'].max()]
# あるいは
tmp = my_df['english'].max()
x = my_df.query('english == @tmp')
my_df2 = my_df.copy() # コピー
my_df2.loc[my_df['gender'] == 'm', 'gender'] = 'M'
my_df2
## name english math gender
## 0 A 60 70 f
## 1 B 90 80 M
## 2 C 70 90 M
## 3 D 90 100 f
x = my_df.sort_values('english')
x = my_df.sort_values('english',
ascending=False)
import numpy as np
x = [2, 3, 5, 7, 11, 13, 17, 19, 23,
29, 31, 37]
A = np.array(x).reshape(3, 4)
A
## array([[ 2, 3, 5, 7],
## [11, 13, 17, 19],
## [23, 29, 31, 37]])
A = my_df.iloc[:, [1, 2]].values
A
## array([[ 60, 70],
## [ 90, 80],
## [ 70, 90],
## [ 90, 100]])
pd.DataFrame(A)
## 0 1
## 0 60 70
## 1 90 80
## 2 70 90
## 3 90 100
A.T
## array([[ 60, 90, 70, 90],
## [ 70, 80, 90, 100]])
A.T @ A
## array([[24700, 26700],
## [26700, 29400]])
my_df = pd.DataFrame({
'day': [25, 26, 27],
'min': [20, 21, 15],
'max': [24, 27, 21]})
my_longer = my_df.melt(id_vars='day')
my_longer
## day variable value
## 0 25 min 20
## 1 26 min 21
## 2 27 min 15
## 3 25 max 24
## 4 26 max 27
## 5 27 max 21
my_wider = my_longer.pivot(
index='day',
columns='variable',
values='value')
my_wider
## variable max min
## day
## 25 24 20
## 26 27 21
## 27 21 15
my_wider.plot(
style='o-',
xticks=my_wider.index, # x軸目盛り
ylabel='temperature') # y軸ラベル
import numpy as np
from scipy.spatial import distance
from scipy.stats import pearsonr
A = np.array([3, 4, 5])
B = np.array([3, 4, 29])
C = np.array([9, -18, 8])
distance.euclidean(A, B)
## 24.0
distance.euclidean(A, C)
## 23.0
distance.cityblock(A, B)
## 24
distance.cityblock(A, C)
## 31
1 - distance.cosine(A, B)
## 0.8169678632647616
1 - distance.cosine(A, C)
## -0.032651157422416865
1 - distance.correlation(A, B)
## 0.8824975032927699
# あるいは
pearsonr(A, B)[0]
## 0.8824975032927698
1 - distance.correlation(A, C)
## -0.032662766723200676
# あるいは
pearsonr(A, C)[0]
## -0.032662766723200676
# 小数点以下は3桁表示
np.set_printoptions(precision=3)
import pandas as pd
my_df = pd.DataFrame({
'x': [3, 3, 9],
'y': [4, 4, -18],
'z': [5, 29, 8]},
index=['A', 'B', 'C'])
# ユークリッド距離
distance.cdist(my_df, my_df,
metric='euclidean')
## array([[ 0., 24., 23.],
## [24., 0., 31.],
## [23., 31., 0.]])
# マンハッタン距離
distance.cdist(my_df, my_df,
metric='cityblock')
## array([[ 0., 24., 31.],
## [24., 0., 49.],
## [31., 49., 0.]])
# コサイン類似度
1 - distance.cdist(my_df, my_df,
metric='cosine')
## array([[ 1. , 0.817, -0.033],
## [ 0.817, 1. , 0.293],
## [-0.033, 0.293, 1. ]])
# 相関係数
1 - distance.cdist(my_df, my_df,
metric='correlation')
## array([[ 1. , 0.882, -0.033],
## [ 0.882, 1. , 0.441],
## [-0.033, 0.441, 1. ]])
import math
import numpy as np
import pandas as pd
import numpy
numpy.array([1, 2, 3, 4])
## array([1, 2, 3, 4])
import numpy as np
np.array([1, 2, 3, 4])
## array([1, 2, 3, 4])
from numpy import array
array([1, 2, 3, 4])
## array([1, 2, 3, 4])
from numpy import *
array([1, 2, 3, 4])
## array([1, 2, 3, 4])
import numpy as np
import pandas as pd
def f1(x):
tmp = np.random.random(x)
return np.mean(tmp)
f1(10) # 動作確認
## 0.42605540050625645
[f1(10) for i in range(3)]
## [0.5283901047769273, 0.49923398985369377, 0.44162355655073987]
[f1(10)] * 3
## [0.6847194621617138, 0.6847194621617138, 0.6847194621617138]
v = [5, 10, 100]
[f1(x) for x in v] # 方法1
## [0.4579165789870182, 0.47807391912045843, 0.5388143902994681]
# あるいは
v = pd.Series([5, 10, 100])
v.apply(f1) # 方法2
## 0 0.573013
## 1 0.737001
## 2 0.512561
## dtype: float64
pd.Series([10] * 3).apply(f1)
## 0 0.412093
## 1 0.501986
## 2 0.441776
## dtype: float64
# 結果は割愛
def f2(n):
tmp = np.random.random(n)
return pd.Series([
n,
tmp.mean(),
tmp.std(ddof=1)],
index=['x', 'p', 'q'])
f2(10) # 動作確認
## x 10.000000
## p 0.519253
## q 0.222708
## dtype: float64
v = pd.Series([5, 10, 100])
v.apply(f2)
## x p q
## 0 5.0 0.478368 0.347545
## 1 10.0 0.471601 0.256528
## 2 100.0 0.503002 0.291739
def f3(x, y):
tmp = np.random.random(x) * y
return pd.Series([
x,
y,
tmp.mean(),
tmp.std(ddof=1)],
index=['x', 'y', 'p', 'q'])
f3(10, 6) # 動作確認
## x 10.000000
## y 6.000000
## p 4.056748
## q 1.351471
## dtype: float64
my_df = pd.DataFrame({
'x': [5, 10, 100, 5, 10, 100],
'y': [6, 6, 6, 12, 12, 12]})
my_df.apply(
lambda row: f3(row['x'], row['y']),
axis=1)
## x y p q
## 0 5.0 6.0 3.596414 1.729970
## 1 10.0 6.0 4.243813 0.651038
## 2 100.0 6.0 2.803639 1.741745
## 3 5.0 12.0 7.748588 4.416035
## 4 10.0 12.0 6.905837 4.169298
## 5 100.0 12.0 6.245772 3.500152
# あるいは
my_df.apply(lambda row:
f3(*row), axis=1)
## x y p q
## 0 5.0 6.0 2.130290 1.281828
## 1 10.0 6.0 3.472413 1.195906
## 2 100.0 6.0 2.740226 1.751456
## 3 5.0 12.0 7.112518 4.433728
## 4 10.0 12.0 5.408222 3.582678
## 5 100.0 12.0 6.182361 3.685494
from pandarallel import pandarallel
pandarallel.initialize() # 準備
## INFO: Pandarallel will run on 16 workers.
## INFO: Pandarallel will use Memory file system to transfer data between the main process and workers.
v = pd.Series([5, 10, 100])
v.parallel_apply(f1)
## 0 0.409252
## 1 0.518033
## 2 0.526379
## dtype: float64
# 結果は割愛
x = 123
type(x)
## <class 'int'>
%whos
import math
?math.log
# あるいは
help(math.log)
import numpy as np
v = [1, np.nan, 3]
v
## [1, nan, 3]
np.isnan(v[1])
## True
v[1] == np.nan # 誤り
## False
import numpy as np
import pandas as pd
x = [165, 170, 175, 180, 185]
np.mean(x) # リストの場合
## 175.0
x = np.array( # アレイ
[165, 170, 175, 180, 185])
x.mean() # np.mean(x)も可
## 175.0
x = pd.Series( # シリーズ
[165, 170, 175, 180, 185])
x.mean() # np.mean(x)も可
## 175.0
n = len(x) # サンプルサイズ
sum(x) / n
## 175.0
y = [173, 174, 175, 176, 177]
np.mean(y)
## 175.0
np.var(x, ddof=1) # xの分散
## 62.5
np.var(y, ddof=1) # yの分散
## 2.5
sum((x - np.mean(x))**2) / (n - 1)
## 62.5
np.std(x, ddof=1) # xの標準偏差
## 7.905694150420948
np.std(y, ddof=1) # yの標準偏差
## 1.5811388300841898
np.var(x, ddof=1)**0.5 # xの標準偏差
## 7.905694150420948
s = pd.Series(x)
s.describe()
## count 5.000000
## mean 175.000000
## std 7.905694
## min 165.000000
## 25% 170.000000
## 50% 175.000000
## 75% 180.000000
## max 185.000000
## dtype: float64
# s.describe()で計算済み
x = [165, 170, 175, 180, 185]
np.var(x, ddof=1) # 不偏分散
## 62.5
np.var(x, ddof=0) # 標本分散
## 50.0
np.std(x, ddof=1) # √不偏分散
## 7.905694150420948
np.std(x, ddof=0) # √標本分散
## 7.0710678118654755
np.std(x, ddof=1) / len(x)**0.5
## 3.5355339059327373
import numpy as np
import pandas as pd
my_df = pd.DataFrame({
'name': ['A', 'B', 'C', 'D'],
'english': [ 60, 90, 70, 90],
'math': [ 70, 80, 90, 100],
'gender': ['f', 'm', 'm', 'f']})
my_df['english'].var(ddof=1)
## 225.0
# あるいは
np.var(my_df['english'], ddof=1)
## 225.0
my_df.var()
## english 225.000000
## math 166.666667
## dtype: float64
##
## <string>:1: FutureWarning: The default value of numeric_only in DataFrame.var is deprecated. In a future version, it will default to False. In addition, specifying 'numeric_only=None' is deprecated. Select only valid columns or specify the value of numeric_only to silence this warning.
# あるいは
my_df.apply('var')
## english 225.000000
## math 166.666667
## dtype: float64
##
## <string>:2: FutureWarning: The default value of numeric_only in DataFrame.var is deprecated. In a future version, it will default to False. In addition, specifying 'numeric_only=None' is deprecated. Select only valid columns or specify the value of numeric_only to silence this warning.
# あるいは
my_df.iloc[:, [1, 2]].apply(
lambda x: np.var(x, ddof=1))
## english 225.000000
## math 166.666667
## dtype: float64
my_df.describe()
## english math
## count 4.0 4.000000
## mean 77.5 85.000000
## std 15.0 12.909944
## min 60.0 70.000000
## 25% 67.5 77.500000
## 50% 80.0 85.000000
## 75% 90.0 92.500000
## max 90.0 100.000000
from collections import Counter
Counter(my_df.gender)
## Counter({'f': 2, 'm': 2})
# あるいは
my_df.groupby('gender').apply(len)
## gender
## f 2
## m 2
## dtype: int64
my_df2 = my_df.assign(
excel=my_df.math >= 80)
pd.crosstab(my_df2.gender,
my_df2.excel)
## excel False True
## gender
## f 1 1
## m 0 2
my_df.groupby('gender').mean()
## english math
## gender
## f 75.0 85.0
## m 80.0 85.0
##
## <string>:1: FutureWarning: The default value of numeric_only in DataFrameGroupBy.mean is deprecated. In a future version, numeric_only will default to False. Either specify numeric_only or select only columns which should be valid for the function.
# あるいは
my_df.groupby('gender').agg('mean')
## english math
## gender
## f 75.0 85.0
## m 80.0 85.0
##
## <string>:2: FutureWarning: The default value of numeric_only in DataFrameGroupBy.mean is deprecated. In a future version, numeric_only will default to False. Either specify numeric_only or select only columns which should be valid for the function.
# あるいは
my_df.groupby('gender').agg(np.mean)
## english math
## gender
## f 75.0 85.0
## m 80.0 85.0
##
## <string>:2: FutureWarning: The operation <function mean at 0x7f858371c550> failed on a column. If any error is raised, this will raise an exception in a future version of pandas. Drop these columns to avoid this warning.
import numpy as np
import pandas as pd
import statsmodels.api as sm
iris = sm.datasets.get_rdataset('iris', 'datasets').data
iris.head()
## Sepal.Length Sepal.Width Petal.Length Petal.Width Species
## 0 5.1 3.5 1.4 0.2 setosa
## 1 4.9 3.0 1.4 0.2 setosa
## 2 4.7 3.2 1.3 0.2 setosa
## 3 4.6 3.1 1.5 0.2 setosa
## 4 5.0 3.6 1.4 0.2 setosa
iris.hist('Sepal.Length')
## array([[<Axes: title={'center': 'Sepal.Length'}>]], dtype=object)
my_df = pd.DataFrame(
{'x': [10, 20, 30]})
my_df.hist('x', bins=2) # 階級数は2
## array([[<Axes: title={'center': 'x'}>]], dtype=object)
x = iris['Sepal.Length']
tmp = np.linspace(min(x), max(x), 10)
iris.hist('Sepal.Length',
bins=tmp.round(2))
## array([[<Axes: title={'center': 'Sepal.Length'}>]], dtype=object)
iris.plot('Sepal.Length',
'Sepal.Width',
kind='scatter')
iris.boxplot()
pd.options.display.float_format = (
'{:.2f}'.format)
my_df = (iris.describe().transpose()
[['mean', 'std']])
my_df['se'] = (my_df['std'] /
len(iris)**0.5)
my_df
## mean std se
## Sepal.Length 5.84 0.83 0.07
## Sepal.Width 3.06 0.44 0.04
## Petal.Length 3.76 1.77 0.14
## Petal.Width 1.20 0.76 0.06
my_df.plot(y='mean', kind='bar', yerr='se', capsize=10)
my_group = iris.groupby('Species') # 品種ごとに,
my_df = my_group.agg('mean') # 各変数の,平均と
my_se = my_group.agg(lambda x: x.std() / len(x)**0.5) # 標準誤差を求める.
my_se
## Sepal.Length Sepal.Width Petal.Length Petal.Width
## Species
## setosa 0.05 0.05 0.02 0.01
## versicolor 0.07 0.04 0.07 0.03
## virginica 0.09 0.05 0.08 0.04
my_group.agg('mean').plot(kind='bar', yerr=my_se, capsize=5)
from statsmodels.graphics.mosaicplot \
import mosaic
my_df = pd.DataFrame({
'Species': iris.Species,
'w_Sepal': iris['Sepal.Width'] > 3})
my_table = pd.crosstab( # 分割表
my_df['Species'],
my_df['w_Sepal'])
my_table
## w_Sepal False True
## Species
## setosa 8 42
## versicolor 42 8
## virginica 33 17
mosaic(my_df,
index=['Species', 'w_Sepal'])
my_table.columns = [str(x) for x in my_table.columns]
my_table.index = [str(x) for x in my_table.index]
mosaic(my_df, index=['Species', 'w_Sepal'], labelizer=lambda k: my_table.loc[k])
import matplotlib.pyplot as plt
import numpy as np
x = np.linspace(-2, 2, 100)
y = x**3 - x
plt.plot(x, y)
import matplotlib.pyplot as plt
import numpy as np
rng = np.random.default_rng()
x = np.random.choice(
a=range(1, 7), # 1から6
size=10000, # 乱数の数
replace=True) # 重複あり
# あるいは
x = np.random.randint(
# あるいは
#x = rng.integers(
low=1, # 最小
high=7, # 最大+1
size=10000) # 乱数の数
plt.hist(x, bins=6) # ヒストグラム
x = np.random.random(size=1000)
# あるいは
x = rng.random(size=10000)
# あるいは
x = np.random.uniform(
low=0, # 最小
high=1, # 最大
size=1000) # 乱数の数
plt.hist(x)
tmp = np.random.uniform(
low=1, # 最小
high=7, # 最大 + 1
size=1000) # 乱数の数
x = [int(k) for k in tmp]
plt.hist(x, bins=6) # 結果は割愛
n = 100
p = 0.5
r = 10000
x = np.random.binomial(
# あるいは
#x = rng.binomial(
n=n, # 試行回数
p=p, # 確率
size=r) # 乱数の数
plt.hist(x, bins=max(x) - min(x))
r = 10000
x = np.random.normal(
# あるいは
#x = rng.normal(
loc=50, # 平均
scale=5, # 標準偏差
size=r) # 乱数の数
plt.hist(x, bins=40)
import numpy as np
import pandas as pd
def f(k):
n = 10000
tmp = [g(np.random.normal(size=k, scale=3)) for _ in range(n)]
return pd.Series([k,
np.mean(tmp), # 平均
np.std(tmp, ddof=1) / n**0.5], # 標準誤差
index=['k', 'mean', 'se'])
def g(x):
return np.var(x, ddof=1)
pd.Series([10, 20, 30]).apply(f)
## k mean se
## 0 10.00 9.07 0.04
## 1 20.00 8.95 0.03
## 2 30.00 9.06 0.02
def g(x):
return np.std(x, ddof=1)
pd.Series([10, 20, 30]).apply(f)
## k mean se
## 0 10.00 2.91 0.01
## 1 20.00 2.96 0.00
## 2 30.00 2.98 0.00
from math import gamma
def g(x):
n = len(x)
return (np.std(x, ddof=1) *
(np.sqrt((n - 1) / 2) *
gamma((n - 1) / 2) /
gamma(n / 2)))
pd.Series([10, 20, 30]).apply(f)
## k mean se
## 0 10.00 3.00 0.01
## 1 20.00 3.01 0.00
## 2 30.00 3.00 0.00
from statsmodels.stats.proportion import binom_test, proportion_confint
binom_test(count=2, # 当たった回数
nobs=15, # くじを引いた回数
prop=4 / 10, # 当たる確率(仮説)
alternative='two-sided') # 両側検定(デフォルト)
## 0.036461661552639996
# 左片側検定なら'smaller'
# 右片側検定なら'larger'
import numpy as np
import pandas as pd
from scipy import stats
t = 4 / 10 # 当たる確率
n = 15 # くじを引いた回数
x = np.array(range(0, n + 1)) # 当たった回数
my_pr = stats.binom.pmf(x, n, t) # x回当たる確率
my_pr2 = stats.binom.pmf(2, n, t) # 2回当たる確率
my_data = pd.DataFrame({'x': x, 'y1': my_pr, 'y2': my_pr})
my_data.loc[my_pr > my_pr2, 'y1'] = np.nan # 当たる確率が,2回当たる確率超過
my_data.loc[my_pr <= my_pr2, 'y2'] = np.nan # 当たる確率が,2回当たる確率以下
ax = my_data.plot(x='x', style='o', ylabel='probability',
legend=False) # 凡例を表示しない.
ax.hlines(y=my_pr2, xmin=0, xmax=15) # 水平線
ax.vlines(x=x, ymin=0, ymax=my_pr) # 垂直線
a = 0.05
proportion_confint(
count=2, # 当たった回数
nobs=15, # くじを引いた回数
alpha=a, # 有意水準(省略可)
method='binom_test')
## (0.024225732468536584, 0.3967139848749017)
a = 0.05 # 有意水準
tmp = np.linspace(0, 1, 100)
my_df = pd.DataFrame({
't': tmp, # 当たる確率
'q': a, # 水平線
'p': [binom_test(count=2, nobs=15, prop=t) for t in tmp]}) # p値
my_df.plot(x='t', legend=None, xlabel=r'$\theta$', ylabel=r'p-value')
from statsmodels.stats.weightstats import CompareMeans, DescrStatsW
X = [32.1, 26.2, 27.5, 31.8, 32.1, 31.2, 30.1, 32.4, 32.3, 29.9,
29.6, 26.6, 31.2, 30.9, 29.3]
Y = [35.4, 34.6, 31.1, 32.4, 33.3, 34.7, 35.3, 34.3, 32.1, 28.3,
33.3, 30.5, 32.6, 33.3, 32.2]
a = 0.05 # 有意水準(デフォルト) = 1 - 信頼係数
alt = 'two-sided' # 両側検定(デフォルト)
# 左片側検定なら'smaller'
# 右片側検定なら'larger'
d = DescrStatsW(np.array(X) - np.array(Y)) # 対標本の場合
d.ttest_mean(alternative=alt)[1] # p値
## 0.0006415571512322235
d.tconfint_mean(alpha=a, alternative=alt) # 信頼区間
## (-3.9955246743198867, -1.3644753256801117)
c = CompareMeans(DescrStatsW(X), DescrStatsW(Y)) # 対標本でない場合
ve = 'pooled' # 等分散を仮定する(デフォルト).仮定しないなら'unequal'.
c.ttest_ind(alternative=alt, usevar=ve)[1] # p値
## 0.000978530937238609
c.tconfint_diff(alpha=a, alternative=alt, usevar=ve) # 信頼区間
## (-4.170905570517185, -1.1890944294828283)
import pandas as pd
my_url = ('https://raw.githubusercontent.com/taroyabuki'
'/fromzero/master/data/smoker.csv')
my_data = pd.read_csv(my_url)
my_data.head()
## alive smoker
## 0 Yes No
## 1 Yes No
## 2 Yes No
## 3 Yes No
## 4 Yes No
my_table = pd.crosstab(
my_data['alive'],
my_data['smoker'])
my_table
## smoker No Yes
## alive
## No 117 54
## Yes 950 348
from scipy.stats import chi2_contingency
chi2_contingency(my_table, correction=False)[1]
## 0.18860725715300422
X = [0] * 13 + [1] * 2 # 手順1
X
## [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1]
tmp = np.random.choice(X, 15, replace=True) # 手順2
tmp
## array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0])
sum(tmp) # 手順3
## 2
n = 10**5
result = [sum(np.random.choice(X, len(X), replace=True)) for _ in range(n)] # 手順4
import matplotlib.pyplot as plt
plt.hist(result, bins=range(0, 16))
np.quantile(result, [0.025, 0.975])
## array([0., 5.])
system("wget https://raw.githubusercontent.com/taroyabuki/fromzero/master/data/exam.csv")
import pandas as pd
my_df = pd.read_csv('exam.csv')
my_df
## name english math gender
## 0 A 60 70 f
## 1 B 90 80 m
## 2 C 70 90 m
## 3 D 90 100 f
my_url = ('https://raw.githubusercontent.com/taroyabuki'
'/fromzero/master/data/exam.csv')
my_df = pd.read_csv(my_url)
my_df2 = pd.read_csv('exam.csv',
index_col='name')
my_df2
## english math gender
## name
## A 60 70 f
## B 90 80 m
## C 70 90 m
## D 90 100 f
my_df.to_csv('exam2.csv', index=False)
my_df2.to_csv('exam3.csv')
my_df = pd.read_csv('exam.csv',
encoding='UTF-8')
my_df.to_csv('exam2.csv', index=False, encoding='UTF-8')
my_url = 'https://taroyabuki.github.io/fromzero/exam.html'
my_tables = pd.read_html(my_url)
my_tables
## [ Unnamed: 0 name english math gender
## 0 NaN A 60 70 f
## 1 NaN B 90 80 m
## 2 NaN C 70 90 m
## 3 NaN D 90 100 f]
my_tables[0]
## Unnamed: 0 name english math gender
## 0 NaN A 60 70 f
## 1 NaN B 90 80 m
## 2 NaN C 70 90 m
## 3 NaN D 90 100 f
# 1列目以降を取り出す.
my_data = my_tables[0].iloc[:, 1:]
my_data
## name english math gender
## 0 A 60 70 f
## 1 B 90 80 m
## 2 C 70 90 m
## 3 D 90 100 f
my_url = ('https://raw.githubusercontent.com/taroyabuki'
'/fromzero/master/data/exam.json')
my_data = pd.read_json(my_url)
#my_data = pd.read_json('exam.json') # (ファイルを使う場合)
my_data
## name english math gender
## 0 A 60 70 f
## 1 B 90 80 m
## 2 C 70 90 m
## 3 D 90 100 f
import xml.etree.ElementTree as ET
from urllib.request import urlopen
my_url = ('https://raw.githubusercontent.com/taroyabuki'
'/fromzero/master/data/exam.xml')
with urlopen(my_url) as f:
my_tree = ET.parse(f) # XMLデータの読み込み
#my_tree = ET.parse('exam.xml') # (ファイルを使う場合)
my_ns = '{https://www.example.net/ns/1.0}' # 名前空間
my_records = my_tree.findall(f'.//{my_ns}record')
def f(record):
my_dic1 = record.attrib # 属性を取り出す.
# 子要素の名前と内容のペアを辞書にする.
my_dic2 = {child.tag.replace(my_ns, ''): child.text for child in list(record)}
return {**my_dic1, **my_dic2} # 辞書を結合する.
my_data = pd.DataFrame([f(record) for record in my_records])
my_data['english'] = pd.to_numeric(my_data['english'])
my_data['math'] = pd.to_numeric(my_data['math'])
my_data
## english math gender name
## 0 60 70 f A
## 1 90 80 m B
## 2 70 90 m C
## 3 90 100 f D
import numpy as np
from scipy.stats import zscore
x1 = [1, 2, 3]
z1 = ((x1 - np.mean(x1)) /
np.std(x1, ddof=1))
# あるいは
z1 = zscore(x1, ddof=1)
z1
## array([-1., 0., 1.])
z1.mean(), np.std(z1, ddof=1)
## (0.0, 1.0)
z1 * np.std(x1, ddof=1) + np.mean(x1)
## array([1., 2., 3.])
x2 = [1, 3, 5]
z2 = ((x2 - np.mean(x1)) /
np.std(x1, ddof=1))
z2.mean(), np.std(z2, ddof=1)
## (1.0, 2.0)
import pandas as pd
from sklearn.preprocessing import (
OneHotEncoder)
my_df = pd.DataFrame({
'id': [ 1 , 2 , 3 ],
'class': ['A', 'B', 'C']})
my_enc = OneHotEncoder()
tmp = my_enc.fit_transform(
my_df[['class']]).toarray()
my_names = my_enc.get_feature_names() \
if hasattr(my_enc, 'get_feature_names') \
else my_enc.get_feature_names_out()
## /opt/venv/lib/python3.10/site-packages/sklearn/utils/deprecation.py:87: FutureWarning: Function get_feature_names is deprecated; get_feature_names is deprecated in 1.0 and will be removed in 1.2. Please use get_feature_names_out instead.
## warnings.warn(msg, category=FutureWarning)
pd.DataFrame(tmp, columns=my_names)
## x0_A x0_B x0_C
## 0 1.00 0.00 0.00
## 1 0.00 1.00 0.00
## 2 0.00 0.00 1.00
my_df2 = pd.DataFrame({
'id': [ 4 , 5, 6 ],
'class': ['B', 'C', 'B']})
tmp = my_enc.transform(
my_df2[['class']]).toarray()
pd.DataFrame(tmp, columns=my_names)
## x0_A x0_B x0_C
## 0 0.00 1.00 0.00
## 1 0.00 0.00 1.00
## 2 0.00 1.00 0.00
my_enc = OneHotEncoder(drop='first')
tmp = my_enc.fit_transform(
my_df[['class']]).toarray()
my_names = my_enc.get_feature_names() \
if hasattr(my_enc, 'get_feature_names') \
else my_enc.get_feature_names_out()
## /opt/venv/lib/python3.10/site-packages/sklearn/utils/deprecation.py:87: FutureWarning: Function get_feature_names is deprecated; get_feature_names is deprecated in 1.0 and will be removed in 1.2. Please use get_feature_names_out instead.
## warnings.warn(msg, category=FutureWarning)
pd.DataFrame(tmp, columns=my_names)
## x0_B x0_C
## 0 0.00 0.00
## 1 1.00 0.00
## 2 0.00 1.00
tmp = my_enc.transform(
my_df2[['class']]).toarray()
pd.DataFrame(tmp, columns=my_names)
## x0_B x0_C
## 0 1.00 0.00
## 1 0.00 1.00
## 2 1.00 0.00
import statsmodels.api as sm
iris = sm.datasets.get_rdataset('iris', 'datasets').data
iris.head()
## Sepal.Length Sepal.Width Petal.Length Petal.Width Species
## 0 5.10 3.50 1.40 0.20 setosa
## 1 4.90 3.00 1.40 0.20 setosa
## 2 4.70 3.20 1.30 0.20 setosa
## 3 4.60 3.10 1.50 0.20 setosa
## 4 5.00 3.60 1.40 0.20 setosa
# 以下省略
import seaborn as sns
iris = sns.load_dataset('iris')
iris.head()
## sepal_length sepal_width petal_length petal_width species
## 0 5.10 3.50 1.40 0.20 setosa
## 1 4.90 3.00 1.40 0.20 setosa
## 2 4.70 3.20 1.30 0.20 setosa
## 3 4.60 3.10 1.50 0.20 setosa
## 4 5.00 3.60 1.40 0.20 setosa
# 以下省略
import pandas as pd
from sklearn.datasets import load_iris
tmp = load_iris()
iris = pd.DataFrame(tmp.data, columns=tmp.feature_names)
iris['target'] = tmp.target_names[tmp.target]
iris.head()
## sepal length (cm) sepal width (cm) ... petal width (cm) target
## 0 5.10 3.50 ... 0.20 setosa
## 1 4.90 3.00 ... 0.20 setosa
## 2 4.70 3.20 ... 0.20 setosa
## 3 4.60 3.10 ... 0.20 setosa
## 4 5.00 3.60 ... 0.20 setosa
##
## [5 rows x 5 columns]
# 以下省略
import statsmodels.api as sm
my_data = sm.datasets.get_rdataset('cars', 'datasets').data
my_data.shape
## (50, 2)
my_data.head()
## speed dist
## 0 4 2
## 1 4 10
## 2 7 4
## 3 7 22
## 4 8 16
my_data.describe()
## speed dist
## count 50.00 50.00
## mean 15.40 42.98
## std 5.29 25.77
## min 4.00 2.00
## 25% 12.00 26.00
## 50% 15.00 36.00
## 75% 19.00 56.00
## max 25.00 120.00
my_data.plot(x='speed', style='o')
import seaborn as sns
import statsmodels.api as sm
my_data = sm.datasets.get_rdataset('cars', 'datasets').data
ax = sns.regplot(x='speed', y='dist', data=my_data)
ax.vlines(x=21.5, ymin=-5, ymax=67, linestyles='dotted')
ax.hlines(y=67, xmin=4, xmax=21.5, linestyles='dotted')
ax.set_xlim(4, 25)
## (4.0, 25.0)
ax.set_ylim(-5, 125)
## (-5.0, 125.0)
import statsmodels.api as sm
my_data = sm.datasets.get_rdataset('cars', 'datasets').data
X, y = my_data[['speed']], my_data['dist']
# モデルの指定
from sklearn.linear_model import LinearRegression
my_model = LinearRegression()
# 訓練(モデルをデータにフィットさせる.)
my_model.fit(X, y)
LinearRegression()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
LinearRegression()
# まとめて実行してもよい.
# my_model = LinearRegression().fit(X, y)
my_model.intercept_, my_model.coef_
## (-17.579094890510973, array([3.932]))
tmp = [[21.5]]
my_model.predict(tmp)
## array([66.968])
##
## /opt/venv/lib/python3.10/site-packages/sklearn/base.py:450: UserWarning: X does not have valid feature names, but LinearRegression was fitted with feature names
## warnings.warn(
import numpy as np
import pandas as pd
tmp = pd.DataFrame({'speed': np.linspace(min(my_data.speed),
max(my_data.speed),
100)})
tmp['model'] = my_model.predict(tmp)
pd.concat([my_data, tmp]).plot(
x='speed', style=['o', '-'])
import pandas as pd
import statsmodels.api as sm
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
my_data = sm.datasets.get_rdataset('cars', 'datasets').data
X, y = my_data[['speed']], my_data['dist']
my_model = LinearRegression()
my_model.fit(X, y)
LinearRegression()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
LinearRegression()
y_ = my_model.predict(X)
my_data['y_'] = y_
pd.options.display.float_format = (
'{:.2f}'.format)
my_data['residual'] = y - y_
my_data.head()
## speed dist y_ residual
## 0 4 2 -1.85 3.85
## 1 4 10 -1.85 11.85
## 2 7 4 9.95 -5.95
## 3 7 22 9.95 12.05
## 4 8 16 13.88 2.12
ax = my_data.plot(x='speed', y='dist', style='o', legend=False)
my_data.plot(x='speed', y='y_', style='-', legend=False, ax=ax)
ax.vlines(x=X, ymin=y, ymax=y_, linestyles='dotted')
mean_squared_error(y, y_)**0.5
## 15.068855995791381
# あるいは
(my_data['residual']**2).mean()**0.5
## 15.068855995791381
my_model.score(X, y)
## 0.6510793807582509
# あるいは
r2_score(y_true=y, y_pred=y_)
## 0.6510793807582509
import numpy as np
np.corrcoef(y, y_)[0, 1]**2
## 0.6510793807582511
my_test = my_data[:3]
X = my_test[['speed']]
y = my_test['dist']
y_ = my_model.predict(X)
my_model.score(X, y)
## -4.498191310376778
# あるいは
r2_score(y_true=y, y_pred=y_)
## -4.498191310376778
np.corrcoef(y, y_)[0, 1]**2
## 0.0769230769230769
import numpy as np
import pandas as pd
import statsmodels.api as sm
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import PolynomialFeatures
my_data = sm.datasets.get_rdataset('cars', 'datasets').data
my_idx = [1, 10, 26, 33, 38, 43]
my_sample = my_data.iloc[my_idx, ]
X, y = my_sample[['speed']], my_sample['dist']
d = 5
X5 = PolynomialFeatures(d, include_bias=False).fit_transform(X) # Xの1乗から5乗の変数
my_model = LinearRegression()
my_model.fit(X5, y)
LinearRegression()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
LinearRegression()
y_ = my_model.predict(X5)
((y - y_)**2).mean()**0.5
## 4.948253176703135e-07
my_model.score(X5, y)
## 0.9999999999999996
np.corrcoef(y, y_)[0, 1]**2
## 0.9999999999999993
tmp = pd.DataFrame({'speed': np.linspace(min(my_data.speed),
max(my_data.speed),
100)})
X5 = PolynomialFeatures(d, include_bias=False).fit_transform(tmp)
tmp['model'] = my_model.predict(X5)
my_sample = my_sample.assign(sample=y)
my_df = pd.concat([my_data, my_sample, tmp])
my_df.plot(x='speed', style=['o', 'o', '-'], ylim=(0, 130))
# 準備
import numpy as np
import pandas as pd
import statsmodels.api as sm
from sklearn.neighbors import KNeighborsRegressor
my_data = sm.datasets.get_rdataset('cars', 'datasets').data
X, y = my_data[['speed']], my_data['dist']
# 訓練
my_model = KNeighborsRegressor()
my_model.fit(X, y)
KNeighborsRegressor()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
KNeighborsRegressor()
# 可視化の準備
tmp = pd.DataFrame({'speed': np.linspace(min(my_data.speed),
max(my_data.speed),
100)})
tmp['model'] = my_model.predict(tmp)
pd.concat([my_data, tmp]).plot(
x='speed', style=['o', '-'])
y_ = my_model.predict(X)
((y - y_)**2).mean()**0.5
## 13.087184571174962
my_model.score(X, y)
## 0.7368165812204317
np.corrcoef(y, y_)[0, 1]**2
## 0.7380949412509705
import statsmodels.api as sm
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import cross_val_score
# データの準備
my_data = sm.datasets.get_rdataset('cars', 'datasets').data
X, y = my_data[['speed']], my_data['dist']
# モデルの指定
my_model = LinearRegression()
# 検証(5分割交差検証)
my_scores = cross_val_score(my_model, X, y)
# 5個の決定係数1を得る.
my_scores
## array([-0.258, -0.214, -0.309, -0.273, 0.023])
# 平均を決定係数1(検証)とする.
my_scores.mean()
## -0.20629282165364665
my_scores = cross_val_score(my_model, X, y,
scoring='neg_root_mean_squared_error')
-my_scores.mean()
## 15.58402474583013
import numpy as np
import statsmodels.api as sm
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import cross_val_score, LeaveOneOut
my_data = sm.datasets.get_rdataset('cars', 'datasets').data
X, y = my_data[['speed']], my_data['dist']
my_model = LinearRegression().fit(X, y)
y_ = my_model.predict(X)
# RMSE(訓練)
mean_squared_error(y, y_)**0.5
## 15.068855995791381
# 決定係数1(訓練)
my_model.score(X, y)
## 0.6510793807582509
# あるいは
r2_score(y_true=y, y_pred=y_)
## 0.6510793807582509
# 決定係数6(訓練)
np.corrcoef(y, y_)[0, 1]**2
## 0.6510793807582511
my_scores = cross_val_score(my_model, X, y,
scoring='neg_root_mean_squared_error')
-my_scores.mean()
## 15.58402474583013
my_scores = cross_val_score(my_model, X, y, scoring='r2') # scoring='r2'は省略可
my_scores.mean()
## -0.20629282165364665
# 方法1
my_scores1 = cross_val_score(my_model, X, y, cv=LeaveOneOut(),
scoring='neg_mean_squared_error')
(-my_scores1.mean())**0.5
## 15.6973060093991
# 方法2
my_scores2 = cross_val_score(my_model, X, y, cv=LeaveOneOut(),
scoring='neg_root_mean_squared_error')
(my_scores2**2).mean()**0.5
## 15.6973060093991
-my_scores2.mean()
## 12.059178648637483
import pandas as pd
import statsmodels.api as sm
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import cross_val_score, LeaveOneOut
from sklearn.neighbors import KNeighborsRegressor
my_data = sm.datasets.get_rdataset('cars', 'datasets').data
X, y = my_data[['speed']], my_data['dist']
my_lm_scores = cross_val_score(
LinearRegression(),
X, y, cv=LeaveOneOut(), scoring='neg_mean_squared_error')
my_knn_socres = cross_val_score(
KNeighborsRegressor(n_neighbors=5),
X, y, cv=LeaveOneOut(), scoring='neg_mean_squared_error')
(-my_lm_scores.mean())**0.5
## 15.6973060093991
(-my_knn_socres.mean())**0.5
## 16.07308308943869
my_df = pd.DataFrame({
'lm': -my_lm_scores,
'knn': -my_knn_socres})
my_df.head()
## lm knn
## 0 18.91 108.16
## 1 179.22 0.64
## 2 41.03 64.00
## 3 168.49 184.96
## 4 5.09 0.00
my_df.boxplot().set_ylabel("$r^2$")
from statsmodels.stats.weightstats import DescrStatsW
d = DescrStatsW(my_df.lm - my_df.knn)
d.ttest_mean()[1] # p値
## 0.6952755720536117
d.tconfint_mean(alpha=0.05, alternative='two-sided') # 信頼区間
## (-72.82752833122278, 48.95036023665705)
import pandas as pd
import statsmodels.api as sm
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import GridSearchCV, LeaveOneOut
from sklearn.neighbors import KNeighborsRegressor
my_data = sm.datasets.get_rdataset('cars', 'datasets').data
X, y = my_data[['speed']], my_data['dist']
my_params = {'n_neighbors': range(1, 16)} # 探索範囲(1以上16未満の整数)
my_search = GridSearchCV(estimator=KNeighborsRegressor(),
param_grid=my_params,
cv=LeaveOneOut(),
scoring='neg_mean_squared_error')
my_search.fit(X, y)
GridSearchCV(cv=LeaveOneOut(), estimator=KNeighborsRegressor(), param_grid={'n_neighbors': range(1, 16)}, scoring='neg_mean_squared_error')In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
GridSearchCV(cv=LeaveOneOut(), estimator=KNeighborsRegressor(), param_grid={'n_neighbors': range(1, 16)}, scoring='neg_mean_squared_error')
KNeighborsRegressor()
KNeighborsRegressor()
tmp = my_search.cv_results_ # チューニングの詳細
my_scores = (-tmp['mean_test_score'])**0.5 # RMSE
my_results = pd.DataFrame(tmp['params']).assign(validation=my_scores)
my_results.head()
## n_neighbors validation
## 0 1 20.09
## 1 2 17.58
## 2 3 16.35
## 3 4 16.20
## 4 5 16.07
my_results.plot(x='n_neighbors',
style='o-',
ylabel='RMSE')
my_search.best_params_
## {'n_neighbors': 5}
(-my_search.best_score_)**0.5
## 16.07308308943869
my_model = my_search.best_estimator_
y_ = my_model.predict(X)
mean_squared_error(y_, y)**0.5
## 13.087184571174962
import pandas as pd
import statsmodels.api as sm
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import cross_val_score, LeaveOneOut
from sklearn.neighbors import KNeighborsRegressor
my_data = sm.datasets.get_rdataset('cars', 'datasets').data
X, y = my_data[['speed']], my_data['dist']
def my_loocv(k):
my_model = KNeighborsRegressor(n_neighbors=k)
my_scores = cross_val_score(estimator=my_model, X=X, y=y,
cv=LeaveOneOut(),
scoring='neg_mean_squared_error')
y_ = my_model.fit(X, y).predict(X)
return pd.Series([k,
(-my_scores.mean())**0.5, # RMSE(検証)
mean_squared_error(y_, y)**0.5], # RMSE(訓練)
index=['n_neighbors', 'validation', 'training'])
my_results = pd.Series(range(1, 16)).apply(my_loocv)
my_results.plot(x='n_neighbors',
style='o-',
ylabel='RMSE')
import pandas as pd
my_url = 'http://www.liquidasset.com/winedata.html'
tmp = pd.read_table(my_url, skiprows=62, nrows=38, sep='\\s+', na_values='.')
tmp.describe()
## OBS VINT LPRICE2 WRAIN DEGREES HRAIN TIME_SV
## count 38.00 38.00 27.00 38.00 37.00 38.00 38.00
## mean 19.50 1970.50 -1.45 605.00 16.52 137.00 12.50
## std 11.11 11.11 0.63 135.28 0.66 66.74 11.11
## min 1.00 1952.00 -2.29 376.00 14.98 38.00 -6.00
## 25% 10.25 1961.25 -1.99 510.25 16.17 87.50 3.25
## 50% 19.50 1970.50 -1.51 586.50 16.53 120.50 12.50
## 75% 28.75 1979.75 -1.05 713.50 17.07 171.00 21.75
## max 38.00 1989.00 0.00 845.00 17.65 292.00 31.00
# 以下省略
my_data = tmp.iloc[:, 2:].dropna()
my_data.head()
## LPRICE2 WRAIN DEGREES HRAIN TIME_SV
## 0 -1.00 600 17.12 160 31
## 1 -0.45 690 16.73 80 30
## 3 -0.81 502 17.15 130 28
## 5 -1.51 420 16.13 110 26
## 6 -1.72 582 16.42 187 25
my_data.shape
## (27, 5)
my_data.to_csv('wine.csv',
index=False)
#my_data = pd.read_csv('wine.csv') # 作ったファイルを使う場合
my_url = ('https://raw.githubusercontent.com/taroyabuki'
'/fromzero/master/data/wine.csv')
my_data = pd.read_csv(my_url)
import numpy as np
import pandas as pd
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import cross_val_score, LeaveOneOut
my_url = ('https://raw.githubusercontent.com/taroyabuki'
'/fromzero/master/data/wine.csv')
my_data = pd.read_csv(my_url)
X, y = my_data.drop(columns=['LPRICE2']), my_data['LPRICE2']
my_model = LinearRegression().fit(X, y)
my_model.intercept_
## -12.145333576510417
pd.Series(my_model.coef_,
index=X.columns)
## WRAIN 0.00
## DEGREES 0.62
## HRAIN -0.00
## TIME_SV 0.02
## dtype: float64
my_test = [[500, 17, 120, 2]]
my_model.predict(my_test)
## array([-1.499])
##
## /opt/venv/lib/python3.10/site-packages/sklearn/base.py:450: UserWarning: X does not have valid feature names, but LinearRegression was fitted with feature names
## warnings.warn(
y_ = my_model.predict(X)
mean_squared_error(y_, y)**0.5
## 0.25861666201306194
my_model.score(X, y)
## 0.8275277990052157
np.corrcoef(y, y_)[0, 1]**2
## 0.8275277990052158
my_scores = cross_val_score(my_model, X, y,
cv=LeaveOneOut(),
scoring='neg_mean_squared_error')
(-my_scores.mean())**0.5
## 0.3230042651841198
import numpy as np
M = np.matrix(X.assign(b0=1))
b = np.linalg.pinv(M) @ y
pd.Series(b,
index=list(X.columns) + ['b0'])
## WRAIN 0.00
## DEGREES 0.62
## HRAIN -0.00
## TIME_SV 0.02
## b0 -12.15
## dtype: float64
import pandas as pd
from sklearn.linear_model import LinearRegression
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
my_url = ('https://raw.githubusercontent.com/taroyabuki'
'/fromzero/master/data/wine.csv')
my_data = pd.read_csv(my_url)
X, y = my_data.drop(columns=['LPRICE2']), my_data['LPRICE2']
# StandardScalerで標準化した結果をデータフレームに戻してから描画する.
pd.DataFrame(StandardScaler().fit_transform(X), columns=X.columns
).boxplot(showmeans=True)
my_pipeline = Pipeline([
('sc', StandardScaler()),
('lr', LinearRegression())])
my_pipeline.fit(X, y)
Pipeline(steps=[('sc', StandardScaler()), ('lr', LinearRegression())])In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
Pipeline(steps=[('sc', StandardScaler()), ('lr', LinearRegression())])
StandardScaler()
LinearRegression()
# 線形回帰の部分だけを取り出す.
my_lr = my_pipeline.named_steps.lr
my_lr.intercept_
## -1.4517651851851847
pd.Series(my_lr.coef_,
index=X.columns)
## WRAIN 0.15
## DEGREES 0.40
## HRAIN -0.28
## TIME_SV 0.19
## dtype: float64
my_test = [[500, 17, 120, 2]]
my_pipeline.predict(my_test)
## array([-1.499])
##
## /opt/venv/lib/python3.10/site-packages/sklearn/base.py:450: UserWarning: X does not have valid feature names, but StandardScaler was fitted with feature names
## warnings.warn(
import pandas as pd
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import cross_val_score, LeaveOneOut
my_url = ('https://raw.githubusercontent.com/taroyabuki'
'/fromzero/master/data/wine.csv')
my_data = pd.read_csv(my_url)
n = len(my_data)
my_data2 = my_data.assign(v1=[i % 2 for i in range(n)],
v2=[i % 3 for i in range(n)])
my_data2.head()
## LPRICE2 WRAIN DEGREES HRAIN TIME_SV v1 v2
## 0 -1.00 600 17.12 160 31 0 0
## 1 -0.45 690 16.73 80 30 1 1
## 2 -0.81 502 17.15 130 28 0 2
## 3 -1.51 420 16.13 110 26 1 0
## 4 -1.72 582 16.42 187 25 0 1
X, y = my_data2.drop(columns=['LPRICE2']), my_data2['LPRICE2']
my_model2 = LinearRegression().fit(X, y)
y_ = my_model2.predict(X)
mean_squared_error(y_, y)**0.5
## 0.256212004750575
my_scores = cross_val_score(my_model2, X, y,
cv=LeaveOneOut(),
scoring='neg_mean_squared_error')
(-my_scores.mean())**0.5
## 0.35699180359289384
import pandas as pd
from sklearn.feature_selection import SequentialFeatureSelector
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import GridSearchCV, LeaveOneOut
from sklearn.pipeline import Pipeline
my_url = ('https://raw.githubusercontent.com/taroyabuki'
'/fromzero/master/data/wine.csv')
my_data = pd.read_csv(my_url)
n = len(my_data)
my_data2 = my_data.assign(v1=[i % 2 for i in range(n)],
v2=[i % 3 for i in range(n)])
X, y = my_data2.drop(columns=['LPRICE2']), my_data2['LPRICE2']
my_sfs = SequentialFeatureSelector(
estimator=LinearRegression(),
direction='forward', # 変数増加法
cv=LeaveOneOut(),
scoring='neg_mean_squared_error')
my_pipeline = Pipeline([ # 変数選択の後で再訓練を行うようにする.
('sfs', my_sfs), # 変数選択
('lr', LinearRegression())]) # 回帰分析
my_params = {'sfs__n_features_to_select': range(1, 6)} # 選択する変数の上限
my_search = GridSearchCV(estimator=my_pipeline,
param_grid=my_params,
cv=LeaveOneOut(),
scoring='neg_mean_squared_error',
n_jobs=-1).fit(X, y)
my_model = my_search.best_estimator_ # 最良のパラメータで再訓練したモデル
my_search.best_estimator_.named_steps.sfs.get_support()
## array([ True, True, True, True, False, False])
import numpy as np
import pandas as pd
import warnings
from sklearn.exceptions import ConvergenceWarning
from sklearn.linear_model import ElasticNet, enet_path
from sklearn.model_selection import GridSearchCV, LeaveOneOut
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from scipy.stats import zscore
warnings.simplefilter('ignore', ConvergenceWarning) # これ以降,警告を表示しない.
my_url = ('https://raw.githubusercontent.com/taroyabuki'
'/fromzero/master/data/wine.csv')
my_data = pd.read_csv(my_url)
X, y = my_data.drop(columns=['LPRICE2']), my_data['LPRICE2']
A = 2
B = 0.1
my_pipeline = Pipeline([
('sc', StandardScaler()),
('enet', ElasticNet(
alpha=A,
l1_ratio=B))])
my_pipeline.fit(X, y)
Pipeline(steps=[('sc', StandardScaler()), ('enet', ElasticNet(alpha=2, l1_ratio=0.1))])In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
Pipeline(steps=[('sc', StandardScaler()), ('enet', ElasticNet(alpha=2, l1_ratio=0.1))])
StandardScaler()
ElasticNet(alpha=2, l1_ratio=0.1)
my_enet = my_pipeline.named_steps.enet
my_enet.intercept_
## -1.4517651851851852
pd.Series(my_enet.coef_,
index=X.columns)
## WRAIN 0.00
## DEGREES 0.07
## HRAIN -0.04
## TIME_SV 0.02
## dtype: float64
my_test = pd.DataFrame(
[[500, 17, 120, 2]])
my_pipeline.predict(my_test)
## array([-1.42])
##
## /opt/venv/lib/python3.10/site-packages/sklearn/base.py:450: UserWarning: X does not have valid feature names, but StandardScaler was fitted with feature names
## warnings.warn(
As = np.e**np.arange(2, -5.5, -0.1)
B = 0.1
_, my_path, _ = enet_path(
zscore(X), zscore(y),
alphas=As,
l1_ratio=B)
pd.DataFrame(
my_path.T,
columns=X.columns,
index=np.log(As)
).plot(
xlabel='log A ( = log alpha)',
ylabel='Coefficients')
## <Axes: xlabel='log A ( = log alpha)', ylabel='Coefficients'>
As = np.linspace(0, 0.1, 21)
Bs = np.linspace(0, 0.1, 6)
my_pipeline = Pipeline([('sc', StandardScaler()),
('enet', ElasticNet())])
my_search = GridSearchCV(
estimator=my_pipeline,
param_grid={'enet__alpha': As, 'enet__l1_ratio': Bs},
cv=LeaveOneOut(),
scoring='neg_mean_squared_error',
n_jobs=-1).fit(X, y)
my_model = my_search.best_estimator_ # 最良モデル
my_search.best_params_ # 最良パラメータ
## {'enet__alpha': 0.075, 'enet__l1_ratio': 0.0}
tmp = my_search.cv_results_ # チューニングの詳細
my_scores = (-tmp['mean_test_score'])**0.5 # RMSE
my_results = pd.DataFrame(tmp['params']).assign(RMSE=my_scores).pivot(
index='enet__alpha',
columns='enet__l1_ratio',
values='RMSE')
my_results.plot(style='o-', xlabel='A ( = alpha)', ylabel='RMSE').legend(
title='B ( = l1_ratio)')
## <matplotlib.legend.Legend object at 0x7f868c994400>
(-my_search.best_score_)**0.5
## 0.31945619679509646
import matplotlib.pyplot as plt
import numpy as np
x = np.linspace(-6, 6, 100)
y = 1 / (1 + np.exp(-x))
plt.plot(x, y)
## [<matplotlib.lines.Line2D object at 0x7f857399eda0>]
import pandas as pd
import warnings
from sklearn.exceptions import ConvergenceWarning
from sklearn.neural_network import MLPRegressor
from sklearn.model_selection import cross_val_score, GridSearchCV, LeaveOneOut
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
my_url = ('https://raw.githubusercontent.com/taroyabuki'
'/fromzero/master/data/wine.csv')
my_data = pd.read_csv(my_url)
X, y = my_data.drop(columns=['LPRICE2']), my_data['LPRICE2']
warnings.simplefilter("ignore", ConvergenceWarning) # これ以降,警告を表示しない.
my_pipeline = Pipeline([('sc', StandardScaler()), # 標準化
('mlp', MLPRegressor())]) # ニューラルネットワーク
my_pipeline.fit(X, y) # 訓練
## Pipeline(steps=[('sc', StandardScaler()), ('mlp', MLPRegressor())])
my_scores = cross_val_score(my_pipeline, X, y, cv=LeaveOneOut(),
scoring='neg_mean_squared_error')
warnings.simplefilter("default", ConvergenceWarning) # これ以降,警告を表示する.
(-my_scores.mean())**0.5
## 0.4412589891379253
my_pipeline = Pipeline([
('sc', StandardScaler()),
('mlp', MLPRegressor(tol=1e-5, # 改善したと見なす基準
max_iter=5000))]) # 改善しなくなるまでの反復数
my_layers = (1, 3, 5, # 隠れ層1層の場合
(1, 1), (3, 1), (5, 1), (1, 2), (3, 2), (5, 2)) # 隠れ層2層の場合
my_params = {'mlp__hidden_layer_sizes': my_layers}
my_search = GridSearchCV(estimator=my_pipeline,
param_grid=my_params,
cv=LeaveOneOut(),
scoring='neg_mean_squared_error',
n_jobs=-1).fit(X, y)
my_model = my_search.best_estimator_ # 最良モデル
my_search.best_params_ # 最良パラメータ
## {'mlp__hidden_layer_sizes': 3}
(-my_search.best_score_)**0.5
## 0.37175608581060476
import statsmodels.api as sm
my_data = sm.datasets.get_rdataset('iris', 'datasets').data
my_data.head()
## Sepal.Length Sepal.Width Petal.Length Petal.Width Species
## 0 5.10 3.50 1.40 0.20 setosa
## 1 4.90 3.00 1.40 0.20 setosa
## 2 4.70 3.20 1.30 0.20 setosa
## 3 4.60 3.10 1.50 0.20 setosa
## 4 5.00 3.60 1.40 0.20 setosa
my_data.describe()
## Sepal.Length Sepal.Width Petal.Length Petal.Width
## count 150.00 150.00 150.00 150.00
## mean 5.84 3.06 3.76 1.20
## std 0.83 0.44 1.77 0.76
## min 4.30 2.00 1.00 0.10
## 25% 5.10 2.80 1.60 0.30
## 50% 5.80 3.00 4.35 1.30
## 75% 6.40 3.30 5.10 1.80
## max 7.90 4.40 6.90 2.50
# 以下省略
import graphviz
import pandas as pd
import statsmodels.api as sm
from sklearn import tree
my_data = sm.datasets.get_rdataset('iris', 'datasets').data
X, y = my_data.iloc[:, 0:4], my_data.Species
my_model = tree.DecisionTreeClassifier(max_depth=2, random_state=0)
my_model.fit(X, y)
## DecisionTreeClassifier(max_depth=2, random_state=0)
my_dot = tree.export_graphviz(
decision_tree=my_model,
out_file=None, # ファイルに出力しない.
feature_names=X.columns, # 変数名
class_names=my_model.classes_, # カテゴリ名
filled=True) # 色を塗る.
graphviz.Source(my_dot)
## <graphviz.sources.Source object at 0x7f8573b84e80>
my_test = pd.DataFrame([[5.0, 3.5, 1.5, 0.5],
[6.5, 3.0, 5.0, 2.0]])
my_model.predict(my_test)
## array(['setosa', 'virginica'], dtype=object)
##
## /opt/venv/lib/python3.10/site-packages/sklearn/base.py:450: UserWarning: X does not have valid feature names, but DecisionTreeClassifier was fitted with feature names
## warnings.warn(
pd.DataFrame(
my_model.predict_proba(my_test),
columns=my_model.classes_)
## setosa versicolor virginica
## 0 1.00 0.00 0.00
## 1 0.00 0.02 0.98
##
## /opt/venv/lib/python3.10/site-packages/sklearn/base.py:450: UserWarning: X does not have valid feature names, but DecisionTreeClassifier was fitted with feature names
## warnings.warn(
import graphviz
import pandas as pd
import statsmodels.api as sm
from sklearn import tree
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import cross_val_score, GridSearchCV, LeaveOneOut
my_data = sm.datasets.get_rdataset('iris', 'datasets').data
X, y = my_data.iloc[:, 0:4], my_data.Species
my_model = tree.DecisionTreeClassifier(max_depth=2, random_state=0).fit(X, y)
y_ = my_model.predict(X)
confusion_matrix(y_true=y, y_pred=y_)
## array([[50, 0, 0],
## [ 0, 49, 1],
## [ 0, 5, 45]])
my_model.score(X, y)
## 0.96
# あるいは
y_ = my_model.predict(X)
(y_ == y).mean()
## 0.96
cross_val_score(my_model, X, y, cv=LeaveOneOut()).mean()
## 0.9533333333333334
my_search = GridSearchCV(estimator=tree.DecisionTreeClassifier(random_state=0),
param_grid={'max_depth': range(1, 11)},
cv=LeaveOneOut(),
n_jobs=-1).fit(X, y)
my_search.best_params_, my_search.best_score_
## ({'max_depth': 2}, 0.9533333333333334)
my_params = {
'max_depth': range(2, 6),
'min_samples_split': [2, 20],
'min_samples_leaf': range(1, 8)}
my_search = GridSearchCV(
estimator=tree.DecisionTreeClassifier(min_impurity_decrease=0.01,
random_state=0),
param_grid=my_params,
cv=LeaveOneOut(),
n_jobs=-1).fit(X, y)
my_search.best_params_, my_search.best_score_
## ({'max_depth': 3, 'min_samples_leaf': 5, 'min_samples_split': 2}, 0.9733333333333334)
tmp = my_search.cv_results_
my_results = pd.DataFrame(tmp['params']).assign(
Accuracy=tmp['mean_test_score'])
# 正解率(検証)の最大値
my_results[my_results.Accuracy == my_results.Accuracy.max()]
## max_depth min_samples_leaf min_samples_split Accuracy
## 22 3 5 2 0.97
## 23 3 5 20 0.97
## 36 4 5 2 0.97
## 37 4 5 20 0.97
## 50 5 5 2 0.97
## 51 5 5 20 0.97
my_model = my_search.best_estimator_
my_dot = tree.export_graphviz(
decision_tree=my_model,
out_file=None,
feature_names=X.columns,
class_names=my_model.classes_,
filled=True)
graphviz.Source(my_dot)
## <graphviz.sources.Source object at 0x7f867bbdf520>
import pandas as pd
import statsmodels.api as sm
import warnings
import xgboost
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV, LeaveOneOut
from sklearn.preprocessing import LabelEncoder
my_data = sm.datasets.get_rdataset('iris', 'datasets').data
X, y = my_data.iloc[:, 0:4], my_data.Species
label_encoder = LabelEncoder(); y = label_encoder.fit_transform(y)
my_search = GridSearchCV(RandomForestClassifier(),
param_grid={'max_features': [2, 3, 4]},
cv=LeaveOneOut(),
n_jobs=-1).fit(X, y)
my_search.best_params_
## {'max_features': 3}
my_search.cv_results_['mean_test_score']
## array([0.947, 0.96 , 0.953])
warnings.simplefilter('ignore') # これ以降,警告を表示しない.
my_search = GridSearchCV(
xgboost.XGBClassifier(eval_metric='mlogloss'),
param_grid={'n_estimators' : [50, 100, 150],
'max_depth' : [1, 2, 3],
'learning_rate' : [0.3, 0.4],
'gamma' : [0],
'colsample_bytree': [0.6, 0.8],
'min_child_weight': [1],
'subsample' : [0.5, 0.75, 1]},
cv=5, # 5分割交差検証
n_jobs=1).fit(X, y) # n_jobs=-1ではない.
warnings.simplefilter('default') # これ以降,警告を表示する.
my_search.best_params_
## {'colsample_bytree': 0.6, 'gamma': 0, 'learning_rate': 0.3, 'max_depth': 1, 'min_child_weight': 1, 'n_estimators': 50, 'subsample': 0.5}
my_search.best_score_
## 0.96
my_model = RandomForestClassifier().fit(X, y)
tmp = pd.Series(my_model.feature_importances_, index=X.columns)
tmp.sort_values().plot(kind='barh')
## <Axes: >
import numpy as np
import statsmodels.api as sm
import warnings
import xgboost
from sklearn import tree
from sklearn.impute import SimpleImputer
from sklearn.model_selection import cross_val_score, LeaveOneOut
from sklearn.pipeline import Pipeline
my_data = sm.datasets.get_rdataset('iris', 'datasets').data
## /opt/venv/lib/python3.10/site-packages/pandas/core/algorithms.py:522: DeprecationWarning: np.find_common_type is deprecated. Please use `np.result_type` or `np.promote_types`.
## See https://numpy.org/devdocs/release/1.25.0-notes.html and the docs for more information. (Deprecated NumPy 1.25)
## common = np.find_common_type([values.dtype, comps_array.dtype], [])
n = len(my_data)
my_data['Petal.Length'] = [np.nan if i % 10 == 0 else
my_data['Petal.Length'][i] for i in range(n)]
my_data['Petal.Width'] = [np.nan if i % 10 == 1 else
my_data['Petal.Width'][i] for i in range(n)]
my_data.describe() # countの値が135の変数に,150-135=15個の欠損がある.
## Sepal.Length Sepal.Width Petal.Length Petal.Width
## count 150.00 150.00 135.00 135.00
## mean 5.84 3.06 3.75 1.20
## std 0.83 0.44 1.76 0.77
## min 4.30 2.00 1.00 0.10
## 25% 5.10 2.80 1.55 0.30
## 50% 5.80 3.00 4.30 1.30
## 75% 6.40 3.30 5.10 1.80
## max 7.90 4.40 6.90 2.50
# 以下省略
X, y = my_data.iloc[:, 0:4], my_data.Species
my_pipeline = Pipeline([
('imputer', SimpleImputer(strategy='median')), # 欠損を中央値で埋める.
('tree', tree.DecisionTreeClassifier(random_state=0))])
my_scores = cross_val_score(my_pipeline, X, y, cv=LeaveOneOut(), n_jobs=-1)
my_scores.mean()
## 0.9333333333333333
from sklearn.preprocessing import LabelEncoder
label_encoder = LabelEncoder()
y = label_encoder.fit_transform(y)
warnings.simplefilter('ignore') # これ以降,警告を表示しない.
my_scores = cross_val_score(
xgboost.XGBClassifier(eval_metric='mlogloss'), X, y, cv=5)
warnings.simplefilter('default') # これ以降,警告を表示する.
my_scores.mean()
## 0.9666666666666668
import statsmodels.api as sm
from sklearn.model_selection import cross_val_score, LeaveOneOut
from sklearn.neighbors import KNeighborsClassifier
my_data = sm.datasets.get_rdataset('iris', 'datasets').data
## /opt/venv/lib/python3.10/site-packages/pandas/core/algorithms.py:522: DeprecationWarning: np.find_common_type is deprecated. Please use `np.result_type` or `np.promote_types`.
## See https://numpy.org/devdocs/release/1.25.0-notes.html and the docs for more information. (Deprecated NumPy 1.25)
## common = np.find_common_type([values.dtype, comps_array.dtype], [])
X, y = my_data.iloc[:, 0:4], my_data.Species
my_scores = cross_val_score(KNeighborsClassifier(), X, y, cv=LeaveOneOut())
my_scores.mean()
## 0.9666666666666667
import statsmodels.api as sm
from sklearn.model_selection import cross_val_score, LeaveOneOut
from sklearn.neural_network import MLPClassifier
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
my_data = sm.datasets.get_rdataset('iris', 'datasets').data
## /opt/venv/lib/python3.10/site-packages/pandas/core/algorithms.py:522: DeprecationWarning: np.find_common_type is deprecated. Please use `np.result_type` or `np.promote_types`.
## See https://numpy.org/devdocs/release/1.25.0-notes.html and the docs for more information. (Deprecated NumPy 1.25)
## common = np.find_common_type([values.dtype, comps_array.dtype], [])
X, y = my_data.iloc[:, 0:4], my_data.Species
my_pipeline = Pipeline([('sc', StandardScaler()), # 標準化
('mlp', MLPClassifier(max_iter=1000))]) # ニューラルネットワーク
my_scores = cross_val_score(my_pipeline, X, y, cv=LeaveOneOut(), n_jobs=-1)
my_scores.mean()
## 0.9666666666666667
import numpy as np
from sklearn.metrics import classification_report, confusion_matrix
y = np.array([ 0, 1, 1, 0, 1, 0, 1, 0, 0, 1])
y_score = np.array([0.7, 0.8, 0.3, 0.4, 0.9, 0.6, 0.99, 0.1, 0.2, 0.5])
y_ = np.array([1 if 0.5 <= p else 0 for p in y_score])
y_
## array([1, 1, 0, 0, 1, 1, 1, 0, 0, 1])
confusion_matrix(y_true=y, y_pred=y_)
## array([[3, 2],
## [1, 4]])
print(classification_report(y_true=y, y_pred=y_))
## precision recall f1-score support
##
## 0 0.75 0.60 0.67 5
## 1 0.67 0.80 0.73 5
##
## accuracy 0.70 10
## macro avg 0.71 0.70 0.70 10
## weighted avg 0.71 0.70 0.70 10
import numpy as np
from sklearn.metrics import (roc_curve, RocCurveDisplay,
precision_recall_curve, PrecisionRecallDisplay, auc)
y = np.array([ 0, 1, 1, 0, 1, 0, 1, 0, 0, 1])
y_score = np.array([0.7, 0.8, 0.3, 0.4, 0.9, 0.6, 0.99, 0.1, 0.2, 0.5])
y_ = np.array([1 if 0.5 <= p else 0 for p in y_score])
[sum((y == 0) & (y_ == 1)) / sum(y == 0), # FPR
sum((y == 1) & (y_ == 1)) / sum(y == 1)] # TPR
## [0.4, 0.8]
my_fpr, my_tpr, _ = roc_curve(y_true=y,
y_score=y_score,
pos_label=1) # 1が陽性である.
RocCurveDisplay(fpr=my_fpr, tpr=my_tpr).plot()
## <sklearn.metrics._plot.roc_curve.RocCurveDisplay object at 0x7f8684401c60>
auc(x=my_fpr, y=my_tpr)
## 0.8
[sum((y == 1) & (y_ == 1)) / sum(y == 1), # Recall == TPR
sum((y == 1) & (y_ == 1)) / sum(y_ == 1)] # Precision
## [0.8, 0.6666666666666666]
my_precision, my_recall, _ = precision_recall_curve(y_true=y,
probas_pred=y_score,
pos_label=1)
PrecisionRecallDisplay(precision=my_precision, recall=my_recall).plot()
## <sklearn.metrics._plot.precision_recall_curve.PrecisionRecallDisplay object at 0x7f85737ed0f0>
auc(x=my_recall, y=my_precision)
## 0.8463095238095237
import graphviz
import pandas as pd
from sklearn import tree
from sklearn.metrics import roc_curve, RocCurveDisplay, auc
from sklearn.model_selection import cross_val_score, LeaveOneOut
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder
my_url = ('https://raw.githubusercontent.com/taroyabuki'
'/fromzero/master/data/titanic.csv')
my_data = pd.read_csv(my_url)
my_data.head()
## Class Sex Age Survived
## 0 1st Male Child Yes
## 1 1st Male Child Yes
## 2 1st Male Child Yes
## 3 1st Male Child Yes
## 4 1st Male Child Yes
X, y = my_data.iloc[:, 0:3], my_data.Survived
my_pipeline = Pipeline([
('ohe', OneHotEncoder(drop='first')),
('tree', tree.DecisionTreeClassifier(max_depth=2, random_state=0,
min_impurity_decrease=0.01))])
my_pipeline.fit(X, y)
## Pipeline(steps=[('ohe', OneHotEncoder(drop='first')),
## ('tree',
## DecisionTreeClassifier(max_depth=2, min_impurity_decrease=0.01,
## random_state=0))])
my_enc = my_pipeline.named_steps['ohe'] # パイプラインからエンコーダを取り出す.
my_tree = my_pipeline.named_steps['tree'] # パイプラインから木を取り出す.
my_dot = tree.export_graphviz(
decision_tree=my_tree,
out_file=None,
feature_names=my_enc.get_feature_names() \
if hasattr(my_enc, 'get_feature_names') else my_enc.get_feature_names_out(),
class_names=my_pipeline.classes_,
filled=True)
## /opt/venv/lib/python3.10/site-packages/sklearn/utils/deprecation.py:87: FutureWarning: Function get_feature_names is deprecated; get_feature_names is deprecated in 1.0 and will be removed in 1.2. Please use get_feature_names_out instead.
## warnings.warn(msg, category=FutureWarning)
graphviz.Source(my_dot)
## <graphviz.sources.Source object at 0x7f8573bd0700>
my_scores = cross_val_score(
my_pipeline, X, y,
cv=LeaveOneOut(),
n_jobs=-1)
my_scores.mean()
## 0.7832803271240345
tmp = pd.DataFrame(
my_pipeline.predict_proba(X),
columns=my_pipeline.classes_)
y_score = tmp.Yes
my_fpr, my_tpr, _ = roc_curve(y_true=y,
y_score=y_score,
pos_label='Yes')
my_auc = auc(x=my_fpr, y=my_tpr)
my_auc
## 0.7114886868858494
RocCurveDisplay(fpr=my_fpr, tpr=my_tpr, roc_auc=my_auc).plot()
## <sklearn.metrics._plot.roc_curve.RocCurveDisplay object at 0x7f8683e25cf0>
import matplotlib.pyplot as plt
import numpy as np
x = np.arange(-6, 6, 0.1)
y = 1 / (1 + np.exp(-x))
plt.plot(x, y)
## [<matplotlib.lines.Line2D object at 0x7f86840bd3c0>]
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score, LeaveOneOut
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder
my_url = ('https://raw.githubusercontent.com/taroyabuki'
'/fromzero/master/data/titanic.csv')
my_data = pd.read_csv(my_url)
X, y = my_data.iloc[:, 0:3], my_data.Survived
my_pipeline = Pipeline([('ohe', OneHotEncoder(drop='first')),
('lr', LogisticRegression(penalty='none'))])
my_pipeline.fit(X, y)
## Pipeline(steps=[('ohe', OneHotEncoder(drop='first')),
## ('lr', LogisticRegression(penalty='none'))])
my_ohe = my_pipeline.named_steps.ohe
my_lr = my_pipeline.named_steps.lr
my_lr.intercept_[0]
## 2.043878162056616
tmp = my_ohe.get_feature_names() \
if hasattr(my_ohe, 'get_feature_names') \
else my_ohe.get_feature_names_out()
## /opt/venv/lib/python3.10/site-packages/sklearn/utils/deprecation.py:87: FutureWarning: Function get_feature_names is deprecated; get_feature_names is deprecated in 1.0 and will be removed in 1.2. Please use get_feature_names_out instead.
## warnings.warn(msg, category=FutureWarning)
pd.Series(my_lr.coef_[0],
index=tmp)
## x0_2nd -1.02
## x0_3rd -1.78
## x0_Crew -0.86
## x1_Male -2.42
## x2_Child 1.06
## dtype: float64
my_scores = cross_val_score(
my_pipeline, X, y,
cv=LeaveOneOut(),
n_jobs=-1)
my_scores.mean()
## 0.7782825988187188
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from keras import activations, callbacks, layers, models
## /usr/local/lib/R/site-library/reticulate/python/rpytools/loader.py:117: DeprecationWarning: The distutils package is deprecated and slated for removal in Python 3.12. Use setuptools or check PEP 632 for potential alternatives
## return _find_and_load(name, import_)
from sklearn.preprocessing import StandardScaler
from sklearn.utils import shuffle
my_url = ('https://raw.githubusercontent.com/taroyabuki'
'/fromzero/master/data/wine.csv')
tmp = pd.read_csv(my_url)
my_data = shuffle(tmp)
my_scaler = StandardScaler()
X = my_scaler.fit_transform(
my_data.drop(columns=['LPRICE2']))
## /opt/venv/lib/python3.10/site-packages/pandas/core/dtypes/cast.py:1641: DeprecationWarning: np.find_common_type is deprecated. Please use `np.result_type` or `np.promote_types`.
## See https://numpy.org/devdocs/release/1.25.0-notes.html and the docs for more information. (Deprecated NumPy 1.25)
## return np.find_common_type(types, [])
## /opt/venv/lib/python3.10/site-packages/pandas/core/dtypes/cast.py:1641: DeprecationWarning: np.find_common_type is deprecated. Please use `np.result_type` or `np.promote_types`.
## See https://numpy.org/devdocs/release/1.25.0-notes.html and the docs for more information. (Deprecated NumPy 1.25)
## return np.find_common_type(types, [])
## /opt/venv/lib/python3.10/site-packages/pandas/core/dtypes/cast.py:1641: DeprecationWarning: np.find_common_type is deprecated. Please use `np.result_type` or `np.promote_types`.
## See https://numpy.org/devdocs/release/1.25.0-notes.html and the docs for more information. (Deprecated NumPy 1.25)
## return np.find_common_type(types, [])
y = my_data['LPRICE2']
x = np.linspace(-3, 3, 100)
plt.plot(x, activations.relu(x))
## [<matplotlib.lines.Line2D object at 0x7f84e9487ca0>]
plt.xlabel('x')
## Text(0.5, 0, 'x')
plt.ylabel('ReLU(x)')
## Text(0, 0.5, 'ReLU(x)')
my_model = models.Sequential()
my_model.add(layers.Dense(units=3, activation='relu', input_shape=[4]))
## /opt/venv/lib/python3.10/site-packages/keras/src/layers/core/dense.py:88: UserWarning: Do not pass an `input_shape`/`input_dim` argument to a layer. When using Sequential models, prefer using an `Input(shape)` object as the first layer in the model instead.
## super().__init__(activity_regularizer=activity_regularizer, **kwargs)
my_model.add(layers.Dense(units=1))
my_model.summary() # ネットワークの概要
## Model: "sequential"
## ┏━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━┓
## ┃ Layer (type) ┃ Output Shape ┃ Param # ┃
## ┡━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━┩
## │ dense (Dense) │ (None, 3) │ 15 │
## ├─────────────────────────────────┼────────────────────────┼───────────────┤
## │ dense_1 (Dense) │ (None, 1) │ 4 │
## └─────────────────────────────────┴────────────────────────┴───────────────┘
## Total params: 19 (76.00 B)
## Trainable params: 19 (76.00 B)
## Non-trainable params: 0 (0.00 B)
my_model.compile(
loss='mse',
optimizer='rmsprop')
my_cb = callbacks.EarlyStopping(
patience=20,
restore_best_weights=True)
my_history = my_model.fit(
x=X,
y=y,
validation_split=0.25,
batch_size=10,
epochs=500,
callbacks=[my_cb],
verbose=0)
tmp = pd.DataFrame(my_history.history)
tmp.plot(xlabel='epoch')
## <Axes: xlabel='epoch'>
tmp.iloc[-1, ]
## loss 0.10
## val_loss 0.14
## Name: 383, dtype: float64
y_ = my_model.predict(X)
##
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 31ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 31ms/step
((y_.ravel() - y)**2).mean()
## 0.11717902360155158
import numpy as np
import pandas as pd
import statsmodels.api as sm
from keras import callbacks, layers, losses, models
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.utils import shuffle
tmp = sm.datasets.get_rdataset('iris', 'datasets').data
## /opt/venv/lib/python3.10/site-packages/pandas/core/algorithms.py:522: DeprecationWarning: np.find_common_type is deprecated. Please use `np.result_type` or `np.promote_types`.
## See https://numpy.org/devdocs/release/1.25.0-notes.html and the docs for more information. (Deprecated NumPy 1.25)
## common = np.find_common_type([values.dtype, comps_array.dtype], [])
my_data = shuffle(tmp)
my_scaler = StandardScaler()
X = my_scaler.fit_transform(
my_data.drop(columns=['Species']))
my_enc = LabelEncoder()
y = my_enc.fit_transform(
my_data['Species'])
my_model = models.Sequential()
my_model.add(layers.Dense(units=3, activation='relu', input_shape=[4]))
## /opt/venv/lib/python3.10/site-packages/keras/src/layers/core/dense.py:88: UserWarning: Do not pass an `input_shape`/`input_dim` argument to a layer. When using Sequential models, prefer using an `Input(shape)` object as the first layer in the model instead.
## super().__init__(activity_regularizer=activity_regularizer, **kwargs)
my_model.add(layers.Dense(units=3, activation='softmax'))
my_model.compile(loss='sparse_categorical_crossentropy',
optimizer='rmsprop',
metrics=['accuracy'])
my_cb = callbacks.EarlyStopping(
patience=20,
restore_best_weights=True)
my_history = my_model.fit(
x=X,
y=y,
validation_split=0.25,
batch_size=10,
epochs=500,
callbacks=[my_cb],
verbose=0)
tmp = pd.DataFrame(my_history.history)
tmp.plot(xlabel='epoch')
## <Axes: xlabel='epoch'>
tmp.iloc[-1, ]
## accuracy 0.98
## loss 0.06
## val_accuracy 0.92
## val_loss 0.13
## Name: 434, dtype: float64
tmp = my_model.predict(X)
##
[1m1/5[0m [32m━━━━[0m[37m━━━━━━━━━━━━━━━━[0m [1m0s[0m 31ms/step
[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step
y_ = np.argmax(tmp, axis=-1)
(y_ == y).mean()
## 0.9666666666666667
-np.log([0.8, 0.7, 0.3, 0.8]).mean()
## 0.5017337127232719
-np.log([0.7, 0.6, 0.2, 0.7]).mean()
## 0.708403356019389
y = [2, 1, 0, 1]
y_1 = [[0.1, 0.1, 0.8],
[0.1, 0.7, 0.2],
[0.3, 0.4, 0.3],
[0.1, 0.8, 0.1]]
y_2 = [[0.1, 0.2, 0.7],
[0.2, 0.6, 0.2],
[0.2, 0.5, 0.3],
[0.2, 0.7, 0.1]]
[losses.sparse_categorical_crossentropy(y_true=y, y_pred=y_1).numpy().mean(),
losses.sparse_categorical_crossentropy(y_true=y, y_pred=y_2).numpy().mean()]
## [0.5017337, 0.70840335]
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import tensorflow as tf
from random import sample
from keras import callbacks, layers, models
from sklearn.metrics import confusion_matrix
(x_train, y_train), (x_test, y_test) = tf.keras.datasets.mnist.load_data()
## Downloading data from https://storage.googleapis.com/tensorflow/tf-keras-datasets/mnist.npz
##
[1m 0/11490434[0m [37m━━━━━━━━━━━━━━━━━━━━[0m [1m0s[0m 0s/step
[1m 16384/11490434[0m [37m━━━━━━━━━━━━━━━━━━━━[0m [1m43s[0m 4us/step
[1m 49152/11490434[0m [37m━━━━━━━━━━━━━━━━━━━━[0m [1m38s[0m 3us/step
[1m 81920/11490434[0m [37m━━━━━━━━━━━━━━━━━━━━[0m [1m36s[0m 3us/step
[1m 147456/11490434[0m [37m━━━━━━━━━━━━━━━━━━━━[0m [1m24s[0m 2us/step
[1m 212992/11490434[0m [37m━━━━━━━━━━━━━━━━━━━━[0m [1m20s[0m 2us/step
[1m 294912/11490434[0m [37m━━━━━━━━━━━━━━━━━━━━[0m [1m16s[0m 1us/step
[1m 458752/11490434[0m [37m━━━━━━━━━━━━━━━━━━━━[0m [1m11s[0m 1us/step
[1m 655360/11490434[0m [32m━[0m[37m━━━━━━━━━━━━━━━━━━━[0m [1m8s[0m 1us/step
[1m 983040/11490434[0m [32m━[0m[37m━━━━━━━━━━━━━━━━━━━[0m [1m6s[0m 1us/step
[1m 1458176/11490434[0m [32m━━[0m[37m━━━━━━━━━━━━━━━━━━[0m [1m4s[0m 0us/step
[1m 2154496/11490434[0m [32m━━━[0m[37m━━━━━━━━━━━━━━━━━[0m [1m2s[0m 0us/step
[1m 3235840/11490434[0m [32m━━━━━[0m[37m━━━━━━━━━━━━━━━[0m [1m1s[0m 0us/step
[1m 4784128/11490434[0m [32m━━━━━━━━[0m[37m━━━━━━━━━━━━[0m [1m1s[0m 0us/step
[1m 7077888/11490434[0m [32m━━━━━━━━━━━━[0m[37m━━━━━━━━[0m [1m0s[0m 0us/step
[1m 9175040/11490434[0m [32m━━━━━━━━━━━━━━━[0m[37m━━━━━[0m [1m0s[0m 0us/step
[1m11490434/11490434[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 0us/step
x_train.shape
## (60000, 28, 28)
np.set_printoptions(linewidth=170)
x_train[4, :, :]
## array([[ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
## [ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
## [ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
## [ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
## [ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
## [ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
## [ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
## [ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 55, 148, 210, 253, 253, 113, 87, 148, 55, 0, 0, 0, 0, 0, 0, 0],
## [ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 87, 232, 252, 253, 189, 210, 252, 252, 253, 168, 0, 0, 0, 0, 0, 0, 0],
## [ 0, 0, 0, 0, 0, 0, 0, 0, 0, 4, 57, 242, 252, 190, 65, 5, 12, 182, 252, 253, 116, 0, 0, 0, 0, 0, 0, 0],
## [ 0, 0, 0, 0, 0, 0, 0, 0, 0, 96, 252, 252, 183, 14, 0, 0, 92, 252, 252, 225, 21, 0, 0, 0, 0, 0, 0, 0],
## [ 0, 0, 0, 0, 0, 0, 0, 0, 132, 253, 252, 146, 14, 0, 0, 0, 215, 252, 252, 79, 0, 0, 0, 0, 0, 0, 0, 0],
## [ 0, 0, 0, 0, 0, 0, 0, 126, 253, 247, 176, 9, 0, 0, 8, 78, 245, 253, 129, 0, 0, 0, 0, 0, 0, 0, 0, 0],
## [ 0, 0, 0, 0, 0, 0, 16, 232, 252, 176, 0, 0, 0, 36, 201, 252, 252, 169, 11, 0, 0, 0, 0, 0, 0, 0, 0, 0],
## [ 0, 0, 0, 0, 0, 0, 22, 252, 252, 30, 22, 119, 197, 241, 253, 252, 251, 77, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
## [ 0, 0, 0, 0, 0, 0, 16, 231, 252, 253, 252, 252, 252, 226, 227, 252, 231, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
## [ 0, 0, 0, 0, 0, 0, 0, 55, 235, 253, 217, 138, 42, 24, 192, 252, 143, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
## [ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 62, 255, 253, 109, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
## [ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 71, 253, 252, 21, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
## [ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 253, 252, 21, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
## [ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 71, 253, 252, 21, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
## [ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 106, 253, 252, 21, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
## [ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 45, 255, 253, 21, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
## [ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 218, 252, 56, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
## [ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 96, 252, 189, 42, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
## [ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 14, 184, 252, 170, 11, 0, 0, 0, 0, 0, 0, 0, 0, 0],
## [ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 14, 147, 252, 42, 0, 0, 0, 0, 0, 0, 0, 0, 0],
## [ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]], dtype=uint8)
plt.matshow(x_train[4, :, :])
## <matplotlib.image.AxesImage object at 0x7f83a04409a0>
y_train
## array([5, 0, 4, ..., 5, 6, 8], dtype=uint8)
x_train.min(), x_train.max()
## (0, 255)
x_train = x_train / 255
x_test = x_test / 255
my_index = sample(range(60000), 6000)
x_train = x_train[my_index, :, :]
y_train = y_train[my_index]
my_model = models.Sequential()
my_model.add(layers.Flatten(input_shape=[28, 28]))
## /opt/venv/lib/python3.10/site-packages/keras/src/layers/reshaping/flatten.py:37: UserWarning: Do not pass an `input_shape`/`input_dim` argument to a layer. When using Sequential models, prefer using an `Input(shape)` object as the first layer in the model instead.
## super().__init__(**kwargs)
my_model.add(layers.Dense(units=256, activation="relu"))
my_model.add(layers.Dense(units=10, activation="softmax"))
my_model.summary()
## Model: "sequential_2"
## ┏━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━┓
## ┃ Layer (type) ┃ Output Shape ┃ Param # ┃
## ┡━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━┩
## │ flatten (Flatten) │ (None, 784) │ 0 │
## ├─────────────────────────────────┼────────────────────────┼───────────────┤
## │ dense_4 (Dense) │ (None, 256) │ 200,960 │
## ├─────────────────────────────────┼────────────────────────┼───────────────┤
## │ dense_5 (Dense) │ (None, 10) │ 2,570 │
## └─────────────────────────────────┴────────────────────────┴───────────────┘
## Total params: 203,530 (795.04 KB)
## Trainable params: 203,530 (795.04 KB)
## Non-trainable params: 0 (0.00 B)
my_model.compile(loss='sparse_categorical_crossentropy',
optimizer='rmsprop',
metrics=['accuracy'])
my_cb = callbacks.EarlyStopping(patience=5, restore_best_weights=True)
my_history = my_model.fit(
x=x_train,
y=y_train,
validation_split=0.2,
batch_size=128,
epochs=20,
callbacks=[my_cb],
verbose=0)
tmp = pd.DataFrame(my_history.history)
tmp.plot(xlabel='epoch', style='o-')
## <Axes: xlabel='epoch'>
tmp = my_model.predict(x_test)
##
[1m 1/313[0m [37m━━━━━━━━━━━━━━━━━━━━[0m [1m9s[0m 32ms/step
[1m 56/313[0m [32m━━━[0m[37m━━━━━━━━━━━━━━━━━[0m [1m0s[0m 921us/step
[1m114/313[0m [32m━━━━━━━[0m[37m━━━━━━━━━━━━━[0m [1m0s[0m 899us/step
[1m171/313[0m [32m━━━━━━━━━━[0m[37m━━━━━━━━━━[0m [1m0s[0m 894us/step
[1m231/313[0m [32m━━━━━━━━━━━━━━[0m[37m━━━━━━[0m [1m0s[0m 878us/step
[1m291/313[0m [32m━━━━━━━━━━━━━━━━━━[0m[37m━━[0m [1m0s[0m 871us/step
[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 924us/step
y_ = np.argmax(tmp, axis=-1)
confusion_matrix(y_true=y_test,
y_pred=y_)
## array([[ 950, 0, 2, 1, 0, 5, 17, 2, 2, 1],
## [ 0, 1117, 4, 2, 1, 1, 4, 1, 5, 0],
## [ 6, 2, 967, 10, 9, 5, 10, 5, 15, 3],
## [ 1, 1, 21, 932, 1, 29, 3, 9, 6, 7],
## [ 1, 1, 12, 2, 935, 1, 7, 2, 2, 19],
## [ 9, 1, 2, 18, 3, 835, 12, 2, 6, 4],
## [ 5, 3, 7, 3, 6, 11, 923, 0, 0, 0],
## [ 1, 12, 28, 4, 7, 2, 0, 954, 1, 19],
## [ 3, 3, 10, 25, 9, 29, 14, 4, 868, 9],
## [ 6, 9, 1, 12, 32, 13, 1, 17, 2, 916]])
(y_ == y_test).mean()
## 0.9397
my_model.evaluate(x=x_test, y=y_test)
##
[1m 1/313[0m [37m━━━━━━━━━━━━━━━━━━━━[0m [1m6s[0m 20ms/step - accuracy: 0.9688 - loss: 0.1135
[1m 54/313[0m [32m━━━[0m[37m━━━━━━━━━━━━━━━━━[0m [1m0s[0m 957us/step - accuracy: 0.9375 - loss: 0.1849
[1m110/313[0m [32m━━━━━━━[0m[37m━━━━━━━━━━━━━[0m [1m0s[0m 931us/step - accuracy: 0.9278 - loss: 0.2240
[1m166/313[0m [32m━━━━━━━━━━[0m[37m━━━━━━━━━━[0m [1m0s[0m 920us/step - accuracy: 0.9260 - loss: 0.2355
[1m222/313[0m [32m━━━━━━━━━━━━━━[0m[37m━━━━━━[0m [1m0s[0m 915us/step - accuracy: 0.9271 - loss: 0.2367
[1m277/313[0m [32m━━━━━━━━━━━━━━━━━[0m[37m━━━[0m [1m0s[0m 916us/step - accuracy: 0.9289 - loss: 0.2331
[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 919us/step - accuracy: 0.9303 - loss: 0.2295
## [0.20638760924339294, 0.9397000074386597]
x_train2d = x_train.reshape(-1, 28, 28, 1)
x_test2d = x_test.reshape(-1, 28, 28, 1)
my_model = models.Sequential()
my_model.add(layers.Conv2D(filters=32, kernel_size=3, # 畳み込み層
activation='relu',
input_shape=[28, 28, 1]))
## /opt/venv/lib/python3.10/site-packages/keras/src/layers/convolutional/base_conv.py:99: UserWarning: Do not pass an `input_shape`/`input_dim` argument to a layer. When using Sequential models, prefer using an `Input(shape)` object as the first layer in the model instead.
## super().__init__(
my_model.add(layers.MaxPooling2D(pool_size=2)) # プーリング層
my_model.add(layers.Flatten())
my_model.add(layers.Dense(128, activation='relu'))
my_model.add(layers.Dense(10, activation='softmax'))
my_model.summary()
## Model: "sequential_3"
## ┏━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━┓
## ┃ Layer (type) ┃ Output Shape ┃ Param # ┃
## ┡━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━┩
## │ conv2d (Conv2D) │ (None, 26, 26, 32) │ 320 │
## ├─────────────────────────────────┼────────────────────────┼───────────────┤
## │ max_pooling2d (MaxPooling2D) │ (None, 13, 13, 32) │ 0 │
## ├─────────────────────────────────┼────────────────────────┼───────────────┤
## │ flatten_1 (Flatten) │ (None, 5408) │ 0 │
## ├─────────────────────────────────┼────────────────────────┼───────────────┤
## │ dense_6 (Dense) │ (None, 128) │ 692,352 │
## ├─────────────────────────────────┼────────────────────────┼───────────────┤
## │ dense_7 (Dense) │ (None, 10) │ 1,290 │
## └─────────────────────────────────┴────────────────────────┴───────────────┘
## Total params: 693,962 (2.65 MB)
## Trainable params: 693,962 (2.65 MB)
## Non-trainable params: 0 (0.00 B)
my_model.compile(loss='sparse_categorical_crossentropy',
optimizer='rmsprop',
metrics=['accuracy'])
from keras.callbacks import EarlyStopping
my_cb = EarlyStopping(patience=5,
restore_best_weights=True)
my_history = my_model.fit(
x=x_train2d,
y=y_train,
validation_split=0.2,
batch_size=128,
epochs=20,
callbacks=my_cb,
verbose=0)
tmp = pd.DataFrame(my_history.history)
tmp.plot(xlabel='epoch', style='o-')
## <Axes: xlabel='epoch'>
my_model.evaluate(x=x_test2d, y=y_test)
##
[1m 1/313[0m [37m━━━━━━━━━━━━━━━━━━━━[0m [1m6s[0m 19ms/step - accuracy: 0.9688 - loss: 0.0662
[1m 25/313[0m [32m━[0m[37m━━━━━━━━━━━━━━━━━━━[0m [1m0s[0m 2ms/step - accuracy: 0.9700 - loss: 0.0829
[1m 49/313[0m [32m━━━[0m[37m━━━━━━━━━━━━━━━━━[0m [1m0s[0m 2ms/step - accuracy: 0.9644 - loss: 0.1029
[1m 73/313[0m [32m━━━━[0m[37m━━━━━━━━━━━━━━━━[0m [1m0s[0m 2ms/step - accuracy: 0.9607 - loss: 0.1189
[1m 96/313[0m [32m━━━━━━[0m[37m━━━━━━━━━━━━━━[0m [1m0s[0m 2ms/step - accuracy: 0.9585 - loss: 0.1295
[1m120/313[0m [32m━━━━━━━[0m[37m━━━━━━━━━━━━━[0m [1m0s[0m 2ms/step - accuracy: 0.9577 - loss: 0.1347
[1m145/313[0m [32m━━━━━━━━━[0m[37m━━━━━━━━━━━[0m [1m0s[0m 2ms/step - accuracy: 0.9570 - loss: 0.1396
[1m167/313[0m [32m━━━━━━━━━━[0m[37m━━━━━━━━━━[0m [1m0s[0m 2ms/step - accuracy: 0.9568 - loss: 0.1422
[1m190/313[0m [32m━━━━━━━━━━━━[0m[37m━━━━━━━━[0m [1m0s[0m 2ms/step - accuracy: 0.9569 - loss: 0.1430
[1m212/313[0m [32m━━━━━━━━━━━━━[0m[37m━━━━━━━[0m [1m0s[0m 2ms/step - accuracy: 0.9571 - loss: 0.1435
[1m235/313[0m [32m━━━━━━━━━━━━━━━[0m[37m━━━━━[0m [1m0s[0m 2ms/step - accuracy: 0.9574 - loss: 0.1432
[1m257/313[0m [32m━━━━━━━━━━━━━━━━[0m[37m━━━━[0m [1m0s[0m 2ms/step - accuracy: 0.9579 - loss: 0.1423
[1m281/313[0m [32m━━━━━━━━━━━━━━━━━[0m[37m━━━[0m [1m0s[0m 2ms/step - accuracy: 0.9585 - loss: 0.1409
[1m306/313[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m0s[0m 2ms/step - accuracy: 0.9590 - loss: 0.1394
[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - accuracy: 0.9592 - loss: 0.1391
## [0.12661811709403992, 0.9646000266075134]
my_model = models.Sequential()
my_model.add(layers.Conv2D(filters=20, kernel_size=5, activation='relu',
input_shape=(28, 28, 1)))
## /opt/venv/lib/python3.10/site-packages/keras/src/layers/convolutional/base_conv.py:99: UserWarning: Do not pass an `input_shape`/`input_dim` argument to a layer. When using Sequential models, prefer using an `Input(shape)` object as the first layer in the model instead.
## super().__init__(
my_model.add(layers.MaxPooling2D(pool_size=2, strides=2))
my_model.add(layers.Conv2D(filters=20, kernel_size=5, activation='relu'))
my_model.add(layers.MaxPooling2D(pool_size=2, strides=2))
my_model.add(layers.Dropout(rate=0.25))
my_model.add(layers.Flatten())
my_model.add(layers.Dense(500, activation='relu'))
my_model.add(layers.Dropout(rate=0.5))
my_model.add(layers.Dense(10, activation='softmax'))
my_model.compile(loss='sparse_categorical_crossentropy',
optimizer='rmsprop',
metrics=['accuracy'])
my_cb = callbacks.EarlyStopping(patience=5,
restore_best_weights=True)
my_history = my_model.fit(
x=x_train2d,
y=y_train,
validation_split=0.2,
batch_size=128,
epochs=20,
callbacks=my_cb,
verbose=0)
tmp = pd.DataFrame(my_history.history)
tmp.plot(xlabel='epoch', style='o-')
## <Axes: xlabel='epoch'>
my_model.evaluate(x=x_test2d, y=y_test)
##
[1m 1/313[0m [37m━━━━━━━━━━━━━━━━━━━━[0m [1m5s[0m 19ms/step - accuracy: 1.0000 - loss: 0.0028
[1m 23/313[0m [32m━[0m[37m━━━━━━━━━━━━━━━━━━━[0m [1m0s[0m 2ms/step - accuracy: 0.9892 - loss: 0.0314
[1m 45/313[0m [32m━━[0m[37m━━━━━━━━━━━━━━━━━━[0m [1m0s[0m 2ms/step - accuracy: 0.9838 - loss: 0.0462
[1m 65/313[0m [32m━━━━[0m[37m━━━━━━━━━━━━━━━━[0m [1m0s[0m 2ms/step - accuracy: 0.9809 - loss: 0.0563
[1m 85/313[0m [32m━━━━━[0m[37m━━━━━━━━━━━━━━━[0m [1m0s[0m 2ms/step - accuracy: 0.9788 - loss: 0.0644
[1m105/313[0m [32m━━━━━━[0m[37m━━━━━━━━━━━━━━[0m [1m0s[0m 2ms/step - accuracy: 0.9777 - loss: 0.0688
[1m125/313[0m [32m━━━━━━━[0m[37m━━━━━━━━━━━━━[0m [1m0s[0m 2ms/step - accuracy: 0.9770 - loss: 0.0715
[1m146/313[0m [32m━━━━━━━━━[0m[37m━━━━━━━━━━━[0m [1m0s[0m 2ms/step - accuracy: 0.9765 - loss: 0.0740
[1m166/313[0m [32m━━━━━━━━━━[0m[37m━━━━━━━━━━[0m [1m0s[0m 2ms/step - accuracy: 0.9762 - loss: 0.0753
[1m188/313[0m [32m━━━━━━━━━━━━[0m[37m━━━━━━━━[0m [1m0s[0m 2ms/step - accuracy: 0.9762 - loss: 0.0755
[1m210/313[0m [32m━━━━━━━━━━━━━[0m[37m━━━━━━━[0m [1m0s[0m 2ms/step - accuracy: 0.9763 - loss: 0.0754
[1m233/313[0m [32m━━━━━━━━━━━━━━[0m[37m━━━━━━[0m [1m0s[0m 2ms/step - accuracy: 0.9764 - loss: 0.0750
[1m255/313[0m [32m━━━━━━━━━━━━━━━━[0m[37m━━━━[0m [1m0s[0m 2ms/step - accuracy: 0.9767 - loss: 0.0742
[1m277/313[0m [32m━━━━━━━━━━━━━━━━━[0m[37m━━━[0m [1m0s[0m 2ms/step - accuracy: 0.9770 - loss: 0.0732
[1m299/313[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m0s[0m 2ms/step - accuracy: 0.9774 - loss: 0.0722
[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - accuracy: 0.9776 - loss: 0.0716
## [0.061602503061294556, 0.9814000129699707]
y_prob = my_model.predict(x_test2d) # カテゴリに属する確率
##
[1m 1/313[0m [37m━━━━━━━━━━━━━━━━━━━━[0m [1m14s[0m 47ms/step
[1m 23/313[0m [32m━[0m[37m━━━━━━━━━━━━━━━━━━━[0m [1m0s[0m 2ms/step
[1m 45/313[0m [32m━━[0m[37m━━━━━━━━━━━━━━━━━━[0m [1m0s[0m 2ms/step
[1m 67/313[0m [32m━━━━[0m[37m━━━━━━━━━━━━━━━━[0m [1m0s[0m 2ms/step
[1m 89/313[0m [32m━━━━━[0m[37m━━━━━━━━━━━━━━━[0m [1m0s[0m 2ms/step
[1m111/313[0m [32m━━━━━━━[0m[37m━━━━━━━━━━━━━[0m [1m0s[0m 2ms/step
[1m128/313[0m [32m━━━━━━━━[0m[37m━━━━━━━━━━━━[0m [1m0s[0m 2ms/step
[1m149/313[0m [32m━━━━━━━━━[0m[37m━━━━━━━━━━━[0m [1m0s[0m 2ms/step
[1m167/313[0m [32m━━━━━━━━━━[0m[37m━━━━━━━━━━[0m [1m0s[0m 2ms/step
[1m186/313[0m [32m━━━━━━━━━━━[0m[37m━━━━━━━━━[0m [1m0s[0m 2ms/step
[1m207/313[0m [32m━━━━━━━━━━━━━[0m[37m━━━━━━━[0m [1m0s[0m 2ms/step
[1m228/313[0m [32m━━━━━━━━━━━━━━[0m[37m━━━━━━[0m [1m0s[0m 2ms/step
[1m250/313[0m [32m━━━━━━━━━━━━━━━[0m[37m━━━━━[0m [1m0s[0m 2ms/step
[1m273/313[0m [32m━━━━━━━━━━━━━━━━━[0m[37m━━━[0m [1m0s[0m 2ms/step
[1m296/313[0m [32m━━━━━━━━━━━━━━━━━━[0m[37m━━[0m [1m0s[0m 2ms/step
[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step
[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step
tmp = pd.DataFrame({
'y_prob': np.max(y_prob, axis=1), # 確率の最大値
'y_': np.argmax(y_prob, axis=1), # 予測カテゴリ
'y': y_test, # 正解
'id': range(len(y_test))}) # 番号
tmp = tmp[tmp.y_ != tmp.y] # 予測がはずれたものを残す
my_result = tmp.sort_values('y_prob', ascending=False) # 確率の大きい順に並び替える
my_result.head()
## y_prob y_ y id
## 3767 1.00 2 7 3767
## 9009 1.00 2 7 9009
## 1681 1.00 7 3 1681
## 6597 1.00 7 0 6597
## 2654 1.00 1 6 2654
for i in range(5):
plt.subplot(1, 5, i + 1)
ans = my_result['y'].iloc[i]
id = my_result['id'].iloc[i]
plt.title(f'{ans} ({id})')
plt.imshow(x_test[id])
plt.axis('off')
## <Axes: >
## Text(0.5, 1.0, '7 (3767)')
## <matplotlib.image.AxesImage object at 0x7f8340200ca0>
## (-0.5, 27.5, 27.5, -0.5)
## <Axes: >
## Text(0.5, 1.0, '7 (9009)')
## <matplotlib.image.AxesImage object at 0x7f83402034f0>
## (-0.5, 27.5, 27.5, -0.5)
## <Axes: >
## Text(0.5, 1.0, '3 (1681)')
## <matplotlib.image.AxesImage object at 0x7f8340255ed0>
## (-0.5, 27.5, 27.5, -0.5)
## <Axes: >
## Text(0.5, 1.0, '0 (6597)')
## <matplotlib.image.AxesImage object at 0x7f8340502740>
## (-0.5, 27.5, 27.5, -0.5)
## <Axes: >
## Text(0.5, 1.0, '6 (2654)')
## <matplotlib.image.AxesImage object at 0x7f83406e0cd0>
## (-0.5, 27.5, 27.5, -0.5)
import h2o
import pandas as pd
import tensorflow as tf
from h2o.automl import H2OAutoML
from random import sample
h2o.init()
## Checking whether there is an H2O instance running at http://localhost:54321..... not found.
## Attempting to start a local H2O server...
## Java Version: openjdk version "11.0.21" 2023-10-17; OpenJDK Runtime Environment (build 11.0.21+9-post-Ubuntu-0ubuntu122.04); OpenJDK 64-Bit Server VM (build 11.0.21+9-post-Ubuntu-0ubuntu122.04, mixed mode, sharing)
## Starting server from /opt/venv/lib/python3.10/site-packages/h2o/backend/bin/h2o.jar
## Ice root: /tmp/tmpe_p9qusv
## JVM stdout: /tmp/tmpe_p9qusv/h2o_rstudio_started_from_python.out
## JVM stderr: /tmp/tmpe_p9qusv/h2o_rstudio_started_from_python.err
## Server is running at http://127.0.0.1:54321
## Connecting to H2O server at http://127.0.0.1:54321 ... successful.
## -------------------------- ------------------------------
## H2O_cluster_uptime: 01 secs
## H2O_cluster_timezone: Etc/UTC
## H2O_data_parsing_timezone: UTC
## H2O_cluster_version: 3.46.0.1
## H2O_cluster_version_age: 10 days
## H2O_cluster_name: H2O_from_python_rstudio_nrm5ul
## H2O_cluster_total_nodes: 1
## H2O_cluster_free_memory: 15.69 Gb
## H2O_cluster_total_cores: 32
## H2O_cluster_allowed_cores: 32
## H2O_cluster_status: locked, healthy
## H2O_connection_url: http://127.0.0.1:54321
## H2O_connection_proxy: {"http": null, "https": null}
## H2O_internal_security: False
## Python_version: 3.10.12 final
## -------------------------- ------------------------------
##
## /opt/venv/lib/python3.10/site-packages/h2o/backend/server.py:139: ResourceWarning: unclosed file <_io.TextIOWrapper name='/tmp/tmpe_p9qusv/h2o_rstudio_started_from_python.out' mode='w' encoding='utf-8'>
## hs._launch_server(port=port, baseport=baseport, nthreads=int(nthreads), ea=enable_assertions,
## ResourceWarning: Enable tracemalloc to get the object allocation traceback
## /opt/venv/lib/python3.10/site-packages/h2o/backend/server.py:139: ResourceWarning: unclosed file <_io.TextIOWrapper name='/tmp/tmpe_p9qusv/h2o_rstudio_started_from_python.err' mode='w' encoding='utf-8'>
## hs._launch_server(port=port, baseport=baseport, nthreads=int(nthreads), ea=enable_assertions,
## ResourceWarning: Enable tracemalloc to get the object allocation traceback
h2o.no_progress()
# h2o.cluster().shutdown() # 停止
my_url = ('https://raw.githubusercontent.com/taroyabuki'
'/fromzero/master/data/wine.csv')
my_data = pd.read_csv(my_url)
my_frame = h2o.H2OFrame(my_data) # 通常のデータフレームをH2OFrameに変換する.
## /opt/venv/lib/python3.10/site-packages/pandas/core/dtypes/cast.py:1641: DeprecationWarning: np.find_common_type is deprecated. Please use `np.result_type` or `np.promote_types`.
## See https://numpy.org/devdocs/release/1.25.0-notes.html and the docs for more information. (Deprecated NumPy 1.25)
## return np.find_common_type(types, [])
# あるいは
my_frame = h2o.import_file(my_url, header=1) # データを読み込む.
my_frame.head(5)
## H2OFrame({'_ex': <Expr(rows <Expr()#wine.hex> slice(0, 5, 1))#py_1_sid_a906>})
# 通常のデータフレームに戻す.
h2o.as_list(my_frame).head()
## LPRICE2 WRAIN DEGREES HRAIN TIME_SV
## 0 -1.00 600 17.12 160 31
## 1 -0.45 690 16.73 80 30
## 2 -0.81 502 17.15 130 28
## 3 -1.51 420 16.13 110 26
## 4 -1.72 582 16.42 187 25
##
## /opt/venv/lib/python3.10/site-packages/h2o/frame.py:1983: H2ODependencyWarning: Converting H2O frame to pandas dataframe using single-thread. For faster conversion using multi-thread, install datatable (for Python 3.9 or lower), or polars and pyarrow (for Python 3.10 or above) and activate it using:
##
## with h2o.utils.threading.local_context(polars_enabled=True, datatable_enabled=True):
## pandas_df = h2o_df.as_data_frame()
##
## warnings.warn("Converting H2O frame to pandas dataframe using single-thread. For faster conversion using"
# 結果は割愛(見た目は同じ)
my_model = H2OAutoML(
max_runtime_secs=60)
my_model.train(
y='LPRICE2',
training_frame=my_frame)
my_model.leaderboard['rmse'].min()
## 0.26438711806127946
tmp = h2o.as_list(
my_model.predict(my_frame))
## /opt/venv/lib/python3.10/site-packages/h2o/frame.py:1983: H2ODependencyWarning: Converting H2O frame to pandas dataframe using single-thread. For faster conversion using multi-thread, install datatable (for Python 3.9 or lower), or polars and pyarrow (for Python 3.10 or above) and activate it using:
##
## with h2o.utils.threading.local_context(polars_enabled=True, datatable_enabled=True):
## pandas_df = h2o_df.as_data_frame()
##
## warnings.warn("Converting H2O frame to pandas dataframe using single-thread. For faster conversion using"
pd.DataFrame({
'y': my_data['LPRICE2'],
'y_': tmp['predict']}
).plot('y', 'y_', kind='scatter')
## <Axes: xlabel='y', ylabel='y_'>
(x_train, y_train), (x_test, y_test) = tf.keras.datasets.mnist.load_data()
my_index = sample(range(60000), 6000)
x_train = x_train[my_index, :, :]
y_train = y_train[my_index]
tmp = pd.DataFrame(
x_train.reshape(-1, 28 * 28))
y = 'y'
tmp[y] = y_train
## /opt/venv/lib/python3.10/site-packages/pandas/core/dtypes/cast.py:1641: DeprecationWarning: np.find_common_type is deprecated. Please use `np.result_type` or `np.promote_types`.
## See https://numpy.org/devdocs/release/1.25.0-notes.html and the docs for more information. (Deprecated NumPy 1.25)
## return np.find_common_type(types, [])
my_train = h2o.H2OFrame(tmp)
my_train[y] = my_train[y].asfactor()
tmp = pd.DataFrame(
x_test.reshape(-1, 28 * 28))
my_test = h2o.H2OFrame(tmp)
my_model = H2OAutoML(
max_runtime_secs=120)
my_model.train(
y=y,
training_frame=my_train)
my_model.leaderboard[
'mean_per_class_error'].min()
## 0.07359697813608701
tmp = h2o.as_list(
my_model.predict(my_test))
## /opt/venv/lib/python3.10/site-packages/h2o/frame.py:1983: H2ODependencyWarning: Converting H2O frame to pandas dataframe using single-thread. For faster conversion using multi-thread, install datatable (for Python 3.9 or lower), or polars and pyarrow (for Python 3.10 or above) and activate it using:
##
## with h2o.utils.threading.local_context(polars_enabled=True, datatable_enabled=True):
## pandas_df = h2o_df.as_data_frame()
##
## warnings.warn("Converting H2O frame to pandas dataframe using single-thread. For faster conversion using"
y_ = tmp.predict
(y_ == y_test).mean()
## 0.9291
import pandas as pd
pd.to_datetime('2020-01-01')
## Timestamp('2020-01-01 00:00:00')
pd.date_range(start='2021-01-01', end='2023-01-01', freq='1A')
## DatetimeIndex(['2021-12-31', '2022-12-31'], dtype='datetime64[ns]', freq='A-DEC')
pd.date_range(start='2021-01-01', end='2023-01-01', freq='1AS')
## DatetimeIndex(['2021-01-01', '2022-01-01', '2023-01-01'], dtype='datetime64[ns]', freq='AS-JAN')
pd.date_range(start='2021-01-01', end='2021-03-01', freq='2M')
## DatetimeIndex(['2021-01-31'], dtype='datetime64[ns]', freq='2M')
pd.date_range(start='2021-01-01', end='2021-03-01', freq='2MS')
## DatetimeIndex(['2021-01-01', '2021-03-01'], dtype='datetime64[ns]', freq='2MS')
pd.date_range(start='2021-01-01', end='2021-01-03', freq='1D')
## DatetimeIndex(['2021-01-01', '2021-01-02', '2021-01-03'], dtype='datetime64[ns]', freq='D')
pd.date_range(start='2021-01-01 00:00:00', end='2021-01-01 03:00:00', freq='2H')
## DatetimeIndex(['2021-01-01 00:00:00', '2021-01-01 02:00:00'], dtype='datetime64[ns]', freq='2H')
import matplotlib.pyplot as plt
import pandas as pd
from pmdarima.datasets import airpassengers
from sklearn.metrics import mean_squared_error
my_data = airpassengers.load_airpassengers()
n = len(my_data) # データ数(144)
k = 108 # 訓練データ数
my_ds = pd.date_range(
start='1949/01/01',
end='1960/12/01',
freq='MS')
my_df = pd.DataFrame({
'ds': my_ds,
'x': range(n),
'y': my_data},
index=my_ds)
my_df.head()
## ds x y
## 1949-01-01 1949-01-01 0 112.00
## 1949-02-01 1949-02-01 1 118.00
## 1949-03-01 1949-03-01 2 132.00
## 1949-04-01 1949-04-01 3 129.00
## 1949-05-01 1949-05-01 4 121.00
my_train = my_df[ :k]
my_test = my_df[-(n - k): ]
y = my_test.y
plt.plot(my_train.y, label='train')
## [<matplotlib.lines.Line2D object at 0x7f8320692f20>]
plt.plot(my_test.y, label='test')
## [<matplotlib.lines.Line2D object at 0x7f83206a3730>]
plt.legend()
## <matplotlib.legend.Legend object at 0x7f83206a3ee0>
from sklearn.linear_model import LinearRegression
my_lm_model = LinearRegression()
my_lm_model.fit(my_train[['x']], my_train.y)
## LinearRegression()
X = my_test[['x']]
y_ = my_lm_model.predict(X)
mean_squared_error(y, y_)**0.5 # RMSE(テスト)
## 70.63707081783771
y_ = my_lm_model.predict(my_df[['x']])
tmp = pd.DataFrame(y_,
index=my_df.index)
plt.plot(my_train.y, label='train')
## [<matplotlib.lines.Line2D object at 0x7f83204539a0>]
plt.plot(my_test.y, label='test')
## [<matplotlib.lines.Line2D object at 0x7f832045a3e0>]
plt.plot(tmp, label='model')
## [<matplotlib.lines.Line2D object at 0x7f83204751e0>]
plt.legend()
## <matplotlib.legend.Legend object at 0x7f83204765f0>
import pmdarima as pm
my_arima_model = pm.auto_arima(my_train.y, m=12, trace=True)
## Performing stepwise search to minimize aic
## ARIMA(2,1,2)(1,1,1)[12] : AIC=706.671, Time=0.49 sec
## ARIMA(0,1,0)(0,1,0)[12] : AIC=707.730, Time=0.02 sec
## ARIMA(1,1,0)(1,1,0)[12] : AIC=704.186, Time=0.05 sec
## ARIMA(0,1,1)(0,1,1)[12] : AIC=704.801, Time=0.07 sec
## ARIMA(1,1,0)(0,1,0)[12] : AIC=704.001, Time=0.03 sec
## ARIMA(1,1,0)(0,1,1)[12] : AIC=704.472, Time=0.09 sec
## ARIMA(1,1,0)(1,1,1)[12] : AIC=705.993, Time=0.17 sec
## ARIMA(2,1,0)(0,1,0)[12] : AIC=705.691, Time=0.04 sec
## ARIMA(1,1,1)(0,1,0)[12] : AIC=705.081, Time=0.04 sec
## ARIMA(0,1,1)(0,1,0)[12] : AIC=704.376, Time=0.02 sec
## ARIMA(2,1,1)(0,1,0)[12] : AIC=707.075, Time=0.06 sec
## ARIMA(1,1,0)(0,1,0)[12] intercept : AIC=705.875, Time=0.04 sec
##
## Best model: ARIMA(1,1,0)(0,1,0)[12]
## Total fit time: 1.137 seconds
y_, my_ci = my_arima_model.predict(len(my_test), # 期間はテストデータと同じ.
alpha=0.05, # 有意水準(デフォルト)
return_conf_int=True) # 信頼区間を求める.
tmp = pd.DataFrame({'y': y_,
'Lo': my_ci[:, 0],
'Hi': my_ci[:, 1]},
index=my_test.index)
tmp.head()
## y Lo Hi
## 1958-01-01 345.96 327.09 364.84
## 1958-02-01 331.73 308.04 355.43
## 1958-03-01 386.79 358.52 415.06
## 1958-04-01 378.77 346.70 410.85
## 1958-05-01 385.78 350.27 421.28
mean_squared_error(y, y_)**0.5
## 22.132236754717276
plt.plot(my_train.y, label='train')
## [<matplotlib.lines.Line2D object at 0x7f83201f3c40>]
plt.plot(my_test.y, label='test')
## [<matplotlib.lines.Line2D object at 0x7f83007080a0>]
plt.plot(tmp.y, label='model')
## [<matplotlib.lines.Line2D object at 0x7f83201f7ac0>]
plt.fill_between(tmp.index,
tmp.Lo,
tmp.Hi,
alpha=0.25) # 不透明度
## <matplotlib.collections.PolyCollection object at 0x7f83007081f0>
plt.legend(loc='upper left')
## <matplotlib.legend.Legend object at 0x7f830070b070>
try: from fbprophet import Prophet
except ImportError: from prophet import Prophet
## Importing plotly failed. Interactive plots will not work.
my_prophet_model = Prophet(seasonality_mode='multiplicative')
my_prophet_model.fit(my_train)
## <prophet.forecaster.Prophet object at 0x7f83007ebf40>
##
## /opt/venv/lib/python3.10/site-packages/pandas/core/algorithms.py:522: DeprecationWarning: np.find_common_type is deprecated. Please use `np.result_type` or `np.promote_types`.
## See https://numpy.org/devdocs/release/1.25.0-notes.html and the docs for more information. (Deprecated NumPy 1.25)
## common = np.find_common_type([values.dtype, comps_array.dtype], [])
## /opt/venv/lib/python3.10/site-packages/pandas/core/algorithms.py:522: DeprecationWarning: np.find_common_type is deprecated. Please use `np.result_type` or `np.promote_types`.
## See https://numpy.org/devdocs/release/1.25.0-notes.html and the docs for more information. (Deprecated NumPy 1.25)
## common = np.find_common_type([values.dtype, comps_array.dtype], [])
## 15:49:47 - cmdstanpy - INFO - Chain [1] start processing
## 15:49:47 - cmdstanpy - INFO - Chain [1] done processing
tmp = my_prophet_model.predict(my_test)
## /opt/venv/lib/python3.10/site-packages/pandas/core/algorithms.py:522: DeprecationWarning: np.find_common_type is deprecated. Please use `np.result_type` or `np.promote_types`.
## See https://numpy.org/devdocs/release/1.25.0-notes.html and the docs for more information. (Deprecated NumPy 1.25)
## common = np.find_common_type([values.dtype, comps_array.dtype], [])
## /opt/venv/lib/python3.10/site-packages/pandas/core/algorithms.py:522: DeprecationWarning: np.find_common_type is deprecated. Please use `np.result_type` or `np.promote_types`.
## See https://numpy.org/devdocs/release/1.25.0-notes.html and the docs for more information. (Deprecated NumPy 1.25)
## common = np.find_common_type([values.dtype, comps_array.dtype], [])
## /opt/venv/lib/python3.10/site-packages/pandas/core/algorithms.py:522: DeprecationWarning: np.find_common_type is deprecated. Please use `np.result_type` or `np.promote_types`.
## See https://numpy.org/devdocs/release/1.25.0-notes.html and the docs for more information. (Deprecated NumPy 1.25)
## common = np.find_common_type([values.dtype, comps_array.dtype], [])
## /opt/venv/lib/python3.10/site-packages/pandas/core/algorithms.py:522: DeprecationWarning: np.find_common_type is deprecated. Please use `np.result_type` or `np.promote_types`.
## See https://numpy.org/devdocs/release/1.25.0-notes.html and the docs for more information. (Deprecated NumPy 1.25)
## common = np.find_common_type([values.dtype, comps_array.dtype], [])
tmp[['ds', 'yhat', 'yhat_lower', 'yhat_upper']].head()
## ds yhat yhat_lower yhat_upper
## 0 1958-01-01 359.04 349.74 368.31
## 1 1958-02-01 350.52 341.42 359.40
## 2 1958-03-01 406.99 398.17 415.77
## 3 1958-04-01 398.26 389.11 407.11
## 4 1958-05-01 402.34 393.77 411.53
y_ = tmp.yhat
mean_squared_error(y, y_)**0.5
## 33.43445811794536
# my_prophet_model.plot(tmp) # 予測結果のみでよい場合
fig = my_prophet_model.plot(tmp)
fig.axes[0].plot(my_train.ds, my_train.y)
## [<matplotlib.lines.Line2D object at 0x7f83003d40a0>]
fig.axes[0].plot(my_test.ds, my_test.y, color='red')
## [<matplotlib.lines.Line2D object at 0x7f83003d4e50>]
import numpy as np
import pandas as pd
from pca import pca
from scipy.stats import zscore
my_data = pd.DataFrame(
{'language': [ 0, 20, 20, 25, 22, 17],
'english': [ 0, 20, 40, 20, 24, 18],
'math': [100, 20, 5, 30, 17, 25],
'science': [ 0, 20, 5, 25, 16, 23],
'society': [ 0, 20, 30, 0, 21, 17]},
index= ['A', 'B', 'C', 'D', 'E', 'F'])
my_model = pca(n_components=5)
my_result = my_model.fit_transform(my_data) # 主成分分析の実行
## [pca] >Extracting column labels from dataframe.
## [pca] >Extracting row labels from dataframe.
## [pca] >The PCA reduction is performed on the [5] columns of the input dataframe.
## [pca] >Fit using PCA.
## [pca] >Compute loadings and PCs.
## [pca] >Compute explained variance.
## [pca] >Outlier detection using Hotelling T2 test with alpha=[0.05] and n_components=[5]
## [pca] >Multiple test correction applied for Hotelling T2 test: [fdr_bh]
## [pca] >Outlier detection using SPE/DmodX with n_std=[3]
my_result['PC'] # 主成分スコア
## PC1 PC2 PC3 PC4 PC5
## A 74.91 7.01 0.36 0.08 0.00
## B -13.82 -2.75 -5.27 0.84 0.00
## C -33.71 18.42 4.88 -0.88 -0.00
## D -1.73 -17.88 7.93 -0.15 -0.00
## E -17.84 1.06 -1.65 2.31 0.00
## F -7.81 -5.86 -6.24 -2.20 0.00
my_model.biplot(legend=False)
## [pca] >Plot PC1 vs PC2 with loadings.
## (<Figure size 2500x1500 with 1 Axes>, <Axes: title={'center': '5 Principal Components explain [99.99%] of the variance'}, xlabel='PC1 (88.8% expl.var)', ylabel='PC2 (9.11% expl.var)'>)
##
## [scatterd] >INFO> Create scatterplot
## /opt/venv/lib/python3.10/site-packages/pandas/core/algorithms.py:522: DeprecationWarning: np.find_common_type is deprecated. Please use `np.result_type` or `np.promote_types`.
## See https://numpy.org/devdocs/release/1.25.0-notes.html and the docs for more information. (Deprecated NumPy 1.25)
## common = np.find_common_type([values.dtype, comps_array.dtype], [])
my_result['loadings']
## language english math science society
## PC1 -0.21 -0.30 0.89 -0.13 -0.25
## PC2 -0.28 0.33 0.10 -0.70 0.56
## PC3 0.31 0.62 0.06 -0.34 -0.64
## PC4 0.76 -0.47 -0.01 -0.42 0.13
## PC5 -0.45 -0.45 -0.45 -0.45 -0.45
my_result['explained_var']
## array([0.888, 0.98 , 0.999, 1. , 1. ])
tmp = zscore(my_data, ddof=1) # 標準化
my_result = my_model.fit_transform(
tmp)
## [pca] >Cleaning previous fitted model results..
## [pca] >Extracting column labels from dataframe.
## [pca] >Extracting row labels from dataframe.
## [pca] >The PCA reduction is performed on the [5] columns of the input dataframe.
## [pca] >Fit using PCA.
## [pca] >Compute loadings and PCs.
## [pca] >Compute explained variance.
## [pca] >Outlier detection using Hotelling T2 test with alpha=[0.05] and n_components=[5]
## [pca] >Multiple test correction applied for Hotelling T2 test: [fdr_bh]
## [pca] >Outlier detection using SPE/DmodX with n_std=[3]
my_result['PC'] # 主成分スコア
## PC1 PC2 PC3 PC4 PC5
## A 3.67 0.57 0.06 0.01 0.00
## B -0.65 -0.25 -0.43 0.09 -0.00
## C -1.57 1.74 0.38 -0.10 -0.00
## D -0.25 -1.64 0.68 -0.03 0.00
## E -0.89 0.11 -0.09 0.23 0.00
## F -0.32 -0.53 -0.59 -0.20 -0.00
tmp = my_data - my_data.mean()
Z = np.matrix(tmp) # 標準化しない場合
## <string>:1: PendingDeprecationWarning: the matrix subclass is not the recommended way to represent matrices or deal with linear algebra (see https://docs.scipy.org/doc/numpy/user/numpy-for-matlab-users.html). Please adjust your code to use regular ndarray.
#Z = np.matrix(tmp / my_data.std(ddof=1)) # √不偏分散で標準化する場合
#Z = np.matrix(tmp / my_data.std(ddof=0)) # pca(normalize=True)に合わせる場合
n = len(my_data)
S = np.cov(Z, rowvar=0, ddof=0) # 分散共分散行列
#S = Z.T @ Z / n # (同じ結果)
vals, vecs = np.linalg.eig(S) # 固有値と固有ベクトル
idx = np.argsort(-vals) # 固有値の大きい順の番号
vals, vecs = vals[idx], vecs[:, idx] # 固有値の大きい順での並べ替え
Z @ vecs # 主成分スコア(結果は割愛)
## matrix([[ 7.491e+01, 7.011e+00, -3.615e-01, 7.634e-02, -1.707e-14],
## [-1.382e+01, -2.753e+00, 5.273e+00, 8.418e-01, 6.693e-15],
## [-3.371e+01, 1.842e+01, -4.876e+00, -8.820e-01, 5.864e-16],
## [-1.731e+00, -1.788e+01, -7.925e+00, -1.493e-01, 6.974e-16],
## [-1.784e+01, 1.065e+00, 1.653e+00, 2.313e+00, 1.419e-15],
## [-7.806e+00, -5.863e+00, 6.237e+00, -2.199e+00, 1.119e-14]])
vals.cumsum() / vals.sum() # 累積寄与率
## array([0.888, 0.98 , 0.999, 1. , 1. ])
U, d, V = np.linalg.svd(Z, full_matrices=False) # 特異値分解
W = np.diag(d)
[np.isclose(Z, U @ W @ V).all(), # 確認1
np.isclose(U.T @ U, np.identity(U.shape[1])).all(), # 確認2
np.isclose(V @ V.T, np.identity(V.shape[0])).all()] # 確認3
## [True, True, True]
U @ W # 主成分スコア(結果は割愛)
## matrix([[-7.491e+01, -7.011e+00, -3.615e-01, 7.634e-02, -6.852e-16],
## [ 1.382e+01, 2.753e+00, 5.273e+00, 8.418e-01, 8.962e-16],
## [ 3.371e+01, -1.842e+01, -4.876e+00, -8.820e-01, -5.183e-16],
## [ 1.731e+00, 1.788e+01, -7.925e+00, -1.493e-01, -5.827e-16],
## [ 1.784e+01, -1.065e+00, 1.653e+00, 2.313e+00, -1.907e-15],
## [ 7.806e+00, 5.863e+00, 6.237e+00, -2.199e+00, -1.438e-15]])
e = d ** 2 / n # 分散共分散行列の固有値
e.cumsum() / e.sum() # 累積寄与率
## array([0.888, 0.98 , 0.999, 1. , 1. ])
import pandas as pd
from scipy.cluster import hierarchy
my_data = pd.DataFrame(
{'x': [ 0, -16, 10, 10],
'y': [ 0, 0, 10, -15]},
index=['A', 'B', 'C', 'D'])
my_result = hierarchy.linkage(
my_data,
metric='euclidean', # 省略可
method='complete')
hierarchy.dendrogram(my_result,
labels=my_data.index)
## {'icoord': [[25.0, 25.0, 35.0, 35.0], [15.0, 15.0, 30.0, 30.0], [5.0, 5.0, 22.5, 22.5]], 'dcoord': [[0.0, 14.142135623730951, 14.142135623730951, 0.0], [0.0, 25.0, 25.0, 14.142135623730951], [0.0, 30.01666203960727, 30.01666203960727, 25.0]], 'ivl': ['B', 'D', 'A', 'C'], 'leaves': [1, 3, 0, 2], 'color_list': ['C1', 'C0', 'C0'], 'leaves_color_list': ['C0', 'C0', 'C1', 'C1']}
hierarchy.cut_tree(my_result, 3)
## array([[0],
## [1],
## [0],
## [2]])
# 補足(見やすくする)
my_data.assign(cluster=
hierarchy.cut_tree(my_result, 3))
## x y cluster
## A 0 0 0
## B -16 0 1
## C 10 10 0
## D 10 -15 2
import pandas as pd
import seaborn as sns
my_data = pd.DataFrame(
{'language': [ 0, 20, 20, 25, 22, 17],
'english': [ 0, 20, 40, 20, 24, 18],
'math': [100, 20, 5, 30, 17, 25],
'science': [ 0, 20, 5, 25, 16, 23],
'society': [ 0, 20, 30, 0, 21, 17]},
index= ['A', 'B', 'C', 'D', 'E', 'F'])
sns.clustermap(my_data, z_score=1) # 列ごとの標準化
## <seaborn.matrix.ClusterGrid object at 0x7f82c43417b0>
import pandas as pd
from sklearn.cluster import KMeans
my_data = pd.DataFrame(
{'x': [ 0, -16, 10, 10],
'y': [ 0, 0, 10, -15]},
index=['A', 'B', 'C', 'D'])
my_result = KMeans(
n_clusters=3).fit(my_data)
my_result.labels_
## array([1, 0, 1, 2], dtype=int32)
# 補足(見やすくする)
my_data.assign(
cluster=my_result.labels_)
## x y cluster
## A 0 0 1
## B -16 0 0
## C 10 10 1
## D 10 -15 2
import pandas as pd
import statsmodels.api as sm
from sklearn.cluster import KMeans
iris = sm.datasets.get_rdataset('iris', 'datasets').data
## /opt/venv/lib/python3.10/site-packages/pandas/core/algorithms.py:522: DeprecationWarning: np.find_common_type is deprecated. Please use `np.result_type` or `np.promote_types`.
## See https://numpy.org/devdocs/release/1.25.0-notes.html and the docs for more information. (Deprecated NumPy 1.25)
## common = np.find_common_type([values.dtype, comps_array.dtype], [])
my_data = iris.iloc[:, 0:4]
k = range(1, 11)
my_df = pd.DataFrame({
'k': k,
'inertia': [KMeans(k).fit(my_data).inertia_ for k in range(1, 11)]})
my_df.plot(x='k', style='o-', legend=False)
## <Axes: xlabel='k'>
import seaborn as sns
import statsmodels.api as sm
from pca import pca
from scipy.cluster import hierarchy
from scipy.stats import zscore
from sklearn.cluster import KMeans
iris = sm.datasets.get_rdataset('iris', 'datasets').data
## /opt/venv/lib/python3.10/site-packages/pandas/core/algorithms.py:522: DeprecationWarning: np.find_common_type is deprecated. Please use `np.result_type` or `np.promote_types`.
## See https://numpy.org/devdocs/release/1.25.0-notes.html and the docs for more information. (Deprecated NumPy 1.25)
## common = np.find_common_type([values.dtype, comps_array.dtype], [])
my_data = zscore(iris.iloc[:, 0:4])
my_model = pca() # 主成分分析
my_result = my_model.fit_transform(my_data)['PC']
## [pca] >Extracting column labels from dataframe.
## [pca] >Extracting row labels from dataframe.
## [pca] >The PCA reduction is performed to capture [95.0%] explained variance using the [4] columns of the input data.
## [pca] >Fit using PCA.
## [pca] >Compute loadings and PCs.
## [pca] >Compute explained variance.
## [pca] >Number of components is [2] that covers the [95.00%] explained variance.
## [pca] >The PCA reduction is performed on the [4] columns of the input dataframe.
## [pca] >Fit using PCA.
## [pca] >Compute loadings and PCs.
## [pca] >Outlier detection using Hotelling T2 test with alpha=[0.05] and n_components=[2]
## [pca] >Multiple test correction applied for Hotelling T2 test: [fdr_bh]
## [pca] >Outlier detection using SPE/DmodX with n_std=[3]
my_result['Species'] = list(iris.Species)
# 非階層的クラスタ分析の場合
my_result['cluster'] = KMeans(n_clusters=3).fit(my_data).labels_
# 階層的クラスタ分析の場合
#my_result['cluster'] = hierarchy.cut_tree(
# hierarchy.linkage(my_data, method='complete'), 3)[:,0]
sns.scatterplot(x='PC1', y='PC2', data=my_result, legend=False,
hue='cluster', # 色でクラスタを表現する.
style='Species', # 形で品種を表現する.
palette='bright')
## <Axes: xlabel='PC1', ylabel='PC2'>
##
## /opt/venv/lib/python3.10/site-packages/pandas/core/algorithms.py:522: DeprecationWarning: np.find_common_type is deprecated. Please use `np.result_type` or `np.promote_types`.
## See https://numpy.org/devdocs/release/1.25.0-notes.html and the docs for more information. (Deprecated NumPy 1.25)
## common = np.find_common_type([values.dtype, comps_array.dtype], [])
## /opt/venv/lib/python3.10/site-packages/pandas/core/algorithms.py:522: DeprecationWarning: np.find_common_type is deprecated. Please use `np.result_type` or `np.promote_types`.
## See https://numpy.org/devdocs/release/1.25.0-notes.html and the docs for more information. (Deprecated NumPy 1.25)
## common = np.find_common_type([values.dtype, comps_array.dtype], [])