辻真吾・矢吹太朗『ゼロからはじめるデータサイエンス入門』(講談社, 2021)

RStudioでKnitする方法

  1. docker run -d -e PASSWORD=password -e ROOT=TRUE -p 8787:8787 -v "$(pwd)":/home/rstudio/work taroyabuki/rstudio
  2. ブラウザで,http://localhost:8787 にアクセスする.
  3. RStudioのFilesタブで,work/r.Rmd を開く.
  4. RStudioのConsoleで次を実行する.
install.packages(c("knitr", "markdown", "rmarkdown"))
  1. Knit
# これはRのコードの例です.
1 + 1
## [1] 2

1 コンピュータとネットワーク

1.1 コンピュータの基本操作

1.2 ネットワークの仕組み

2 データサイエンスのための環境

2.1 実行環境の選択

2.2 クラウド

2.3 Docker

2.4 ターミナルの使い方

2.5 RとPython

2.6 サンプルコードの利用

1 + 1
## [1] 2
print(1 + 2)
## [1] 3
1 + 3
## [1] 4

3 RとPython

# Google Colaboratoryの環境設定
if (file.exists("/content")) {
  options(Ncpus = parallel::detectCores())
  installed_packages <- rownames(installed.packages())
  packages_to_install <- c("furrr", "keras", "proxy")
  install.packages(setdiff(packages_to_install, installed_packages))
}

3.1 入門

0x10
## [1] 16
1.23e5
## [1] 123000
2 * 3
## [1] 6
10 / 3
## [1] 3.333333
10 %/% 3 # 商
## [1] 3
10 %% 3  # 余り
## [1] 1
x <- 2
y <- 3
x * y
## [1] 6
keras::`%<-%`(c(x, y), c(20, 30)) # まとめて名付け
x * y
## [1] 600
x <- 1 + 1
# この段階では結果は表示されない

x # 変数名を評価する.
## [1] 2
my_s <- "abcde"
nchar(my_s)
## [1] 5
library(tidyverse)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.1 ──
## ✓ ggplot2 3.3.5     ✓ purrr   0.3.4
## ✓ tibble  3.1.6     ✓ dplyr   1.0.7
## ✓ tidyr   1.1.4     ✓ stringr 1.4.0
## ✓ readr   2.1.0     ✓ forcats 0.5.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()
str_c("This is ", "a", " pen.")
## [1] "This is a pen."
substr(x = my_s, start = 2, stop = 4)
## [1] "bcd"
tmp <- "%s is %s."
sprintf(tmp, "This", "a pen")
## [1] "This is a pen."
1 <= 2
## [1] TRUE
1 < 0
## [1] FALSE
0.1 + 0.1 + 0.1 == 0.3
## [1] FALSE
all.equal(0.1 + 0.1 + 0.1, 0.3)
## [1] TRUE
TRUE & FALSE # 論理積(かつ)
## [1] FALSE
TRUE | FALSE # 論理和(または)
## [1] TRUE
!TRUE        # 否定(でない)
## [1] FALSE
ifelse(3 < 5, 0, 10)
## [1] 0
getwd()
## [1] "/home/rstudio/work"
setwd("..")
getwd()
## [1] "/home/rstudio"

3.2 関数

sqrt(4)
## [1] 2
log(100, 10)
## [1] 2
log(100)         # 自然対数
## [1] 4.60517
# あるいは
log(100, exp(1)) # 省略しない場合
## [1] 4.60517
log10(100) # 常用対数
## [1] 2
log2(1024) # 底が2の対数
## [1] 10
library(tidyverse)
4 %>% sqrt
## [1] 2
exp(log(5))       # 通常の書き方
## [1] 5
# あるいは
5 %>% log %>% exp # パイプを使う書き方
## [1] 5
f <- function(a, b) {
  a - b
}
f(3, 5)
## [1] -2
f <- function(a, b = 5) {
  a - b
}

f(3) # f(3, 5)と同じこと
## [1] -2
(function(a, b) { a - b })(3, 5)
## [1] -2

3.3 コレクション

x <- c("foo", "bar", "baz")
length(x)
## [1] 3
x[2]
## [1] "bar"
x[2] <- "BAR"
x # 結果の確認
## [1] "foo" "BAR" "baz"
x[2] <- "bar" # 元に戻す.
x[-2]
## [1] "foo" "baz"
c(x, "qux")
## [1] "foo" "bar" "baz" "qux"
x <- c(x, "qux")
x # 結果の確認
## [1] "foo" "bar" "baz" "qux"
1:5
## [1] 1 2 3 4 5
seq(from = 0, to = 10, by = 2)
## [1]  0  2  4  6  8 10
seq(from = 0, to = 1, by = 0.5)
## [1] 0.0 0.5 1.0
seq(from = 0, to = 100, length.out = 5)
## [1]   0  25  50  75 100
rep(x = 10, times = 5)
## [1] 10 10 10 10 10
tmp <- c("グー", "パー", "グー", "パー")
x <- factor(tmp, levels = c("グー", "チョキ", "パー"))
x
## [1] グー パー グー パー
## Levels: グー チョキ パー
x <- c(2, 3, 5, 7)

x + 10 # 加算
## [1] 12 13 15 17
x * 10 # 乗算
## [1] 20 30 50 70
x <- c(2, 3)
sin(x)
## [1] 0.9092974 0.1411200
x <- c(2,  3,   5,    7)
y <- c(1, 10, 100, 1000)
x + y
## [1]    3   13  105 1007
x * y
## [1]    2   30  500 7000
sum(x * y)
## [1] 7532
x <- c(TRUE, FALSE)
y <- c(TRUE, TRUE)
x & y
## [1]  TRUE FALSE
u <- c(1, 2, 3)
v <- c(1, 2, 3)
w <- c(1, 2, 4)

identical(u, v) # 全体の比較
## [1] TRUE
identical(u, w) # 全体の比較
## [1] FALSE
u == v          # 要素ごとの比較
## [1] TRUE TRUE TRUE
u == w          # 要素ごとの比較
## [1]  TRUE  TRUE FALSE
sum(u == w)  # 同じ要素の数
## [1] 2
mean(u == w) # 同じ要素の割合
## [1] 0.6666667
x <- list(1, "two")
x[[2]]
## [1] "two"
x <- list("apple"  = "りんご",
          "orange" = "みかん")
x[["grape"]] <- "ぶどう"
x$apple
## [1] "りんご"
# あるいは
x$"apple"
## [1] "りんご"
# あるいは
x[["apple"]]
## [1] "りんご"
# あるいは
tmp <- "apple"
x[[tmp]]
## [1] "りんご"
x <- c("foo", "bar", "baz")
y <- x
y[2] <- "BAR" # yを更新する.
y
## [1] "foo" "BAR" "baz"
x             # xは変わらない.
## [1] "foo" "bar" "baz"

3.4 データフレーム

library(tidyverse)
my_df <- data.frame(
  name    = c("A", "B", "C", "D"),
  english = c( 60,  90,  70,  90),
  math    = c( 70,  80,  90, 100),
  gender  = c("f", "m", "m", "f"))
my_df <- tribble(
  ~name, ~english, ~math, ~gender,
  "A",         60,    70,     "f",
  "B",         90,    80,     "m",
  "C",         70,    90,     "m",
  "D",         90,   100,     "f")
head(my_df)
## # A tibble: 4 × 4
##   name  english  math gender
##   <chr>   <dbl> <dbl> <chr> 
## 1 A          60    70 f     
## 2 B          90    80 m     
## 3 C          70    90 m     
## 4 D          90   100 f
# 結果は割愛
dim(my_df)  # 行数と列数
## [1] 4 4
nrow(my_df) # 行数
## [1] 4
ncol(my_df) # 列数
## [1] 4
my_df2 <- expand.grid(
  X = c(1, 2, 3),
  Y = c(10, 100))
my_df2
##   X   Y
## 1 1  10
## 2 2  10
## 3 3  10
## 4 1 100
## 5 2 100
## 6 3 100
colnames(my_df2)
## [1] "X" "Y"
colnames(my_df2) <- c("P", "Q")
my_df2
##   P   Q
## 1 1  10
## 2 2  10
## 3 3  10
## 4 1 100
## 5 2 100
## 6 3 100
# 以下省略
row.names(my_df)
## [1] "1" "2" "3" "4"
row.names(my_df2) <-
  c("a", "b", "c", "d", "e", "f")
my_df2
##   P   Q
## a 1  10
## b 2  10
## c 3  10
## d 1 100
## e 2 100
## f 3 100
# 以下省略
my_df3 <- data.frame(
  english =   c( 60,  90,  70,  90),
  math    =   c( 70,  80,  90, 100),
  gender  =   c("f", "m", "m", "f"),
  row.names = c("A", "B", "C", "D"))
my_df3
##   english math gender
## A      60   70      f
## B      90   80      m
## C      70   90      m
## D      90  100      f
tmp <- data.frame(
  name    = "E",
  english =  80,
  math    =  80,
  gender  = "m")
my_df2 <- rbind(my_df, tmp)
my_df2 <- my_df %>%
  mutate(id = c(1, 2, 3, 4))
my_df3 <- my_df               # コピー
my_df3["id"] <- c(1, 2, 3, 4) # 更新
my_df3 # 結果の確認(割愛)
## # A tibble: 4 × 5
##   name  english  math gender    id
##   <chr>   <dbl> <dbl> <chr>  <dbl>
## 1 A          60    70 f          1
## 2 B          90    80 m          2
## 3 C          70    90 m          3
## 4 D          90   100 f          4
my_df[1, 2]
## # A tibble: 1 × 1
##   english
##     <dbl>
## 1      60
x <- my_df[, 2]
# あるいは
x <- my_df$english
# あるいは
x <- my_df$"english"
# あるいは
x <- my_df[["english"]]
# あるいは
tmp <- "english"
x <- my_df[[tmp]]

x # 結果の確認(割愛)
## [1] 60 90 70 90
x <- my_df %>% select(name, math)
x <- my_df[, c(1, 3)]
x <- my_df %>%
  select(-c(english, gender))
# あるいは
x <- my_df[, -c(2, 4)]
x <- my_df[c(1, 3), ]
x <- my_df[-c(2, 4), ]
x <- my_df[my_df$gender == "m", ]
# あるいは
x <- my_df %>% filter(gender == "m")
x <- my_df[my_df$english > 80 & my_df$gender == "m", ]
# あるいは
x <- my_df %>% filter(english > 80 & gender == "m")
x <- my_df[my_df$english == max(my_df$english), ]
# あるいは
x <- my_df %>% filter(english == max(my_df$english))
my_df2 <- my_df # コピー
my_df2[my_df$gender == "m", ]$gender <- "M"
my_df2
## # A tibble: 4 × 4
##   name  english  math gender
##   <chr>   <dbl> <dbl> <chr> 
## 1 A          60    70 f     
## 2 B          90    80 M     
## 3 C          70    90 M     
## 4 D          90   100 f
x <- my_df %>% arrange(english)
x <- my_df %>% arrange(-english)
x <- c(2, 3, 5, 7, 11, 13, 17, 19, 23,
       29, 31, 37)
A <- matrix(
  data = x,     # 1次元データ
  nrow = 3,     # 行数
  byrow = TRUE) # 行ごとの生成
A
##      [,1] [,2] [,3] [,4]
## [1,]    2    3    5    7
## [2,]   11   13   17   19
## [3,]   23   29   31   37
A <- my_df[, c(2, 3)] %>% as.matrix
A
##      english math
## [1,]      60   70
## [2,]      90   80
## [3,]      70   90
## [4,]      90  100
as.data.frame(A)
##   english math
## 1      60   70
## 2      90   80
## 3      70   90
## 4      90  100
t(A)
##         [,1] [,2] [,3] [,4]
## english   60   90   70   90
## math      70   80   90  100
t(A) %*% A
##         english  math
## english   24700 26700
## math      26700 29400
my_wider <- data.frame(
  day = c(25, 26, 27),
  min = c(20, 21, 15),
  max = c(24, 27, 21))
my_longer <- my_wider %>%
  pivot_longer(-day)
my_longer
## # A tibble: 6 × 3
##     day name  value
##   <dbl> <chr> <dbl>
## 1    25 min      20
## 2    25 max      24
## 3    26 min      21
## 4    26 max      27
## 5    27 min      15
## 6    27 max      21
my_longer %>% pivot_wider()
## # A tibble: 3 × 3
##     day   min   max
##   <dbl> <dbl> <dbl>
## 1    25    20    24
## 2    26    21    27
## 3    27    15    21
my_longer %>%
  ggplot(aes(x = day, y = value,
             color = name)) +
  geom_point() +
  geom_line() +
  ylab("temperature") + # y軸ラベル
  scale_x_continuous(
    breaks = my_longer$day) # x軸目盛り

3.5 1次元データの(非)類似度

A <- c(3,   4,  5)
B <- c(3,   4, 29)
C <- c(9, -18,  8)
AB <- B - A
AC <- C - A

sum(AB^2)^0.5
## [1] 24
sum(AC^2)^0.5
## [1] 23
sum(abs(AB))
## [1] 24
sum(abs(AC))
## [1] 31
sum(A * B) /
  sum(A * A)^0.5 / sum(B * B)^0.5
## [1] 0.8169679
sum(A * C) /
  sum(A * A)^0.5 / sum(C * C)^0.5
## [1] -0.03265116
cor(A, B)
## [1] 0.8824975
cor(A, C)
## [1] -0.03266277
library(tidyverse)



my_df <- data.frame(
  x = c(3,  3,   9),
  y = c(4,  4, -18),
  z = c(5, 29,   8),
  row.names = c("A", "B", "C"))

# ユークリッド距離
my_df %>% proxy::dist("Euclidean")
##    A  B
## B 24   
## C 23 31
# マンハッタン距離
my_df %>% proxy::dist("Manhattan")
##    A  B
## B 24   
## C 31 49
# コサイン類似度
my_df %>% proxy::simil("cosine")
##             A           B
## B  0.81696786            
## C -0.03265116  0.29342441
# 相関係数
my_df %>% proxy::simil("correlation")
##             A           B
## B  0.88249750            
## C -0.03266277  0.44124132

3.6 Rのパッケージ,Pythonのモジュール

library(tidyverse)

3.7 反復処理

library(tidyverse)
f1 <- function(x) {
  tmp <- runif(x)
  mean(tmp)
}

f1(10)           # 動作確認
## [1] 0.5821029
replicate(n = 3, expr = f1(10))
## [1] 0.4878048 0.4695691 0.5339249
rep(x = f1(10), times = 3)
## [1] 0.6140743 0.6140743 0.6140743
v <- c(5, 10, 100)
v %>% map_dbl(f1)
## [1] 0.6447337 0.5604720 0.5270688
rep(x = 10, times = 3) %>% map_dbl(f1)
## [1] 0.6154754 0.4872310 0.4831569
# 結果は割愛
f2 <- function(n) {
  tmp <- runif(n)
  list(x = n,
       p = mean(tmp),
       q = sd(tmp))
}

f2(10) # 動作確認
## $x
## [1] 10
## 
## $p
## [1] 0.3094924
## 
## $q
## [1] 0.2527441
v <- c(5, 10, 100)
v %>% map_dfr(f2)
## # A tibble: 3 × 3
##       x     p     q
##   <dbl> <dbl> <dbl>
## 1     5 0.485 0.274
## 2    10 0.399 0.328
## 3   100 0.580 0.283
f3 <- function(x, y) {
  tmp <- runif(x, min = 1,
                  max = y + 1) %>%
    as.integer
  list(x = x,
       y = y,
       p = mean(tmp),
       q = sd(tmp))
}

f3(x = 10, y = 6) # 動作確認
## $x
## [1] 10
## 
## $y
## [1] 6
## 
## $p
## [1] 3.5
## 
## $q
## [1] 1.581139
my_df <- data.frame(
  x = c(5, 10, 100,  5, 10, 100),
  y = c(6,  6,   6, 12, 12,  12))

my_df %>% pmap_dfr(f3)
## # A tibble: 6 × 4
##       x     y     p     q
##   <dbl> <dbl> <dbl> <dbl>
## 1     5     6  3     2   
## 2    10     6  2.6   1.78
## 3   100     6  3.41  1.70
## 4     5    12  7.8   4.09
## 5    10    12  6.1   3.38
## 6   100    12  6.56  3.53
library(furrr)
## Loading required package: future
plan(multisession) # 準備

v <- c(5, 10, 100)
v %>% future_map_dbl(f1, .options =
  furrr_options(seed = TRUE))
## [1] 0.6562407 0.7985511 0.4760155
# 結果は割愛

3.8 その他

x <- 123
typeof(x)
## [1] "double"
?log
# あるいは
help(log)
v <- c(1, NA, 3)
v
## [1]  1 NA  3
is.na(v[2])
## [1] TRUE
v[2] == NA # 誤り
## [1] NA

4 統計入門

# Google Colaboratoryの環境設定
if (file.exists("/content")) {
  options(Ncpus = parallel::detectCores())
  installed_packages <- rownames(installed.packages())
  packages_to_install <- c("exactci", "ggmosaic", "pastecs", "psych", "vcd")
  install.packages(setdiff(packages_to_install, installed_packages))
}

4.1 記述統計

x <- c(165, 170, 175, 180, 185)
mean(x) # 平均
## [1] 175
n <- length(x) # サンプルサイズ
sum(x) / n
## [1] 175
y <- c(173, 174, 175, 176, 177)
mean(y)
## [1] 175
var(x) # xの分散
## [1] 62.5
var(y) # yの分散
## [1] 2.5
sum((x - mean(x))^2) / (n - 1)
## [1] 62.5
sd(x) # xの標準偏差
## [1] 7.905694
sd(y) # yの標準偏差
## [1] 1.581139
var(x)**0.5 # xの標準偏差
## [1] 7.905694
psych::describe(x)
##    vars n mean   sd median trimmed  mad min max range skew kurtosis   se
## X1    1 5  175 7.91    175     175 7.41 165 185    20    0    -1.91 3.54
# あるいは

pastecs::stat.desc(x)
##      nbr.val     nbr.null       nbr.na          min          max        range 
##    5.0000000    0.0000000    0.0000000  165.0000000  185.0000000   20.0000000 
##          sum       median         mean      SE.mean CI.mean.0.95          var 
##  875.0000000  175.0000000  175.0000000    3.5355339    9.8162158   62.5000000 
##      std.dev     coef.var 
##    7.9056942    0.0451754
quantile(x)
##   0%  25%  50%  75% 100% 
##  165  170  175  180  185
x <- c(165, 170, 175, 180, 185)

var(x)                # 不偏分散
## [1] 62.5
mean((x - mean(x))^2) # 標本分散
## [1] 50
# あるいは
n <- length(x)
var(x) * (n - 1) / n  # 標本分散
## [1] 50
sd(x)                     # √不偏分散
## [1] 7.905694
mean((x - mean(x))^2)^0.5 # √標本分散
## [1] 7.071068
# あるいは
sd(x) * sqrt((n - 1) / n) # √標本分散
## [1] 7.071068
sd(x) / length(x)**0.5
## [1] 3.535534
library(tidyverse)

my_df <- data.frame(
  name    = c("A", "B", "C", "D"),
  english = c( 60,  90,  70,  90),
  math    = c( 70,  80,  90, 100),
  gender  = c("f", "m", "m", "f"))
var(my_df$english)
## [1] 225
# 結果はベクタ
my_df[, c(2, 3)] %>% sapply(var)
##  english     math 
## 225.0000 166.6667
# 結果はリスト
my_df[, c(2, 3)] %>% lapply(var)
## $english
## [1] 225
## 
## $math
## [1] 166.6667
# 結果はデータフレーム
my_df[, c(2, 3)] %>% # 2, 3列目
  summarize(across(  # の
    everything(),    # 全ての
    var))            # 不偏分散
##   english     math
## 1     225 166.6667
# あるいは
my_df %>%              # データフレーム
  summarize(across(    # の
    where(is.numeric), # 数値の列の
    var))              # 不偏分散
##   english     math
## 1     225 166.6667
# あるいは
my_df %>%              # データフレーム
  summarize(across(    # の
    where(is.numeric), # 数値の列の
    function(x) { var(x) })) # 不偏分散
##   english     math
## 1     225 166.6667
psych::describe(my_df)
##         vars n mean    sd median trimmed   mad min max range  skew kurtosis
## name*      1 4  2.5  1.29    2.5     2.5  1.48   1   4     3  0.00    -2.08
## english    2 4 77.5 15.00   80.0    77.5 14.83  60  90    30 -0.14    -2.28
## math       3 4 85.0 12.91   85.0    85.0 14.83  70 100    30  0.00    -2.08
## gender*    4 4  1.5  0.58    1.5     1.5  0.74   1   2     1  0.00    -2.44
##           se
## name*   0.65
## english 7.50
## math    6.45
## gender* 0.29
# あるいは

pastecs::stat.desc(my_df)
##          name     english        math gender
## nbr.val    NA   4.0000000   4.0000000     NA
## nbr.null   NA   0.0000000   0.0000000     NA
## nbr.na     NA   0.0000000   0.0000000     NA
## min        NA  60.0000000  70.0000000     NA
## max        NA  90.0000000 100.0000000     NA
## range      NA  30.0000000  30.0000000     NA
## sum        NA 310.0000000 340.0000000     NA
## median     NA  80.0000000  85.0000000     NA
## mean       NA  77.5000000  85.0000000     NA
## SE.mean    NA   7.5000000   6.4549722     NA
## CI.mean    NA  23.8683473  20.5426026     NA
## var        NA 225.0000000 166.6666667     NA
## std.dev    NA  15.0000000  12.9099445     NA
## coef.var   NA   0.1935484   0.1518817     NA
# 以下省略
table(my_df$gender)
## 
## f m 
## 2 2
my_df2 <- data.frame(
  gender = my_df$gender,
  excel = my_df$math >= 80)
table(my_df2)
##       excel
## gender FALSE TRUE
##      f     1    1
##      m     0    2
my_df %>% group_by(gender) %>%
  summarize(across(
    where(is.numeric), mean),
    .groups = "drop") # グループ化解除
## # A tibble: 2 × 3
##   gender english  math
##   <chr>    <dbl> <dbl>
## 1 f           75    85
## 2 m           80    85

4.2 データの可視化

head(iris)
##   Sepal.Length Sepal.Width Petal.Length Petal.Width Species
## 1          5.1         3.5          1.4         0.2  setosa
## 2          4.9         3.0          1.4         0.2  setosa
## 3          4.7         3.2          1.3         0.2  setosa
## 4          4.6         3.1          1.5         0.2  setosa
## 5          5.0         3.6          1.4         0.2  setosa
## 6          5.4         3.9          1.7         0.4  setosa
hist(iris$Sepal.Length)

x <- c(10, 20, 30)
hist(x, breaks = 2) # 階級数は2

x <- iris$Sepal.Length
tmp <- seq(min(x), max(x),
           length.out = 10)
hist(x, breaks = tmp, right = FALSE)

plot(iris$Sepal.Length,
     iris$Sepal.Width)

boxplot(iris[, -5])

library(tidyverse)
my_df <- psych::describe(iris[, -5])
my_df %>% select(mean, sd, se)
##              mean   sd   se
## Sepal.Length 5.84 0.83 0.07
## Sepal.Width  3.06 0.44 0.04
## Petal.Length 3.76 1.77 0.14
## Petal.Width  1.20 0.76 0.06
tmp <- rownames(my_df)
my_df %>% ggplot(aes(x = factor(tmp, levels = tmp), y = mean)) +
  geom_col() +
  geom_errorbar(aes(ymin = mean - se, ymax = mean + se)) +
  xlab(NULL)

my_group <- iris %>% group_by(Species)       # 品種ごとに,

my_df <- my_group %>%                        # 各変数の,平均と
  summarize(across(everything(), mean)) %>%
  pivot_longer(-Species)

tmp <- my_group %>%                          # 標準誤差を求める.
  summarize(across(everything(), ~ sd(.) / length(.)**0.5)) %>%
  pivot_longer(-Species)

my_df$se <- tmp$value
head(my_df)
## # A tibble: 6 × 4
##   Species    name         value     se
##   <fct>      <chr>        <dbl>  <dbl>
## 1 setosa     Sepal.Length 5.01  0.0498
## 2 setosa     Sepal.Width  3.43  0.0536
## 3 setosa     Petal.Length 1.46  0.0246
## 4 setosa     Petal.Width  0.246 0.0149
## 5 versicolor Sepal.Length 5.94  0.0730
## 6 versicolor Sepal.Width  2.77  0.0444
my_df %>%
  ggplot(aes(x = Species, y = value, fill = name)) +
  geom_col(position = "dodge") +
  geom_errorbar(aes(ymin = value - se, ymax = value + se), position = "dodge")

# 各変数の平均
iris %>% pivot_longer(-Species) %>%
  ggplot(aes(x = name, y = value)) +
  geom_bar(stat = "summary", fun = mean) +
  stat_summary(geom = "errorbar", fun.data = mean_se) +
  xlab(NULL)

# 各変数の平均(品種ごと)
iris %>% pivot_longer(-Species) %>%
  ggplot(aes(x = Species, y = value, fill = name)) +
  geom_bar(stat = "summary", fun = mean, position = "dodge") +
  stat_summary(geom = "errorbar", fun.data = mean_se, position = "dodge")

my_df <- data.frame(
  Species = iris$Species,
  w_Sepal = iris$Sepal.Width > 3)
table(my_df) # 分割表
##             w_Sepal
## Species      FALSE TRUE
##   setosa         8   42
##   versicolor    42    8
##   virginica     33   17
mosaicplot(
  formula = ~ Species + w_Sepal,
  data = my_df)

library(vcd)
## Loading required package: grid
vcd::mosaic(formula = ~w_Sepal + Species, data = my_df,
            labeling = labeling_values)

curve(x^3 - x, -2, 2)

x <- iris$Sepal.Length
tmp <- seq(min(x), max(x),
           length.out = 10)
iris %>%
  ggplot(aes(x = Sepal.Length)) +
  geom_histogram(breaks = tmp,
                 closed = "left")

iris %>%
  ggplot(aes(x = Sepal.Length,
             y = Sepal.Width)) +
  geom_point()

iris %>%
  pivot_longer(-Species) %>%
  ggplot(aes(
    x = factor(name,
               levels = names(iris)),
    y = value)) +
  geom_boxplot() +
  xlab(NULL)

library(ggmosaic)
## 
## Attaching package: 'ggmosaic'
## The following objects are masked from 'package:vcd':
## 
##     mosaic, spine
my_df <- data.frame(
  Species = iris$Species,
  w_Sepal = iris$Sepal.Width > 3)
my_df %>%
  ggplot() +
  geom_mosaic(
    aes(x = product(w_Sepal, Species)))

f <- function(x) { x^3 - x }
data.frame(x = c(-2, 2)) %>%
  ggplot(aes(x = x)) +
  stat_function(fun = f)

4.3 乱数

x <- sample(x = 1:6,        # 範囲
            size = 10000,   # 乱数の数
            replace = TRUE) # 重複あり
hist(x, breaks = 0:6) # ヒストグラム

x <- runif(min = 0,  # 最小
           max = 1,  # 最大
           n = 1000) # 乱数の数
hist(x)

x <- as.integer(      # 整数に変換
  runif(min = 1,      # 最小
        max = 7,      # 最大 + 1
        n = 1000))    # 乱数の数
hist(x, breaks = 0:6) # 結果は割愛

n <- 100
p <- 0.5
r <- 10000
x <- rbinom(size = n, # 試行回数
            prob = p, # 確率
            n = r)    # 乱数の数
hist(x, breaks = max(x) - min(x))

r <- 10000
x <- rnorm(mean = 50, # 平均
           sd = 5,    # 標準偏差
           n = r)     # 乱数の数
hist(x, breaks = 40)

library(tidyverse)

f <- function(k) {
  n <- 10000
  tmp <- replicate(n = n, expr = g(rnorm(n =  k, sd = 3)))
  list(k = k,
       mean = mean(tmp),       # 平均
       se = sd(tmp) / sqrt(n)) # 標準誤差
}
g <- var
c(10, 20, 30) %>% map_dfr(f)
## # A tibble: 3 × 3
##       k  mean     se
##   <dbl> <dbl>  <dbl>
## 1    10  8.95 0.0420
## 2    20  9.00 0.0289
## 3    30  8.99 0.0239
g <- sd
c(5, 10, 15, 20) %>% map_dfr(f)
## # A tibble: 4 × 3
##       k  mean      se
##   <dbl> <dbl>   <dbl>
## 1     5  2.80 0.0101 
## 2    10  2.92 0.00704
## 3    15  2.95 0.00559
## 4    20  2.97 0.00488
g <- function(x) {
  n <- length(x)
  sd(x) *
    sqrt((n - 1) / 2) *
    gamma((n - 1) / 2) /
    gamma(n / 2)
}
c(10, 20, 30) %>% map_dfr(f)
## # A tibble: 3 × 3
##       k  mean      se
##   <dbl> <dbl>   <dbl>
## 1    10  3.00 0.00711
## 2    20  3.01 0.00497
## 3    30  3.01 0.00398

4.4 統計的推測

library(exactci)
## Loading required package: ssanv
## Loading required package: testthat
## 
## Attaching package: 'testthat'
## The following object is masked from 'package:dplyr':
## 
##     matches
## The following object is masked from 'package:purrr':
## 
##     is_null
## The following objects are masked from 'package:readr':
## 
##     edition_get, local_edition
## The following object is masked from 'package:tidyr':
## 
##     matches
library(tidyverse)

a <- 0.05                              # 有意水準
binom.exact(x = 2,                     # 当たった回数
            n = 15,                    # くじを引いた回数
            p = 4 / 10,                # 当たる確率(仮説)
            plot = TRUE,               # p値の描画(結果は次項に掲載)
            conf.level = 1 - a,        # 信頼係数(デフォルト)
            tsmethod = "minlike",      # p値の定義
            alternative = "two.sided") # 両側検定(デフォルト)

## 
##  Exact two-sided binomial test (sum of minimum likelihood method)
## 
## data:  2 and 15
## number of successes = 2, number of trials = 15, p-value = 0.03646
## alternative hypothesis: true probability of success is not equal to 0.4
## 95 percent confidence interval:
##  0.0242 0.3967
## sample estimates:
## probability of success 
##              0.1333333
                                       # 左片側検定なら'less'
                                       # 右片側検定なら'greater'
t <- 4 / 10               # 当たる確率
n <- 15                   # くじを引いた回数
x <- 0:n                  # 当たった回数
my_pr  <- dbinom(x, n, t) # x回当たる確率
my_pr2 <- dbinom(2, n, t) # 2回当たる確率

my_data <- data.frame(x = x,
                      probability = my_pr,
                      color = my_pr <= my_pr2) # 当たる確率が,2回当たる確率以下

my_data %>% ggplot(aes(x = x, y = probability, color = color)) +
  geom_point(size = 3) +
  geom_linerange(aes(ymin = 0, ymax = probability), ) + # 垂直線
  geom_hline(yintercept = my_pr2) +                     # 水平線
  theme(legend.position = "none")                       # 凡例を表示しない.

# 前項の結果(再掲)
# 前項冒頭のコード
X <- c(32.1, 26.2, 27.5, 31.8, 32.1, 31.2, 30.1, 32.4, 32.3, 29.9,
       29.6, 26.6, 31.2, 30.9, 29.3)
Y <- c(35.4, 34.6, 31.1, 32.4, 33.3, 34.7, 35.3, 34.3, 32.1, 28.3,
       33.3, 30.5, 32.6, 33.3, 32.2)

t.test(x = X, y = Y,
       conf.level = 0.95,         # 信頼係数(デフォルト)
       paired = TRUE,             # 対標本である.
       alternative = "two.sided") # 両側検定(デフォルト)
## 
##  Paired t-test
## 
## data:  X and Y
## t = -4.3694, df = 14, p-value = 0.0006416
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  -3.995525 -1.364475
## sample estimates:
## mean of the differences 
##                   -2.68
                                  # 左片側検定なら'less'
                                  # 右片側検定なら'greater'
t.test(x = X, y = Y,
       paired = FALSE,   # 対標本ではない(デフォルト).
       var.equal = TRUE, # 等分散を仮定する.仮定しないならFALSE(デフォルト).
       alternative = "two.sided",
       conf.level = 0.95)
## 
##  Two Sample t-test
## 
## data:  X and Y
## t = -3.6821, df = 28, p-value = 0.0009785
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  -4.170906 -1.189094
## sample estimates:
## mean of x mean of y 
##  30.21333  32.89333
my_url <- str_c("https://raw.githubusercontent.com/taroyabuki",
                "/fromzero/master/data/smoker.csv")
my_data <- read_csv(my_url)
## Rows: 1469 Columns: 2
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (2): alive, smoker
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
head(my_data)
## # A tibble: 6 × 2
##   alive smoker
##   <chr> <chr> 
## 1 Yes   No    
## 2 Yes   No    
## 3 Yes   No    
## 4 Yes   No    
## 5 Yes   No    
## 6 Yes   No
my_table <- table(my_data)
my_table
##      smoker
## alive  No Yes
##   No  117  54
##   Yes 950 348
chisq.test(my_table, correct = FALSE)
## 
##  Pearson's Chi-squared test
## 
## data:  my_table
## X-squared = 1.7285, df = 1, p-value = 0.1886
X <- rep(0:1, c(13, 2)) # 手順1
X
##  [1] 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1
tmp <- sample(X, size = length(X), replace = TRUE) # 手順2
tmp
##  [1] 1 1 0 0 0 0 0 0 0 0 1 1 0 0 0
sum(tmp) # 手順3
## [1] 4
n <- 10^5
result <- replicate(n, sum(sample(X, size = length(X), replace = TRUE))) # 手順4
hist(x = result, breaks = 0:15,
     right = FALSE)

quantile(result, c(0.025, 0.975))
##  2.5% 97.5% 
##     0     5

5 前処理

# Google Colaboratoryの環境設定
if (file.exists("/content")) {
  options(Ncpus = parallel::detectCores())
  installed_packages <- rownames(installed.packages())
  packages_to_install <- c("caret")
  install.packages(setdiff(packages_to_install, installed_packages))
}

5.1 データの読み込み

library(tidyverse)
system(str_c("wget https://raw.githubusercontent.com/taroyabuki",
             "/fromzero/master/data/exam.csv"))
my_df <- read_csv("exam.csv")
## Rows: 4 Columns: 4
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (2): name, gender
## dbl (2): english, math
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
# あるいは
my_df <- read.csv("exam.csv",
  stringsAsFactors = FALSE)

my_df
##   name english math gender
## 1    A      60   70      f
## 2    B      90   80      m
## 3    C      70   90      m
## 4    D      90  100      f
my_url <- str_c("https://raw.githubusercontent.com/taroyabuki",
                "/fromzero/master/data/exam.csv")
my_df <- read_csv(my_url)
## Rows: 4 Columns: 4
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (2): name, gender
## dbl (2): english, math
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
# あるいは
my_df <- read.csv(my_url, stringsAsFactors = FALSE)
my_df2 <- read.csv(
  file = "exam.csv",
  stringsAsFactors = FALSE,
  row.names = 1)
my_df2
##   english math gender
## A      60   70      f
## B      90   80      m
## C      70   90      m
## D      90  100      f
my_df %>% write_csv("exam2.csv")
# あるいは
my_df %>% write.csv(
  file = "exam2.csv",
  row.names = FALSE)
my_df2 %>% write.csv("exam3.csv")
my_df <- read_csv(file = "exam.csv",
  locale = locale(encoding = "UTF-8"))
## Rows: 4 Columns: 4
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (2): name, gender
## dbl (2): english, math
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
# あるいは
my_df <- read.csv(file = "exam.csv",
  stringsAsFactors = FALSE,
  fileEncoding = "UTF-8")
my_df %>% write_csv("exam2.csv")
# あるいは
my_df %>% write.csv(file = "exam2.csv", row.names = FALSE,
                    fileEncoding = "UTF-8")
my_url <- "https://taroyabuki.github.io/fromzero/exam.html"
my_tables <- xml2::read_html(my_url) %>% rvest::html_table()
my_tables
## [[1]]
## # A tibble: 5 × 5
##   X1    X2    X3      X4    X5    
##   <lgl> <chr> <chr>   <chr> <chr> 
## 1 NA    name  english math  gender
## 2 NA    A     60      70    f     
## 3 NA    B     90      80    m     
## 4 NA    C     70      90    m     
## 5 NA    D     90      100   f
tmp <- my_tables[[1]]
tmp
## # A tibble: 5 × 5
##   X1    X2    X3      X4    X5    
##   <lgl> <chr> <chr>   <chr> <chr> 
## 1 NA    name  english math  gender
## 2 NA    A     60      70    f     
## 3 NA    B     90      80    m     
## 4 NA    C     70      90    m     
## 5 NA    D     90      100   f
# 1行目のデータを使って列の名前を付け直す.
colnames(tmp) <- tmp[1, ]

# 1行目と1列目を削除する.
my_data <- tmp[-1, -1]
my_data
## # A tibble: 4 × 4
##   name  english math  gender
##   <chr> <chr>   <chr> <chr> 
## 1 A     60      70    f     
## 2 B     90      80    m     
## 3 C     70      90    m     
## 4 D     90      100   f
library(jsonlite)
## 
## Attaching package: 'jsonlite'
## The following object is masked from 'package:purrr':
## 
##     flatten
my_url <- str_c("https://raw.githubusercontent.com/taroyabuki",
                "/fromzero/master/data/exam.json")
my_data <- fromJSON(my_url)
#my_data <- fromJSON("exam.json") # (ファイルを使う場合)
my_data
##   name english math gender
## 1    A      60   70      f
## 2    B      90   80      m
## 3    C      70   90      m
## 4    D      90  100      f
library(xml2)
my_url <- str_c("https://raw.githubusercontent.com/taroyabuki",
                "/fromzero/master/data/exam.xml")
my_xml <- read_xml(my_url)      # XMLデータの読み込み
#my_xml <- read_xml("exam.xml") # (ファイルを使う場合)
xml_ns(my_xml)                  # 名前空間の確認(d1)
## d1 <-> https://www.example.net/ns/1.0
my_records <- xml_find_all(my_xml, ".//d1:record")
f <- function(record) {
  tmp <- xml_attrs(record)                    # 属性を全て取り出し,
  xml_children(record) %>% walk(function(e) {
    tmp[xml_name(e)] <<- xml_text(e)          # 子要素の名前と内容を追加する.
  })
  tmp
}
my_data <- my_records %>% map_dfr(f)
my_data$english <- as.numeric(my_data$english)
my_data$math    <- as.numeric(my_data$math)
my_data
## # A tibble: 4 × 4
##   english  math gender name 
##     <dbl> <dbl> <chr>  <chr>
## 1      60    70 f      A    
## 2      90    80 m      B    
## 3      70    90 m      C    
## 4      90   100 f      D

5.2 データの変換

x1 <- c(1, 2, 3)

z1 <- scale(x1)
# あるいは
z1 <- (x1 - mean(x1)) / sd(x1)

z1
## [1] -1  0  1
c(mean(z1), sd(z1))
## [1] 0 1
z1 * sd(x1) + mean(x1)
## [1] 1 2 3
x2 <- c(1, 3, 5)
z2 <- (x2 - mean(x1)) / sd(x1)
c(mean(z2), sd(z2))
## [1] 1 2
library(caret)
## Loading required package: lattice
## 
## Attaching package: 'caret'
## The following object is masked from 'package:future':
## 
##     cluster
## The following object is masked from 'package:purrr':
## 
##     lift
library(tidyverse)

my_df <- data.frame(
  id = c(1, 2, 3),
  class = as.factor(
    c("A", "B", "C")))

my_enc <- my_df %>%
  dummyVars(formula = ~ .)

my_enc %>% predict(my_df)
##   id class.A class.B class.C
## 1  1       1       0       0
## 2  2       0       1       0
## 3  3       0       0       1
my_df2 <- data.frame(
  id =    c( 4 ,  5 ,  6 ),
  class = c("B", "C", "B"))
my_enc %>% predict(my_df2)
##   id class.A class.B class.C
## 1  4       0       1       0
## 2  5       0       0       1
## 3  6       0       1       0
my_enc <- my_df %>%
  dummyVars(formula = ~ .,
            fullRank = TRUE)
my_enc %>% predict(my_df)
##   id class.B class.C
## 1  1       0       0
## 2  2       1       0
## 3  3       0       1
my_enc %>% predict(my_df2)
##   id class.B class.C
## 1  4       1       0
## 2  5       0       1
## 3  6       1       0

6 機械学習の目的・データ・手法

6.1 機械学習の目的(本書の場合)

6.2 機械学習のためのデータ

iris
##     Sepal.Length Sepal.Width Petal.Length Petal.Width    Species
## 1            5.1         3.5          1.4         0.2     setosa
## 2            4.9         3.0          1.4         0.2     setosa
## 3            4.7         3.2          1.3         0.2     setosa
## 4            4.6         3.1          1.5         0.2     setosa
## 5            5.0         3.6          1.4         0.2     setosa
## 6            5.4         3.9          1.7         0.4     setosa
## 7            4.6         3.4          1.4         0.3     setosa
## 8            5.0         3.4          1.5         0.2     setosa
## 9            4.4         2.9          1.4         0.2     setosa
## 10           4.9         3.1          1.5         0.1     setosa
## 11           5.4         3.7          1.5         0.2     setosa
## 12           4.8         3.4          1.6         0.2     setosa
## 13           4.8         3.0          1.4         0.1     setosa
## 14           4.3         3.0          1.1         0.1     setosa
## 15           5.8         4.0          1.2         0.2     setosa
## 16           5.7         4.4          1.5         0.4     setosa
## 17           5.4         3.9          1.3         0.4     setosa
## 18           5.1         3.5          1.4         0.3     setosa
## 19           5.7         3.8          1.7         0.3     setosa
## 20           5.1         3.8          1.5         0.3     setosa
## 21           5.4         3.4          1.7         0.2     setosa
## 22           5.1         3.7          1.5         0.4     setosa
## 23           4.6         3.6          1.0         0.2     setosa
## 24           5.1         3.3          1.7         0.5     setosa
## 25           4.8         3.4          1.9         0.2     setosa
## 26           5.0         3.0          1.6         0.2     setosa
## 27           5.0         3.4          1.6         0.4     setosa
## 28           5.2         3.5          1.5         0.2     setosa
## 29           5.2         3.4          1.4         0.2     setosa
## 30           4.7         3.2          1.6         0.2     setosa
## 31           4.8         3.1          1.6         0.2     setosa
## 32           5.4         3.4          1.5         0.4     setosa
## 33           5.2         4.1          1.5         0.1     setosa
## 34           5.5         4.2          1.4         0.2     setosa
## 35           4.9         3.1          1.5         0.2     setosa
## 36           5.0         3.2          1.2         0.2     setosa
## 37           5.5         3.5          1.3         0.2     setosa
## 38           4.9         3.6          1.4         0.1     setosa
## 39           4.4         3.0          1.3         0.2     setosa
## 40           5.1         3.4          1.5         0.2     setosa
## 41           5.0         3.5          1.3         0.3     setosa
## 42           4.5         2.3          1.3         0.3     setosa
## 43           4.4         3.2          1.3         0.2     setosa
## 44           5.0         3.5          1.6         0.6     setosa
## 45           5.1         3.8          1.9         0.4     setosa
## 46           4.8         3.0          1.4         0.3     setosa
## 47           5.1         3.8          1.6         0.2     setosa
## 48           4.6         3.2          1.4         0.2     setosa
## 49           5.3         3.7          1.5         0.2     setosa
## 50           5.0         3.3          1.4         0.2     setosa
## 51           7.0         3.2          4.7         1.4 versicolor
## 52           6.4         3.2          4.5         1.5 versicolor
## 53           6.9         3.1          4.9         1.5 versicolor
## 54           5.5         2.3          4.0         1.3 versicolor
## 55           6.5         2.8          4.6         1.5 versicolor
## 56           5.7         2.8          4.5         1.3 versicolor
## 57           6.3         3.3          4.7         1.6 versicolor
## 58           4.9         2.4          3.3         1.0 versicolor
## 59           6.6         2.9          4.6         1.3 versicolor
## 60           5.2         2.7          3.9         1.4 versicolor
## 61           5.0         2.0          3.5         1.0 versicolor
## 62           5.9         3.0          4.2         1.5 versicolor
## 63           6.0         2.2          4.0         1.0 versicolor
## 64           6.1         2.9          4.7         1.4 versicolor
## 65           5.6         2.9          3.6         1.3 versicolor
## 66           6.7         3.1          4.4         1.4 versicolor
## 67           5.6         3.0          4.5         1.5 versicolor
## 68           5.8         2.7          4.1         1.0 versicolor
## 69           6.2         2.2          4.5         1.5 versicolor
## 70           5.6         2.5          3.9         1.1 versicolor
## 71           5.9         3.2          4.8         1.8 versicolor
## 72           6.1         2.8          4.0         1.3 versicolor
## 73           6.3         2.5          4.9         1.5 versicolor
## 74           6.1         2.8          4.7         1.2 versicolor
## 75           6.4         2.9          4.3         1.3 versicolor
## 76           6.6         3.0          4.4         1.4 versicolor
## 77           6.8         2.8          4.8         1.4 versicolor
## 78           6.7         3.0          5.0         1.7 versicolor
## 79           6.0         2.9          4.5         1.5 versicolor
## 80           5.7         2.6          3.5         1.0 versicolor
## 81           5.5         2.4          3.8         1.1 versicolor
## 82           5.5         2.4          3.7         1.0 versicolor
## 83           5.8         2.7          3.9         1.2 versicolor
## 84           6.0         2.7          5.1         1.6 versicolor
## 85           5.4         3.0          4.5         1.5 versicolor
## 86           6.0         3.4          4.5         1.6 versicolor
## 87           6.7         3.1          4.7         1.5 versicolor
## 88           6.3         2.3          4.4         1.3 versicolor
## 89           5.6         3.0          4.1         1.3 versicolor
## 90           5.5         2.5          4.0         1.3 versicolor
## 91           5.5         2.6          4.4         1.2 versicolor
## 92           6.1         3.0          4.6         1.4 versicolor
## 93           5.8         2.6          4.0         1.2 versicolor
## 94           5.0         2.3          3.3         1.0 versicolor
## 95           5.6         2.7          4.2         1.3 versicolor
## 96           5.7         3.0          4.2         1.2 versicolor
## 97           5.7         2.9          4.2         1.3 versicolor
## 98           6.2         2.9          4.3         1.3 versicolor
## 99           5.1         2.5          3.0         1.1 versicolor
## 100          5.7         2.8          4.1         1.3 versicolor
## 101          6.3         3.3          6.0         2.5  virginica
## 102          5.8         2.7          5.1         1.9  virginica
## 103          7.1         3.0          5.9         2.1  virginica
## 104          6.3         2.9          5.6         1.8  virginica
## 105          6.5         3.0          5.8         2.2  virginica
## 106          7.6         3.0          6.6         2.1  virginica
## 107          4.9         2.5          4.5         1.7  virginica
## 108          7.3         2.9          6.3         1.8  virginica
## 109          6.7         2.5          5.8         1.8  virginica
## 110          7.2         3.6          6.1         2.5  virginica
## 111          6.5         3.2          5.1         2.0  virginica
## 112          6.4         2.7          5.3         1.9  virginica
## 113          6.8         3.0          5.5         2.1  virginica
## 114          5.7         2.5          5.0         2.0  virginica
## 115          5.8         2.8          5.1         2.4  virginica
## 116          6.4         3.2          5.3         2.3  virginica
## 117          6.5         3.0          5.5         1.8  virginica
## 118          7.7         3.8          6.7         2.2  virginica
## 119          7.7         2.6          6.9         2.3  virginica
## 120          6.0         2.2          5.0         1.5  virginica
## 121          6.9         3.2          5.7         2.3  virginica
## 122          5.6         2.8          4.9         2.0  virginica
## 123          7.7         2.8          6.7         2.0  virginica
## 124          6.3         2.7          4.9         1.8  virginica
## 125          6.7         3.3          5.7         2.1  virginica
## 126          7.2         3.2          6.0         1.8  virginica
## 127          6.2         2.8          4.8         1.8  virginica
## 128          6.1         3.0          4.9         1.8  virginica
## 129          6.4         2.8          5.6         2.1  virginica
## 130          7.2         3.0          5.8         1.6  virginica
## 131          7.4         2.8          6.1         1.9  virginica
## 132          7.9         3.8          6.4         2.0  virginica
## 133          6.4         2.8          5.6         2.2  virginica
## 134          6.3         2.8          5.1         1.5  virginica
## 135          6.1         2.6          5.6         1.4  virginica
## 136          7.7         3.0          6.1         2.3  virginica
## 137          6.3         3.4          5.6         2.4  virginica
## 138          6.4         3.1          5.5         1.8  virginica
## 139          6.0         3.0          4.8         1.8  virginica
## 140          6.9         3.1          5.4         2.1  virginica
## 141          6.7         3.1          5.6         2.4  virginica
## 142          6.9         3.1          5.1         2.3  virginica
## 143          5.8         2.7          5.1         1.9  virginica
## 144          6.8         3.2          5.9         2.3  virginica
## 145          6.7         3.3          5.7         2.5  virginica
## 146          6.7         3.0          5.2         2.3  virginica
## 147          6.3         2.5          5.0         1.9  virginica
## 148          6.5         3.0          5.2         2.0  virginica
## 149          6.2         3.4          5.4         2.3  virginica
## 150          5.9         3.0          5.1         1.8  virginica
# 以下省略

6.3 機械学習のための手法

7 回帰1(単回帰)

# Google Colaboratoryの環境設定
if (file.exists("/content")) {
  options(Ncpus = parallel::detectCores())
  installed_packages <- rownames(installed.packages())
  packages_to_install <- c("caret", "doParallel", "pastecs")
  install.packages(setdiff(packages_to_install, installed_packages))
}

7.1 自動車の停止距離

7.2 データの確認

library(caret)
library(tidyverse)
my_data <- cars
dim(my_data)
## [1] 50  2
head(my_data)
##   speed dist
## 1     4    2
## 2     4   10
## 3     7    4
## 4     7   22
## 5     8   16
## 6     9   10
options(digits = 3)
pastecs::stat.desc(my_data)
##                speed    dist
## nbr.val       50.000   50.00
## nbr.null       0.000    0.00
## nbr.na         0.000    0.00
## min            4.000    2.00
## max           25.000  120.00
## range         21.000  118.00
## sum          770.000 2149.00
## median        15.000   36.00
## mean          15.400   42.98
## SE.mean        0.748    3.64
## CI.mean.0.95   1.503    7.32
## var           27.959  664.06
## std.dev        5.288   25.77
## coef.var       0.343    0.60
my_data %>%
  ggplot(aes(x = speed, y = dist)) +
  geom_point()

7.3 回帰分析

library(tidyverse)

my_data <- cars
tmp <- data.frame(speed = 21.5, dist = 67)
my_data %>% ggplot(aes(x = speed, y = dist)) +
  coord_cartesian(xlim = c(4, 25), ylim = c(0, 120)) +
  geom_point() +
  stat_smooth(formula = y ~ x, method = "lm") +
  geom_pointrange(data = tmp, aes(ymin = -9, ymax = dist),  linetype = "dotted") +
  geom_pointrange(data = tmp, aes(xmin =  0, xmax = speed), linetype = "dotted")

library(caret)
library(tidyverse)
my_data <- cars
my_model <- train(form = dist ~ speed, # モデル式(出力変数と入力変数の関係)
                  data = my_data,      # データ
                  method = "lm")       # 手法
coef(my_model$finalModel)
## (Intercept)       speed 
##      -17.58        3.93
tmp <- data.frame(speed = 21.5)
my_model %>% predict(tmp)
##  1 
## 67
f <- function(x) { my_model %>% predict(data.frame(speed = x)) }
my_data %>%
  ggplot(aes(x = speed, y = dist,
             color = "data")) +
  geom_point() +
  stat_function(
    fun = f,
    mapping = aes(color = "model"))

7.4 当てはまりの良さの指標

library(caret)
library(tidyverse)
my_data <- cars
my_model <- train(form = dist ~ speed, data = my_data, method = "lm")

y  <- my_data$dist
y_ <- my_model %>% predict(my_data)
my_data$y_ <- y_
my_data$residual <- y - y_
head(my_data)
##   speed dist    y_ residual
## 1     4    2 -1.85     3.85
## 2     4   10 -1.85    11.85
## 3     7    4  9.95    -5.95
## 4     7   22  9.95    12.05
## 5     8   16 13.88     2.12
## 6     9   10 17.81    -7.81
my_data %>%
  ggplot(aes(x = speed, y = dist)) +
  geom_point() +
  geom_line(aes(x = speed, y = y_)) +
  geom_linerange(mapping = aes(ymin = y_, ymax = dist), linetype = "dotted")

RMSE(y_, y)
## [1] 15.1
# あるいは
mean((my_data$residual^2))**0.5
## [1] 15.1
R2(pred = y_, obs = y,
   form = "traditional")
## [1] 0.651
R2(pred = y_, obs = y,
   form = "corr")
## [1] 0.651
# あるいは
summary(my_model$finalModel)$r.squared
## [1] 0.651
my_test <- my_data[1:3, ]
y  <- my_test$dist
y_ <- my_model %>% predict(my_test)

R2(pred = y_, obs = y,
   form = "traditional")
## [1] -4.5
R2(pred = y_, obs = y,
   form = "corr")
## [1] 0.0769
library(caret)
library(tidyverse)
my_data <- cars
my_idx <- c(2, 11, 27, 34, 39, 44)
my_sample <- my_data[my_idx, ]
options(warn = -1) # これ以降,警告を表示しない.
my_model <- train(form = dist ~ poly(speed, degree = 5, raw = TRUE),
                  data = my_sample,
                  method = "lm")
options(warn = 0)  # これ以降,警告を表示する.

y  <- my_sample$dist
y_ <- my_model %>% predict(my_sample)
RMSE(y_, y)
## [1] 1.07e-10
R2(pred = y_, obs = y,
   form = "traditional")
## [1] 1
R2(pred = y_, obs = y,
   form = "corr")
## [1] 1
f <- function(x) { my_model %>% predict(data.frame(speed = x)) }

my_data %>%
  ggplot(aes(x = speed, y = dist, color = "data")) +
  geom_point() +
  geom_point(data = my_sample, mapping = aes(color = "sample")) +
  stat_function(fun = f, mapping = aes(color = "model")) +
  coord_cartesian(ylim = c(0, 120))

7.5 K最近傍法

# 準備
library(caret)
library(tidyverse)
my_data <- cars

# 訓練
my_model <- train(form = dist ~ speed, data = my_data, method = "knn")

# 可視化の準備
f <- function(x) { my_model %>% predict(data.frame(speed = x))}
my_data %>%
  ggplot(aes(x = speed,
             y = dist,
             color = "data")) +
  geom_point() +
  stat_function(
    fun = f,
    mapping = aes(color = "model"))

y  <- my_data$dist
y_ <- my_model %>% predict(my_data)

RMSE(y_, y)
## [1] 14
R2(pred = y_, obs = y,
   form = "traditional")
## [1] 0.7
R2(pred = y_, obs = y,
   form = "corr")
## [1] 0.702

7.6 検証

library(caret)
library(tidyverse)
my_data <- cars
my_model <- train(form = dist ~ speed, data = my_data, method = "lm")

my_model$results
##   intercept RMSE Rsquared  MAE RMSESD RsquaredSD MAESD
## 1      TRUE 15.1    0.697 12.1   2.41      0.122  1.67
my_model <- train(form = dist ~ speed, data = my_data, method = "lm",
                  trControl = trainControl(method = "cv", number = 5))
my_model$results
##   intercept RMSE Rsquared  MAE RMSESD RsquaredSD MAESD
## 1      TRUE 15.6     0.68 12.1   2.39       0.18  1.59
my_model <- train(form = dist ~ speed, data = my_data, method = "lm",
                  trControl = trainControl(method = "LOOCV"))
my_model$results
##   intercept RMSE Rsquared  MAE
## 1      TRUE 15.7    0.622 12.1
library(doParallel)
## Loading required package: foreach
## 
## Attaching package: 'foreach'
## The following objects are masked from 'package:purrr':
## 
##     accumulate, when
## Loading required package: iterators
## Loading required package: parallel
cl <- makeCluster(detectCores())
registerDoParallel(cl)
library(caret)
library(tidyverse)
my_data <- cars
my_model <- train(form = dist ~ speed, data = my_data, method = "lm")
y  <- my_data$dist
y_ <- my_model %>% predict(my_data)
# RMSE(訓練)
RMSE(y_, y)
## [1] 15.1
# 決定係数1(訓練)
R2(pred = y_, obs = y,
   form = "traditional")
## [1] 0.651
# 決定係数6(訓練)
R2(pred = y_, obs = y,
   form = "corr")
## [1] 0.651
postResample(pred = y_, obs = y)
##     RMSE Rsquared      MAE 
##   15.069    0.651   11.580
my_model <- train(form = dist ~ speed, data = my_data, method = "lm")
my_model$results
##   intercept RMSE Rsquared  MAE RMSESD RsquaredSD MAESD
## 1      TRUE 14.4    0.701 11.4   2.59     0.0923  1.78
# 左から,RMSE(検証),決定係数6(検証),MAE(検証)
my_model <- train(form = dist ~ speed, data = my_data, method = "lm",
                  trControl = trainControl(method = "LOOCV"))

# 方法1
my_model$results
##   intercept RMSE Rsquared  MAE
## 1      TRUE 15.7    0.622 12.1
# 方法2
y  <- my_model$pred$obs
y_ <- my_model$pred$pred
mean((y - y_)^2)**0.5
## [1] 15.7
mean(((y - y_)^2)**0.5)
## [1] 12.1
library(caret)
library(tidyverse)
my_data <- cars

my_lm_model <- train(form = dist ~ speed, data = my_data, method = "lm",
                     trControl = trainControl(method = "LOOCV"))

my_knn_model <- train(form = dist ~ speed, data = my_data, method = "knn",
                      tuneGrid = data.frame(k = 5),
                      trControl = trainControl(method = "LOOCV"))
my_lm_model$results$RMSE
## [1] 15.7
my_knn_model$results$RMSE
## [1] 15.8
y     <- my_data$dist
y_lm  <- my_lm_model$pred$pred
y_knn <- my_knn_model$pred$pred

my_df <- data.frame(
  lm  = (y - y_lm)^2,
  knn = (y - y_knn)^2)

head(my_df)
##       lm    knn
## 1  18.91 108.16
## 2 179.22   0.64
## 3  41.03 175.56
## 4 168.49  49.00
## 5   5.09   9.00
## 6  67.62 112.89
boxplot(my_df, ylab = "r^2")

t.test(x = my_df$lm, y = my_df$knn,
       conf.level = 0.95,
       paired = TRUE,
       alternative = "two.sided")
## 
##  Paired t-test
## 
## data:  my_df$lm and my_df$knn
## t = -0.1, df = 49, p-value = 0.9
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  -53.5  47.0
## sample estimates:
## mean of the differences 
##                   -3.21

7.7 パラメータチューニング

library(caret)
library(tidyverse)
my_data <- cars
my_model <- train(form = dist ~ speed, data = my_data, method = "knn")
my_model$results
##   k RMSE Rsquared  MAE RMSESD RsquaredSD MAESD
## 1 5 16.5    0.618 13.0   3.37      0.140  2.29
## 2 7 16.9    0.601 13.0   3.45      0.130  2.59
## 3 9 17.2    0.579 13.2   3.52      0.128  2.54
my_params <- expand.grid(k = 1:15)

my_model <- train(form = dist ~ speed, data = my_data, method = "knn",
                  tuneGrid = my_params,
                  trControl = trainControl(method = "LOOCV"))
head(my_model$results)
##   k RMSE Rsquared  MAE
## 1 1 17.2    0.578 13.8
## 2 2 16.8    0.594 13.0
## 3 3 16.3    0.622 12.7
## 4 4 16.0    0.609 12.3
## 5 5 15.8    0.617 12.0
## 6 6 16.0    0.608 12.3
ggplot(my_model)

my_model$bestTune
##   k
## 5 5
my_model$results %>%
  filter(RMSE == min(RMSE))
##   k RMSE Rsquared MAE
## 1 5 15.8    0.617  12
y  <- my_data$dist
y_ <- my_model %>% predict(my_data)
RMSE(y_, y)
## [1] 14
library(caret)
library(tidyverse)
my_data <- cars

my_loocv <- function(k) {
  my_model <- train(form = dist ~ speed, data = my_data, method = "knn",
                    tuneGrid = data.frame(k = k),
                    trControl = trainControl(method = "LOOCV"))
  y  <- my_data$dist
  y_ <- my_model %>% predict(my_data)
  list(k = k,
       training = RMSE(y_, y),             # RMSE(訓練)
       validation = my_model$results$RMSE) # RMSE(検証)
}

my_results <- 1:15 %>% map_dfr(my_loocv)
my_results %>%
  pivot_longer(-k) %>%
  ggplot(aes(x = k, y = value,
             color = name)) +
  geom_line() + geom_point() +
  xlab("#Neighbors") + ylab("RMSE") +
  theme(legend.position = c(1, 0),
        legend.justification = c(1, 0))

# 並列クラスタの停止
stopCluster(cl)
registerDoSEQ()

8 回帰2(重回帰)

# Google Colaboratoryの環境設定
if (file.exists("/content")) {
  options(Ncpus = parallel::detectCores())
  installed_packages <- rownames(installed.packages())
  packages_to_install <- c("caret", "ggfortify", "glmnet", "glmnetUtils", "leaps", "neuralnet", "psych")
  install.packages(setdiff(packages_to_install, installed_packages))
}

8.1 ブドウの生育条件とワインの価格

library(tidyverse)
my_url <- "http://www.liquidasset.com/winedata.html"
tmp <- read.table(file = my_url,   # 読み込む対象
                  header = TRUE,   # 1行目は変数名
                  na.string = ".", # 欠損値を表す文字列
                  skip = 62,       # 読み飛ばす行数
                  nrows = 38)      # 読み込む行数
psych::describe(tmp)
##         vars  n    mean     sd  median trimmed    mad     min    max  range
## OBS        1 38   19.50  11.11   19.50   19.50  14.08    1.00   38.0  37.00
## VINT       2 38 1970.50  11.11 1970.50 1970.50  14.08 1952.00 1989.0  37.00
## LPRICE2    3 27   -1.45   0.63   -1.51   -1.49   0.72   -2.29    0.0   2.29
## WRAIN      4 38  605.00 135.28  586.50  603.06 174.95  376.00  845.0 469.00
## DEGREES    5 37   16.52   0.66   16.53   16.55   0.67   14.98   17.6   2.67
## HRAIN      6 38  137.00  66.74  120.50  132.19  59.30   38.00  292.0 254.00
## TIME_SV    7 38   12.50  11.11   12.50   12.50  14.08   -6.00   31.0  37.00
##          skew kurtosis    se
## OBS      0.00    -1.30  1.80
## VINT     0.00    -1.30  1.80
## LPRICE2  0.41    -0.82  0.12
## WRAIN    0.15    -1.10 21.95
## DEGREES -0.34    -0.73  0.11
## HRAIN    0.69    -0.20 10.83
## TIME_SV  0.00    -1.30  1.80
my_data <- na.omit(tmp[, -c(1, 2)])
head(my_data)
##   LPRICE2 WRAIN DEGREES HRAIN TIME_SV
## 1  -0.999   600    17.1   160      31
## 2  -0.454   690    16.7    80      30
## 4  -0.808   502    17.1   130      28
## 6  -1.509   420    16.1   110      26
## 7  -1.717   582    16.4   187      25
## 8  -0.418   485    17.5   187      24
dim(my_data)
## [1] 27  5
my_data %>% write_csv("wine.csv")
#my_data <- read_csv("wine.csv") # 作ったファイルを使う場合
my_url <- str_c("https://raw.githubusercontent.com/taroyabuki",
                "/fromzero/master/data/wine.csv")
my_data <- read_csv(my_url)
## Rows: 27 Columns: 5
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (5): LPRICE2, WRAIN, DEGREES, HRAIN, TIME_SV
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

8.2 重回帰分析

library(caret)
library(tidyverse)
my_url <- str_c("https://raw.githubusercontent.com/taroyabuki",
                "/fromzero/master/data/wine.csv")
my_data <- read_csv(my_url)
## Rows: 27 Columns: 5
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (5): LPRICE2, WRAIN, DEGREES, HRAIN, TIME_SV
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
my_model <- train(form = LPRICE2 ~ WRAIN + DEGREES + HRAIN + TIME_SV,
                  data = my_data,
                  method = "lm",
                  trControl = trainControl(method = "LOOCV"))
coef(my_model$finalModel) %>%
  as.data.frame
##                     .
## (Intercept) -12.14533
## WRAIN         0.00117
## DEGREES       0.61639
## HRAIN        -0.00386
## TIME_SV       0.02385
my_test <- data.frame(
  WRAIN = 500, DEGREES = 17,
  HRAIN = 120, TIME_SV = 2)
my_model %>% predict(my_test)
##    1 
## -1.5
y  <- my_data$LPRICE2
y_ <- my_model %>% predict(my_data)

RMSE(y_, y)
## [1] 0.259
R2(pred = y_, obs = y,
   form = "traditional")
## [1] 0.828
R2(pred = y_, obs = y,
   form = "corr")
## [1] 0.828
my_model$results
##   intercept  RMSE Rsquared   MAE
## 1      TRUE 0.323    0.736 0.277
M <- my_data[, -1] %>%
  mutate(b0 = 1) %>% as.matrix
b <- MASS::ginv(M) %*% y
matrix(b,
       dimnames = list(colnames(M)))
##              [,1]
## WRAIN     0.00117
## DEGREES   0.61639
## HRAIN    -0.00386
## TIME_SV   0.02385
## b0      -12.14533

8.3 標準化

library(caret)
library(tidyverse)
my_url <- str_c("https://raw.githubusercontent.com/taroyabuki",
                "/fromzero/master/data/wine.csv")
my_data <- read_csv(my_url)
## Rows: 27 Columns: 5
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (5): LPRICE2, WRAIN, DEGREES, HRAIN, TIME_SV
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
my_data %>%
  mutate_if(is.numeric, scale) %>% # 数値の列の標準化
  pivot_longer(-LPRICE2) %>%
  ggplot(aes(x = name, y = value)) +
  geom_boxplot() +
  stat_summary(fun = mean, geom = "point", size = 3) +
  xlab(NULL)

my_model <- train(
  form = LPRICE2 ~ .,
  data = my_data,
  method = "lm",
  preProcess = c("center", "scale"))
coef(my_model$finalModel) %>%
  as.data.frame
##                  .
## (Intercept) -1.452
## WRAIN        0.151
## DEGREES      0.406
## HRAIN       -0.282
## TIME_SV      0.197
my_test <- data.frame(
  WRAIN = 500, DEGREES = 17,
  HRAIN = 120, TIME_SV = 2)
my_model %>% predict(my_test)
##    1 
## -1.5

8.4 入力変数の数とモデルの良さ

library(caret)
library(tidyverse)
my_url <- str_c("https://raw.githubusercontent.com/taroyabuki",
                "/fromzero/master/data/wine.csv")
my_data <- read_csv(my_url)
## Rows: 27 Columns: 5
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (5): LPRICE2, WRAIN, DEGREES, HRAIN, TIME_SV
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
n <- nrow(my_data)
my_data2 <- my_data %>% mutate(v1 = 0:(n - 1) %% 2,
                               v2 = 0:(n - 1) %% 3)
head(my_data2)
## # A tibble: 6 × 7
##   LPRICE2 WRAIN DEGREES HRAIN TIME_SV    v1    v2
##     <dbl> <dbl>   <dbl> <dbl>   <dbl> <dbl> <dbl>
## 1  -0.999   600    17.1   160      31     0     0
## 2  -0.454   690    16.7    80      30     1     1
## 3  -0.808   502    17.2   130      28     0     2
## 4  -1.51    420    16.1   110      26     1     0
## 5  -1.72    582    16.4   187      25     0     1
## 6  -0.418   485    17.5   187      24     1     2
my_model2 <- train(form = LPRICE2 ~ ., data = my_data2, method = "lm",
                   trControl = trainControl(method = "LOOCV"))
y  <- my_data2$LPRICE2
y_ <- my_model2 %>% predict(my_data2)

RMSE(y_, y)
## [1] 0.256
my_model2$results$RMSE
## [1] 0.357

8.5 変数選択

library(caret)
library(tidyverse)
my_url <- str_c("https://raw.githubusercontent.com/taroyabuki",
                "/fromzero/master/data/wine.csv")
my_data <- read_csv(my_url)
## Rows: 27 Columns: 5
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (5): LPRICE2, WRAIN, DEGREES, HRAIN, TIME_SV
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
n <- nrow(my_data)
my_data2 <- my_data %>% mutate(v1 = 0:(n - 1) %% 2,
                               v2 = 0:(n - 1) %% 3)
my_model <- train(form = LPRICE2 ~ .,
                  data = my_data2,
                  method = "leapForward", # 変数増加法
                  trControl = trainControl(method = "LOOCV"),
                  tuneGrid = data.frame(nvmax = 1:6)) # 選択する変数の上限
summary(my_model$finalModel)$outmat
##          WRAIN DEGREES HRAIN TIME_SV v1  v2 
## 1  ( 1 ) " "   "*"     " "   " "     " " " "
## 2  ( 1 ) " "   "*"     "*"   " "     " " " "
## 3  ( 1 ) " "   "*"     "*"   "*"     " " " "
## 4  ( 1 ) "*"   "*"     "*"   "*"     " " " "

8.6 補足:正則化

library(caret)
library(tidyverse)
my_url <- str_c("https://raw.githubusercontent.com/taroyabuki",
                "/fromzero/master/data/wine.csv")
my_data <- read_csv(my_url)
## Rows: 27 Columns: 5
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (5): LPRICE2, WRAIN, DEGREES, HRAIN, TIME_SV
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
A <- 2
B <- 0.1

my_model <- train(
  form = LPRICE2 ~ .,
  data = my_data,
  method = "glmnet",
  standardize = TRUE,
  tuneGrid = data.frame(
    lambda = A,
    alpha = B))
coef(my_model$finalModel, A)
## 5 x 1 sparse Matrix of class "dgCMatrix"
##                    s1
## (Intercept) -2.801552
## WRAIN        .       
## DEGREES      0.083291
## HRAIN       -0.000415
## TIME_SV      0.002310
my_test <- data.frame(
  WRAIN = 500, DEGREES = 17,
  HRAIN = 120, TIME_SV = 2)
my_model %>% predict(my_test)
## [1] -1.43
library(ggfortify)
library(glmnetUtils)

my_data2 <- my_data %>% scale %>%
  as.data.frame

B <- 0.1

glmnet(
  form = LPRICE2 ~ .,
  data = my_data2,
  alpha = B) %>%
  autoplot(xvar = "lambda") +
  xlab("log A ( = log lambda)") +
  theme(legend.position = c(0.15, 0.25))

As <- seq(0, 0.1, length.out = 21)
Bs <- seq(0, 0.1, length.out =  6)

my_model <- train(
  form = LPRICE2 ~ ., data = my_data, method = "glmnet", standardize = TRUE,
  trControl = trainControl(method = "LOOCV"),
  tuneGrid = expand.grid(lambda = As, alpha  = Bs))

my_model$bestTune
##   alpha lambda
## 8     0  0.035
tmp <- "B ( = alpha)"
ggplot(my_model) +
  theme(legend.position = c(0, 1), legend.justification = c(0, 1)) +
  xlab("A ( = lambda)") +
  guides(shape = guide_legend(tmp), color = guide_legend(tmp))

my_model$results %>%
  filter(RMSE == min(RMSE))
##   alpha lambda RMSE Rsquared   MAE
## 1     0  0.000 0.32    0.737 0.278
## 2     0  0.005 0.32    0.737 0.278
## 3     0  0.010 0.32    0.737 0.278
## 4     0  0.015 0.32    0.737 0.278
## 5     0  0.020 0.32    0.737 0.278
## 6     0  0.025 0.32    0.737 0.278
## 7     0  0.030 0.32    0.737 0.278
## 8     0  0.035 0.32    0.737 0.278

8.7 ニューラルネットワーク

curve(1 / (1 + exp(-x)), -6, 6)

library(caret)
library(tidyverse)
my_url <- str_c("https://raw.githubusercontent.com/taroyabuki",
                "/fromzero/master/data/wine.csv")
my_data <- read_csv(my_url)
## Rows: 27 Columns: 5
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (5): LPRICE2, WRAIN, DEGREES, HRAIN, TIME_SV
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
my_model <- train(form = LPRICE2 ~ .,
                  data = my_data,
                  method = "neuralnet",              # ニューラルネットワーク
                  preProcess = c("center", "scale"), # 標準化
                  trControl = trainControl(method = "LOOCV"))
plot(my_model$finalModel) # 訓練済ネットワークの描画
my_model$results
##   layer1 layer2 layer3  RMSE Rsquared   MAE
## 1      1      0      0 0.315    0.749 0.271
## 2      3      0      0 0.329    0.741 0.267
## 3      5      0      0 0.331    0.772 0.270
my_model <- train(
  form = LPRICE2 ~ .,
  data = my_data,
  method = "neuralnet",
  preProcess = c("center", "scale"),
  trControl = trainControl(method = "LOOCV"),
  tuneGrid = expand.grid(layer1 = 1:5,
                         layer2 = 0:2,
                         layer3 = 0))
my_model$results %>%
  filter(RMSE == min(RMSE))
##   layer1 layer2 layer3  RMSE Rsquared   MAE
## 1      3      0      0 0.298    0.784 0.225

9 分類1(多値分類)

# Google Colaboratoryの環境設定
if (file.exists("/content")) {
  options(Ncpus = parallel::detectCores())
  installed_packages <- rownames(installed.packages())
  packages_to_install <- c("caret", "furrr", "psych", "randomForest", "rpart.plot", "xgboost")
  install.packages(setdiff(packages_to_install, installed_packages))
}

9.1 アヤメのデータ

my_data <- iris
head(my_data)
##   Sepal.Length Sepal.Width Petal.Length Petal.Width Species
## 1          5.1         3.5          1.4         0.2  setosa
## 2          4.9         3.0          1.4         0.2  setosa
## 3          4.7         3.2          1.3         0.2  setosa
## 4          4.6         3.1          1.5         0.2  setosa
## 5          5.0         3.6          1.4         0.2  setosa
## 6          5.4         3.9          1.7         0.4  setosa
psych::describe(my_data)
##              vars   n mean   sd median trimmed  mad min max range  skew
## Sepal.Length    1 150 5.84 0.83   5.80    5.81 1.04 4.3 7.9   3.6  0.31
## Sepal.Width     2 150 3.06 0.44   3.00    3.04 0.44 2.0 4.4   2.4  0.31
## Petal.Length    3 150 3.76 1.77   4.35    3.76 1.85 1.0 6.9   5.9 -0.27
## Petal.Width     4 150 1.20 0.76   1.30    1.18 1.04 0.1 2.5   2.4 -0.10
## Species*        5 150 2.00 0.82   2.00    2.00 1.48 1.0 3.0   2.0  0.00
##              kurtosis   se
## Sepal.Length    -0.61 0.07
## Sepal.Width      0.14 0.04
## Petal.Length    -1.42 0.14
## Petal.Width     -1.36 0.06
## Species*        -1.52 0.07

9.2 木による分類

library(caret)
library(tidyverse)
my_data <- iris
my_model <- train(form = Species ~ ., data = my_data, method = "rpart")
rpart.plot::rpart.plot(my_model$finalModel, extra = 1)

my_test <- tribble(
~Sepal.Length, ~Sepal.Width, ~Petal.Length, ~Petal.Width,
          5.0,          3.5,           1.5,          0.5,
          6.5,          3.0,           5.0,          2.0)

my_model %>% predict(my_test)
## [1] setosa    virginica
## Levels: setosa versicolor virginica
my_model %>% predict(my_test,
                     type = "prob")
##   setosa versicolor virginica
## 1      1     0.0000     0.000
## 2      0     0.0217     0.978

9.3 正解率

library(caret)
library(tidyverse)
my_data <- iris
my_model <- train(form = Species ~ ., data = my_data, method = "rpart2")
## note: only 2 possible values of the max tree depth from the initial fit.
##  Truncating the grid to 2 .
y  <- my_data$Species
y_ <- my_model %>% predict(my_data)
confusionMatrix(data = y_, reference = y)
## Confusion Matrix and Statistics
## 
##             Reference
## Prediction   setosa versicolor virginica
##   setosa         50          0         0
##   versicolor      0         49         5
##   virginica       0          1        45
## 
## Overall Statistics
##                                         
##                Accuracy : 0.96          
##                  95% CI : (0.915, 0.985)
##     No Information Rate : 0.333         
##     P-Value [Acc > NIR] : <2e-16        
##                                         
##                   Kappa : 0.94          
##                                         
##  Mcnemar's Test P-Value : NA            
## 
## Statistics by Class:
## 
##                      Class: setosa Class: versicolor Class: virginica
## Sensitivity                  1.000             0.980            0.900
## Specificity                  1.000             0.950            0.990
## Pos Pred Value               1.000             0.907            0.978
## Neg Pred Value               1.000             0.990            0.952
## Prevalence                   0.333             0.333            0.333
## Detection Rate               0.333             0.327            0.300
## Detection Prevalence         0.333             0.360            0.307
## Balanced Accuracy            1.000             0.965            0.945
# 以下は割愛
y  <- my_data$Species
y_ <- my_model %>% predict(my_data)
mean(y_ == y)
## [1] 0.96
my_model <- train(form = Species ~ ., data = my_data, method = "rpart2",
                  trControl = trainControl(method = "LOOCV"))
## note: only 2 possible values of the max tree depth from the initial fit.
##  Truncating the grid to 2 .
my_model$results
##   maxdepth Accuracy Kappa
## 1        1    0.333  0.00
## 2        2    0.953  0.93
my_model <- train(form = Species ~ ., data = my_data, method = "rpart2",
                  tuneGrid = data.frame(maxdepth = 1:10),
                  trControl = trainControl(method = "LOOCV"))
my_model$results %>% filter(Accuracy == max(Accuracy))
##   maxdepth Accuracy Kappa
## 1        2    0.953  0.93
# パラメータを与えると正解率(LOOCV)を返す関数
my_loocv <- function(maxdepth, minbucket, minsplit) {
  my_model <- train(form = Species ~ ., data = my_data, method = "rpart2",
                    trControl = trainControl(method = "LOOCV"),
                    tuneGrid = data.frame(maxdepth = maxdepth),
                    control = rpart::rpart.control(cp = 0.01,
                                                   minbucket = minbucket,
                                                   minsplit = minsplit))
  list(maxdepth = maxdepth,
       minbucket = minbucket,
       minsplit = minsplit,
       Accuracy = my_model$results$Accuracy)
}
my_params <- expand.grid(
  maxdepth = 2:5,
  minbucket = 1:7,
  minsplit = c(2, 20))

library(furrr)
plan(multisession) # 並列処理の準備
my_results <- my_params %>% future_pmap_dfr(my_loocv, # 実行
  .options = furrr_options(seed = TRUE))

my_results %>% filter(Accuracy == max(Accuracy)) # 正解率(検証)の最大値
## # A tibble: 6 × 4
##   maxdepth minbucket minsplit Accuracy
##      <int>     <int>    <dbl>    <dbl>
## 1        3         5        2    0.973
## 2        4         5        2    0.973
## 3        5         5        2    0.973
## 4        3         5       20    0.973
## 5        4         5       20    0.973
## 6        5         5       20    0.973
my_model <- train(form = Species ~ ., data = my_data, method = "rpart2",
                  trControl = trainControl(method = "none"),
                  tuneGrid = data.frame(maxdepth = 3),
                  control = rpart::rpart.control(cp = 0.01,
                                                 minbucket = 5,
                                                 minsplit = 2))
rpart.plot::rpart.plot(
  my_model$finalModel, extra = 1)

9.4 複数の木を使う方法

library(caret)
library(tidyverse)
my_data <- iris

my_model <- train(form = Species ~ ., data = my_data, method = "rf",
                  tuneGrid = data.frame(mtry = 2:4), # 省略可
                  trControl = trainControl(method = "LOOCV"))
my_model$results
##   mtry Accuracy Kappa
## 1    2     0.96  0.94
## 2    3     0.96  0.94
## 3    4     0.96  0.94
my_model <- train(
  form = Species ~ ., data = my_data, method = "xgbTree",
  tuneGrid = expand.grid(
    nrounds          = c(50, 100, 150),
    max_depth        = c(1, 2, 3),
    eta              = c(0.3, 0.4),
    gamma            = 0,
    colsample_bytree = c(0.6, 0.8),
    min_child_weight = 1,
    subsample        = c(0.5, 0.75, 1)),
  trControl = trainControl(method = "cv", number = 5)) # 5分割交差検証
my_model$results %>% filter(Accuracy == max(Accuracy)) %>% head(5) %>% t
##                        1      2       3       4        5
## eta               0.3000  0.300  0.4000  0.4000   0.3000
## max_depth         1.0000  2.000  2.0000  3.0000   2.0000
## gamma             0.0000  0.000  0.0000  0.0000   0.0000
## colsample_bytree  0.8000  0.600  0.8000  0.8000   0.6000
## min_child_weight  1.0000  1.000  1.0000  1.0000   1.0000
## subsample         0.5000  0.750  0.7500  0.5000   0.7500
## nrounds          50.0000 50.000 50.0000 50.0000 100.0000
## Accuracy          0.9533  0.953  0.9533  0.9533   0.9533
## Kappa             0.9300  0.930  0.9300  0.9300   0.9300
## AccuracySD        0.0183  0.038  0.0183  0.0183   0.0183
## KappaSD           0.0274  0.057  0.0274  0.0274   0.0274
my_model <- train(form = Species ~ ., data = my_data, method = "rf")
ggplot(varImp(my_model))

9.5 欠損のあるデータでの学習

library(caret)
library(tidyverse)
my_data <- iris

n <- nrow(my_data)
my_data$Petal.Length[0:(n - 1) %% 10 == 0] <- NA
my_data$Petal.Width[ 0:(n - 1) %% 10 == 1] <- NA

psych::describe(my_data) # nの値が135の変数に,150 - 135 = 15個の欠損がある.
##              vars   n mean   sd median trimmed  mad min max range  skew
## Sepal.Length    1 150 5.84 0.83    5.8    5.81 1.04 4.3 7.9   3.6  0.31
## Sepal.Width     2 150 3.06 0.44    3.0    3.04 0.44 2.0 4.4   2.4  0.31
## Petal.Length    3 135 3.75 1.76    4.3    3.75 1.78 1.0 6.9   5.9 -0.27
## Petal.Width     4 135 1.20 0.77    1.3    1.18 1.04 0.1 2.5   2.4 -0.09
## Species*        5 150 2.00 0.82    2.0    2.00 1.48 1.0 3.0   2.0  0.00
##              kurtosis   se
## Sepal.Length    -0.61 0.07
## Sepal.Width      0.14 0.04
## Petal.Length    -1.41 0.15
## Petal.Width     -1.35 0.07
## Species*        -1.52 0.07
my_model <- train(
  form = Species ~ ., data = my_data, method = "rpart2",
  na.action = na.pass,         # 欠損があっても学習を止めない.
  preProcess = "medianImpute", # 欠損を中央値で埋める.
  trControl = trainControl(method = "LOOCV"),
  tuneGrid = data.frame(maxdepth = 20),          # 木の高さの上限
  control = rpart::rpart.control(minsplit = 2,   # 分岐の条件
                                 minbucket = 1)) # 終端ノードの条件
max(my_model$results$Accuracy)
## [1] 0.927
my_model <- train(form = Species ~ ., data = my_data, method = "xgbTree",
                  na.action = na.pass,
                  trControl = trainControl(method = "cv", number = 5))
max(my_model$results$Accuracy)
## [1] 0.96

9.6 他の分類手法

library(caret)
library(tidyverse)
my_data <- iris

my_model <- train(form = Species ~ ., data = my_data, method = "knn",
                  trControl = trainControl(method = "LOOCV"))
my_model$results %>% filter(Accuracy == max(Accuracy))
##   k Accuracy Kappa
## 1 5    0.967  0.95
## 2 7    0.967  0.95
## 3 9    0.967  0.95
library(caret)
library(tidyverse)
my_data <- iris

my_model <- train(form = Species ~ ., data = my_data, method = "nnet",
                  preProcess = c("center", "scale"), # 標準化
                  trControl = trainControl(method = "LOOCV"),
                  trace = FALSE) # 途中経過を表示しない
my_model$results %>% filter(Accuracy == max(Accuracy))
##   size decay Accuracy Kappa
## 1    3   0.1    0.973  0.96
## 2    5   0.1    0.973  0.96

10 分類2(2値分類)

# Google Colaboratoryの環境設定
if (file.exists("/content")) {
  options(Ncpus = parallel::detectCores())
  installed_packages <- rownames(installed.packages())
  packages_to_install <- c("caret", "PRROC", "rpart.plot")
  install.packages(setdiff(packages_to_install, installed_packages))
}

10.1 2値分類の性能指標

y       <- c(  0,   1,   1,   0,   1,   0,    1,   0,   0,   1)
y_score <- c(0.7, 0.8, 0.3, 0.4, 0.9, 0.6, 0.99, 0.1, 0.2, 0.5)
y_ <- ifelse(0.5 <= y_score, 1, 0)
y_
##  [1] 1 1 0 0 1 1 1 0 0 1
library(caret)
confusionMatrix(data      = as.factor(y_), # 予測
                reference = as.factor(y),  # 正解
                positive = "1",            # 「1」を陽性とする.
                mode = "everything")       # 全ての指標を求める.
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction 0 1
##          0 3 1
##          1 2 4
##                                         
##                Accuracy : 0.7           
##                  95% CI : (0.348, 0.933)
##     No Information Rate : 0.5           
##     P-Value [Acc > NIR] : 0.172         
##                                         
##                   Kappa : 0.4           
##                                         
##  Mcnemar's Test P-Value : 1.000         
##                                         
##             Sensitivity : 0.800         
##             Specificity : 0.600         
##          Pos Pred Value : 0.667         
##          Neg Pred Value : 0.750         
##               Precision : 0.667         
##                  Recall : 0.800         
##                      F1 : 0.727         
##              Prevalence : 0.500         
##          Detection Rate : 0.400         
##    Detection Prevalence : 0.600         
##       Balanced Accuracy : 0.700         
##                                         
##        'Positive' Class : 1             
## 

10.2 トレードオフ

library(PRROC)
library(tidyverse)

y       <- c(  0,   1,   1,   0,   1,   0,    1,   0,   0,   1)
y_score <- c(0.7, 0.8, 0.3, 0.4, 0.9, 0.6, 0.99, 0.1, 0.2, 0.5)
y_      <- ifelse(0.5 <= y_score, 1, 0)

c(sum((y == 0) & (y_ == 1)) / sum(y == 0), # FPR
  sum((y == 1) & (y_ == 1)) / sum(y == 1)) # TPR
## [1] 0.4 0.8
my_roc <- roc.curve(scores.class0 = y_score[y == 1], # 答えが1のもののスコア
                    scores.class1 = y_score[y == 0], # 答えが0のもののスコア
                    curve = TRUE)
my_roc %>% plot(xlab = "False Positive Rate",
                ylab = "True Positive Rate",
                legend = FALSE)

my_roc$auc
## [1] 0.8
c(sum((y == 1) & (y_ == 1)) / sum(y  == 1), # Recall == TPR
  sum((y == 1) & (y_ == 1)) / sum(y_ == 1)) # Precision
## [1] 0.800 0.667
my_pr <- pr.curve(scores.class0 = y_score[y == 1],
                  scores.class1 = y_score[y == 0],
                  curve = TRUE)
my_pr %>% plot(xlab = "Recall",
               ylab = "Precision",
               legend = FALSE)

my_pr$auc.integral
## [1] 0.847

10.3 タイタニック

library(caret)
library(PRROC)
library(tidyverse)

my_url <- str_c("https://raw.githubusercontent.com/taroyabuki",
                "/fromzero/master/data/titanic.csv")
my_data <- read_csv(my_url)
## Rows: 2201 Columns: 4
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (4): Class, Sex, Age, Survived
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
head(my_data)
## # A tibble: 6 × 4
##   Class Sex   Age   Survived
##   <chr> <chr> <chr> <chr>   
## 1 1st   Male  Child Yes     
## 2 1st   Male  Child Yes     
## 3 1st   Male  Child Yes     
## 4 1st   Male  Child Yes     
## 5 1st   Male  Child Yes     
## 6 1st   Male  Adult No
my_model <- train(form = Survived ~ ., data = my_data, method = "rpart2",
                  tuneGrid = data.frame(maxdepth = 2),
                  trControl = trainControl(method = "LOOCV"))
rpart.plot::rpart.plot(my_model$finalModel, extra = 1)

my_model$results
##   maxdepth Accuracy Kappa
## 1        2    0.783  0.41
y <- my_data$Survived
tmp <- my_model %>% predict(newdata = my_data, type = "prob")
y_score <- tmp$Yes

my_roc <- roc.curve(scores.class0 = y_score[y == "Yes"],
                    scores.class1 = y_score[y == "No"],
                    curve = TRUE)
my_roc$auc
## [1] 0.711
my_roc %>% plot(xlab = "False Positive Rate",
                ylab = "True Positive Rate",
                legend = FALSE)

X <- my_data %>% select(Class) # 質的入力変数
y <- my_data$Survived          # 出力変数

options(warn = -1) # これ以降,警告を表示しない.
my_model1 <- train(x = X, y = y, method = "rpart2",
                   tuneGrid = data.frame(maxdepth = 2),
                   trControl = trainControl(method = "LOOCV"))
options(warn = 0)  # これ以降,警告を表示する.

rpart.plot::rpart.plot(my_model1$finalModel, extra = 1)

my_model1$results
##   maxdepth Accuracy Kappa
## 1        2    0.714 0.237
my_enc <- my_data %>% dummyVars(formula = Survived ~ Class)
my_data2 <- my_enc %>%
  predict(my_data) %>%
  as.data.frame %>%
  mutate(Survived = my_data$Survived)

my_model2 <- train(form = Survived ~ ., data = my_data2, method = "rpart2",
                   tuneGrid = data.frame(maxdepth = 2),
                   trControl = trainControl(method = "LOOCV"))
rpart.plot::rpart.plot(my_model2$finalModel, extra = 1)

my_model2$results
##   maxdepth Accuracy Kappa
## 1        2    0.714 0.237
my_model3 <- train(form = Survived ~ Class, data = my_data, method = "rpart2",
                   tuneGrid = data.frame(maxdepth = 2),
                   trControl = trainControl(method = "LOOCV"))
rpart.plot::rpart.plot(my_model3$finalModel, extra = 1)

my_model3$results
##   maxdepth Accuracy Kappa
## 1        2    0.692 0.267

10.4 ロジスティック回帰

curve(1 / (1 + exp(-x)), -6, 6)

library(caret)
library(PRROC)
library(tidyverse)

my_url <- str_c("https://raw.githubusercontent.com/taroyabuki",
                "/fromzero/master/data/titanic.csv")
my_data <- read_csv(my_url)
## Rows: 2201 Columns: 4
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (4): Class, Sex, Age, Survived
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
my_model <- train(form = Survived ~ ., data = my_data, method = "glm",
                  trControl = trainControl(method = "LOOCV"))
coef(my_model$finalModel) %>%
  as.data.frame
##                  .
## (Intercept)  2.044
## Class2nd    -1.018
## Class3rd    -1.778
## ClassCrew   -0.858
## SexMale     -2.420
## AgeChild     1.062
my_model$results
##   parameter Accuracy Kappa
## 1      none    0.778 0.445

11 深層学習とAutoML

# Google Colaboratoryの環境設定
if (file.exists("/content")) {
  options(Ncpus = parallel::detectCores())
  installed_packages <- rownames(installed.packages())
  packages_to_install <- c("h2o", "keras")
  install.packages(setdiff(packages_to_install, installed_packages))
}

11.1 Kerasによる回帰

library(keras)
## 
## Attaching package: 'keras'
## The following object is masked from 'package:future':
## 
##     %<-%
library(tidyverse)
my_url <- str_c("https://raw.githubusercontent.com/taroyabuki",
                "/fromzero/master/data/wine.csv")
tmp <- read_csv(my_url)
## Rows: 27 Columns: 5
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (5): LPRICE2, WRAIN, DEGREES, HRAIN, TIME_SV
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
my_data <- tmp[sample(nrow(tmp)), ]
X <- my_data %>%
  select(-LPRICE2) %>% scale
y <- my_data$LPRICE2
curve(activation_relu(x), -3, 3)
## Loaded Tensorflow version 2.7.0

my_model <- keras_model_sequential() %>%
  layer_dense(units = 3, activation = "relu", input_shape = c(4)) %>%
  layer_dense(units = 1)

summary(my_model) # ネットワークの概要
## Model: "sequential"
## ________________________________________________________________________________
##  Layer (type)                       Output Shape                    Param #     
## ================================================================================
##  dense_1 (Dense)                    (None, 3)                       15          
##                                                                                 
##  dense (Dense)                      (None, 1)                       4           
##                                                                                 
## ================================================================================
## Total params: 19
## Trainable params: 19
## Non-trainable params: 0
## ________________________________________________________________________________
# 割愛(Pythonの結果を参照)
my_model %>% compile(
  loss = "mse",
  optimizer = "rmsprop")
my_cb <- callback_early_stopping(
  patience = 20,
  restore_best_weights = TRUE)
my_history <- my_model %>% fit(
    x = X,
    y = y,
    validation_split = 0.25,
    batch_size = 10,
    epochs = 500,
    callbacks = list(my_cb),
    verbose = 0)
plot(my_history)
## `geom_smooth()` using formula 'y ~ x'

my_history
## 
## Final epoch (plot to see history):
##     loss: 0.1457
## val_loss: 0.3786
y_ <- my_model %>% predict(X)
mean((y_ - y)^2)**0.5
## [1] 0.453

11.2 Kerasによる分類

library(keras)
library(tidyverse)
my_data <- iris[sample(nrow(iris)), ]
X <- my_data %>%
  select(-Species) %>% scale
y <- as.integer(my_data$Species) - 1
my_model <- keras_model_sequential() %>%
  layer_dense(units = 3, activation = "relu", input_shape = c(4)) %>%
  layer_dense(units = 3, activation = "softmax")
my_model %>% compile(loss = "sparse_categorical_crossentropy",
                     optimizer = "rmsprop",
                     metrics = c("accuracy"))
my_cb <- callback_early_stopping(
  patience = 20,
  restore_best_weights = TRUE)

my_history <- my_model %>%
  fit(x = X,
    y = y,
    validation_split = 0.25,
    batch_size = 10,
    epochs = 500,
    callbacks = list(my_cb),
    verbose = 0)

plot(my_history)
## `geom_smooth()` using formula 'y ~ x'

my_history
## 
## Final epoch (plot to see history):
##         loss: 0.1077
##     accuracy: 0.9732
##     val_loss: 0.1418
## val_accuracy: 0.9474
tmp <- my_model %>% predict(X)
y_ <- apply(tmp, 1, which.max) - 1
mean(y_ == y)
## [1] 0.967
-mean(log(c(0.8, 0.7, 0.3, 0.8)))
## [1] 0.502
-mean(log(c(0.7, 0.6, 0.2, 0.7)))
## [1] 0.708
y <- c(2, 1, 0, 1)
y_1 <- list(c(0.1, 0.1, 0.8),
            c(0.1, 0.7, 0.2),
            c(0.3, 0.4, 0.3),
            c(0.1, 0.8, 0.1))
y_2 <- list(c(0.1, 0.2, 0.7),
            c(0.2, 0.6, 0.2),
            c(0.2, 0.5, 0.3),
            c(0.2, 0.7, 0.1))
c(mean(as.array(loss_sparse_categorical_crossentropy(y_true = y, y_pred = y_1))),
  mean(as.array(loss_sparse_categorical_crossentropy(y_true = y, y_pred = y_2))))
## [1] 0.502 0.708

11.3 MNIST:手書き数字の分類

library(keras)
library(tidyverse)
keras::`%<-%`(c(c(x_train, y_train), c(x_test, y_test)), dataset_mnist())
dim(x_train)
## [1] 60000    28    28
x_train[5, , ]
##       [,1] [,2] [,3] [,4] [,5] [,6] [,7] [,8] [,9] [,10] [,11] [,12] [,13]
##  [1,]    0    0    0    0    0    0    0    0    0     0     0     0     0
##  [2,]    0    0    0    0    0    0    0    0    0     0     0     0     0
##  [3,]    0    0    0    0    0    0    0    0    0     0     0     0     0
##  [4,]    0    0    0    0    0    0    0    0    0     0     0     0     0
##  [5,]    0    0    0    0    0    0    0    0    0     0     0     0     0
##  [6,]    0    0    0    0    0    0    0    0    0     0     0     0     0
##  [7,]    0    0    0    0    0    0    0    0    0     0     0     0     0
##  [8,]    0    0    0    0    0    0    0    0    0     0     0     0    55
##  [9,]    0    0    0    0    0    0    0    0    0     0     0    87   232
## [10,]    0    0    0    0    0    0    0    0    0     4    57   242   252
## [11,]    0    0    0    0    0    0    0    0    0    96   252   252   183
## [12,]    0    0    0    0    0    0    0    0  132   253   252   146    14
## [13,]    0    0    0    0    0    0    0  126  253   247   176     9     0
## [14,]    0    0    0    0    0    0   16  232  252   176     0     0     0
## [15,]    0    0    0    0    0    0   22  252  252    30    22   119   197
## [16,]    0    0    0    0    0    0   16  231  252   253   252   252   252
## [17,]    0    0    0    0    0    0    0   55  235   253   217   138    42
## [18,]    0    0    0    0    0    0    0    0    0     0     0     0     0
## [19,]    0    0    0    0    0    0    0    0    0     0     0     0     0
## [20,]    0    0    0    0    0    0    0    0    0     0     0     0     0
## [21,]    0    0    0    0    0    0    0    0    0     0     0     0     0
## [22,]    0    0    0    0    0    0    0    0    0     0     0     0     0
## [23,]    0    0    0    0    0    0    0    0    0     0     0     0     0
## [24,]    0    0    0    0    0    0    0    0    0     0     0     0     0
## [25,]    0    0    0    0    0    0    0    0    0     0     0     0     0
## [26,]    0    0    0    0    0    0    0    0    0     0     0     0     0
## [27,]    0    0    0    0    0    0    0    0    0     0     0     0     0
## [28,]    0    0    0    0    0    0    0    0    0     0     0     0     0
##       [,14] [,15] [,16] [,17] [,18] [,19] [,20] [,21] [,22] [,23] [,24] [,25]
##  [1,]     0     0     0     0     0     0     0     0     0     0     0     0
##  [2,]     0     0     0     0     0     0     0     0     0     0     0     0
##  [3,]     0     0     0     0     0     0     0     0     0     0     0     0
##  [4,]     0     0     0     0     0     0     0     0     0     0     0     0
##  [5,]     0     0     0     0     0     0     0     0     0     0     0     0
##  [6,]     0     0     0     0     0     0     0     0     0     0     0     0
##  [7,]     0     0     0     0     0     0     0     0     0     0     0     0
##  [8,]   148   210   253   253   113    87   148    55     0     0     0     0
##  [9,]   252   253   189   210   252   252   253   168     0     0     0     0
## [10,]   190    65     5    12   182   252   253   116     0     0     0     0
## [11,]    14     0     0    92   252   252   225    21     0     0     0     0
## [12,]     0     0     0   215   252   252    79     0     0     0     0     0
## [13,]     0     8    78   245   253   129     0     0     0     0     0     0
## [14,]    36   201   252   252   169    11     0     0     0     0     0     0
## [15,]   241   253   252   251    77     0     0     0     0     0     0     0
## [16,]   226   227   252   231     0     0     0     0     0     0     0     0
## [17,]    24   192   252   143     0     0     0     0     0     0     0     0
## [18,]    62   255   253   109     0     0     0     0     0     0     0     0
## [19,]    71   253   252    21     0     0     0     0     0     0     0     0
## [20,]     0   253   252    21     0     0     0     0     0     0     0     0
## [21,]    71   253   252    21     0     0     0     0     0     0     0     0
## [22,]   106   253   252    21     0     0     0     0     0     0     0     0
## [23,]    45   255   253    21     0     0     0     0     0     0     0     0
## [24,]     0   218   252    56     0     0     0     0     0     0     0     0
## [25,]     0    96   252   189    42     0     0     0     0     0     0     0
## [26,]     0    14   184   252   170    11     0     0     0     0     0     0
## [27,]     0     0    14   147   252    42     0     0     0     0     0     0
## [28,]     0     0     0     0     0     0     0     0     0     0     0     0
##       [,26] [,27] [,28]
##  [1,]     0     0     0
##  [2,]     0     0     0
##  [3,]     0     0     0
##  [4,]     0     0     0
##  [5,]     0     0     0
##  [6,]     0     0     0
##  [7,]     0     0     0
##  [8,]     0     0     0
##  [9,]     0     0     0
## [10,]     0     0     0
## [11,]     0     0     0
## [12,]     0     0     0
## [13,]     0     0     0
## [14,]     0     0     0
## [15,]     0     0     0
## [16,]     0     0     0
## [17,]     0     0     0
## [18,]     0     0     0
## [19,]     0     0     0
## [20,]     0     0     0
## [21,]     0     0     0
## [22,]     0     0     0
## [23,]     0     0     0
## [24,]     0     0     0
## [25,]     0     0     0
## [26,]     0     0     0
## [27,]     0     0     0
## [28,]     0     0     0
plot(as.raster(x = x_train[5, , ],
               max = max(x_train)))

head(y_train)
## [1] 5 0 4 1 9 2
c(min(x_train), max(x_train))
## [1]   0 255
x_train <- x_train / 255
x_test  <- x_test  / 255
my_index <- sample(1:60000, 6000)
x_train <- x_train[my_index, , ]
y_train <- y_train[my_index]
my_model <- keras_model_sequential() %>%
  layer_flatten(input_shape = c(28, 28)) %>%
  layer_dense(units = 256, activation = "relu") %>%
  layer_dense(units = 10, activation = "softmax")
summary(my_model)
## Model: "sequential_2"
## ________________________________________________________________________________
##  Layer (type)                       Output Shape                    Param #     
## ================================================================================
##  flatten (Flatten)                  (None, 784)                     0           
##                                                                                 
##  dense_5 (Dense)                    (None, 256)                     200960      
##                                                                                 
##  dense_4 (Dense)                    (None, 10)                      2570        
##                                                                                 
## ================================================================================
## Total params: 203,530
## Trainable params: 203,530
## Non-trainable params: 0
## ________________________________________________________________________________
# 割愛(Pythonの結果を参照)

my_model %>% compile(loss = "sparse_categorical_crossentropy",
                     optimizer = "rmsprop",
                     metrics = c("accuracy"))

my_cb <- callback_early_stopping(patience = 5,
                                 restore_best_weights = TRUE)
my_history <- my_model %>%
  fit(x = x_train,
      y = y_train,
      validation_split = 0.2,
      batch_size = 128,
      epochs = 20,
      callbacks = list(my_cb),
      verbose = 0)

plot(my_history)
## `geom_smooth()` using formula 'y ~ x'

tmp <- my_model %>% predict(x_test)
y_ <- apply(tmp, 1, which.max) - 1
table(y_, y_test)
##    y_test
## y_     0    1    2    3    4    5    6    7    8    9
##   0  964    0   12    4    2    9   11    3   12    9
##   1    0 1124    5    2    1    2    3   10    9    9
##   2    1    2  958   17    1    2    4   21    5    2
##   3    1    2   14  938    0   25    0    8   24   11
##   4    0    0   12    1  944    8   11    9   10   39
##   5    3    1    1   14    0  807   10    1   21    4
##   6    9    3    6    1    8   13  916    0    6    1
##   7    1    1   10   11    2    1    0  946    4    5
##   8    1    2   13   12    3   14    3    3  866    4
##   9    0    0    1   10   21   11    0   27   17  925
mean(y_ == y_test)
## [1] 0.939
my_model %>%
  evaluate(x = x_test, y = y_test)
##     loss accuracy 
##    0.208    0.939
x_train2d <- x_train %>% array_reshape(c(-1, 28, 28, 1))
x_test2d  <- x_test  %>% array_reshape(c(-1, 28, 28, 1))
my_model <- keras_model_sequential() %>%
  layer_conv_2d(filters = 32, kernel_size = 3,  # 畳み込み層
                activation = "relu",
                input_shape = c(28, 28, 1)) %>%
  layer_max_pooling_2d(pool_size = 2) %>%       # プーリング層
  layer_flatten() %>%
  layer_dense(units = 128, activation = "relu") %>%
  layer_dense(units = 10, activation = "softmax")

summary(my_model)
## Model: "sequential_3"
## ________________________________________________________________________________
##  Layer (type)                       Output Shape                    Param #     
## ================================================================================
##  conv2d (Conv2D)                    (None, 26, 26, 32)              320         
##                                                                                 
##  max_pooling2d (MaxPooling2D)       (None, 13, 13, 32)              0           
##                                                                                 
##  flatten_1 (Flatten)                (None, 5408)                    0           
##                                                                                 
##  dense_7 (Dense)                    (None, 128)                     692352      
##                                                                                 
##  dense_6 (Dense)                    (None, 10)                      1290        
##                                                                                 
## ================================================================================
## Total params: 693,962
## Trainable params: 693,962
## Non-trainable params: 0
## ________________________________________________________________________________
# 割愛(Python の結果を参照)

my_model %>% compile(loss = "sparse_categorical_crossentropy",
                     optimizer = "rmsprop",
                     metrics = c("accuracy"))

my_cb <- callback_early_stopping(patience = 5,
                                 restore_best_weights = TRUE)
my_history <- my_model %>%
  fit(x = x_train2d,
      y = y_train,
      validation_split = 0.2,
      batch_size = 128,
      epochs = 20,
      callbacks = list(my_cb),
      verbose = 0)

plot(my_history)
## `geom_smooth()` using formula 'y ~ x'

my_model %>%
  evaluate(x = x_test2d, y = y_test)
##     loss accuracy 
##    0.122    0.965
my_model <- keras_model_sequential() %>%
  layer_conv_2d(filters = 20, kernel_size = 5, activation = "relu",
                input_shape = c(28, 28, 1)) %>%
  layer_max_pooling_2d(pool_size = 2, strides = 2) %>%
  layer_conv_2d(filters = 50, kernel_size = 5, activation = "relu") %>%
  layer_max_pooling_2d(pool_size = 2, strides = 2) %>%
  layer_dropout(rate = 0.25) %>%
  layer_flatten() %>%
  layer_dense(units = 500, activation = "relu") %>%
  layer_dropout(rate = 0.5) %>%
  layer_dense(units = 10, activation = "softmax")

my_model %>% compile(
  loss = "sparse_categorical_crossentropy",
  optimizer = "rmsprop",
  metrics = c("accuracy"))

my_cb <- callback_early_stopping(patience = 5,
                                 restore_best_weights = TRUE)
my_history <- my_model %>%
  fit(x = x_train2d,
      y = y_train,
      validation_split = 0.2,
      batch_size = 128,
      epochs = 20,
      callbacks = list(my_cb),
      verbose = 0)

plot(my_history)
## `geom_smooth()` using formula 'y ~ x'

my_model %>%
  evaluate(x = x_test2d, y = y_test)
##     loss accuracy 
##   0.0633   0.9825
y_prob <- my_model %>% predict(x_test2d) # カテゴリに属する確率

my_result <- data.frame(
  y_prob = apply(y_prob, 1, max),           # 確率の最大値
  y_     = apply(y_prob, 1, which.max) - 1, # 予測カテゴリ
  y      = y_test,                          # 正解
  id     = seq_len(length(y_test))) %>%     # 番号
  filter(y_ != y) %>%                       # 予測がはずれたものを残す
  arrange(desc(y_prob))                     # 確率の大きい順に並び替える
head(my_result)
##   y_prob y_ y   id
## 1  1.000  5 3  450
## 2  1.000  0 9 6506
## 3  1.000  4 6 3521
## 4  1.000  7 3 1682
## 5  1.000  6 5 9730
## 6  0.999  0 5 9771
tmp <- my_result[1:5, ]$id
my_labels <- sprintf("%s (%s)",
  my_result[1:5, ]$y, tmp)
my_fig <- expand.grid(
  label = my_labels,
  y = 28:1,
  x = 1:28)
my_fig$z <- as.vector(
  x_test[tmp, , ])

my_fig %>% ggplot(
  aes(x = x, y = y, fill = z)) +
  geom_raster() +
  coord_fixed() +
  theme_void() +
  theme(legend.position = "none") +
  facet_grid(. ~ label)

11.4 AutoML

library(h2o)
## 
## ----------------------------------------------------------------------
## 
## Your next step is to start H2O:
##     > h2o.init()
## 
## For H2O package documentation, ask for help:
##     > ??h2o
## 
## After starting H2O, you can use the Web UI at http://localhost:54321
## For more information visit https://docs.h2o.ai
## 
## ----------------------------------------------------------------------
## 
## Attaching package: 'h2o'
## The following objects are masked from 'package:stats':
## 
##     cor, sd, var
## The following objects are masked from 'package:base':
## 
##     &&, %*%, %in%, ||, apply, as.factor, as.numeric, colnames,
##     colnames<-, ifelse, is.character, is.factor, is.numeric, log,
##     log10, log1p, log2, round, signif, trunc
library(keras)
library(tidyverse)

h2o.init()
## 
## H2O is not running yet, starting it now...
## 
## Note:  In case of errors look at the following log files:
##     /tmp/RtmpUk6Jta/file414622b8b2/h2o_rstudio_started_from_r.out
##     /tmp/RtmpUk6Jta/file414ef06fbf/h2o_rstudio_started_from_r.err
## 
## 
## Starting H2O JVM and connecting: .. Connection successful!
## 
## R is connected to the H2O cluster: 
##     H2O cluster uptime:         1 seconds 332 milliseconds 
##     H2O cluster timezone:       Etc/UTC 
##     H2O data parsing timezone:  UTC 
##     H2O cluster version:        3.34.0.3 
##     H2O cluster version age:    2 years, 5 months and 16 days !!! 
##     H2O cluster name:           H2O_started_from_R_rstudio_rir268 
##     H2O cluster total nodes:    1 
##     H2O cluster total memory:   15.69 GB 
##     H2O cluster total cores:    32 
##     H2O cluster allowed cores:  32 
##     H2O cluster healthy:        TRUE 
##     H2O Connection ip:          localhost 
##     H2O Connection port:        54321 
##     H2O Connection proxy:       NA 
##     H2O Internal Security:      FALSE 
##     H2O API Extensions:         Amazon S3, XGBoost, Algos, AutoML, Core V3, TargetEncoder, Core V4 
##     R Version:                  R version 4.1.2 (2021-11-01)
## Warning in h2o.clusterInfo(): 
## Your H2O cluster version is too old (2 years, 5 months and 16 days)!
## Please download and install the latest version from http://h2o.ai/download/
h2o.no_progress()
# h2o.shutdown(prompt = FALSE) # 停止
my_url <- str_c("https://raw.githubusercontent.com/taroyabuki",
                "/fromzero/master/data/wine.csv")
my_data <- read_csv(my_url)
## Rows: 27 Columns: 5
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (5): LPRICE2, WRAIN, DEGREES, HRAIN, TIME_SV
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
my_frame <- as.h2o(my_data) # 通常のデータフレームをH2OFrameに変換する.
# あるいは
my_frame <- h2o.importFile(my_url, header = TRUE) # データを読み込む.
my_frame
##   LPRICE2 WRAIN DEGREES HRAIN TIME_SV
## 1  -0.999   600    17.1   160      31
## 2  -0.454   690    16.7    80      30
## 3  -0.808   502    17.1   130      28
## 4  -1.509   420    16.1   110      26
## 5  -1.717   582    16.4   187      25
## 6  -0.418   485    17.5   187      24
## 
## [27 rows x 5 columns]
# 通常のデータフレームに戻す.
my_frame %>% as.data.frame %>% head
##   LPRICE2 WRAIN DEGREES HRAIN TIME_SV
## 1  -0.999   600    17.1   160      31
## 2  -0.454   690    16.7    80      30
## 3  -0.808   502    17.1   130      28
## 4  -1.509   420    16.1   110      26
## 5  -1.717   582    16.4   187      25
## 6  -0.418   485    17.5   187      24
# 結果は割愛(見た目は同じ)
my_model <- h2o.automl(
    y = "LPRICE2",
    training_frame = my_frame,
    max_runtime_secs = 60)
## 
## 15:30:42.217: Skipping training of model GBM_1_AutoML_1_20240323_153039 due to exception: water.exceptions.H2OModelBuilderIllegalArgumentException: Illegal argument(s) for GBM model: GBM_1_AutoML_1_20240323_153039.  Details: ERRR on field: _min_rows: The dataset size is too small to split for min_rows=100.0: must have at least 200.0 (weighted) rows, but have only 27.0.
min(my_model@leaderboard$rmse)
## [1] 0.289
tmp <- my_model %>%
  predict(my_frame) %>%
  as.data.frame
y_ <- tmp$predict
y  <- my_data$LPRICE2

plot(y, y_)

keras::`%<-%`(c(c(x_train, y_train), c(x_test, y_test)), dataset_mnist())
my_index <- sample(1:60000, 6000)
x_train <- x_train[my_index, , ]
y_train <- y_train[my_index]
tmp <- x_train %>%
  array_reshape(c(-1, 28 * 28)) %>%
  as.data.frame
tmp$y <- as.factor(y_train)
my_train <- as.h2o(tmp)

tmp <- x_test %>%
  array_reshape(c(-1, 28 * 28)) %>%
  as.data.frame
my_test <- as.h2o(tmp)
my_model <- h2o.automl(
    y = "y",
    training_frame = my_train,
    max_runtime_secs = 120)
min(my_model@leaderboard$
    mean_per_class_error)
## [1] 0.076
tmp <- my_model %>%
  predict(my_test) %>% as.data.frame
y_ <- tmp$predict

mean(y_ == y_test)
## [1] 0.932

12 時系列予測

# Google Colaboratoryの環境設定
if (file.exists("/content")) {
  options(Ncpus = parallel::detectCores())
  installed_packages <- rownames(installed.packages())
  packages_to_install <- c("caret", "fable", "feasts", "prophet", "tsibble", "urca")
  packages_to_upgrade <- c("ggplot2")
  install.packages(union(setdiff(packages_to_install, installed_packages), packages_to_upgrade))
}

12.1 日時と日時の列

as.POSIXct("2021-01-01")
## [1] "2021-01-01 UTC"
library(tsibble)
## 
## Attaching package: 'tsibble'
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, union
seq(from = 2021, to = 2023, by = 1)
## [1] 2021 2022 2023
seq(from = yearmonth("202101"), to = yearmonth("202103"), by = 2)
## Warning: `yearmonth()` may yield unexpected results.
## ℹ Please use arg `format` to supply formats.
## `yearmonth()` may yield unexpected results.
## ℹ Please use arg `format` to supply formats.
## <yearmonth[2]>
## [1] "2021 Jan" "2021 Mar"
seq(from = as.POSIXct("2021-01-01"), to = as.POSIXct("2021-01-03"), by = "1 day")
## [1] "2021-01-01 UTC" "2021-01-02 UTC" "2021-01-03 UTC"
seq(from = as.POSIXct("2021-01-01 00:00:00"),
    to   = as.POSIXct("2021-01-01 03:00:00"), by = "2 hour")
## [1] "2021-01-01 00:00:00 UTC" "2021-01-01 02:00:00 UTC"

12.2 時系列データの予測

my_data <- as.vector(AirPassengers)
n <- length(my_data) # データ数(144)
k <- 108             # 訓練データ数
library(tidyverse)
library(tsibble)

my_ds <- seq(
  from = yearmonth("1949/01"),
  to   = yearmonth("1960/12"),
  by   = 1)
my_label <- rep(
  c("train", "test"),
  c(k, n - k))
my_df <- tsibble(
  ds    = my_ds,
  x     = 0:(n - 1),
  y     = my_data,
  label = my_label,
  index = ds) # 日時の列の指定

head(my_df)
## # A tsibble: 6 x 4 [1M]
##         ds     x     y label
##      <mth> <int> <dbl> <chr>
## 1 1949 Jan     0   112 train
## 2 1949 Feb     1   118 train
## 3 1949 Mar     2   132 train
## 4 1949 Apr     3   129 train
## 5 1949 May     4   121 train
## 6 1949 Jun     5   135 train
my_train <- my_df[  1:k , ]
my_test  <- my_df[-(1:k), ]
y <- my_test$y
my_plot <- my_df %>%
  ggplot(aes(x = ds,
             y = y,
             color = label)) +
  geom_line()
my_plot

library(caret)
my_lm_model <- train(form = y ~ x, data = my_train, method = "lm")
y_ <- my_lm_model %>% predict(my_test)
caret::RMSE(y, y_) # RMSE(テスト)
## [1] 70.6
y_ <- my_lm_model %>% predict(my_df)
tmp <- my_df %>%
  mutate(y = y_, label = "model")
my_plot + geom_line(data = tmp)

library(fable)
## Loading required package: fabletools
## 
## Attaching package: 'fabletools'
## The following objects are masked from 'package:caret':
## 
##     MAE, RMSE
my_arima_model <- my_train %>% model(ARIMA(y))
my_arima_model
## # A mable: 1 x 1
##                  `ARIMA(y)`
##                     <model>
## 1 <ARIMA(1,1,0)(0,1,0)[12]>
tmp <- my_arima_model %>% forecast(h = "3 years")
head(tmp)
## # A fable: 6 x 4 [1M]
## # Key:     .model [1]
##   .model         ds           y .mean
##   <chr>       <mth>      <dist> <dbl>
## 1 ARIMA(y) 1958 Jan  N(346, 94)  346.
## 2 ARIMA(y) 1958 Feb N(332, 148)  332.
## 3 ARIMA(y) 1958 Mar N(387, 210)  387.
## 4 ARIMA(y) 1958 Apr N(379, 271)  379.
## 5 ARIMA(y) 1958 May N(386, 332)  386.
## 6 ARIMA(y) 1958 Jun N(453, 393)  453.
y_ <- tmp$.mean
caret::RMSE(y_, y)
## [1] 22.1
# 予測結果のみでよい場合
#tmp %>% autoplot

tmp %>% autoplot +
  geom_line(data = my_df,
            aes(x = ds,
                y = y,
                color = label))

library(prophet)
## Loading required package: Rcpp
## Loading required package: rlang
## 
## Attaching package: 'rlang'
## The following object is masked from 'package:xml2':
## 
##     as_list
## The following objects are masked from 'package:jsonlite':
## 
##     flatten, unbox
## The following objects are masked from 'package:testthat':
## 
##     is_false, is_null, is_true
## The following objects are masked from 'package:purrr':
## 
##     %@%, as_function, flatten, flatten_chr, flatten_dbl, flatten_int,
##     flatten_lgl, flatten_raw, invoke, splice
my_prophet_model <- my_train %>%
  prophet(seasonality.mode = "multiplicative")
## Disabling weekly seasonality. Run prophet with weekly.seasonality=TRUE to override this.
## Disabling daily seasonality. Run prophet with daily.seasonality=TRUE to override this.
tmp <- my_prophet_model %>% predict(my_test)
head(tmp[, c("ds", "yhat", "yhat_lower", "yhat_upper")])
## # A tibble: 6 × 4
##   ds                   yhat yhat_lower yhat_upper
##   <dttm>              <dbl>      <dbl>      <dbl>
## 1 1958-01-01 00:00:00  359.       350.       369.
## 2 1958-02-01 00:00:00  351.       342.       359.
## 3 1958-03-01 00:00:00  407.       398.       416.
## 4 1958-04-01 00:00:00  398.       389.       407.
## 5 1958-05-01 00:00:00  402.       393.       411.
## 6 1958-06-01 00:00:00  460.       451.       469.
y_ <- tmp$yhat
caret::RMSE(y_, y)
## [1] 33.7
# my_prophet_model %>% plot(tmp) # 予測結果のみでよい場合

my_prophet_model %>% plot(tmp) +
  geom_line(data = my_train, aes(x = as.POSIXct(ds))) +
  geom_line(data = my_test,  aes(x = as.POSIXct(ds)), color = "red")

13 教師なし学習

# Google Colaboratoryの環境設定
if (file.exists("/content")) {
  options(Ncpus = parallel::detectCores())
  installed_packages <- rownames(installed.packages())
  packages_to_install <- c("factoextra", "ggbiplot", "igraph")
  install.packages(setdiff(packages_to_install, installed_packages))
}

13.1 主成分分析

library(tidyverse)

my_data <- data.frame(
  language  = c(  0,  20,  20,  25,  22,  17),
  english   = c(  0,  20,  40,  20,  24,  18),
  math      = c(100,  20,   5,  30,  17,  25),
  science   = c(  0,  20,   5,  25,  16,  23),
  society   = c(  0,  20,  30,   0,  21,  17),
  row.names = c("A", "B", "C", "D", "E", "F"))
my_result <- my_data %>% prcomp # 主成分分析の実行
my_result$x # 主成分スコア
##      PC1    PC2    PC3     PC4       PC5
## A -74.91  -7.01 -0.361  0.0763  2.86e-15
## B  13.82   2.75  5.273  0.8418 -2.25e-15
## C  33.71 -18.42 -4.876 -0.8820  1.19e-15
## D   1.73  17.88 -7.925 -0.1493  1.91e-16
## E  17.84  -1.06  1.653  2.3125 -3.20e-15
## F   7.81   5.86  6.237 -2.1994 -3.09e-16
my_result %>% ggbiplot::ggbiplot(
  labels = row.names(my_data),
  scale = 0)
## ------------------------------------------------------------------------------
## You have loaded plyr after dplyr - this is likely to cause problems.
## If you need functions from both plyr and dplyr, please load plyr first, then dplyr:
## library(plyr); library(dplyr)
## ------------------------------------------------------------------------------
## 
## Attaching package: 'plyr'
## The following objects are masked from 'package:dplyr':
## 
##     arrange, count, desc, failwith, id, mutate, rename, summarise,
##     summarize
## The following object is masked from 'package:purrr':
## 
##     compact
## 
## Attaching package: 'scales'
## The following object is masked from 'package:purrr':
## 
##     discard
## The following object is masked from 'package:readr':
## 
##     col_factor

my_result$rotation %>% t
##     language english     math science society
## PC1    0.207   0.304 -0.88726   0.130   0.245
## PC2    0.279  -0.325 -0.09764   0.703  -0.559
## PC3   -0.306  -0.616 -0.05635   0.338   0.640
## PC4    0.765  -0.472 -0.00765  -0.418   0.132
## PC5    0.447   0.447  0.44721   0.447   0.447
summary(my_result)
## Importance of components:
##                           PC1     PC2   PC3     PC4      PC5
## Standard deviation     38.264 12.2557 5.588 1.52970 1.23e-15
## Proportion of Variance  0.888  0.0911 0.019 0.00142 0.00e+00
## Cumulative Proportion   0.888  0.9796 0.999 1.00000 1.00e+00
my_result <- prcomp(
  x = my_data,
  scale = TRUE)       # 標準化
# あるいは
my_result <- prcomp(
  x = scale(my_data)) # 標準化データ

my_result$x # 主成分スコア
##      PC1    PC2     PC3     PC4       PC5
## A -3.674 -0.569 -0.0584  0.0106  1.67e-16
## B  0.653  0.247  0.4348  0.0930 -1.47e-16
## C  1.568 -1.743 -0.3806 -0.0977 -2.38e-16
## D  0.251  1.640 -0.6754 -0.0320  4.44e-16
## E  0.886 -0.110  0.0925  0.2304  6.74e-18
## F  0.316  0.535  0.5870 -0.2043 -2.44e-16
Z  <- my_data %>% scale(scale = FALSE) %>% as.matrix # 標準化しない場合
#Z <- my_data %>% scale(scale = TRUE)  %>% as.matrix # 標準化する場合

n <- nrow(my_data)
S <- var(Z)                          # 分散共分散行列
#S <- t(Z) %*% Z / (n - 1)           # (同じ結果)
tmp <- eigen(S)                      # 固有値と固有ベクトル
Z %*% tmp$vectors                    # 主成分スコア(結果は割愛)
##     [,1]   [,2]   [,3]    [,4]      [,5]
## A  74.91  -7.01  0.361  0.0763  4.63e-15
## B -13.82   2.75 -5.273  0.8418 -8.02e-15
## C -33.71 -18.42  4.876 -0.8820  6.96e-15
## D  -1.73  17.88  7.925 -0.1493  2.86e-15
## E -17.84  -1.06 -1.653  2.3125 -1.01e-14
## F  -7.81   5.86 -6.237 -2.1994  1.91e-15
cumsum(tmp$values) / sum(tmp$values) # 累積寄与率
## [1] 0.888 0.980 0.999 1.000 1.000
udv <- svd(Z) # 特異値分解
U <- udv$u
d <- udv$d
V <- udv$v
W <- diag(d)

c(all.equal(Z, U %*% W %*% t(V), check.attributes = FALSE), # 確認1
  all.equal(t(U) %*% U, diag(dim(U)[2])),                   # 確認2
  all.equal(t(V) %*% V, diag(dim(V)[2])))                   # 確認3
## [1] TRUE TRUE TRUE
U %*% W            # 主成分スコア(結果は割愛)
##        [,1]   [,2]   [,3]    [,4]      [,5]
## [1,] -74.91  -7.01 -0.361  0.0763 -6.85e-16
## [2,]  13.82   2.75  5.273  0.8418  8.96e-16
## [3,]  33.71 -18.42 -4.876 -0.8820 -5.18e-16
## [4,]   1.73  17.88 -7.925 -0.1493 -5.83e-16
## [5,]  17.84  -1.06  1.653  2.3125 -1.91e-15
## [6,]   7.81   5.86  6.237 -2.1994 -1.44e-15
e <- d^2 / (n - 1) # 分散共分散行列の固有値
cumsum(e) / sum(e) # 累積寄与率
## [1] 0.888 0.980 0.999 1.000 1.000

13.2 クラスタ分析

library(tidyverse)

my_data <- data.frame(
  x         = c(  0, -16,  10,  10),
  y         = c(  0,   0,  10, -15),
  row.names = c("A", "B", "C", "D"))

my_result <- my_data %>%
  dist("euclidian") %>% # distだけでも可
  hclust("complete")    # hclustだけでも可
my_result %>% factoextra::fviz_dend(
  k = 3, # クラスタ数
  rect = TRUE, rect_fill = TRUE)
## Warning: `guides(<scale> = FALSE)` is deprecated. Please use `guides(<scale> =
## "none")` instead.

my_result %>% factoextra::fviz_dend(
  k = 3,
  rect = TRUE, rect_fill = TRUE,
  type = "phylogenic")

my_result %>% cutree(3)
## A B C D 
## 1 2 1 3
library(tidyverse)

my_data <- data.frame(
  language  = c(  0,  20,  20,  25,  22,  17),
  english   = c(  0,  20,  40,  20,  24,  18),
  math      = c(100,  20,   5,  30,  17,  25),
  science   = c(  0,  20,   5,  25,  16,  23),
  society   = c(  0,  20,  30,   0,  21,  17),
  row.names = c("A", "B", "C", "D", "E", "F"))

try( # RMarkdownで発生するエラーを回避する.
  my_data %>% scale %>%                        # 列ごとの標準化
    gplots::heatmap.2(cexRow = 1, cexCol = 1), # ラベルのサイズを指定して描画する.
  silent = TRUE)

library(tidyverse)

my_data <- data.frame(
  x         = c(  0, -16,  10,  10),
  y         = c(  0,   0,  10, -15),
  row.names = c("A", "B", "C", "D"))

my_result <- my_data %>% kmeans(3)
my_result$cluster
## A B C D 
## 3 2 3 1
library(tidyverse)
library(factoextra)
## Welcome! Want to learn more? See two factoextra-related books at https://goo.gl/ve3WBa
my_data <- iris[, -5]

f <- 2:5 %>% map(function(k) {
  my_data %>% kmeans(k) %>%
    fviz_cluster(data = my_data, geom = "point") +
    ggtitle(sprintf("k = %s", k))
})
gridExtra::grid.arrange(f[[1]], f[[2]], f[[3]], f[[4]], ncol = 2)

fviz_nbclust(my_data, kmeans, method = "wss")

library(tidyverse)
my_data <- iris[, -5] %>% scale

my_result <- prcomp(my_data)$x %>% as.data.frame # 主成分分析

# 非階層的クラスタ分析の場合
my_result$cluster <- (my_data %>% scale %>% kmeans(3))$cluster %>% as.factor

# 階層的クラスタ分析の場合
#my_result$cluster <- my_data %>% dist %>% hclust %>% cutree(3) %>% as.factor

my_result %>%
  ggplot(aes(x = PC1, y = PC2, color = cluster)) + # 色でクラスタを表現する.
  geom_point(shape = iris$Species) +               # 形で品種を表現する.
  theme(legend.position = "none")