辻真吾・矢吹太朗『ゼロからはじめるデータサイエンス入門』（講談社, 2021）

RStudioでKnitする方法

docker run -d -e PASSWORD=password -e ROOT=TRUE -p 8787:8787 -v "$(pwd)":/home/rstudio/work taroyabuki/rstudio
ブラウザで，http://localhost:8787 にアクセスする．
RStudioのFilesタブで，work/r.Rmd を開く．
RStudioのConsoleで次を実行する．

install.packages(c("knitr", "markdown", "rmarkdown"))

Knit

# これはRのコードの例です．
1 + 1

## [1] 2

1 コンピュータとネットワーク

1.1 コンピュータの基本操作

1.2 ネットワークの仕組み

2 データサイエンスのための環境

2.1 実行環境の選択

2.2 クラウド

2.3 Docker

2.4 ターミナルの使い方

2.5 RとPython

2.6 サンプルコードの利用

1 + 1

## [1] 2

print(1 + 2)

## [1] 3

1 + 3

## [1] 4

3 RとPython

# Google Colaboratoryの環境設定
if (file.exists("/content")) {
  options(Ncpus = parallel::detectCores())
  installed_packages <- rownames(installed.packages())
  packages_to_install <- c("furrr", "keras", "proxy")
  install.packages(setdiff(packages_to_install, installed_packages))
}

3.1 入門

0x10

## [1] 16

1.23e5

## [1] 123000

2 * 3

## [1] 6

10 / 3

## [1] 3.333333

10 %/% 3 # 商

## [1] 3

10 %% 3  # 余り

## [1] 1

x <- 2
y <- 3
x * y

## [1] 6

keras::`%<-%`(c(x, y), c(20, 30)) # まとめて名付け
x * y

## [1] 600

x <- 1 + 1
# この段階では結果は表示されない

x # 変数名を評価する．

## [1] 2

my_s <- "abcde"

nchar(my_s)

## [1] 5

library(tidyverse)

## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.1 ──

## ✓ ggplot2 3.3.5     ✓ purrr   0.3.4
## ✓ tibble  3.1.6     ✓ dplyr   1.0.7
## ✓ tidyr   1.1.4     ✓ stringr 1.4.0
## ✓ readr   2.1.0     ✓ forcats 0.5.1

## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()

str_c("This is ", "a", " pen.")

## [1] "This is a pen."

substr(x = my_s, start = 2, stop = 4)

## [1] "bcd"

tmp <- "%s is %s."
sprintf(tmp, "This", "a pen")

## [1] "This is a pen."

1 <= 2

## [1] TRUE

1 < 0

## [1] FALSE

0.1 + 0.1 + 0.1 == 0.3

## [1] FALSE

all.equal(0.1 + 0.1 + 0.1, 0.3)

## [1] TRUE

TRUE & FALSE # 論理積（かつ）

## [1] FALSE

TRUE | FALSE # 論理和（または）

## [1] TRUE

!TRUE        # 否定（でない）

## [1] FALSE

ifelse(3 < 5, 0, 10)

## [1] 0

getwd()

## [1] "/home/rstudio/work"

setwd("..")
getwd()

## [1] "/home/rstudio"

3.2 関数

sqrt(4)

## [1] 2

log(100, 10)

## [1] 2

log(100)         # 自然対数

## [1] 4.60517

# あるいは
log(100, exp(1)) # 省略しない場合

## [1] 4.60517

log10(100) # 常用対数

## [1] 2

log2(1024) # 底が2の対数

## [1] 10

library(tidyverse)
4 %>% sqrt

## [1] 2

exp(log(5))       # 通常の書き方

## [1] 5

# あるいは
5 %>% log %>% exp # パイプを使う書き方

## [1] 5

f <- function(a, b) {
  a - b
}

f(3, 5)

## [1] -2

f <- function(a, b = 5) {
  a - b
}

f(3) # f(3, 5)と同じこと

## [1] -2

(function(a, b) { a - b })(3, 5)

## [1] -2

3.3 コレクション

x <- c("foo", "bar", "baz")

length(x)

## [1] 3

x[2]

## [1] "bar"

x[2] <- "BAR"
x # 結果の確認

## [1] "foo" "BAR" "baz"

x[2] <- "bar" # 元に戻す．

x[-2]

## [1] "foo" "baz"

c(x, "qux")

## [1] "foo" "bar" "baz" "qux"

x <- c(x, "qux")
x # 結果の確認

## [1] "foo" "bar" "baz" "qux"

1:5

## [1] 1 2 3 4 5

seq(from = 0, to = 10, by = 2)

## [1]  0  2  4  6  8 10

seq(from = 0, to = 1, by = 0.5)

## [1] 0.0 0.5 1.0

seq(from = 0, to = 100, length.out = 5)

## [1]   0  25  50  75 100

rep(x = 10, times = 5)

## [1] 10 10 10 10 10

tmp <- c("グー", "パー", "グー", "パー")
x <- factor(tmp, levels = c("グー", "チョキ", "パー"))
x

## [1] グー パー グー パー
## Levels: グー チョキ パー

x <- c(2, 3, 5, 7)

x + 10 # 加算

## [1] 12 13 15 17

x * 10 # 乗算

## [1] 20 30 50 70

x <- c(2, 3)
sin(x)

## [1] 0.9092974 0.1411200

x <- c(2,  3,   5,    7)
y <- c(1, 10, 100, 1000)
x + y

## [1]    3   13  105 1007

x * y

## [1]    2   30  500 7000

sum(x * y)

## [1] 7532

x <- c(TRUE, FALSE)
y <- c(TRUE, TRUE)
x & y

## [1]  TRUE FALSE

u <- c(1, 2, 3)
v <- c(1, 2, 3)
w <- c(1, 2, 4)

identical(u, v) # 全体の比較

## [1] TRUE

identical(u, w) # 全体の比較

## [1] FALSE

u == v          # 要素ごとの比較

## [1] TRUE TRUE TRUE

u == w          # 要素ごとの比較

## [1]  TRUE  TRUE FALSE

sum(u == w)  # 同じ要素の数

## [1] 2

mean(u == w) # 同じ要素の割合

## [1] 0.6666667

x <- list(1, "two")

x[[2]]

## [1] "two"

x <- list("apple"  = "りんご",
          "orange" = "みかん")

x[["grape"]] <- "ぶどう"

x$apple

## [1] "りんご"

# あるいは
x$"apple"

## [1] "りんご"

# あるいは
x[["apple"]]

## [1] "りんご"

# あるいは
tmp <- "apple"
x[[tmp]]

## [1] "りんご"

x <- c("foo", "bar", "baz")
y <- x
y[2] <- "BAR" # yを更新する．
y

## [1] "foo" "BAR" "baz"

x             # xは変わらない．

## [1] "foo" "bar" "baz"

3.4 データフレーム

library(tidyverse)

my_df <- data.frame(
  name    = c("A", "B", "C", "D"),
  english = c( 60,  90,  70,  90),
  math    = c( 70,  80,  90, 100),
  gender  = c("f", "m", "m", "f"))

my_df <- tribble(
  ~name, ~english, ~math, ~gender,
  "A",         60,    70,     "f",
  "B",         90,    80,     "m",
  "C",         70,    90,     "m",
  "D",         90,   100,     "f")

head(my_df)

## # A tibble: 4 × 4
##   name  english  math gender
##   <chr>   <dbl> <dbl> <chr> 
## 1 A          60    70 f     
## 2 B          90    80 m     
## 3 C          70    90 m     
## 4 D          90   100 f

# 結果は割愛

dim(my_df)  # 行数と列数

## [1] 4 4

nrow(my_df) # 行数

## [1] 4

ncol(my_df) # 列数

## [1] 4

my_df2 <- expand.grid(
  X = c(1, 2, 3),
  Y = c(10, 100))
my_df2

##   X   Y
## 1 1  10
## 2 2  10
## 3 3  10
## 4 1 100
## 5 2 100
## 6 3 100

colnames(my_df2)

## [1] "X" "Y"

colnames(my_df2) <- c("P", "Q")
my_df2

##   P   Q
## 1 1  10
## 2 2  10
## 3 3  10
## 4 1 100
## 5 2 100
## 6 3 100

# 以下省略

row.names(my_df)

## [1] "1" "2" "3" "4"

row.names(my_df2) <-
  c("a", "b", "c", "d", "e", "f")
my_df2

##   P   Q
## a 1  10
## b 2  10
## c 3  10
## d 1 100
## e 2 100
## f 3 100

# 以下省略

my_df3 <- data.frame(
  english =   c( 60,  90,  70,  90),
  math    =   c( 70,  80,  90, 100),
  gender  =   c("f", "m", "m", "f"),
  row.names = c("A", "B", "C", "D"))
my_df3

##   english math gender
## A      60   70      f
## B      90   80      m
## C      70   90      m
## D      90  100      f

tmp <- data.frame(
  name    = "E",
  english =  80,
  math    =  80,
  gender  = "m")
my_df2 <- rbind(my_df, tmp)

my_df2 <- my_df %>%
  mutate(id = c(1, 2, 3, 4))

my_df3 <- my_df               # コピー
my_df3["id"] <- c(1, 2, 3, 4) # 更新
my_df3 # 結果の確認（割愛）

## # A tibble: 4 × 5
##   name  english  math gender    id
##   <chr>   <dbl> <dbl> <chr>  <dbl>
## 1 A          60    70 f          1
## 2 B          90    80 m          2
## 3 C          70    90 m          3
## 4 D          90   100 f          4

my_df[1, 2]

## # A tibble: 1 × 1
##   english
##     <dbl>
## 1      60

x <- my_df[, 2]
# あるいは
x <- my_df$english
# あるいは
x <- my_df$"english"
# あるいは
x <- my_df[["english"]]
# あるいは
tmp <- "english"
x <- my_df[[tmp]]

x # 結果の確認（割愛）

## [1] 60 90 70 90

x <- my_df %>% select(name, math)

x <- my_df[, c(1, 3)]

x <- my_df %>%
  select(-c(english, gender))
# あるいは
x <- my_df[, -c(2, 4)]

x <- my_df[c(1, 3), ]

x <- my_df[-c(2, 4), ]

x <- my_df[my_df$gender == "m", ]
# あるいは
x <- my_df %>% filter(gender == "m")

x <- my_df[my_df$english > 80 & my_df$gender == "m", ]
# あるいは
x <- my_df %>% filter(english > 80 & gender == "m")

x <- my_df[my_df$english == max(my_df$english), ]
# あるいは
x <- my_df %>% filter(english == max(my_df$english))

my_df2 <- my_df # コピー
my_df2[my_df$gender == "m", ]$gender <- "M"

my_df2

## # A tibble: 4 × 4
##   name  english  math gender
##   <chr>   <dbl> <dbl> <chr> 
## 1 A          60    70 f     
## 2 B          90    80 M     
## 3 C          70    90 M     
## 4 D          90   100 f

x <- my_df %>% arrange(english)

x <- my_df %>% arrange(-english)

x <- c(2, 3, 5, 7, 11, 13, 17, 19, 23,
       29, 31, 37)
A <- matrix(
  data = x,     # 1次元データ
  nrow = 3,     # 行数
  byrow = TRUE) # 行ごとの生成
A

##      [,1] [,2] [,3] [,4]
## [1,]    2    3    5    7
## [2,]   11   13   17   19
## [3,]   23   29   31   37

A <- my_df[, c(2, 3)] %>% as.matrix
A

##      english math
## [1,]      60   70
## [2,]      90   80
## [3,]      70   90
## [4,]      90  100

as.data.frame(A)

##   english math
## 1      60   70
## 2      90   80
## 3      70   90
## 4      90  100

t(A)

##         [,1] [,2] [,3] [,4]
## english   60   90   70   90
## math      70   80   90  100

t(A) %*% A

##         english  math
## english   24700 26700
## math      26700 29400

my_wider <- data.frame(
  day = c(25, 26, 27),
  min = c(20, 21, 15),
  max = c(24, 27, 21))

my_longer <- my_wider %>%
  pivot_longer(-day)
my_longer

## # A tibble: 6 × 3
##     day name  value
##   <dbl> <chr> <dbl>
## 1    25 min      20
## 2    25 max      24
## 3    26 min      21
## 4    26 max      27
## 5    27 min      15
## 6    27 max      21

my_longer %>% pivot_wider()

## # A tibble: 3 × 3
##     day   min   max
##   <dbl> <dbl> <dbl>
## 1    25    20    24
## 2    26    21    27
## 3    27    15    21

my_longer %>%
  ggplot(aes(x = day, y = value,
             color = name)) +
  geom_point() +
  geom_line() +
  ylab("temperature") + # y軸ラベル
  scale_x_continuous(
    breaks = my_longer$day) # x軸目盛り

3.5 1次元データの（非）類似度

A <- c(3,   4,  5)
B <- c(3,   4, 29)
C <- c(9, -18,  8)
AB <- B - A
AC <- C - A

sum(AB^2)^0.5

## [1] 24

sum(AC^2)^0.5

## [1] 23

sum(abs(AB))

## [1] 24

sum(abs(AC))

## [1] 31

sum(A * B) /
  sum(A * A)^0.5 / sum(B * B)^0.5

## [1] 0.8169679

sum(A * C) /
  sum(A * A)^0.5 / sum(C * C)^0.5

## [1] -0.03265116

cor(A, B)

## [1] 0.8824975

cor(A, C)

## [1] -0.03266277

library(tidyverse)



my_df <- data.frame(
  x = c(3,  3,   9),
  y = c(4,  4, -18),
  z = c(5, 29,   8),
  row.names = c("A", "B", "C"))

# ユークリッド距離
my_df %>% proxy::dist("Euclidean")

##    A  B
## B 24   
## C 23 31

# マンハッタン距離
my_df %>% proxy::dist("Manhattan")

##    A  B
## B 24   
## C 31 49

# コサイン類似度
my_df %>% proxy::simil("cosine")

##             A           B
## B  0.81696786            
## C -0.03265116  0.29342441

# 相関係数
my_df %>% proxy::simil("correlation")

##             A           B
## B  0.88249750            
## C -0.03266277  0.44124132

3.6 Rのパッケージ，Pythonのモジュール

library(tidyverse)

3.7 反復処理

library(tidyverse)

f1 <- function(x) {
  tmp <- runif(x)
  mean(tmp)
}

f1(10)           # 動作確認

## [1] 0.5821029

replicate(n = 3, expr = f1(10))

## [1] 0.4878048 0.4695691 0.5339249

rep(x = f1(10), times = 3)

## [1] 0.6140743 0.6140743 0.6140743

v <- c(5, 10, 100)
v %>% map_dbl(f1)

## [1] 0.6447337 0.5604720 0.5270688

rep(x = 10, times = 3) %>% map_dbl(f1)

## [1] 0.6154754 0.4872310 0.4831569

# 結果は割愛

f2 <- function(n) {
  tmp <- runif(n)
  list(x = n,
       p = mean(tmp),
       q = sd(tmp))
}

f2(10) # 動作確認

## $x
## [1] 10
## 
## $p
## [1] 0.3094924
## 
## $q
## [1] 0.2527441

v <- c(5, 10, 100)
v %>% map_dfr(f2)

## # A tibble: 3 × 3
##       x     p     q
##   <dbl> <dbl> <dbl>
## 1     5 0.485 0.274
## 2    10 0.399 0.328
## 3   100 0.580 0.283

f3 <- function(x, y) {
  tmp <- runif(x, min = 1,
                  max = y + 1) %>%
    as.integer
  list(x = x,
       y = y,
       p = mean(tmp),
       q = sd(tmp))
}

f3(x = 10, y = 6) # 動作確認

## $x
## [1] 10
## 
## $y
## [1] 6
## 
## $p
## [1] 3.5
## 
## $q
## [1] 1.581139

my_df <- data.frame(
  x = c(5, 10, 100,  5, 10, 100),
  y = c(6,  6,   6, 12, 12,  12))

my_df %>% pmap_dfr(f3)

## # A tibble: 6 × 4
##       x     y     p     q
##   <dbl> <dbl> <dbl> <dbl>
## 1     5     6  3     2   
## 2    10     6  2.6   1.78
## 3   100     6  3.41  1.70
## 4     5    12  7.8   4.09
## 5    10    12  6.1   3.38
## 6   100    12  6.56  3.53

library(furrr)

## Loading required package: future

plan(multisession) # 準備

v <- c(5, 10, 100)
v %>% future_map_dbl(f1, .options =
  furrr_options(seed = TRUE))

## [1] 0.6562407 0.7985511 0.4760155

# 結果は割愛

3.8 その他

x <- 123
typeof(x)

## [1] "double"

?log
# あるいは
help(log)

v <- c(1, NA, 3)
v

## [1]  1 NA  3

is.na(v[2])

## [1] TRUE

v[2] == NA # 誤り

## [1] NA

4 統計入門

# Google Colaboratoryの環境設定
if (file.exists("/content")) {
  options(Ncpus = parallel::detectCores())
  installed_packages <- rownames(installed.packages())
  packages_to_install <- c("exactci", "ggmosaic", "pastecs", "psych", "vcd")
  install.packages(setdiff(packages_to_install, installed_packages))
}

4.1 記述統計

x <- c(165, 170, 175, 180, 185)
mean(x) # 平均

## [1] 175

n <- length(x) # サンプルサイズ
sum(x) / n

## [1] 175

y <- c(173, 174, 175, 176, 177)
mean(y)

## [1] 175

var(x) # xの分散

## [1] 62.5

var(y) # yの分散

## [1] 2.5

sum((x - mean(x))^2) / (n - 1)

## [1] 62.5

sd(x) # xの標準偏差

## [1] 7.905694

sd(y) # yの標準偏差

## [1] 1.581139

var(x)**0.5 # xの標準偏差

## [1] 7.905694

psych::describe(x)

##    vars n mean   sd median trimmed  mad min max range skew kurtosis   se
## X1    1 5  175 7.91    175     175 7.41 165 185    20    0    -1.91 3.54

# あるいは

pastecs::stat.desc(x)

##      nbr.val     nbr.null       nbr.na          min          max        range 
##    5.0000000    0.0000000    0.0000000  165.0000000  185.0000000   20.0000000 
##          sum       median         mean      SE.mean CI.mean.0.95          var 
##  875.0000000  175.0000000  175.0000000    3.5355339    9.8162158   62.5000000 
##      std.dev     coef.var 
##    7.9056942    0.0451754

quantile(x)

##   0%  25%  50%  75% 100% 
##  165  170  175  180  185

x <- c(165, 170, 175, 180, 185)

var(x)                # 不偏分散

## [1] 62.5

mean((x - mean(x))^2) # 標本分散

## [1] 50

# あるいは
n <- length(x)
var(x) * (n - 1) / n  # 標本分散

## [1] 50

sd(x)                     # √不偏分散

## [1] 7.905694

mean((x - mean(x))^2)^0.5 # √標本分散

## [1] 7.071068

# あるいは
sd(x) * sqrt((n - 1) / n) # √標本分散

## [1] 7.071068

sd(x) / length(x)**0.5

## [1] 3.535534

library(tidyverse)

my_df <- data.frame(
  name    = c("A", "B", "C", "D"),
  english = c( 60,  90,  70,  90),
  math    = c( 70,  80,  90, 100),
  gender  = c("f", "m", "m", "f"))

var(my_df$english)

## [1] 225

# 結果はベクタ
my_df[, c(2, 3)] %>% sapply(var)

##  english     math 
## 225.0000 166.6667

# 結果はリスト
my_df[, c(2, 3)] %>% lapply(var)

## $english
## [1] 225
## 
## $math
## [1] 166.6667

# 結果はデータフレーム
my_df[, c(2, 3)] %>% # 2, 3列目
  summarize(across(  # の
    everything(),    # 全ての
    var))            # 不偏分散

##   english     math
## 1     225 166.6667

# あるいは
my_df %>%              # データフレーム
  summarize(across(    # の
    where(is.numeric), # 数値の列の
    var))              # 不偏分散

##   english     math
## 1     225 166.6667

# あるいは
my_df %>%              # データフレーム
  summarize(across(    # の
    where(is.numeric), # 数値の列の
    function(x) { var(x) })) # 不偏分散

##   english     math
## 1     225 166.6667

psych::describe(my_df)

##         vars n mean    sd median trimmed   mad min max range  skew kurtosis
## name*      1 4  2.5  1.29    2.5     2.5  1.48   1   4     3  0.00    -2.08
## english    2 4 77.5 15.00   80.0    77.5 14.83  60  90    30 -0.14    -2.28
## math       3 4 85.0 12.91   85.0    85.0 14.83  70 100    30  0.00    -2.08
## gender*    4 4  1.5  0.58    1.5     1.5  0.74   1   2     1  0.00    -2.44
##           se
## name*   0.65
## english 7.50
## math    6.45
## gender* 0.29

# あるいは

pastecs::stat.desc(my_df)

##          name     english        math gender
## nbr.val    NA   4.0000000   4.0000000     NA
## nbr.null   NA   0.0000000   0.0000000     NA
## nbr.na     NA   0.0000000   0.0000000     NA
## min        NA  60.0000000  70.0000000     NA
## max        NA  90.0000000 100.0000000     NA
## range      NA  30.0000000  30.0000000     NA
## sum        NA 310.0000000 340.0000000     NA
## median     NA  80.0000000  85.0000000     NA
## mean       NA  77.5000000  85.0000000     NA
## SE.mean    NA   7.5000000   6.4549722     NA
## CI.mean    NA  23.8683473  20.5426026     NA
## var        NA 225.0000000 166.6666667     NA
## std.dev    NA  15.0000000  12.9099445     NA
## coef.var   NA   0.1935484   0.1518817     NA

# 以下省略

table(my_df$gender)

## 
## f m 
## 2 2

my_df2 <- data.frame(
  gender = my_df$gender,
  excel = my_df$math >= 80)
table(my_df2)

##       excel
## gender FALSE TRUE
##      f     1    1
##      m     0    2

my_df %>% group_by(gender) %>%
  summarize(across(
    where(is.numeric), mean),
    .groups = "drop") # グループ化解除

## # A tibble: 2 × 3
##   gender english  math
##   <chr>    <dbl> <dbl>
## 1 f           75    85
## 2 m           80    85

4.2 データの可視化

head(iris)

##   Sepal.Length Sepal.Width Petal.Length Petal.Width Species
## 1          5.1         3.5          1.4         0.2  setosa
## 2          4.9         3.0          1.4         0.2  setosa
## 3          4.7         3.2          1.3         0.2  setosa
## 4          4.6         3.1          1.5         0.2  setosa
## 5          5.0         3.6          1.4         0.2  setosa
## 6          5.4         3.9          1.7         0.4  setosa

hist(iris$Sepal.Length)

x <- c(10, 20, 30)
hist(x, breaks = 2) # 階級数は2

x <- iris$Sepal.Length
tmp <- seq(min(x), max(x),
           length.out = 10)
hist(x, breaks = tmp, right = FALSE)

plot(iris$Sepal.Length,
     iris$Sepal.Width)

boxplot(iris[, -5])

library(tidyverse)
my_df <- psych::describe(iris[, -5])
my_df %>% select(mean, sd, se)

##              mean   sd   se
## Sepal.Length 5.84 0.83 0.07
## Sepal.Width  3.06 0.44 0.04
## Petal.Length 3.76 1.77 0.14
## Petal.Width  1.20 0.76 0.06

tmp <- rownames(my_df)
my_df %>% ggplot(aes(x = factor(tmp, levels = tmp), y = mean)) +
  geom_col() +
  geom_errorbar(aes(ymin = mean - se, ymax = mean + se)) +
  xlab(NULL)

my_group <- iris %>% group_by(Species)       # 品種ごとに，

my_df <- my_group %>%                        # 各変数の，平均と
  summarize(across(everything(), mean)) %>%
  pivot_longer(-Species)

tmp <- my_group %>%                          # 標準誤差を求める．
  summarize(across(everything(), ~ sd(.) / length(.)**0.5)) %>%
  pivot_longer(-Species)

my_df$se <- tmp$value
head(my_df)

## # A tibble: 6 × 4
##   Species    name         value     se
##   <fct>      <chr>        <dbl>  <dbl>
## 1 setosa     Sepal.Length 5.01  0.0498
## 2 setosa     Sepal.Width  3.43  0.0536
## 3 setosa     Petal.Length 1.46  0.0246
## 4 setosa     Petal.Width  0.246 0.0149
## 5 versicolor Sepal.Length 5.94  0.0730
## 6 versicolor Sepal.Width  2.77  0.0444

my_df %>%
  ggplot(aes(x = Species, y = value, fill = name)) +
  geom_col(position = "dodge") +
  geom_errorbar(aes(ymin = value - se, ymax = value + se), position = "dodge")

# 各変数の平均
iris %>% pivot_longer(-Species) %>%
  ggplot(aes(x = name, y = value)) +
  geom_bar(stat = "summary", fun = mean) +
  stat_summary(geom = "errorbar", fun.data = mean_se) +
  xlab(NULL)

# 各変数の平均（品種ごと）
iris %>% pivot_longer(-Species) %>%
  ggplot(aes(x = Species, y = value, fill = name)) +
  geom_bar(stat = "summary", fun = mean, position = "dodge") +
  stat_summary(geom = "errorbar", fun.data = mean_se, position = "dodge")

my_df <- data.frame(
  Species = iris$Species,
  w_Sepal = iris$Sepal.Width > 3)
table(my_df) # 分割表

##             w_Sepal
## Species      FALSE TRUE
##   setosa         8   42
##   versicolor    42    8
##   virginica     33   17

mosaicplot(
  formula = ~ Species + w_Sepal,
  data = my_df)

library(vcd)

## Loading required package: grid

vcd::mosaic(formula = ~w_Sepal + Species, data = my_df,
            labeling = labeling_values)

curve(x^3 - x, -2, 2)

x <- iris$Sepal.Length
tmp <- seq(min(x), max(x),
           length.out = 10)
iris %>%
  ggplot(aes(x = Sepal.Length)) +
  geom_histogram(breaks = tmp,
                 closed = "left")

iris %>%
  ggplot(aes(x = Sepal.Length,
             y = Sepal.Width)) +
  geom_point()

iris %>%
  pivot_longer(-Species) %>%
  ggplot(aes(
    x = factor(name,
               levels = names(iris)),
    y = value)) +
  geom_boxplot() +
  xlab(NULL)

library(ggmosaic)

## 
## Attaching package: 'ggmosaic'

## The following objects are masked from 'package:vcd':
## 
##     mosaic, spine

my_df <- data.frame(
  Species = iris$Species,
  w_Sepal = iris$Sepal.Width > 3)
my_df %>%
  ggplot() +
  geom_mosaic(
    aes(x = product(w_Sepal, Species)))

f <- function(x) { x^3 - x }
data.frame(x = c(-2, 2)) %>%
  ggplot(aes(x = x)) +
  stat_function(fun = f)

4.3 乱数

x <- sample(x = 1:6,        # 範囲
            size = 10000,   # 乱数の数
            replace = TRUE) # 重複あり
hist(x, breaks = 0:6) # ヒストグラム

x <- runif(min = 0,  # 最小
           max = 1,  # 最大
           n = 1000) # 乱数の数
hist(x)

x <- as.integer(      # 整数に変換
  runif(min = 1,      # 最小
        max = 7,      # 最大 + 1
        n = 1000))    # 乱数の数
hist(x, breaks = 0:6) # 結果は割愛

n <- 100
p <- 0.5
r <- 10000
x <- rbinom(size = n, # 試行回数
            prob = p, # 確率
            n = r)    # 乱数の数
hist(x, breaks = max(x) - min(x))

r <- 10000
x <- rnorm(mean = 50, # 平均
           sd = 5,    # 標準偏差
           n = r)     # 乱数の数
hist(x, breaks = 40)

library(tidyverse)

f <- function(k) {
  n <- 10000
  tmp <- replicate(n = n, expr = g(rnorm(n =  k, sd = 3)))
  list(k = k,
       mean = mean(tmp),       # 平均
       se = sd(tmp) / sqrt(n)) # 標準誤差
}

g <- var
c(10, 20, 30) %>% map_dfr(f)

## # A tibble: 3 × 3
##       k  mean     se
##   <dbl> <dbl>  <dbl>
## 1    10  8.95 0.0420
## 2    20  9.00 0.0289
## 3    30  8.99 0.0239

g <- sd
c(5, 10, 15, 20) %>% map_dfr(f)

## # A tibble: 4 × 3
##       k  mean      se
##   <dbl> <dbl>   <dbl>
## 1     5  2.80 0.0101 
## 2    10  2.92 0.00704
## 3    15  2.95 0.00559
## 4    20  2.97 0.00488

g <- function(x) {
  n <- length(x)
  sd(x) *
    sqrt((n - 1) / 2) *
    gamma((n - 1) / 2) /
    gamma(n / 2)
}
c(10, 20, 30) %>% map_dfr(f)

## # A tibble: 3 × 3
##       k  mean      se
##   <dbl> <dbl>   <dbl>
## 1    10  3.00 0.00711
## 2    20  3.01 0.00497
## 3    30  3.01 0.00398

4.4 統計的推測

library(exactci)

## Loading required package: ssanv

## Loading required package: testthat

## 
## Attaching package: 'testthat'

## The following object is masked from 'package:dplyr':
## 
##     matches

## The following object is masked from 'package:purrr':
## 
##     is_null

## The following objects are masked from 'package:readr':
## 
##     edition_get, local_edition

## The following object is masked from 'package:tidyr':
## 
##     matches

library(tidyverse)

a <- 0.05                              # 有意水準
binom.exact(x = 2,                     # 当たった回数
            n = 15,                    # くじを引いた回数
            p = 4 / 10,                # 当たる確率（仮説）
            plot = TRUE,               # p値の描画（結果は次項に掲載）
            conf.level = 1 - a,        # 信頼係数（デフォルト）
            tsmethod = "minlike",      # p値の定義
            alternative = "two.sided") # 両側検定（デフォルト）

## 
##  Exact two-sided binomial test (sum of minimum likelihood method)
## 
## data:  2 and 15
## number of successes = 2, number of trials = 15, p-value = 0.03646
## alternative hypothesis: true probability of success is not equal to 0.4
## 95 percent confidence interval:
##  0.0242 0.3967
## sample estimates:
## probability of success 
##              0.1333333

                                       # 左片側検定なら'less'
                                       # 右片側検定なら'greater'

t <- 4 / 10               # 当たる確率
n <- 15                   # くじを引いた回数
x <- 0:n                  # 当たった回数
my_pr  <- dbinom(x, n, t) # x回当たる確率
my_pr2 <- dbinom(2, n, t) # 2回当たる確率

my_data <- data.frame(x = x,
                      probability = my_pr,
                      color = my_pr <= my_pr2) # 当たる確率が，2回当たる確率以下

my_data %>% ggplot(aes(x = x, y = probability, color = color)) +
  geom_point(size = 3) +
  geom_linerange(aes(ymin = 0, ymax = probability), ) + # 垂直線
  geom_hline(yintercept = my_pr2) +                     # 水平線
  theme(legend.position = "none")                       # 凡例を表示しない．

# 前項の結果（再掲）

# 前項冒頭のコード

X <- c(32.1, 26.2, 27.5, 31.8, 32.1, 31.2, 30.1, 32.4, 32.3, 29.9,
       29.6, 26.6, 31.2, 30.9, 29.3)
Y <- c(35.4, 34.6, 31.1, 32.4, 33.3, 34.7, 35.3, 34.3, 32.1, 28.3,
       33.3, 30.5, 32.6, 33.3, 32.2)

t.test(x = X, y = Y,
       conf.level = 0.95,         # 信頼係数（デフォルト）
       paired = TRUE,             # 対標本である．
       alternative = "two.sided") # 両側検定（デフォルト）

## 
##  Paired t-test
## 
## data:  X and Y
## t = -4.3694, df = 14, p-value = 0.0006416
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  -3.995525 -1.364475
## sample estimates:
## mean of the differences 
##                   -2.68

                                  # 左片側検定なら'less'
                                  # 右片側検定なら'greater'

t.test(x = X, y = Y,
       paired = FALSE,   # 対標本ではない（デフォルト）．
       var.equal = TRUE, # 等分散を仮定する．仮定しないならFALSE（デフォルト）．
       alternative = "two.sided",
       conf.level = 0.95)

## 
##  Two Sample t-test
## 
## data:  X and Y
## t = -3.6821, df = 28, p-value = 0.0009785
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  -4.170906 -1.189094
## sample estimates:
## mean of x mean of y 
##  30.21333  32.89333

my_url <- str_c("https://raw.githubusercontent.com/taroyabuki",
                "/fromzero/master/data/smoker.csv")
my_data <- read_csv(my_url)

## Rows: 1469 Columns: 2
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (2): alive, smoker
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

head(my_data)

## # A tibble: 6 × 2
##   alive smoker
##   <chr> <chr> 
## 1 Yes   No    
## 2 Yes   No    
## 3 Yes   No    
## 4 Yes   No    
## 5 Yes   No    
## 6 Yes   No

my_table <- table(my_data)
my_table

##      smoker
## alive  No Yes
##   No  117  54
##   Yes 950 348

chisq.test(my_table, correct = FALSE)

## 
##  Pearson's Chi-squared test
## 
## data:  my_table
## X-squared = 1.7285, df = 1, p-value = 0.1886

X <- rep(0:1, c(13, 2)) # 手順1
X

##  [1] 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1

tmp <- sample(X, size = length(X), replace = TRUE) # 手順2
tmp

##  [1] 1 1 0 0 0 0 0 0 0 0 1 1 0 0 0

sum(tmp) # 手順3

## [1] 4

n <- 10^5
result <- replicate(n, sum(sample(X, size = length(X), replace = TRUE))) # 手順4

hist(x = result, breaks = 0:15,
     right = FALSE)

quantile(result, c(0.025, 0.975))

##  2.5% 97.5% 
##     0     5

5 前処理

# Google Colaboratoryの環境設定
if (file.exists("/content")) {
  options(Ncpus = parallel::detectCores())
  installed_packages <- rownames(installed.packages())
  packages_to_install <- c("caret")
  install.packages(setdiff(packages_to_install, installed_packages))
}

5.1 データの読み込み

library(tidyverse)
system(str_c("wget https://raw.githubusercontent.com/taroyabuki",
             "/fromzero/master/data/exam.csv"))

my_df <- read_csv("exam.csv")

## Rows: 4 Columns: 4
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (2): name, gender
## dbl (2): english, math
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

# あるいは
my_df <- read.csv("exam.csv",
  stringsAsFactors = FALSE)

my_df

##   name english math gender
## 1    A      60   70      f
## 2    B      90   80      m
## 3    C      70   90      m
## 4    D      90  100      f

my_url <- str_c("https://raw.githubusercontent.com/taroyabuki",
                "/fromzero/master/data/exam.csv")
my_df <- read_csv(my_url)

## Rows: 4 Columns: 4
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (2): name, gender
## dbl (2): english, math
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

# あるいは
my_df <- read.csv(my_url, stringsAsFactors = FALSE)

my_df2 <- read.csv(
  file = "exam.csv",
  stringsAsFactors = FALSE,
  row.names = 1)
my_df2

##   english math gender
## A      60   70      f
## B      90   80      m
## C      70   90      m
## D      90  100      f

my_df %>% write_csv("exam2.csv")
# あるいは
my_df %>% write.csv(
  file = "exam2.csv",
  row.names = FALSE)

my_df2 %>% write.csv("exam3.csv")

my_df <- read_csv(file = "exam.csv",
  locale = locale(encoding = "UTF-8"))

## Rows: 4 Columns: 4
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (2): name, gender
## dbl (2): english, math
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

# あるいは
my_df <- read.csv(file = "exam.csv",
  stringsAsFactors = FALSE,
  fileEncoding = "UTF-8")

my_df %>% write_csv("exam2.csv")
# あるいは
my_df %>% write.csv(file = "exam2.csv", row.names = FALSE,
                    fileEncoding = "UTF-8")

my_url <- "https://taroyabuki.github.io/fromzero/exam.html"
my_tables <- xml2::read_html(my_url) %>% rvest::html_table()

my_tables

## [[1]]
## # A tibble: 5 × 5
##   X1    X2    X3      X4    X5    
##   <lgl> <chr> <chr>   <chr> <chr> 
## 1 NA    name  english math  gender
## 2 NA    A     60      70    f     
## 3 NA    B     90      80    m     
## 4 NA    C     70      90    m     
## 5 NA    D     90      100   f

tmp <- my_tables[[1]]
tmp

## # A tibble: 5 × 5
##   X1    X2    X3      X4    X5    
##   <lgl> <chr> <chr>   <chr> <chr> 
## 1 NA    name  english math  gender
## 2 NA    A     60      70    f     
## 3 NA    B     90      80    m     
## 4 NA    C     70      90    m     
## 5 NA    D     90      100   f

# 1行目のデータを使って列の名前を付け直す．
colnames(tmp) <- tmp[1, ]

# 1行目と1列目を削除する．
my_data <- tmp[-1, -1]
my_data

## # A tibble: 4 × 4
##   name  english math  gender
##   <chr> <chr>   <chr> <chr> 
## 1 A     60      70    f     
## 2 B     90      80    m     
## 3 C     70      90    m     
## 4 D     90      100   f

library(jsonlite)

## 
## Attaching package: 'jsonlite'

## The following object is masked from 'package:purrr':
## 
##     flatten

my_url <- str_c("https://raw.githubusercontent.com/taroyabuki",
                "/fromzero/master/data/exam.json")
my_data <- fromJSON(my_url)
#my_data <- fromJSON("exam.json") # （ファイルを使う場合）
my_data

##   name english math gender
## 1    A      60   70      f
## 2    B      90   80      m
## 3    C      70   90      m
## 4    D      90  100      f

library(xml2)
my_url <- str_c("https://raw.githubusercontent.com/taroyabuki",
                "/fromzero/master/data/exam.xml")
my_xml <- read_xml(my_url)      # XMLデータの読み込み
#my_xml <- read_xml("exam.xml") # （ファイルを使う場合）
xml_ns(my_xml)                  # 名前空間の確認（d1）

## d1 <-> https://www.example.net/ns/1.0

my_records <- xml_find_all(my_xml, ".//d1:record")

f <- function(record) {
  tmp <- xml_attrs(record)                    # 属性を全て取り出し，
  xml_children(record) %>% walk(function(e) {
    tmp[xml_name(e)] <<- xml_text(e)          # 子要素の名前と内容を追加する．
  })
  tmp
}

my_data <- my_records %>% map_dfr(f)
my_data$english <- as.numeric(my_data$english)
my_data$math    <- as.numeric(my_data$math)
my_data

## # A tibble: 4 × 4
##   english  math gender name 
##     <dbl> <dbl> <chr>  <chr>
## 1      60    70 f      A    
## 2      90    80 m      B    
## 3      70    90 m      C    
## 4      90   100 f      D

5.2 データの変換

x1 <- c(1, 2, 3)

z1 <- scale(x1)
# あるいは
z1 <- (x1 - mean(x1)) / sd(x1)

z1

## [1] -1  0  1

c(mean(z1), sd(z1))

## [1] 0 1

z1 * sd(x1) + mean(x1)

## [1] 1 2 3

x2 <- c(1, 3, 5)
z2 <- (x2 - mean(x1)) / sd(x1)
c(mean(z2), sd(z2))

## [1] 1 2

library(caret)

## Loading required package: lattice

## 
## Attaching package: 'caret'

## The following object is masked from 'package:future':
## 
##     cluster

## The following object is masked from 'package:purrr':
## 
##     lift

library(tidyverse)

my_df <- data.frame(
  id = c(1, 2, 3),
  class = as.factor(
    c("A", "B", "C")))

my_enc <- my_df %>%
  dummyVars(formula = ~ .)

my_enc %>% predict(my_df)

##   id class.A class.B class.C
## 1  1       1       0       0
## 2  2       0       1       0
## 3  3       0       0       1

my_df2 <- data.frame(
  id =    c( 4 ,  5 ,  6 ),
  class = c("B", "C", "B"))
my_enc %>% predict(my_df2)

##   id class.A class.B class.C
## 1  4       0       1       0
## 2  5       0       0       1
## 3  6       0       1       0

my_enc <- my_df %>%
  dummyVars(formula = ~ .,
            fullRank = TRUE)
my_enc %>% predict(my_df)

##   id class.B class.C
## 1  1       0       0
## 2  2       1       0
## 3  3       0       1

my_enc %>% predict(my_df2)

##   id class.B class.C
## 1  4       1       0
## 2  5       0       1
## 3  6       1       0

6 機械学習の目的・データ・手法

6.1 機械学習の目的（本書の場合）

6.2 機械学習のためのデータ

iris

##     Sepal.Length Sepal.Width Petal.Length Petal.Width    Species
## 1            5.1         3.5          1.4         0.2     setosa
## 2            4.9         3.0          1.4         0.2     setosa
## 3            4.7         3.2          1.3         0.2     setosa
## 4            4.6         3.1          1.5         0.2     setosa
## 5            5.0         3.6          1.4         0.2     setosa
## 6            5.4         3.9          1.7         0.4     setosa
## 7            4.6         3.4          1.4         0.3     setosa
## 8            5.0         3.4          1.5         0.2     setosa
## 9            4.4         2.9          1.4         0.2     setosa
## 10           4.9         3.1          1.5         0.1     setosa
## 11           5.4         3.7          1.5         0.2     setosa
## 12           4.8         3.4          1.6         0.2     setosa
## 13           4.8         3.0          1.4         0.1     setosa
## 14           4.3         3.0          1.1         0.1     setosa
## 15           5.8         4.0          1.2         0.2     setosa
## 16           5.7         4.4          1.5         0.4     setosa
## 17           5.4         3.9          1.3         0.4     setosa
## 18           5.1         3.5          1.4         0.3     setosa
## 19           5.7         3.8          1.7         0.3     setosa
## 20           5.1         3.8          1.5         0.3     setosa
## 21           5.4         3.4          1.7         0.2     setosa
## 22           5.1         3.7          1.5         0.4     setosa
## 23           4.6         3.6          1.0         0.2     setosa
## 24           5.1         3.3          1.7         0.5     setosa
## 25           4.8         3.4          1.9         0.2     setosa
## 26           5.0         3.0          1.6         0.2     setosa
## 27           5.0         3.4          1.6         0.4     setosa
## 28           5.2         3.5          1.5         0.2     setosa
## 29           5.2         3.4          1.4         0.2     setosa
## 30           4.7         3.2          1.6         0.2     setosa
## 31           4.8         3.1          1.6         0.2     setosa
## 32           5.4         3.4          1.5         0.4     setosa
## 33           5.2         4.1          1.5         0.1     setosa
## 34           5.5         4.2          1.4         0.2     setosa
## 35           4.9         3.1          1.5         0.2     setosa
## 36           5.0         3.2          1.2         0.2     setosa
## 37           5.5         3.5          1.3         0.2     setosa
## 38           4.9         3.6          1.4         0.1     setosa
## 39           4.4         3.0          1.3         0.2     setosa
## 40           5.1         3.4          1.5         0.2     setosa
## 41           5.0         3.5          1.3         0.3     setosa
## 42           4.5         2.3          1.3         0.3     setosa
## 43           4.4         3.2          1.3         0.2     setosa
## 44           5.0         3.5          1.6         0.6     setosa
## 45           5.1         3.8          1.9         0.4     setosa
## 46           4.8         3.0          1.4         0.3     setosa
## 47           5.1         3.8          1.6         0.2     setosa
## 48           4.6         3.2          1.4         0.2     setosa
## 49           5.3         3.7          1.5         0.2     setosa
## 50           5.0         3.3          1.4         0.2     setosa
## 51           7.0         3.2          4.7         1.4 versicolor
## 52           6.4         3.2          4.5         1.5 versicolor
## 53           6.9         3.1          4.9         1.5 versicolor
## 54           5.5         2.3          4.0         1.3 versicolor
## 55           6.5         2.8          4.6         1.5 versicolor
## 56           5.7         2.8          4.5         1.3 versicolor
## 57           6.3         3.3          4.7         1.6 versicolor
## 58           4.9         2.4          3.3         1.0 versicolor
## 59           6.6         2.9          4.6         1.3 versicolor
## 60           5.2         2.7          3.9         1.4 versicolor
## 61           5.0         2.0          3.5         1.0 versicolor
## 62           5.9         3.0          4.2         1.5 versicolor
## 63           6.0         2.2          4.0         1.0 versicolor
## 64           6.1         2.9          4.7         1.4 versicolor
## 65           5.6         2.9          3.6         1.3 versicolor
## 66           6.7         3.1          4.4         1.4 versicolor
## 67           5.6         3.0          4.5         1.5 versicolor
## 68           5.8         2.7          4.1         1.0 versicolor
## 69           6.2         2.2          4.5         1.5 versicolor
## 70           5.6         2.5          3.9         1.1 versicolor
## 71           5.9         3.2          4.8         1.8 versicolor
## 72           6.1         2.8          4.0         1.3 versicolor
## 73           6.3         2.5          4.9         1.5 versicolor
## 74           6.1         2.8          4.7         1.2 versicolor
## 75           6.4         2.9          4.3         1.3 versicolor
## 76           6.6         3.0          4.4         1.4 versicolor
## 77           6.8         2.8          4.8         1.4 versicolor
## 78           6.7         3.0          5.0         1.7 versicolor
## 79           6.0         2.9          4.5         1.5 versicolor
## 80           5.7         2.6          3.5         1.0 versicolor
## 81           5.5         2.4          3.8         1.1 versicolor
## 82           5.5         2.4          3.7         1.0 versicolor
## 83           5.8         2.7          3.9         1.2 versicolor
## 84           6.0         2.7          5.1         1.6 versicolor
## 85           5.4         3.0          4.5         1.5 versicolor
## 86           6.0         3.4          4.5         1.6 versicolor
## 87           6.7         3.1          4.7         1.5 versicolor
## 88           6.3         2.3          4.4         1.3 versicolor
## 89           5.6         3.0          4.1         1.3 versicolor
## 90           5.5         2.5          4.0         1.3 versicolor
## 91           5.5         2.6          4.4         1.2 versicolor
## 92           6.1         3.0          4.6         1.4 versicolor
## 93           5.8         2.6          4.0         1.2 versicolor
## 94           5.0         2.3          3.3         1.0 versicolor
## 95           5.6         2.7          4.2         1.3 versicolor
## 96           5.7         3.0          4.2         1.2 versicolor
## 97           5.7         2.9          4.2         1.3 versicolor
## 98           6.2         2.9          4.3         1.3 versicolor
## 99           5.1         2.5          3.0         1.1 versicolor
## 100          5.7         2.8          4.1         1.3 versicolor
## 101          6.3         3.3          6.0         2.5  virginica
## 102          5.8         2.7          5.1         1.9  virginica
## 103          7.1         3.0          5.9         2.1  virginica
## 104          6.3         2.9          5.6         1.8  virginica
## 105          6.5         3.0          5.8         2.2  virginica
## 106          7.6         3.0          6.6         2.1  virginica
## 107          4.9         2.5          4.5         1.7  virginica
## 108          7.3         2.9          6.3         1.8  virginica
## 109          6.7         2.5          5.8         1.8  virginica
## 110          7.2         3.6          6.1         2.5  virginica
## 111          6.5         3.2          5.1         2.0  virginica
## 112          6.4         2.7          5.3         1.9  virginica
## 113          6.8         3.0          5.5         2.1  virginica
## 114          5.7         2.5          5.0         2.0  virginica
## 115          5.8         2.8          5.1         2.4  virginica
## 116          6.4         3.2          5.3         2.3  virginica
## 117          6.5         3.0          5.5         1.8  virginica
## 118          7.7         3.8          6.7         2.2  virginica
## 119          7.7         2.6          6.9         2.3  virginica
## 120          6.0         2.2          5.0         1.5  virginica
## 121          6.9         3.2          5.7         2.3  virginica
## 122          5.6         2.8          4.9         2.0  virginica
## 123          7.7         2.8          6.7         2.0  virginica
## 124          6.3         2.7          4.9         1.8  virginica
## 125          6.7         3.3          5.7         2.1  virginica
## 126          7.2         3.2          6.0         1.8  virginica
## 127          6.2         2.8          4.8         1.8  virginica
## 128          6.1         3.0          4.9         1.8  virginica
## 129          6.4         2.8          5.6         2.1  virginica
## 130          7.2         3.0          5.8         1.6  virginica
## 131          7.4         2.8          6.1         1.9  virginica
## 132          7.9         3.8          6.4         2.0  virginica
## 133          6.4         2.8          5.6         2.2  virginica
## 134          6.3         2.8          5.1         1.5  virginica
## 135          6.1         2.6          5.6         1.4  virginica
## 136          7.7         3.0          6.1         2.3  virginica
## 137          6.3         3.4          5.6         2.4  virginica
## 138          6.4         3.1          5.5         1.8  virginica
## 139          6.0         3.0          4.8         1.8  virginica
## 140          6.9         3.1          5.4         2.1  virginica
## 141          6.7         3.1          5.6         2.4  virginica
## 142          6.9         3.1          5.1         2.3  virginica
## 143          5.8         2.7          5.1         1.9  virginica
## 144          6.8         3.2          5.9         2.3  virginica
## 145          6.7         3.3          5.7         2.5  virginica
## 146          6.7         3.0          5.2         2.3  virginica
## 147          6.3         2.5          5.0         1.9  virginica
## 148          6.5         3.0          5.2         2.0  virginica
## 149          6.2         3.4          5.4         2.3  virginica
## 150          5.9         3.0          5.1         1.8  virginica

# 以下省略

6.3 機械学習のための手法

7 回帰1（単回帰）

# Google Colaboratoryの環境設定
if (file.exists("/content")) {
  options(Ncpus = parallel::detectCores())
  installed_packages <- rownames(installed.packages())
  packages_to_install <- c("caret", "doParallel", "pastecs")
  install.packages(setdiff(packages_to_install, installed_packages))
}

7.1 自動車の停止距離

7.2 データの確認

library(caret)
library(tidyverse)
my_data <- cars

dim(my_data)

## [1] 50  2

head(my_data)

##   speed dist
## 1     4    2
## 2     4   10
## 3     7    4
## 4     7   22
## 5     8   16
## 6     9   10

options(digits = 3)
pastecs::stat.desc(my_data)

##                speed    dist
## nbr.val       50.000   50.00
## nbr.null       0.000    0.00
## nbr.na         0.000    0.00
## min            4.000    2.00
## max           25.000  120.00
## range         21.000  118.00
## sum          770.000 2149.00
## median        15.000   36.00
## mean          15.400   42.98
## SE.mean        0.748    3.64
## CI.mean.0.95   1.503    7.32
## var           27.959  664.06
## std.dev        5.288   25.77
## coef.var       0.343    0.60

my_data %>%
  ggplot(aes(x = speed, y = dist)) +
  geom_point()

7.3 回帰分析

library(tidyverse)

my_data <- cars
tmp <- data.frame(speed = 21.5, dist = 67)
my_data %>% ggplot(aes(x = speed, y = dist)) +
  coord_cartesian(xlim = c(4, 25), ylim = c(0, 120)) +
  geom_point() +
  stat_smooth(formula = y ~ x, method = "lm") +
  geom_pointrange(data = tmp, aes(ymin = -9, ymax = dist),  linetype = "dotted") +
  geom_pointrange(data = tmp, aes(xmin =  0, xmax = speed), linetype = "dotted")

library(caret)
library(tidyverse)
my_data <- cars

my_model <- train(form = dist ~ speed, # モデル式（出力変数と入力変数の関係）
                  data = my_data,      # データ
                  method = "lm")       # 手法

coef(my_model$finalModel)

## (Intercept)       speed 
##      -17.58        3.93

tmp <- data.frame(speed = 21.5)
my_model %>% predict(tmp)

##  1 
## 67

f <- function(x) { my_model %>% predict(data.frame(speed = x)) }

my_data %>%
  ggplot(aes(x = speed, y = dist,
             color = "data")) +
  geom_point() +
  stat_function(
    fun = f,
    mapping = aes(color = "model"))

7.4 当てはまりの良さの指標

library(caret)
library(tidyverse)
my_data <- cars
my_model <- train(form = dist ~ speed, data = my_data, method = "lm")

y  <- my_data$dist
y_ <- my_model %>% predict(my_data)
my_data$y_ <- y_

my_data$residual <- y - y_
head(my_data)

##   speed dist    y_ residual
## 1     4    2 -1.85     3.85
## 2     4   10 -1.85    11.85
## 3     7    4  9.95    -5.95
## 4     7   22  9.95    12.05
## 5     8   16 13.88     2.12
## 6     9   10 17.81    -7.81

my_data %>%
  ggplot(aes(x = speed, y = dist)) +
  geom_point() +
  geom_line(aes(x = speed, y = y_)) +
  geom_linerange(mapping = aes(ymin = y_, ymax = dist), linetype = "dotted")

RMSE(y_, y)

## [1] 15.1

# あるいは
mean((my_data$residual^2))**0.5

## [1] 15.1

R2(pred = y_, obs = y,
   form = "traditional")

## [1] 0.651

R2(pred = y_, obs = y,
   form = "corr")

## [1] 0.651

# あるいは
summary(my_model$finalModel)$r.squared

## [1] 0.651

my_test <- my_data[1:3, ]
y  <- my_test$dist
y_ <- my_model %>% predict(my_test)

R2(pred = y_, obs = y,
   form = "traditional")

## [1] -4.5

R2(pred = y_, obs = y,
   form = "corr")

## [1] 0.0769

library(caret)
library(tidyverse)
my_data <- cars
my_idx <- c(2, 11, 27, 34, 39, 44)
my_sample <- my_data[my_idx, ]

options(warn = -1) # これ以降，警告を表示しない．
my_model <- train(form = dist ~ poly(speed, degree = 5, raw = TRUE),
                  data = my_sample,
                  method = "lm")
options(warn = 0)  # これ以降，警告を表示する．

y  <- my_sample$dist
y_ <- my_model %>% predict(my_sample)

RMSE(y_, y)

## [1] 1.07e-10

R2(pred = y_, obs = y,
   form = "traditional")

## [1] 1

R2(pred = y_, obs = y,
   form = "corr")

## [1] 1

f <- function(x) { my_model %>% predict(data.frame(speed = x)) }

my_data %>%
  ggplot(aes(x = speed, y = dist, color = "data")) +
  geom_point() +
  geom_point(data = my_sample, mapping = aes(color = "sample")) +
  stat_function(fun = f, mapping = aes(color = "model")) +
  coord_cartesian(ylim = c(0, 120))

7.5 K最近傍法

# 準備
library(caret)
library(tidyverse)
my_data <- cars

# 訓練
my_model <- train(form = dist ~ speed, data = my_data, method = "knn")

# 可視化の準備
f <- function(x) { my_model %>% predict(data.frame(speed = x))}

my_data %>%
  ggplot(aes(x = speed,
             y = dist,
             color = "data")) +
  geom_point() +
  stat_function(
    fun = f,
    mapping = aes(color = "model"))

y  <- my_data$dist
y_ <- my_model %>% predict(my_data)

RMSE(y_, y)

## [1] 14

R2(pred = y_, obs = y,
   form = "traditional")

## [1] 0.7

R2(pred = y_, obs = y,
   form = "corr")

## [1] 0.702

7.6 検証

library(caret)
library(tidyverse)
my_data <- cars
my_model <- train(form = dist ~ speed, data = my_data, method = "lm")

my_model$results

##   intercept RMSE Rsquared  MAE RMSESD RsquaredSD MAESD
## 1      TRUE 15.1    0.697 12.1   2.41      0.122  1.67

my_model <- train(form = dist ~ speed, data = my_data, method = "lm",
                  trControl = trainControl(method = "cv", number = 5))
my_model$results

##   intercept RMSE Rsquared  MAE RMSESD RsquaredSD MAESD
## 1      TRUE 15.6     0.68 12.1   2.39       0.18  1.59

my_model <- train(form = dist ~ speed, data = my_data, method = "lm",
                  trControl = trainControl(method = "LOOCV"))
my_model$results

##   intercept RMSE Rsquared  MAE
## 1      TRUE 15.7    0.622 12.1

library(doParallel)

## Loading required package: foreach

## 
## Attaching package: 'foreach'

## The following objects are masked from 'package:purrr':
## 
##     accumulate, when

## Loading required package: iterators

## Loading required package: parallel

cl <- makeCluster(detectCores())
registerDoParallel(cl)

library(caret)
library(tidyverse)
my_data <- cars
my_model <- train(form = dist ~ speed, data = my_data, method = "lm")
y  <- my_data$dist
y_ <- my_model %>% predict(my_data)

# RMSE（訓練）
RMSE(y_, y)

## [1] 15.1

# 決定係数1（訓練）
R2(pred = y_, obs = y,
   form = "traditional")

## [1] 0.651

# 決定係数6（訓練）
R2(pred = y_, obs = y,
   form = "corr")

## [1] 0.651

postResample(pred = y_, obs = y)

##     RMSE Rsquared      MAE 
##   15.069    0.651   11.580

my_model <- train(form = dist ~ speed, data = my_data, method = "lm")
my_model$results

##   intercept RMSE Rsquared  MAE RMSESD RsquaredSD MAESD
## 1      TRUE 14.4    0.701 11.4   2.59     0.0923  1.78

# 左から，RMSE（検証），決定係数6（検証），MAE（検証）

my_model <- train(form = dist ~ speed, data = my_data, method = "lm",
                  trControl = trainControl(method = "LOOCV"))

# 方法1
my_model$results

##   intercept RMSE Rsquared  MAE
## 1      TRUE 15.7    0.622 12.1

# 方法2
y  <- my_model$pred$obs
y_ <- my_model$pred$pred
mean((y - y_)^2)**0.5

## [1] 15.7

mean(((y - y_)^2)**0.5)

## [1] 12.1

library(caret)
library(tidyverse)
my_data <- cars

my_lm_model <- train(form = dist ~ speed, data = my_data, method = "lm",
                     trControl = trainControl(method = "LOOCV"))

my_knn_model <- train(form = dist ~ speed, data = my_data, method = "knn",
                      tuneGrid = data.frame(k = 5),
                      trControl = trainControl(method = "LOOCV"))

my_lm_model$results$RMSE

## [1] 15.7

my_knn_model$results$RMSE

## [1] 15.8

y     <- my_data$dist
y_lm  <- my_lm_model$pred$pred
y_knn <- my_knn_model$pred$pred

my_df <- data.frame(
  lm  = (y - y_lm)^2,
  knn = (y - y_knn)^2)

head(my_df)

##       lm    knn
## 1  18.91 108.16
## 2 179.22   0.64
## 3  41.03 175.56
## 4 168.49  49.00
## 5   5.09   9.00
## 6  67.62 112.89

boxplot(my_df, ylab = "r^2")

t.test(x = my_df$lm, y = my_df$knn,
       conf.level = 0.95,
       paired = TRUE,
       alternative = "two.sided")

## 
##  Paired t-test
## 
## data:  my_df$lm and my_df$knn
## t = -0.1, df = 49, p-value = 0.9
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  -53.5  47.0
## sample estimates:
## mean of the differences 
##                   -3.21

7.7 パラメータチューニング

library(caret)
library(tidyverse)
my_data <- cars
my_model <- train(form = dist ~ speed, data = my_data, method = "knn")
my_model$results

##   k RMSE Rsquared  MAE RMSESD RsquaredSD MAESD
## 1 5 16.5    0.618 13.0   3.37      0.140  2.29
## 2 7 16.9    0.601 13.0   3.45      0.130  2.59
## 3 9 17.2    0.579 13.2   3.52      0.128  2.54

my_params <- expand.grid(k = 1:15)

my_model <- train(form = dist ~ speed, data = my_data, method = "knn",
                  tuneGrid = my_params,
                  trControl = trainControl(method = "LOOCV"))

head(my_model$results)

##   k RMSE Rsquared  MAE
## 1 1 17.2    0.578 13.8
## 2 2 16.8    0.594 13.0
## 3 3 16.3    0.622 12.7
## 4 4 16.0    0.609 12.3
## 5 5 15.8    0.617 12.0
## 6 6 16.0    0.608 12.3

ggplot(my_model)

my_model$bestTune

##   k
## 5 5

my_model$results %>%
  filter(RMSE == min(RMSE))

##   k RMSE Rsquared MAE
## 1 5 15.8    0.617  12

y  <- my_data$dist
y_ <- my_model %>% predict(my_data)
RMSE(y_, y)

## [1] 14

library(caret)
library(tidyverse)
my_data <- cars

my_loocv <- function(k) {
  my_model <- train(form = dist ~ speed, data = my_data, method = "knn",
                    tuneGrid = data.frame(k = k),
                    trControl = trainControl(method = "LOOCV"))
  y  <- my_data$dist
  y_ <- my_model %>% predict(my_data)
  list(k = k,
       training = RMSE(y_, y),             # RMSE（訓練）
       validation = my_model$results$RMSE) # RMSE（検証）
}

my_results <- 1:15 %>% map_dfr(my_loocv)

my_results %>%
  pivot_longer(-k) %>%
  ggplot(aes(x = k, y = value,
             color = name)) +
  geom_line() + geom_point() +
  xlab("#Neighbors") + ylab("RMSE") +
  theme(legend.position = c(1, 0),
        legend.justification = c(1, 0))

# 並列クラスタの停止
stopCluster(cl)
registerDoSEQ()

8 回帰2（重回帰）

# Google Colaboratoryの環境設定
if (file.exists("/content")) {
  options(Ncpus = parallel::detectCores())
  installed_packages <- rownames(installed.packages())
  packages_to_install <- c("caret", "ggfortify", "glmnet", "glmnetUtils", "leaps", "neuralnet", "psych")
  install.packages(setdiff(packages_to_install, installed_packages))
}

8.1 ブドウの生育条件とワインの価格

library(tidyverse)
my_url <- "http://www.liquidasset.com/winedata.html"
tmp <- read.table(file = my_url,   # 読み込む対象
                  header = TRUE,   # 1行目は変数名
                  na.string = ".", # 欠損値を表す文字列
                  skip = 62,       # 読み飛ばす行数
                  nrows = 38)      # 読み込む行数
psych::describe(tmp)

##         vars  n    mean     sd  median trimmed    mad     min    max  range
## OBS        1 38   19.50  11.11   19.50   19.50  14.08    1.00   38.0  37.00
## VINT       2 38 1970.50  11.11 1970.50 1970.50  14.08 1952.00 1989.0  37.00
## LPRICE2    3 27   -1.45   0.63   -1.51   -1.49   0.72   -2.29    0.0   2.29
## WRAIN      4 38  605.00 135.28  586.50  603.06 174.95  376.00  845.0 469.00
## DEGREES    5 37   16.52   0.66   16.53   16.55   0.67   14.98   17.6   2.67
## HRAIN      6 38  137.00  66.74  120.50  132.19  59.30   38.00  292.0 254.00
## TIME_SV    7 38   12.50  11.11   12.50   12.50  14.08   -6.00   31.0  37.00
##          skew kurtosis    se
## OBS      0.00    -1.30  1.80
## VINT     0.00    -1.30  1.80
## LPRICE2  0.41    -0.82  0.12
## WRAIN    0.15    -1.10 21.95
## DEGREES -0.34    -0.73  0.11
## HRAIN    0.69    -0.20 10.83
## TIME_SV  0.00    -1.30  1.80

my_data <- na.omit(tmp[, -c(1, 2)])
head(my_data)

##   LPRICE2 WRAIN DEGREES HRAIN TIME_SV
## 1  -0.999   600    17.1   160      31
## 2  -0.454   690    16.7    80      30
## 4  -0.808   502    17.1   130      28
## 6  -1.509   420    16.1   110      26
## 7  -1.717   582    16.4   187      25
## 8  -0.418   485    17.5   187      24

dim(my_data)

## [1] 27  5

my_data %>% write_csv("wine.csv")

#my_data <- read_csv("wine.csv") # 作ったファイルを使う場合
my_url <- str_c("https://raw.githubusercontent.com/taroyabuki",
                "/fromzero/master/data/wine.csv")
my_data <- read_csv(my_url)

## Rows: 27 Columns: 5
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (5): LPRICE2, WRAIN, DEGREES, HRAIN, TIME_SV
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

8.2 重回帰分析

library(caret)
library(tidyverse)
my_url <- str_c("https://raw.githubusercontent.com/taroyabuki",
                "/fromzero/master/data/wine.csv")
my_data <- read_csv(my_url)

## Rows: 27 Columns: 5
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (5): LPRICE2, WRAIN, DEGREES, HRAIN, TIME_SV
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

my_model <- train(form = LPRICE2 ~ WRAIN + DEGREES + HRAIN + TIME_SV,
                  data = my_data,
                  method = "lm",
                  trControl = trainControl(method = "LOOCV"))

coef(my_model$finalModel) %>%
  as.data.frame

##                     .
## (Intercept) -12.14533
## WRAIN         0.00117
## DEGREES       0.61639
## HRAIN        -0.00386
## TIME_SV       0.02385

my_test <- data.frame(
  WRAIN = 500, DEGREES = 17,
  HRAIN = 120, TIME_SV = 2)
my_model %>% predict(my_test)

##    1 
## -1.5

y  <- my_data$LPRICE2
y_ <- my_model %>% predict(my_data)

RMSE(y_, y)

## [1] 0.259

R2(pred = y_, obs = y,
   form = "traditional")

## [1] 0.828

R2(pred = y_, obs = y,
   form = "corr")

## [1] 0.828

my_model$results

##   intercept  RMSE Rsquared   MAE
## 1      TRUE 0.323    0.736 0.277

M <- my_data[, -1] %>%
  mutate(b0 = 1) %>% as.matrix
b <- MASS::ginv(M) %*% y
matrix(b,
       dimnames = list(colnames(M)))

##              [,1]
## WRAIN     0.00117
## DEGREES   0.61639
## HRAIN    -0.00386
## TIME_SV   0.02385
## b0      -12.14533

8.3 標準化

library(caret)
library(tidyverse)
my_url <- str_c("https://raw.githubusercontent.com/taroyabuki",
                "/fromzero/master/data/wine.csv")
my_data <- read_csv(my_url)

## Rows: 27 Columns: 5
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (5): LPRICE2, WRAIN, DEGREES, HRAIN, TIME_SV
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

my_data %>%
  mutate_if(is.numeric, scale) %>% # 数値の列の標準化
  pivot_longer(-LPRICE2) %>%
  ggplot(aes(x = name, y = value)) +
  geom_boxplot() +
  stat_summary(fun = mean, geom = "point", size = 3) +
  xlab(NULL)

my_model <- train(
  form = LPRICE2 ~ .,
  data = my_data,
  method = "lm",
  preProcess = c("center", "scale"))

coef(my_model$finalModel) %>%
  as.data.frame

##                  .
## (Intercept) -1.452
## WRAIN        0.151
## DEGREES      0.406
## HRAIN       -0.282
## TIME_SV      0.197

my_test <- data.frame(
  WRAIN = 500, DEGREES = 17,
  HRAIN = 120, TIME_SV = 2)
my_model %>% predict(my_test)

##    1 
## -1.5

8.4 入力変数の数とモデルの良さ

library(caret)
library(tidyverse)
my_url <- str_c("https://raw.githubusercontent.com/taroyabuki",
                "/fromzero/master/data/wine.csv")
my_data <- read_csv(my_url)

## Rows: 27 Columns: 5
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (5): LPRICE2, WRAIN, DEGREES, HRAIN, TIME_SV
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

n <- nrow(my_data)
my_data2 <- my_data %>% mutate(v1 = 0:(n - 1) %% 2,
                               v2 = 0:(n - 1) %% 3)
head(my_data2)

## # A tibble: 6 × 7
##   LPRICE2 WRAIN DEGREES HRAIN TIME_SV    v1    v2
##     <dbl> <dbl>   <dbl> <dbl>   <dbl> <dbl> <dbl>
## 1  -0.999   600    17.1   160      31     0     0
## 2  -0.454   690    16.7    80      30     1     1
## 3  -0.808   502    17.2   130      28     0     2
## 4  -1.51    420    16.1   110      26     1     0
## 5  -1.72    582    16.4   187      25     0     1
## 6  -0.418   485    17.5   187      24     1     2

my_model2 <- train(form = LPRICE2 ~ ., data = my_data2, method = "lm",
                   trControl = trainControl(method = "LOOCV"))
y  <- my_data2$LPRICE2
y_ <- my_model2 %>% predict(my_data2)

RMSE(y_, y)

## [1] 0.256

my_model2$results$RMSE

## [1] 0.357

8.5 変数選択

library(caret)
library(tidyverse)
my_url <- str_c("https://raw.githubusercontent.com/taroyabuki",
                "/fromzero/master/data/wine.csv")
my_data <- read_csv(my_url)

## Rows: 27 Columns: 5
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (5): LPRICE2, WRAIN, DEGREES, HRAIN, TIME_SV
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

n <- nrow(my_data)
my_data2 <- my_data %>% mutate(v1 = 0:(n - 1) %% 2,
                               v2 = 0:(n - 1) %% 3)

my_model <- train(form = LPRICE2 ~ .,
                  data = my_data2,
                  method = "leapForward", # 変数増加法
                  trControl = trainControl(method = "LOOCV"),
                  tuneGrid = data.frame(nvmax = 1:6)) # 選択する変数の上限
summary(my_model$finalModel)$outmat

##          WRAIN DEGREES HRAIN TIME_SV v1  v2 
## 1  ( 1 ) " "   "*"     " "   " "     " " " "
## 2  ( 1 ) " "   "*"     "*"   " "     " " " "
## 3  ( 1 ) " "   "*"     "*"   "*"     " " " "
## 4  ( 1 ) "*"   "*"     "*"   "*"     " " " "

8.6 補足：正則化

library(caret)
library(tidyverse)
my_url <- str_c("https://raw.githubusercontent.com/taroyabuki",
                "/fromzero/master/data/wine.csv")
my_data <- read_csv(my_url)

## Rows: 27 Columns: 5
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (5): LPRICE2, WRAIN, DEGREES, HRAIN, TIME_SV
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

A <- 2
B <- 0.1

my_model <- train(
  form = LPRICE2 ~ .,
  data = my_data,
  method = "glmnet",
  standardize = TRUE,
  tuneGrid = data.frame(
    lambda = A,
    alpha = B))

coef(my_model$finalModel, A)

## 5 x 1 sparse Matrix of class "dgCMatrix"
##                    s1
## (Intercept) -2.801552
## WRAIN        .       
## DEGREES      0.083291
## HRAIN       -0.000415
## TIME_SV      0.002310

my_test <- data.frame(
  WRAIN = 500, DEGREES = 17,
  HRAIN = 120, TIME_SV = 2)
my_model %>% predict(my_test)

## [1] -1.43

library(ggfortify)
library(glmnetUtils)

my_data2 <- my_data %>% scale %>%
  as.data.frame

B <- 0.1

glmnet(
  form = LPRICE2 ~ .,
  data = my_data2,
  alpha = B) %>%
  autoplot(xvar = "lambda") +
  xlab("log A ( = log lambda)") +
  theme(legend.position = c(0.15, 0.25))

As <- seq(0, 0.1, length.out = 21)
Bs <- seq(0, 0.1, length.out =  6)

my_model <- train(
  form = LPRICE2 ~ ., data = my_data, method = "glmnet", standardize = TRUE,
  trControl = trainControl(method = "LOOCV"),
  tuneGrid = expand.grid(lambda = As, alpha  = Bs))

my_model$bestTune

##   alpha lambda
## 8     0  0.035

tmp <- "B ( = alpha)"
ggplot(my_model) +
  theme(legend.position = c(0, 1), legend.justification = c(0, 1)) +
  xlab("A ( = lambda)") +
  guides(shape = guide_legend(tmp), color = guide_legend(tmp))

my_model$results %>%
  filter(RMSE == min(RMSE))

##   alpha lambda RMSE Rsquared   MAE
## 1     0  0.000 0.32    0.737 0.278
## 2     0  0.005 0.32    0.737 0.278
## 3     0  0.010 0.32    0.737 0.278
## 4     0  0.015 0.32    0.737 0.278
## 5     0  0.020 0.32    0.737 0.278
## 6     0  0.025 0.32    0.737 0.278
## 7     0  0.030 0.32    0.737 0.278
## 8     0  0.035 0.32    0.737 0.278

8.7 ニューラルネットワーク

curve(1 / (1 + exp(-x)), -6, 6)

library(caret)
library(tidyverse)
my_url <- str_c("https://raw.githubusercontent.com/taroyabuki",
                "/fromzero/master/data/wine.csv")
my_data <- read_csv(my_url)

## Rows: 27 Columns: 5
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (5): LPRICE2, WRAIN, DEGREES, HRAIN, TIME_SV
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

my_model <- train(form = LPRICE2 ~ .,
                  data = my_data,
                  method = "neuralnet",              # ニューラルネットワーク
                  preProcess = c("center", "scale"), # 標準化
                  trControl = trainControl(method = "LOOCV"))
plot(my_model$finalModel) # 訓練済ネットワークの描画

my_model$results

##   layer1 layer2 layer3  RMSE Rsquared   MAE
## 1      1      0      0 0.315    0.749 0.271
## 2      3      0      0 0.329    0.741 0.267
## 3      5      0      0 0.331    0.772 0.270

my_model <- train(
  form = LPRICE2 ~ .,
  data = my_data,
  method = "neuralnet",
  preProcess = c("center", "scale"),
  trControl = trainControl(method = "LOOCV"),
  tuneGrid = expand.grid(layer1 = 1:5,
                         layer2 = 0:2,
                         layer3 = 0))

my_model$results %>%
  filter(RMSE == min(RMSE))

##   layer1 layer2 layer3  RMSE Rsquared   MAE
## 1      3      0      0 0.298    0.784 0.225

9 分類1（多値分類）

# Google Colaboratoryの環境設定
if (file.exists("/content")) {
  options(Ncpus = parallel::detectCores())
  installed_packages <- rownames(installed.packages())
  packages_to_install <- c("caret", "furrr", "psych", "randomForest", "rpart.plot", "xgboost")
  install.packages(setdiff(packages_to_install, installed_packages))
}

9.1 アヤメのデータ

my_data <- iris
head(my_data)

##   Sepal.Length Sepal.Width Petal.Length Petal.Width Species
## 1          5.1         3.5          1.4         0.2  setosa
## 2          4.9         3.0          1.4         0.2  setosa
## 3          4.7         3.2          1.3         0.2  setosa
## 4          4.6         3.1          1.5         0.2  setosa
## 5          5.0         3.6          1.4         0.2  setosa
## 6          5.4         3.9          1.7         0.4  setosa

psych::describe(my_data)

##              vars   n mean   sd median trimmed  mad min max range  skew
## Sepal.Length    1 150 5.84 0.83   5.80    5.81 1.04 4.3 7.9   3.6  0.31
## Sepal.Width     2 150 3.06 0.44   3.00    3.04 0.44 2.0 4.4   2.4  0.31
## Petal.Length    3 150 3.76 1.77   4.35    3.76 1.85 1.0 6.9   5.9 -0.27
## Petal.Width     4 150 1.20 0.76   1.30    1.18 1.04 0.1 2.5   2.4 -0.10
## Species*        5 150 2.00 0.82   2.00    2.00 1.48 1.0 3.0   2.0  0.00
##              kurtosis   se
## Sepal.Length    -0.61 0.07
## Sepal.Width      0.14 0.04
## Petal.Length    -1.42 0.14
## Petal.Width     -1.36 0.06
## Species*        -1.52 0.07

9.2 木による分類

library(caret)
library(tidyverse)
my_data <- iris
my_model <- train(form = Species ~ ., data = my_data, method = "rpart")

rpart.plot::rpart.plot(my_model$finalModel, extra = 1)

my_test <- tribble(
~Sepal.Length, ~Sepal.Width, ~Petal.Length, ~Petal.Width,
          5.0,          3.5,           1.5,          0.5,
          6.5,          3.0,           5.0,          2.0)

my_model %>% predict(my_test)

## [1] setosa    virginica
## Levels: setosa versicolor virginica

my_model %>% predict(my_test,
                     type = "prob")

##   setosa versicolor virginica
## 1      1     0.0000     0.000
## 2      0     0.0217     0.978

9.3 正解率

library(caret)
library(tidyverse)
my_data <- iris
my_model <- train(form = Species ~ ., data = my_data, method = "rpart2")

## note: only 2 possible values of the max tree depth from the initial fit.
##  Truncating the grid to 2 .

y  <- my_data$Species
y_ <- my_model %>% predict(my_data)
confusionMatrix(data = y_, reference = y)

## Confusion Matrix and Statistics
## 
##             Reference
## Prediction   setosa versicolor virginica
##   setosa         50          0         0
##   versicolor      0         49         5
##   virginica       0          1        45
## 
## Overall Statistics
##                                         
##                Accuracy : 0.96          
##                  95% CI : (0.915, 0.985)
##     No Information Rate : 0.333         
##     P-Value [Acc > NIR] : <2e-16        
##                                         
##                   Kappa : 0.94          
##                                         
##  Mcnemar's Test P-Value : NA            
## 
## Statistics by Class:
## 
##                      Class: setosa Class: versicolor Class: virginica
## Sensitivity                  1.000             0.980            0.900
## Specificity                  1.000             0.950            0.990
## Pos Pred Value               1.000             0.907            0.978
## Neg Pred Value               1.000             0.990            0.952
## Prevalence                   0.333             0.333            0.333
## Detection Rate               0.333             0.327            0.300
## Detection Prevalence         0.333             0.360            0.307
## Balanced Accuracy            1.000             0.965            0.945

# 以下は割愛

y  <- my_data$Species
y_ <- my_model %>% predict(my_data)
mean(y_ == y)

## [1] 0.96

my_model <- train(form = Species ~ ., data = my_data, method = "rpart2",
                  trControl = trainControl(method = "LOOCV"))

## note: only 2 possible values of the max tree depth from the initial fit.
##  Truncating the grid to 2 .

my_model$results

##   maxdepth Accuracy Kappa
## 1        1    0.333  0.00
## 2        2    0.953  0.93

my_model <- train(form = Species ~ ., data = my_data, method = "rpart2",
                  tuneGrid = data.frame(maxdepth = 1:10),
                  trControl = trainControl(method = "LOOCV"))
my_model$results %>% filter(Accuracy == max(Accuracy))

##   maxdepth Accuracy Kappa
## 1        2    0.953  0.93

# パラメータを与えると正解率（LOOCV）を返す関数
my_loocv <- function(maxdepth, minbucket, minsplit) {
  my_model <- train(form = Species ~ ., data = my_data, method = "rpart2",
                    trControl = trainControl(method = "LOOCV"),
                    tuneGrid = data.frame(maxdepth = maxdepth),
                    control = rpart::rpart.control(cp = 0.01,
                                                   minbucket = minbucket,
                                                   minsplit = minsplit))
  list(maxdepth = maxdepth,
       minbucket = minbucket,
       minsplit = minsplit,
       Accuracy = my_model$results$Accuracy)
}

my_params <- expand.grid(
  maxdepth = 2:5,
  minbucket = 1:7,
  minsplit = c(2, 20))

library(furrr)
plan(multisession) # 並列処理の準備
my_results <- my_params %>% future_pmap_dfr(my_loocv, # 実行
  .options = furrr_options(seed = TRUE))

my_results %>% filter(Accuracy == max(Accuracy)) # 正解率（検証）の最大値

## # A tibble: 6 × 4
##   maxdepth minbucket minsplit Accuracy
##      <int>     <int>    <dbl>    <dbl>
## 1        3         5        2    0.973
## 2        4         5        2    0.973
## 3        5         5        2    0.973
## 4        3         5       20    0.973
## 5        4         5       20    0.973
## 6        5         5       20    0.973

my_model <- train(form = Species ~ ., data = my_data, method = "rpart2",
                  trControl = trainControl(method = "none"),
                  tuneGrid = data.frame(maxdepth = 3),
                  control = rpart::rpart.control(cp = 0.01,
                                                 minbucket = 5,
                                                 minsplit = 2))

rpart.plot::rpart.plot(
  my_model$finalModel, extra = 1)

9.4 複数の木を使う方法

library(caret)
library(tidyverse)
my_data <- iris

my_model <- train(form = Species ~ ., data = my_data, method = "rf",
                  tuneGrid = data.frame(mtry = 2:4), # 省略可
                  trControl = trainControl(method = "LOOCV"))
my_model$results

##   mtry Accuracy Kappa
## 1    2     0.96  0.94
## 2    3     0.96  0.94
## 3    4     0.96  0.94

my_model <- train(
  form = Species ~ ., data = my_data, method = "xgbTree",
  tuneGrid = expand.grid(
    nrounds          = c(50, 100, 150),
    max_depth        = c(1, 2, 3),
    eta              = c(0.3, 0.4),
    gamma            = 0,
    colsample_bytree = c(0.6, 0.8),
    min_child_weight = 1,
    subsample        = c(0.5, 0.75, 1)),
  trControl = trainControl(method = "cv", number = 5)) # 5分割交差検証
my_model$results %>% filter(Accuracy == max(Accuracy)) %>% head(5) %>% t

##                        1      2       3       4        5
## eta               0.3000  0.300  0.4000  0.4000   0.3000
## max_depth         1.0000  2.000  2.0000  3.0000   2.0000
## gamma             0.0000  0.000  0.0000  0.0000   0.0000
## colsample_bytree  0.8000  0.600  0.8000  0.8000   0.6000
## min_child_weight  1.0000  1.000  1.0000  1.0000   1.0000
## subsample         0.5000  0.750  0.7500  0.5000   0.7500
## nrounds          50.0000 50.000 50.0000 50.0000 100.0000
## Accuracy          0.9533  0.953  0.9533  0.9533   0.9533
## Kappa             0.9300  0.930  0.9300  0.9300   0.9300
## AccuracySD        0.0183  0.038  0.0183  0.0183   0.0183
## KappaSD           0.0274  0.057  0.0274  0.0274   0.0274

my_model <- train(form = Species ~ ., data = my_data, method = "rf")
ggplot(varImp(my_model))

9.5 欠損のあるデータでの学習

library(caret)
library(tidyverse)
my_data <- iris

n <- nrow(my_data)
my_data$Petal.Length[0:(n - 1) %% 10 == 0] <- NA
my_data$Petal.Width[ 0:(n - 1) %% 10 == 1] <- NA

psych::describe(my_data) # nの値が135の変数に，150 - 135 = 15個の欠損がある．

##              vars   n mean   sd median trimmed  mad min max range  skew
## Sepal.Length    1 150 5.84 0.83    5.8    5.81 1.04 4.3 7.9   3.6  0.31
## Sepal.Width     2 150 3.06 0.44    3.0    3.04 0.44 2.0 4.4   2.4  0.31
## Petal.Length    3 135 3.75 1.76    4.3    3.75 1.78 1.0 6.9   5.9 -0.27
## Petal.Width     4 135 1.20 0.77    1.3    1.18 1.04 0.1 2.5   2.4 -0.09
## Species*        5 150 2.00 0.82    2.0    2.00 1.48 1.0 3.0   2.0  0.00
##              kurtosis   se
## Sepal.Length    -0.61 0.07
## Sepal.Width      0.14 0.04
## Petal.Length    -1.41 0.15
## Petal.Width     -1.35 0.07
## Species*        -1.52 0.07

my_model <- train(
  form = Species ~ ., data = my_data, method = "rpart2",
  na.action = na.pass,         # 欠損があっても学習を止めない．
  preProcess = "medianImpute", # 欠損を中央値で埋める．
  trControl = trainControl(method = "LOOCV"),
  tuneGrid = data.frame(maxdepth = 20),          # 木の高さの上限
  control = rpart::rpart.control(minsplit = 2,   # 分岐の条件
                                 minbucket = 1)) # 終端ノードの条件
max(my_model$results$Accuracy)

## [1] 0.927

my_model <- train(form = Species ~ ., data = my_data, method = "xgbTree",
                  na.action = na.pass,
                  trControl = trainControl(method = "cv", number = 5))
max(my_model$results$Accuracy)

## [1] 0.96

9.6 他の分類手法

library(caret)
library(tidyverse)
my_data <- iris

my_model <- train(form = Species ~ ., data = my_data, method = "knn",
                  trControl = trainControl(method = "LOOCV"))
my_model$results %>% filter(Accuracy == max(Accuracy))

##   k Accuracy Kappa
## 1 5    0.967  0.95
## 2 7    0.967  0.95
## 3 9    0.967  0.95

library(caret)
library(tidyverse)
my_data <- iris

my_model <- train(form = Species ~ ., data = my_data, method = "nnet",
                  preProcess = c("center", "scale"), # 標準化
                  trControl = trainControl(method = "LOOCV"),
                  trace = FALSE) # 途中経過を表示しない
my_model$results %>% filter(Accuracy == max(Accuracy))

##   size decay Accuracy Kappa
## 1    3   0.1    0.973  0.96
## 2    5   0.1    0.973  0.96

10 分類2（2値分類）

# Google Colaboratoryの環境設定
if (file.exists("/content")) {
  options(Ncpus = parallel::detectCores())
  installed_packages <- rownames(installed.packages())
  packages_to_install <- c("caret", "PRROC", "rpart.plot")
  install.packages(setdiff(packages_to_install, installed_packages))
}

10.1 2値分類の性能指標

y       <- c(  0,   1,   1,   0,   1,   0,    1,   0,   0,   1)
y_score <- c(0.7, 0.8, 0.3, 0.4, 0.9, 0.6, 0.99, 0.1, 0.2, 0.5)

y_ <- ifelse(0.5 <= y_score, 1, 0)
y_

##  [1] 1 1 0 0 1 1 1 0 0 1

library(caret)
confusionMatrix(data      = as.factor(y_), # 予測
                reference = as.factor(y),  # 正解
                positive = "1",            # 「1」を陽性とする．
                mode = "everything")       # 全ての指標を求める．

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction 0 1
##          0 3 1
##          1 2 4
##                                         
##                Accuracy : 0.7           
##                  95% CI : (0.348, 0.933)
##     No Information Rate : 0.5           
##     P-Value [Acc > NIR] : 0.172         
##                                         
##                   Kappa : 0.4           
##                                         
##  Mcnemar's Test P-Value : 1.000         
##                                         
##             Sensitivity : 0.800         
##             Specificity : 0.600         
##          Pos Pred Value : 0.667         
##          Neg Pred Value : 0.750         
##               Precision : 0.667         
##                  Recall : 0.800         
##                      F1 : 0.727         
##              Prevalence : 0.500         
##          Detection Rate : 0.400         
##    Detection Prevalence : 0.600         
##       Balanced Accuracy : 0.700         
##                                         
##        'Positive' Class : 1             
##

10.2 トレードオフ

library(PRROC)
library(tidyverse)

y       <- c(  0,   1,   1,   0,   1,   0,    1,   0,   0,   1)
y_score <- c(0.7, 0.8, 0.3, 0.4, 0.9, 0.6, 0.99, 0.1, 0.2, 0.5)
y_      <- ifelse(0.5 <= y_score, 1, 0)

c(sum((y == 0) & (y_ == 1)) / sum(y == 0), # FPR
  sum((y == 1) & (y_ == 1)) / sum(y == 1)) # TPR

## [1] 0.4 0.8

my_roc <- roc.curve(scores.class0 = y_score[y == 1], # 答えが1のもののスコア
                    scores.class1 = y_score[y == 0], # 答えが0のもののスコア
                    curve = TRUE)
my_roc %>% plot(xlab = "False Positive Rate",
                ylab = "True Positive Rate",
                legend = FALSE)

my_roc$auc

## [1] 0.8

c(sum((y == 1) & (y_ == 1)) / sum(y  == 1), # Recall == TPR
  sum((y == 1) & (y_ == 1)) / sum(y_ == 1)) # Precision

## [1] 0.800 0.667

my_pr <- pr.curve(scores.class0 = y_score[y == 1],
                  scores.class1 = y_score[y == 0],
                  curve = TRUE)
my_pr %>% plot(xlab = "Recall",
               ylab = "Precision",
               legend = FALSE)

my_pr$auc.integral

## [1] 0.847

10.3 タイタニック

library(caret)
library(PRROC)
library(tidyverse)

my_url <- str_c("https://raw.githubusercontent.com/taroyabuki",
                "/fromzero/master/data/titanic.csv")
my_data <- read_csv(my_url)

## Rows: 2201 Columns: 4
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (4): Class, Sex, Age, Survived
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

head(my_data)

## # A tibble: 6 × 4
##   Class Sex   Age   Survived
##   <chr> <chr> <chr> <chr>   
## 1 1st   Male  Child Yes     
## 2 1st   Male  Child Yes     
## 3 1st   Male  Child Yes     
## 4 1st   Male  Child Yes     
## 5 1st   Male  Child Yes     
## 6 1st   Male  Adult No

my_model <- train(form = Survived ~ ., data = my_data, method = "rpart2",
                  tuneGrid = data.frame(maxdepth = 2),
                  trControl = trainControl(method = "LOOCV"))

rpart.plot::rpart.plot(my_model$finalModel, extra = 1)

my_model$results

##   maxdepth Accuracy Kappa
## 1        2    0.783  0.41

y <- my_data$Survived
tmp <- my_model %>% predict(newdata = my_data, type = "prob")
y_score <- tmp$Yes

my_roc <- roc.curve(scores.class0 = y_score[y == "Yes"],
                    scores.class1 = y_score[y == "No"],
                    curve = TRUE)
my_roc$auc

## [1] 0.711

my_roc %>% plot(xlab = "False Positive Rate",
                ylab = "True Positive Rate",
                legend = FALSE)

X <- my_data %>% select(Class) # 質的入力変数
y <- my_data$Survived          # 出力変数

options(warn = -1) # これ以降，警告を表示しない．
my_model1 <- train(x = X, y = y, method = "rpart2",
                   tuneGrid = data.frame(maxdepth = 2),
                   trControl = trainControl(method = "LOOCV"))
options(warn = 0)  # これ以降，警告を表示する．

rpart.plot::rpart.plot(my_model1$finalModel, extra = 1)

my_model1$results

##   maxdepth Accuracy Kappa
## 1        2    0.714 0.237

my_enc <- my_data %>% dummyVars(formula = Survived ~ Class)
my_data2 <- my_enc %>%
  predict(my_data) %>%
  as.data.frame %>%
  mutate(Survived = my_data$Survived)

my_model2 <- train(form = Survived ~ ., data = my_data2, method = "rpart2",
                   tuneGrid = data.frame(maxdepth = 2),
                   trControl = trainControl(method = "LOOCV"))
rpart.plot::rpart.plot(my_model2$finalModel, extra = 1)

my_model2$results

##   maxdepth Accuracy Kappa
## 1        2    0.714 0.237

my_model3 <- train(form = Survived ~ Class, data = my_data, method = "rpart2",
                   tuneGrid = data.frame(maxdepth = 2),
                   trControl = trainControl(method = "LOOCV"))
rpart.plot::rpart.plot(my_model3$finalModel, extra = 1)

my_model3$results

##   maxdepth Accuracy Kappa
## 1        2    0.692 0.267

10.4 ロジスティック回帰

curve(1 / (1 + exp(-x)), -6, 6)

library(caret)
library(PRROC)
library(tidyverse)

my_url <- str_c("https://raw.githubusercontent.com/taroyabuki",
                "/fromzero/master/data/titanic.csv")
my_data <- read_csv(my_url)

## Rows: 2201 Columns: 4
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (4): Class, Sex, Age, Survived
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

my_model <- train(form = Survived ~ ., data = my_data, method = "glm",
                  trControl = trainControl(method = "LOOCV"))

coef(my_model$finalModel) %>%
  as.data.frame

##                  .
## (Intercept)  2.044
## Class2nd    -1.018
## Class3rd    -1.778
## ClassCrew   -0.858
## SexMale     -2.420
## AgeChild     1.062

my_model$results

##   parameter Accuracy Kappa
## 1      none    0.778 0.445

11 深層学習とAutoML

# Google Colaboratoryの環境設定
if (file.exists("/content")) {
  options(Ncpus = parallel::detectCores())
  installed_packages <- rownames(installed.packages())
  packages_to_install <- c("h2o", "keras")
  install.packages(setdiff(packages_to_install, installed_packages))
}

11.1 Kerasによる回帰

library(keras)

## 
## Attaching package: 'keras'

## The following object is masked from 'package:future':
## 
##     %<-%

library(tidyverse)
my_url <- str_c("https://raw.githubusercontent.com/taroyabuki",
                "/fromzero/master/data/wine.csv")
tmp <- read_csv(my_url)

## Rows: 27 Columns: 5

## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (5): LPRICE2, WRAIN, DEGREES, HRAIN, TIME_SV
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

my_data <- tmp[sample(nrow(tmp)), ]

X <- my_data %>%
  select(-LPRICE2) %>% scale
y <- my_data$LPRICE2

curve(activation_relu(x), -3, 3)

## Loaded Tensorflow version 2.7.0

my_model <- keras_model_sequential() %>%
  layer_dense(units = 3, activation = "relu", input_shape = c(4)) %>%
  layer_dense(units = 1)

summary(my_model) # ネットワークの概要

## Model: "sequential"
## ________________________________________________________________________________
##  Layer (type)                       Output Shape                    Param #     
## ================================================================================
##  dense_1 (Dense)                    (None, 3)                       15          
##                                                                                 
##  dense (Dense)                      (None, 1)                       4           
##                                                                                 
## ================================================================================
## Total params: 19
## Trainable params: 19
## Non-trainable params: 0
## ________________________________________________________________________________

# 割愛（Pythonの結果を参照）

my_model %>% compile(
  loss = "mse",
  optimizer = "rmsprop")

my_cb <- callback_early_stopping(
  patience = 20,
  restore_best_weights = TRUE)

my_history <- my_model %>% fit(
    x = X,
    y = y,
    validation_split = 0.25,
    batch_size = 10,
    epochs = 500,
    callbacks = list(my_cb),
    verbose = 0)

plot(my_history)

## `geom_smooth()` using formula 'y ~ x'

my_history

## 
## Final epoch (plot to see history):
##     loss: 0.1457
## val_loss: 0.3786

y_ <- my_model %>% predict(X)
mean((y_ - y)^2)**0.5

## [1] 0.453

11.2 Kerasによる分類

library(keras)
library(tidyverse)
my_data <- iris[sample(nrow(iris)), ]

X <- my_data %>%
  select(-Species) %>% scale
y <- as.integer(my_data$Species) - 1

my_model <- keras_model_sequential() %>%
  layer_dense(units = 3, activation = "relu", input_shape = c(4)) %>%
  layer_dense(units = 3, activation = "softmax")

my_model %>% compile(loss = "sparse_categorical_crossentropy",
                     optimizer = "rmsprop",
                     metrics = c("accuracy"))

my_cb <- callback_early_stopping(
  patience = 20,
  restore_best_weights = TRUE)

my_history <- my_model %>%
  fit(x = X,
    y = y,
    validation_split = 0.25,
    batch_size = 10,
    epochs = 500,
    callbacks = list(my_cb),
    verbose = 0)

plot(my_history)

## `geom_smooth()` using formula 'y ~ x'

my_history

## 
## Final epoch (plot to see history):
##         loss: 0.1077
##     accuracy: 0.9732
##     val_loss: 0.1418
## val_accuracy: 0.9474

tmp <- my_model %>% predict(X)
y_ <- apply(tmp, 1, which.max) - 1
mean(y_ == y)

## [1] 0.967

-mean(log(c(0.8, 0.7, 0.3, 0.8)))

## [1] 0.502

-mean(log(c(0.7, 0.6, 0.2, 0.7)))

## [1] 0.708

y <- c(2, 1, 0, 1)
y_1 <- list(c(0.1, 0.1, 0.8),
            c(0.1, 0.7, 0.2),
            c(0.3, 0.4, 0.3),
            c(0.1, 0.8, 0.1))
y_2 <- list(c(0.1, 0.2, 0.7),
            c(0.2, 0.6, 0.2),
            c(0.2, 0.5, 0.3),
            c(0.2, 0.7, 0.1))

c(mean(as.array(loss_sparse_categorical_crossentropy(y_true = y, y_pred = y_1))),
  mean(as.array(loss_sparse_categorical_crossentropy(y_true = y, y_pred = y_2))))

## [1] 0.502 0.708

11.3 MNIST：手書き数字の分類

library(keras)
library(tidyverse)
keras::`%<-%`(c(c(x_train, y_train), c(x_test, y_test)), dataset_mnist())

dim(x_train)

## [1] 60000    28    28

x_train[5, , ]

##       [,1] [,2] [,3] [,4] [,5] [,6] [,7] [,8] [,9] [,10] [,11] [,12] [,13]
##  [1,]    0    0    0    0    0    0    0    0    0     0     0     0     0
##  [2,]    0    0    0    0    0    0    0    0    0     0     0     0     0
##  [3,]    0    0    0    0    0    0    0    0    0     0     0     0     0
##  [4,]    0    0    0    0    0    0    0    0    0     0     0     0     0
##  [5,]    0    0    0    0    0    0    0    0    0     0     0     0     0
##  [6,]    0    0    0    0    0    0    0    0    0     0     0     0     0
##  [7,]    0    0    0    0    0    0    0    0    0     0     0     0     0
##  [8,]    0    0    0    0    0    0    0    0    0     0     0     0    55
##  [9,]    0    0    0    0    0    0    0    0    0     0     0    87   232
## [10,]    0    0    0    0    0    0    0    0    0     4    57   242   252
## [11,]    0    0    0    0    0    0    0    0    0    96   252   252   183
## [12,]    0    0    0    0    0    0    0    0  132   253   252   146    14
## [13,]    0    0    0    0    0    0    0  126  253   247   176     9     0
## [14,]    0    0    0    0    0    0   16  232  252   176     0     0     0
## [15,]    0    0    0    0    0    0   22  252  252    30    22   119   197
## [16,]    0    0    0    0    0    0   16  231  252   253   252   252   252
## [17,]    0    0    0    0    0    0    0   55  235   253   217   138    42
## [18,]    0    0    0    0    0    0    0    0    0     0     0     0     0
## [19,]    0    0    0    0    0    0    0    0    0     0     0     0     0
## [20,]    0    0    0    0    0    0    0    0    0     0     0     0     0
## [21,]    0    0    0    0    0    0    0    0    0     0     0     0     0
## [22,]    0    0    0    0    0    0    0    0    0     0     0     0     0
## [23,]    0    0    0    0    0    0    0    0    0     0     0     0     0
## [24,]    0    0    0    0    0    0    0    0    0     0     0     0     0
## [25,]    0    0    0    0    0    0    0    0    0     0     0     0     0
## [26,]    0    0    0    0    0    0    0    0    0     0     0     0     0
## [27,]    0    0    0    0    0    0    0    0    0     0     0     0     0
## [28,]    0    0    0    0    0    0    0    0    0     0     0     0     0
##       [,14] [,15] [,16] [,17] [,18] [,19] [,20] [,21] [,22] [,23] [,24] [,25]
##  [1,]     0     0     0     0     0     0     0     0     0     0     0     0
##  [2,]     0     0     0     0     0     0     0     0     0     0     0     0
##  [3,]     0     0     0     0     0     0     0     0     0     0     0     0
##  [4,]     0     0     0     0     0     0     0     0     0     0     0     0
##  [5,]     0     0     0     0     0     0     0     0     0     0     0     0
##  [6,]     0     0     0     0     0     0     0     0     0     0     0     0
##  [7,]     0     0     0     0     0     0     0     0     0     0     0     0
##  [8,]   148   210   253   253   113    87   148    55     0     0     0     0
##  [9,]   252   253   189   210   252   252   253   168     0     0     0     0
## [10,]   190    65     5    12   182   252   253   116     0     0     0     0
## [11,]    14     0     0    92   252   252   225    21     0     0     0     0
## [12,]     0     0     0   215   252   252    79     0     0     0     0     0
## [13,]     0     8    78   245   253   129     0     0     0     0     0     0
## [14,]    36   201   252   252   169    11     0     0     0     0     0     0
## [15,]   241   253   252   251    77     0     0     0     0     0     0     0
## [16,]   226   227   252   231     0     0     0     0     0     0     0     0
## [17,]    24   192   252   143     0     0     0     0     0     0     0     0
## [18,]    62   255   253   109     0     0     0     0     0     0     0     0
## [19,]    71   253   252    21     0     0     0     0     0     0     0     0
## [20,]     0   253   252    21     0     0     0     0     0     0     0     0
## [21,]    71   253   252    21     0     0     0     0     0     0     0     0
## [22,]   106   253   252    21     0     0     0     0     0     0     0     0
## [23,]    45   255   253    21     0     0     0     0     0     0     0     0
## [24,]     0   218   252    56     0     0     0     0     0     0     0     0
## [25,]     0    96   252   189    42     0     0     0     0     0     0     0
## [26,]     0    14   184   252   170    11     0     0     0     0     0     0
## [27,]     0     0    14   147   252    42     0     0     0     0     0     0
## [28,]     0     0     0     0     0     0     0     0     0     0     0     0
##       [,26] [,27] [,28]
##  [1,]     0     0     0
##  [2,]     0     0     0
##  [3,]     0     0     0
##  [4,]     0     0     0
##  [5,]     0     0     0
##  [6,]     0     0     0
##  [7,]     0     0     0
##  [8,]     0     0     0
##  [9,]     0     0     0
## [10,]     0     0     0
## [11,]     0     0     0
## [12,]     0     0     0
## [13,]     0     0     0
## [14,]     0     0     0
## [15,]     0     0     0
## [16,]     0     0     0
## [17,]     0     0     0
## [18,]     0     0     0
## [19,]     0     0     0
## [20,]     0     0     0
## [21,]     0     0     0
## [22,]     0     0     0
## [23,]     0     0     0
## [24,]     0     0     0
## [25,]     0     0     0
## [26,]     0     0     0
## [27,]     0     0     0
## [28,]     0     0     0

plot(as.raster(x = x_train[5, , ],
               max = max(x_train)))

head(y_train)

## [1] 5 0 4 1 9 2

c(min(x_train), max(x_train))

## [1]   0 255

x_train <- x_train / 255
x_test  <- x_test  / 255

my_index <- sample(1:60000, 6000)
x_train <- x_train[my_index, , ]
y_train <- y_train[my_index]

my_model <- keras_model_sequential() %>%
  layer_flatten(input_shape = c(28, 28)) %>%
  layer_dense(units = 256, activation = "relu") %>%
  layer_dense(units = 10, activation = "softmax")
summary(my_model)

## Model: "sequential_2"
## ________________________________________________________________________________
##  Layer (type)                       Output Shape                    Param #     
## ================================================================================
##  flatten (Flatten)                  (None, 784)                     0           
##                                                                                 
##  dense_5 (Dense)                    (None, 256)                     200960      
##                                                                                 
##  dense_4 (Dense)                    (None, 10)                      2570        
##                                                                                 
## ================================================================================
## Total params: 203,530
## Trainable params: 203,530
## Non-trainable params: 0
## ________________________________________________________________________________

# 割愛（Pythonの結果を参照）

my_model %>% compile(loss = "sparse_categorical_crossentropy",
                     optimizer = "rmsprop",
                     metrics = c("accuracy"))

my_cb <- callback_early_stopping(patience = 5,
                                 restore_best_weights = TRUE)

my_history <- my_model %>%
  fit(x = x_train,
      y = y_train,
      validation_split = 0.2,
      batch_size = 128,
      epochs = 20,
      callbacks = list(my_cb),
      verbose = 0)

plot(my_history)

## `geom_smooth()` using formula 'y ~ x'

tmp <- my_model %>% predict(x_test)
y_ <- apply(tmp, 1, which.max) - 1
table(y_, y_test)

##    y_test
## y_     0    1    2    3    4    5    6    7    8    9
##   0  964    0   12    4    2    9   11    3   12    9
##   1    0 1124    5    2    1    2    3   10    9    9
##   2    1    2  958   17    1    2    4   21    5    2
##   3    1    2   14  938    0   25    0    8   24   11
##   4    0    0   12    1  944    8   11    9   10   39
##   5    3    1    1   14    0  807   10    1   21    4
##   6    9    3    6    1    8   13  916    0    6    1
##   7    1    1   10   11    2    1    0  946    4    5
##   8    1    2   13   12    3   14    3    3  866    4
##   9    0    0    1   10   21   11    0   27   17  925

mean(y_ == y_test)

## [1] 0.939

my_model %>%
  evaluate(x = x_test, y = y_test)

##     loss accuracy 
##    0.208    0.939

x_train2d <- x_train %>% array_reshape(c(-1, 28, 28, 1))
x_test2d  <- x_test  %>% array_reshape(c(-1, 28, 28, 1))

my_model <- keras_model_sequential() %>%
  layer_conv_2d(filters = 32, kernel_size = 3,  # 畳み込み層
                activation = "relu",
                input_shape = c(28, 28, 1)) %>%
  layer_max_pooling_2d(pool_size = 2) %>%       # プーリング層
  layer_flatten() %>%
  layer_dense(units = 128, activation = "relu") %>%
  layer_dense(units = 10, activation = "softmax")

summary(my_model)

## Model: "sequential_3"
## ________________________________________________________________________________
##  Layer (type)                       Output Shape                    Param #     
## ================================================================================
##  conv2d (Conv2D)                    (None, 26, 26, 32)              320         
##                                                                                 
##  max_pooling2d (MaxPooling2D)       (None, 13, 13, 32)              0           
##                                                                                 
##  flatten_1 (Flatten)                (None, 5408)                    0           
##                                                                                 
##  dense_7 (Dense)                    (None, 128)                     692352      
##                                                                                 
##  dense_6 (Dense)                    (None, 10)                      1290        
##                                                                                 
## ================================================================================
## Total params: 693,962
## Trainable params: 693,962
## Non-trainable params: 0
## ________________________________________________________________________________

# 割愛（Python の結果を参照）

my_model %>% compile(loss = "sparse_categorical_crossentropy",
                     optimizer = "rmsprop",
                     metrics = c("accuracy"))

my_cb <- callback_early_stopping(patience = 5,
                                 restore_best_weights = TRUE)

my_history <- my_model %>%
  fit(x = x_train2d,
      y = y_train,
      validation_split = 0.2,
      batch_size = 128,
      epochs = 20,
      callbacks = list(my_cb),
      verbose = 0)

plot(my_history)

## `geom_smooth()` using formula 'y ~ x'

my_model %>%
  evaluate(x = x_test2d, y = y_test)

##     loss accuracy 
##    0.122    0.965

my_model <- keras_model_sequential() %>%
  layer_conv_2d(filters = 20, kernel_size = 5, activation = "relu",
                input_shape = c(28, 28, 1)) %>%
  layer_max_pooling_2d(pool_size = 2, strides = 2) %>%
  layer_conv_2d(filters = 50, kernel_size = 5, activation = "relu") %>%
  layer_max_pooling_2d(pool_size = 2, strides = 2) %>%
  layer_dropout(rate = 0.25) %>%
  layer_flatten() %>%
  layer_dense(units = 500, activation = "relu") %>%
  layer_dropout(rate = 0.5) %>%
  layer_dense(units = 10, activation = "softmax")

my_model %>% compile(
  loss = "sparse_categorical_crossentropy",
  optimizer = "rmsprop",
  metrics = c("accuracy"))

my_cb <- callback_early_stopping(patience = 5,
                                 restore_best_weights = TRUE)

my_history <- my_model %>%
  fit(x = x_train2d,
      y = y_train,
      validation_split = 0.2,
      batch_size = 128,
      epochs = 20,
      callbacks = list(my_cb),
      verbose = 0)

plot(my_history)

## `geom_smooth()` using formula 'y ~ x'

my_model %>%
  evaluate(x = x_test2d, y = y_test)

##     loss accuracy 
##   0.0633   0.9825

y_prob <- my_model %>% predict(x_test2d) # カテゴリに属する確率

my_result <- data.frame(
  y_prob = apply(y_prob, 1, max),           # 確率の最大値
  y_     = apply(y_prob, 1, which.max) - 1, # 予測カテゴリ
  y      = y_test,                          # 正解
  id     = seq_len(length(y_test))) %>%     # 番号
  filter(y_ != y) %>%                       # 予測がはずれたものを残す
  arrange(desc(y_prob))                     # 確率の大きい順に並び替える

head(my_result)

##   y_prob y_ y   id
## 1  1.000  5 3  450
## 2  1.000  0 9 6506
## 3  1.000  4 6 3521
## 4  1.000  7 3 1682
## 5  1.000  6 5 9730
## 6  0.999  0 5 9771

tmp <- my_result[1:5, ]$id
my_labels <- sprintf("%s (%s)",
  my_result[1:5, ]$y, tmp)
my_fig <- expand.grid(
  label = my_labels,
  y = 28:1,
  x = 1:28)
my_fig$z <- as.vector(
  x_test[tmp, , ])

my_fig %>% ggplot(
  aes(x = x, y = y, fill = z)) +
  geom_raster() +
  coord_fixed() +
  theme_void() +
  theme(legend.position = "none") +
  facet_grid(. ~ label)

11.4 AutoML

library(h2o)

## 
## ----------------------------------------------------------------------
## 
## Your next step is to start H2O:
##     > h2o.init()
## 
## For H2O package documentation, ask for help:
##     > ??h2o
## 
## After starting H2O, you can use the Web UI at http://localhost:54321
## For more information visit https://docs.h2o.ai
## 
## ----------------------------------------------------------------------

## 
## Attaching package: 'h2o'

## The following objects are masked from 'package:stats':
## 
##     cor, sd, var

## The following objects are masked from 'package:base':
## 
##     &&, %*%, %in%, ||, apply, as.factor, as.numeric, colnames,
##     colnames<-, ifelse, is.character, is.factor, is.numeric, log,
##     log10, log1p, log2, round, signif, trunc

library(keras)
library(tidyverse)

h2o.init()

## 
## H2O is not running yet, starting it now...
## 
## Note:  In case of errors look at the following log files:
##     /tmp/RtmpUk6Jta/file414622b8b2/h2o_rstudio_started_from_r.out
##     /tmp/RtmpUk6Jta/file414ef06fbf/h2o_rstudio_started_from_r.err
## 
## 
## Starting H2O JVM and connecting: .. Connection successful!
## 
## R is connected to the H2O cluster: 
##     H2O cluster uptime:         1 seconds 332 milliseconds 
##     H2O cluster timezone:       Etc/UTC 
##     H2O data parsing timezone:  UTC 
##     H2O cluster version:        3.34.0.3 
##     H2O cluster version age:    2 years, 5 months and 16 days !!! 
##     H2O cluster name:           H2O_started_from_R_rstudio_rir268 
##     H2O cluster total nodes:    1 
##     H2O cluster total memory:   15.69 GB 
##     H2O cluster total cores:    32 
##     H2O cluster allowed cores:  32 
##     H2O cluster healthy:        TRUE 
##     H2O Connection ip:          localhost 
##     H2O Connection port:        54321 
##     H2O Connection proxy:       NA 
##     H2O Internal Security:      FALSE 
##     H2O API Extensions:         Amazon S3, XGBoost, Algos, AutoML, Core V3, TargetEncoder, Core V4 
##     R Version:                  R version 4.1.2 (2021-11-01)

## Warning in h2o.clusterInfo(): 
## Your H2O cluster version is too old (2 years, 5 months and 16 days)!
## Please download and install the latest version from http://h2o.ai/download/

h2o.no_progress()
# h2o.shutdown(prompt = FALSE) # 停止

my_url <- str_c("https://raw.githubusercontent.com/taroyabuki",
                "/fromzero/master/data/wine.csv")
my_data <- read_csv(my_url)

## Rows: 27 Columns: 5
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (5): LPRICE2, WRAIN, DEGREES, HRAIN, TIME_SV
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

my_frame <- as.h2o(my_data) # 通常のデータフレームをH2OFrameに変換する．
# あるいは
my_frame <- h2o.importFile(my_url, header = TRUE) # データを読み込む．

my_frame

##   LPRICE2 WRAIN DEGREES HRAIN TIME_SV
## 1  -0.999   600    17.1   160      31
## 2  -0.454   690    16.7    80      30
## 3  -0.808   502    17.1   130      28
## 4  -1.509   420    16.1   110      26
## 5  -1.717   582    16.4   187      25
## 6  -0.418   485    17.5   187      24
## 
## [27 rows x 5 columns]

# 通常のデータフレームに戻す．
my_frame %>% as.data.frame %>% head

##   LPRICE2 WRAIN DEGREES HRAIN TIME_SV
## 1  -0.999   600    17.1   160      31
## 2  -0.454   690    16.7    80      30
## 3  -0.808   502    17.1   130      28
## 4  -1.509   420    16.1   110      26
## 5  -1.717   582    16.4   187      25
## 6  -0.418   485    17.5   187      24

# 結果は割愛（見た目は同じ）

my_model <- h2o.automl(
    y = "LPRICE2",
    training_frame = my_frame,
    max_runtime_secs = 60)

## 
## 15:30:42.217: Skipping training of model GBM_1_AutoML_1_20240323_153039 due to exception: water.exceptions.H2OModelBuilderIllegalArgumentException: Illegal argument(s) for GBM model: GBM_1_AutoML_1_20240323_153039.  Details: ERRR on field: _min_rows: The dataset size is too small to split for min_rows=100.0: must have at least 200.0 (weighted) rows, but have only 27.0.

min(my_model@leaderboard$rmse)

## [1] 0.289

tmp <- my_model %>%
  predict(my_frame) %>%
  as.data.frame
y_ <- tmp$predict
y  <- my_data$LPRICE2

plot(y, y_)

keras::`%<-%`(c(c(x_train, y_train), c(x_test, y_test)), dataset_mnist())
my_index <- sample(1:60000, 6000)
x_train <- x_train[my_index, , ]
y_train <- y_train[my_index]

tmp <- x_train %>%
  array_reshape(c(-1, 28 * 28)) %>%
  as.data.frame
tmp$y <- as.factor(y_train)
my_train <- as.h2o(tmp)

tmp <- x_test %>%
  array_reshape(c(-1, 28 * 28)) %>%
  as.data.frame
my_test <- as.h2o(tmp)

my_model <- h2o.automl(
    y = "y",
    training_frame = my_train,
    max_runtime_secs = 120)

min(my_model@leaderboard$
    mean_per_class_error)

## [1] 0.076

tmp <- my_model %>%
  predict(my_test) %>% as.data.frame
y_ <- tmp$predict

mean(y_ == y_test)

## [1] 0.932

12 時系列予測

# Google Colaboratoryの環境設定
if (file.exists("/content")) {
  options(Ncpus = parallel::detectCores())
  installed_packages <- rownames(installed.packages())
  packages_to_install <- c("caret", "fable", "feasts", "prophet", "tsibble", "urca")
  packages_to_upgrade <- c("ggplot2")
  install.packages(union(setdiff(packages_to_install, installed_packages), packages_to_upgrade))
}

12.1 日時と日時の列

as.POSIXct("2021-01-01")

## [1] "2021-01-01 UTC"

library(tsibble)

## 
## Attaching package: 'tsibble'

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, union

seq(from = 2021, to = 2023, by = 1)

## [1] 2021 2022 2023

seq(from = yearmonth("202101"), to = yearmonth("202103"), by = 2)

## Warning: `yearmonth()` may yield unexpected results.
## ℹ Please use arg `format` to supply formats.
## `yearmonth()` may yield unexpected results.
## ℹ Please use arg `format` to supply formats.

## <yearmonth[2]>
## [1] "2021 Jan" "2021 Mar"

seq(from = as.POSIXct("2021-01-01"), to = as.POSIXct("2021-01-03"), by = "1 day")

## [1] "2021-01-01 UTC" "2021-01-02 UTC" "2021-01-03 UTC"

seq(from = as.POSIXct("2021-01-01 00:00:00"),
    to   = as.POSIXct("2021-01-01 03:00:00"), by = "2 hour")

## [1] "2021-01-01 00:00:00 UTC" "2021-01-01 02:00:00 UTC"

12.2 時系列データの予測

my_data <- as.vector(AirPassengers)

n <- length(my_data) # データ数（144）
k <- 108             # 訓練データ数

library(tidyverse)
library(tsibble)

my_ds <- seq(
  from = yearmonth("1949/01"),
  to   = yearmonth("1960/12"),
  by   = 1)
my_label <- rep(
  c("train", "test"),
  c(k, n - k))
my_df <- tsibble(
  ds    = my_ds,
  x     = 0:(n - 1),
  y     = my_data,
  label = my_label,
  index = ds) # 日時の列の指定

head(my_df)

## # A tsibble: 6 x 4 [1M]
##         ds     x     y label
##      <mth> <int> <dbl> <chr>
## 1 1949 Jan     0   112 train
## 2 1949 Feb     1   118 train
## 3 1949 Mar     2   132 train
## 4 1949 Apr     3   129 train
## 5 1949 May     4   121 train
## 6 1949 Jun     5   135 train

my_train <- my_df[  1:k , ]
my_test  <- my_df[-(1:k), ]
y <- my_test$y

my_plot <- my_df %>%
  ggplot(aes(x = ds,
             y = y,
             color = label)) +
  geom_line()
my_plot

library(caret)
my_lm_model <- train(form = y ~ x, data = my_train, method = "lm")
y_ <- my_lm_model %>% predict(my_test)
caret::RMSE(y, y_) # RMSE（テスト）

## [1] 70.6

y_ <- my_lm_model %>% predict(my_df)
tmp <- my_df %>%
  mutate(y = y_, label = "model")
my_plot + geom_line(data = tmp)

library(fable)

## Loading required package: fabletools

## 
## Attaching package: 'fabletools'

## The following objects are masked from 'package:caret':
## 
##     MAE, RMSE

my_arima_model <- my_train %>% model(ARIMA(y))
my_arima_model

## # A mable: 1 x 1
##                  `ARIMA(y)`
##                     <model>
## 1 <ARIMA(1,1,0)(0,1,0)[12]>

tmp <- my_arima_model %>% forecast(h = "3 years")
head(tmp)

## # A fable: 6 x 4 [1M]
## # Key:     .model [1]
##   .model         ds           y .mean
##   <chr>       <mth>      <dist> <dbl>
## 1 ARIMA(y) 1958 Jan  N(346, 94)  346.
## 2 ARIMA(y) 1958 Feb N(332, 148)  332.
## 3 ARIMA(y) 1958 Mar N(387, 210)  387.
## 4 ARIMA(y) 1958 Apr N(379, 271)  379.
## 5 ARIMA(y) 1958 May N(386, 332)  386.
## 6 ARIMA(y) 1958 Jun N(453, 393)  453.

y_ <- tmp$.mean
caret::RMSE(y_, y)

## [1] 22.1

# 予測結果のみでよい場合
#tmp %>% autoplot

tmp %>% autoplot +
  geom_line(data = my_df,
            aes(x = ds,
                y = y,
                color = label))

library(prophet)

## Loading required package: Rcpp

## Loading required package: rlang

## 
## Attaching package: 'rlang'

## The following object is masked from 'package:xml2':
## 
##     as_list

## The following objects are masked from 'package:jsonlite':
## 
##     flatten, unbox

## The following objects are masked from 'package:testthat':
## 
##     is_false, is_null, is_true

## The following objects are masked from 'package:purrr':
## 
##     %@%, as_function, flatten, flatten_chr, flatten_dbl, flatten_int,
##     flatten_lgl, flatten_raw, invoke, splice

my_prophet_model <- my_train %>%
  prophet(seasonality.mode = "multiplicative")

## Disabling weekly seasonality. Run prophet with weekly.seasonality=TRUE to override this.

## Disabling daily seasonality. Run prophet with daily.seasonality=TRUE to override this.

tmp <- my_prophet_model %>% predict(my_test)
head(tmp[, c("ds", "yhat", "yhat_lower", "yhat_upper")])

## # A tibble: 6 × 4
##   ds                   yhat yhat_lower yhat_upper
##   <dttm>              <dbl>      <dbl>      <dbl>
## 1 1958-01-01 00:00:00  359.       350.       369.
## 2 1958-02-01 00:00:00  351.       342.       359.
## 3 1958-03-01 00:00:00  407.       398.       416.
## 4 1958-04-01 00:00:00  398.       389.       407.
## 5 1958-05-01 00:00:00  402.       393.       411.
## 6 1958-06-01 00:00:00  460.       451.       469.

y_ <- tmp$yhat
caret::RMSE(y_, y)

## [1] 33.7

# my_prophet_model %>% plot(tmp) # 予測結果のみでよい場合

my_prophet_model %>% plot(tmp) +
  geom_line(data = my_train, aes(x = as.POSIXct(ds))) +
  geom_line(data = my_test,  aes(x = as.POSIXct(ds)), color = "red")

13 教師なし学習

# Google Colaboratoryの環境設定
if (file.exists("/content")) {
  options(Ncpus = parallel::detectCores())
  installed_packages <- rownames(installed.packages())
  packages_to_install <- c("factoextra", "ggbiplot", "igraph")
  install.packages(setdiff(packages_to_install, installed_packages))
}

13.1 主成分分析

library(tidyverse)

my_data <- data.frame(
  language  = c(  0,  20,  20,  25,  22,  17),
  english   = c(  0,  20,  40,  20,  24,  18),
  math      = c(100,  20,   5,  30,  17,  25),
  science   = c(  0,  20,   5,  25,  16,  23),
  society   = c(  0,  20,  30,   0,  21,  17),
  row.names = c("A", "B", "C", "D", "E", "F"))
my_result <- my_data %>% prcomp # 主成分分析の実行

my_result$x # 主成分スコア

##      PC1    PC2    PC3     PC4       PC5
## A -74.91  -7.01 -0.361  0.0763  2.86e-15
## B  13.82   2.75  5.273  0.8418 -2.25e-15
## C  33.71 -18.42 -4.876 -0.8820  1.19e-15
## D   1.73  17.88 -7.925 -0.1493  1.91e-16
## E  17.84  -1.06  1.653  2.3125 -3.20e-15
## F   7.81   5.86  6.237 -2.1994 -3.09e-16

my_result %>% ggbiplot::ggbiplot(
  labels = row.names(my_data),
  scale = 0)

## ------------------------------------------------------------------------------

## You have loaded plyr after dplyr - this is likely to cause problems.
## If you need functions from both plyr and dplyr, please load plyr first, then dplyr:
## library(plyr); library(dplyr)

## ------------------------------------------------------------------------------

## 
## Attaching package: 'plyr'

## The following objects are masked from 'package:dplyr':
## 
##     arrange, count, desc, failwith, id, mutate, rename, summarise,
##     summarize

## The following object is masked from 'package:purrr':
## 
##     compact

## 
## Attaching package: 'scales'

## The following object is masked from 'package:purrr':
## 
##     discard

## The following object is masked from 'package:readr':
## 
##     col_factor

my_result$rotation %>% t

##     language english     math science society
## PC1    0.207   0.304 -0.88726   0.130   0.245
## PC2    0.279  -0.325 -0.09764   0.703  -0.559
## PC3   -0.306  -0.616 -0.05635   0.338   0.640
## PC4    0.765  -0.472 -0.00765  -0.418   0.132
## PC5    0.447   0.447  0.44721   0.447   0.447

summary(my_result)

## Importance of components:
##                           PC1     PC2   PC3     PC4      PC5
## Standard deviation     38.264 12.2557 5.588 1.52970 1.23e-15
## Proportion of Variance  0.888  0.0911 0.019 0.00142 0.00e+00
## Cumulative Proportion   0.888  0.9796 0.999 1.00000 1.00e+00

my_result <- prcomp(
  x = my_data,
  scale = TRUE)       # 標準化
# あるいは
my_result <- prcomp(
  x = scale(my_data)) # 標準化データ

my_result$x # 主成分スコア

##      PC1    PC2     PC3     PC4       PC5
## A -3.674 -0.569 -0.0584  0.0106  1.67e-16
## B  0.653  0.247  0.4348  0.0930 -1.47e-16
## C  1.568 -1.743 -0.3806 -0.0977 -2.38e-16
## D  0.251  1.640 -0.6754 -0.0320  4.44e-16
## E  0.886 -0.110  0.0925  0.2304  6.74e-18
## F  0.316  0.535  0.5870 -0.2043 -2.44e-16

Z  <- my_data %>% scale(scale = FALSE) %>% as.matrix # 標準化しない場合
#Z <- my_data %>% scale(scale = TRUE)  %>% as.matrix # 標準化する場合

n <- nrow(my_data)
S <- var(Z)                          # 分散共分散行列
#S <- t(Z) %*% Z / (n - 1)           # （同じ結果）
tmp <- eigen(S)                      # 固有値と固有ベクトル
Z %*% tmp$vectors                    # 主成分スコア（結果は割愛）

##     [,1]   [,2]   [,3]    [,4]      [,5]
## A  74.91  -7.01  0.361  0.0763  4.63e-15
## B -13.82   2.75 -5.273  0.8418 -8.02e-15
## C -33.71 -18.42  4.876 -0.8820  6.96e-15
## D  -1.73  17.88  7.925 -0.1493  2.86e-15
## E -17.84  -1.06 -1.653  2.3125 -1.01e-14
## F  -7.81   5.86 -6.237 -2.1994  1.91e-15

cumsum(tmp$values) / sum(tmp$values) # 累積寄与率

## [1] 0.888 0.980 0.999 1.000 1.000

udv <- svd(Z) # 特異値分解
U <- udv$u
d <- udv$d
V <- udv$v
W <- diag(d)

c(all.equal(Z, U %*% W %*% t(V), check.attributes = FALSE), # 確認1
  all.equal(t(U) %*% U, diag(dim(U)[2])),                   # 確認2
  all.equal(t(V) %*% V, diag(dim(V)[2])))                   # 確認3

## [1] TRUE TRUE TRUE

U %*% W            # 主成分スコア（結果は割愛）

##        [,1]   [,2]   [,3]    [,4]      [,5]
## [1,] -74.91  -7.01 -0.361  0.0763 -6.85e-16
## [2,]  13.82   2.75  5.273  0.8418  8.96e-16
## [3,]  33.71 -18.42 -4.876 -0.8820 -5.18e-16
## [4,]   1.73  17.88 -7.925 -0.1493 -5.83e-16
## [5,]  17.84  -1.06  1.653  2.3125 -1.91e-15
## [6,]   7.81   5.86  6.237 -2.1994 -1.44e-15

e <- d^2 / (n - 1) # 分散共分散行列の固有値
cumsum(e) / sum(e) # 累積寄与率

## [1] 0.888 0.980 0.999 1.000 1.000

13.2 クラスタ分析

library(tidyverse)

my_data <- data.frame(
  x         = c(  0, -16,  10,  10),
  y         = c(  0,   0,  10, -15),
  row.names = c("A", "B", "C", "D"))

my_result <- my_data %>%
  dist("euclidian") %>% # distだけでも可
  hclust("complete")    # hclustだけでも可

my_result %>% factoextra::fviz_dend(
  k = 3, # クラスタ数
  rect = TRUE, rect_fill = TRUE)

## Warning: `guides(<scale> = FALSE)` is deprecated. Please use `guides(<scale> =
## "none")` instead.

my_result %>% factoextra::fviz_dend(
  k = 3,
  rect = TRUE, rect_fill = TRUE,
  type = "phylogenic")

my_result %>% cutree(3)

## A B C D 
## 1 2 1 3

library(tidyverse)

my_data <- data.frame(
  language  = c(  0,  20,  20,  25,  22,  17),
  english   = c(  0,  20,  40,  20,  24,  18),
  math      = c(100,  20,   5,  30,  17,  25),
  science   = c(  0,  20,   5,  25,  16,  23),
  society   = c(  0,  20,  30,   0,  21,  17),
  row.names = c("A", "B", "C", "D", "E", "F"))

try( # RMarkdownで発生するエラーを回避する．
  my_data %>% scale %>%                        # 列ごとの標準化
    gplots::heatmap.2(cexRow = 1, cexCol = 1), # ラベルのサイズを指定して描画する．
  silent = TRUE)

library(tidyverse)

my_data <- data.frame(
  x         = c(  0, -16,  10,  10),
  y         = c(  0,   0,  10, -15),
  row.names = c("A", "B", "C", "D"))

my_result <- my_data %>% kmeans(3)

my_result$cluster

## A B C D 
## 3 2 3 1

library(tidyverse)
library(factoextra)

## Welcome! Want to learn more? See two factoextra-related books at https://goo.gl/ve3WBa

my_data <- iris[, -5]

f <- 2:5 %>% map(function(k) {
  my_data %>% kmeans(k) %>%
    fviz_cluster(data = my_data, geom = "point") +
    ggtitle(sprintf("k = %s", k))
})
gridExtra::grid.arrange(f[[1]], f[[2]], f[[3]], f[[4]], ncol = 2)

fviz_nbclust(my_data, kmeans, method = "wss")

library(tidyverse)
my_data <- iris[, -5] %>% scale

my_result <- prcomp(my_data)$x %>% as.data.frame # 主成分分析

# 非階層的クラスタ分析の場合
my_result$cluster <- (my_data %>% scale %>% kmeans(3))$cluster %>% as.factor

# 階層的クラスタ分析の場合
#my_result$cluster <- my_data %>% dist %>% hclust %>% cutree(3) %>% as.factor

my_result %>%
  ggplot(aes(x = PC1, y = PC2, color = cluster)) + # 色でクラスタを表現する．
  geom_point(shape = iris$Species) +               # 形で品種を表現する．
  theme(legend.position = "none")

R ゼロからはじめるデータサイエンス入門

1 コンピュータとネットワーク

1.1 コンピュータの基本操作

1.2 ネットワークの仕組み

2 データサイエンスのための環境

2.1 実行環境の選択

2.2 クラウド

2.3 Docker

2.4 ターミナルの使い方

2.5 RとPython

2.6 サンプルコードの利用

3 RとPython

3.1 入門

3.2 関数

3.3 コレクション

3.4 データフレーム

3.5 1次元データの（非）類似度

3.6 Rのパッケージ，Pythonのモジュール

3.7 反復処理

3.8 その他

4 統計入門

4.1 記述統計

4.2 データの可視化

4.3 乱数

4.4 統計的推測

5 前処理

5.1 データの読み込み

5.2 データの変換

6 機械学習の目的・データ・手法

6.1 機械学習の目的（本書の場合）

6.2 機械学習のためのデータ

6.3 機械学習のための手法

7 回帰1（単回帰）

7.1 自動車の停止距離

7.2 データの確認

7.3 回帰分析

7.4 当てはまりの良さの指標

7.5 K最近傍法

7.6 検証

7.7 パラメータチューニング

8 回帰2（重回帰）

8.1 ブドウの生育条件とワインの価格

8.2 重回帰分析

8.3 標準化

8.4 入力変数の数とモデルの良さ

8.5 変数選択

8.6 補足：正則化

8.7 ニューラルネットワーク

9 分類1（多値分類）

9.1 アヤメのデータ

9.2 木による分類

9.3 正解率

9.4 複数の木を使う方法

9.5 欠損のあるデータでの学習

9.6 他の分類手法

10 分類2（2値分類）

10.1 2値分類の性能指標

10.2 トレードオフ

10.3 タイタニック

10.4 ロジスティック回帰

11 深層学習とAutoML

11.1 Kerasによる回帰

11.2 Kerasによる分類

11.3 MNIST：手書き数字の分類

11.4 AutoML

12 時系列予測

12.1 日時と日時の列

12.2 時系列データの予測

13 教師なし学習

13.1 主成分分析

13.2 クラスタ分析

R　ゼロからはじめるデータサイエンス入門