辻真吾・矢吹太朗『ゼロからはじめるデータサイエンス入門』(講談社, 2021)
RStudioでKnitする方法
docker run -d -e PASSWORD=password -e ROOT=TRUE -p 8787:8787 -v "$(pwd)":/home/rstudio/work taroyabuki/rstudio
install.packages(c("knitr", "markdown", "rmarkdown"))
# これはRのコードの例です.
1 + 1
## [1] 2
1 + 1
## [1] 2
print(1 + 2)
## [1] 3
1 + 3
## [1] 4
# Google Colaboratoryの環境設定
if (file.exists("/content")) {
options(Ncpus = parallel::detectCores())
installed_packages <- rownames(installed.packages())
packages_to_install <- c("furrr", "keras", "proxy")
install.packages(setdiff(packages_to_install, installed_packages))
}
0x10
## [1] 16
1.23e5
## [1] 123000
2 * 3
## [1] 6
10 / 3
## [1] 3.333333
10 %/% 3 # 商
## [1] 3
10 %% 3 # 余り
## [1] 1
x <- 2
y <- 3
x * y
## [1] 6
keras::`%<-%`(c(x, y), c(20, 30)) # まとめて名付け
x * y
## [1] 600
x <- 1 + 1
# この段階では結果は表示されない
x # 変数名を評価する.
## [1] 2
my_s <- "abcde"
nchar(my_s)
## [1] 5
library(tidyverse)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.1 ──
## ✓ ggplot2 3.3.5 ✓ purrr 0.3.4
## ✓ tibble 3.1.6 ✓ dplyr 1.0.7
## ✓ tidyr 1.1.4 ✓ stringr 1.4.0
## ✓ readr 2.1.0 ✓ forcats 0.5.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
str_c("This is ", "a", " pen.")
## [1] "This is a pen."
substr(x = my_s, start = 2, stop = 4)
## [1] "bcd"
tmp <- "%s is %s."
sprintf(tmp, "This", "a pen")
## [1] "This is a pen."
1 <= 2
## [1] TRUE
1 < 0
## [1] FALSE
0.1 + 0.1 + 0.1 == 0.3
## [1] FALSE
all.equal(0.1 + 0.1 + 0.1, 0.3)
## [1] TRUE
TRUE & FALSE # 論理積(かつ)
## [1] FALSE
TRUE | FALSE # 論理和(または)
## [1] TRUE
!TRUE # 否定(でない)
## [1] FALSE
ifelse(3 < 5, 0, 10)
## [1] 0
getwd()
## [1] "/home/rstudio/work"
setwd("..")
getwd()
## [1] "/home/rstudio"
sqrt(4)
## [1] 2
log(100, 10)
## [1] 2
log(100) # 自然対数
## [1] 4.60517
# あるいは
log(100, exp(1)) # 省略しない場合
## [1] 4.60517
log10(100) # 常用対数
## [1] 2
log2(1024) # 底が2の対数
## [1] 10
library(tidyverse)
4 %>% sqrt
## [1] 2
exp(log(5)) # 通常の書き方
## [1] 5
# あるいは
5 %>% log %>% exp # パイプを使う書き方
## [1] 5
f <- function(a, b) {
a - b
}
f(3, 5)
## [1] -2
f <- function(a, b = 5) {
a - b
}
f(3) # f(3, 5)と同じこと
## [1] -2
(function(a, b) { a - b })(3, 5)
## [1] -2
x <- c("foo", "bar", "baz")
length(x)
## [1] 3
x[2]
## [1] "bar"
x[2] <- "BAR"
x # 結果の確認
## [1] "foo" "BAR" "baz"
x[2] <- "bar" # 元に戻す.
x[-2]
## [1] "foo" "baz"
c(x, "qux")
## [1] "foo" "bar" "baz" "qux"
x <- c(x, "qux")
x # 結果の確認
## [1] "foo" "bar" "baz" "qux"
1:5
## [1] 1 2 3 4 5
seq(from = 0, to = 10, by = 2)
## [1] 0 2 4 6 8 10
seq(from = 0, to = 1, by = 0.5)
## [1] 0.0 0.5 1.0
seq(from = 0, to = 100, length.out = 5)
## [1] 0 25 50 75 100
rep(x = 10, times = 5)
## [1] 10 10 10 10 10
tmp <- c("グー", "パー", "グー", "パー")
x <- factor(tmp, levels = c("グー", "チョキ", "パー"))
x
## [1] グー パー グー パー
## Levels: グー チョキ パー
x <- c(2, 3, 5, 7)
x + 10 # 加算
## [1] 12 13 15 17
x * 10 # 乗算
## [1] 20 30 50 70
x <- c(2, 3)
sin(x)
## [1] 0.9092974 0.1411200
x <- c(2, 3, 5, 7)
y <- c(1, 10, 100, 1000)
x + y
## [1] 3 13 105 1007
x * y
## [1] 2 30 500 7000
sum(x * y)
## [1] 7532
x <- c(TRUE, FALSE)
y <- c(TRUE, TRUE)
x & y
## [1] TRUE FALSE
u <- c(1, 2, 3)
v <- c(1, 2, 3)
w <- c(1, 2, 4)
identical(u, v) # 全体の比較
## [1] TRUE
identical(u, w) # 全体の比較
## [1] FALSE
u == v # 要素ごとの比較
## [1] TRUE TRUE TRUE
u == w # 要素ごとの比較
## [1] TRUE TRUE FALSE
sum(u == w) # 同じ要素の数
## [1] 2
mean(u == w) # 同じ要素の割合
## [1] 0.6666667
x <- list(1, "two")
x[[2]]
## [1] "two"
x <- list("apple" = "りんご",
"orange" = "みかん")
x[["grape"]] <- "ぶどう"
x$apple
## [1] "りんご"
# あるいは
x$"apple"
## [1] "りんご"
# あるいは
x[["apple"]]
## [1] "りんご"
# あるいは
tmp <- "apple"
x[[tmp]]
## [1] "りんご"
x <- c("foo", "bar", "baz")
y <- x
y[2] <- "BAR" # yを更新する.
y
## [1] "foo" "BAR" "baz"
x # xは変わらない.
## [1] "foo" "bar" "baz"
library(tidyverse)
my_df <- data.frame(
name = c("A", "B", "C", "D"),
english = c( 60, 90, 70, 90),
math = c( 70, 80, 90, 100),
gender = c("f", "m", "m", "f"))
my_df <- tribble(
~name, ~english, ~math, ~gender,
"A", 60, 70, "f",
"B", 90, 80, "m",
"C", 70, 90, "m",
"D", 90, 100, "f")
head(my_df)
## # A tibble: 4 × 4
## name english math gender
## <chr> <dbl> <dbl> <chr>
## 1 A 60 70 f
## 2 B 90 80 m
## 3 C 70 90 m
## 4 D 90 100 f
# 結果は割愛
dim(my_df) # 行数と列数
## [1] 4 4
nrow(my_df) # 行数
## [1] 4
ncol(my_df) # 列数
## [1] 4
my_df2 <- expand.grid(
X = c(1, 2, 3),
Y = c(10, 100))
my_df2
## X Y
## 1 1 10
## 2 2 10
## 3 3 10
## 4 1 100
## 5 2 100
## 6 3 100
colnames(my_df2)
## [1] "X" "Y"
colnames(my_df2) <- c("P", "Q")
my_df2
## P Q
## 1 1 10
## 2 2 10
## 3 3 10
## 4 1 100
## 5 2 100
## 6 3 100
# 以下省略
row.names(my_df)
## [1] "1" "2" "3" "4"
row.names(my_df2) <-
c("a", "b", "c", "d", "e", "f")
my_df2
## P Q
## a 1 10
## b 2 10
## c 3 10
## d 1 100
## e 2 100
## f 3 100
# 以下省略
my_df3 <- data.frame(
english = c( 60, 90, 70, 90),
math = c( 70, 80, 90, 100),
gender = c("f", "m", "m", "f"),
row.names = c("A", "B", "C", "D"))
my_df3
## english math gender
## A 60 70 f
## B 90 80 m
## C 70 90 m
## D 90 100 f
tmp <- data.frame(
name = "E",
english = 80,
math = 80,
gender = "m")
my_df2 <- rbind(my_df, tmp)
my_df2 <- my_df %>%
mutate(id = c(1, 2, 3, 4))
my_df3 <- my_df # コピー
my_df3["id"] <- c(1, 2, 3, 4) # 更新
my_df3 # 結果の確認(割愛)
## # A tibble: 4 × 5
## name english math gender id
## <chr> <dbl> <dbl> <chr> <dbl>
## 1 A 60 70 f 1
## 2 B 90 80 m 2
## 3 C 70 90 m 3
## 4 D 90 100 f 4
my_df[1, 2]
## # A tibble: 1 × 1
## english
## <dbl>
## 1 60
x <- my_df[, 2]
# あるいは
x <- my_df$english
# あるいは
x <- my_df$"english"
# あるいは
x <- my_df[["english"]]
# あるいは
tmp <- "english"
x <- my_df[[tmp]]
x # 結果の確認(割愛)
## [1] 60 90 70 90
x <- my_df %>% select(name, math)
x <- my_df[, c(1, 3)]
x <- my_df %>%
select(-c(english, gender))
# あるいは
x <- my_df[, -c(2, 4)]
x <- my_df[c(1, 3), ]
x <- my_df[-c(2, 4), ]
x <- my_df[my_df$gender == "m", ]
# あるいは
x <- my_df %>% filter(gender == "m")
x <- my_df[my_df$english > 80 & my_df$gender == "m", ]
# あるいは
x <- my_df %>% filter(english > 80 & gender == "m")
x <- my_df[my_df$english == max(my_df$english), ]
# あるいは
x <- my_df %>% filter(english == max(my_df$english))
my_df2 <- my_df # コピー
my_df2[my_df$gender == "m", ]$gender <- "M"
my_df2
## # A tibble: 4 × 4
## name english math gender
## <chr> <dbl> <dbl> <chr>
## 1 A 60 70 f
## 2 B 90 80 M
## 3 C 70 90 M
## 4 D 90 100 f
x <- my_df %>% arrange(english)
x <- my_df %>% arrange(-english)
x <- c(2, 3, 5, 7, 11, 13, 17, 19, 23,
29, 31, 37)
A <- matrix(
data = x, # 1次元データ
nrow = 3, # 行数
byrow = TRUE) # 行ごとの生成
A
## [,1] [,2] [,3] [,4]
## [1,] 2 3 5 7
## [2,] 11 13 17 19
## [3,] 23 29 31 37
A <- my_df[, c(2, 3)] %>% as.matrix
A
## english math
## [1,] 60 70
## [2,] 90 80
## [3,] 70 90
## [4,] 90 100
as.data.frame(A)
## english math
## 1 60 70
## 2 90 80
## 3 70 90
## 4 90 100
t(A)
## [,1] [,2] [,3] [,4]
## english 60 90 70 90
## math 70 80 90 100
t(A) %*% A
## english math
## english 24700 26700
## math 26700 29400
my_wider <- data.frame(
day = c(25, 26, 27),
min = c(20, 21, 15),
max = c(24, 27, 21))
my_longer <- my_wider %>%
pivot_longer(-day)
my_longer
## # A tibble: 6 × 3
## day name value
## <dbl> <chr> <dbl>
## 1 25 min 20
## 2 25 max 24
## 3 26 min 21
## 4 26 max 27
## 5 27 min 15
## 6 27 max 21
my_longer %>% pivot_wider()
## # A tibble: 3 × 3
## day min max
## <dbl> <dbl> <dbl>
## 1 25 20 24
## 2 26 21 27
## 3 27 15 21
my_longer %>%
ggplot(aes(x = day, y = value,
color = name)) +
geom_point() +
geom_line() +
ylab("temperature") + # y軸ラベル
scale_x_continuous(
breaks = my_longer$day) # x軸目盛り
A <- c(3, 4, 5)
B <- c(3, 4, 29)
C <- c(9, -18, 8)
AB <- B - A
AC <- C - A
sum(AB^2)^0.5
## [1] 24
sum(AC^2)^0.5
## [1] 23
sum(abs(AB))
## [1] 24
sum(abs(AC))
## [1] 31
sum(A * B) /
sum(A * A)^0.5 / sum(B * B)^0.5
## [1] 0.8169679
sum(A * C) /
sum(A * A)^0.5 / sum(C * C)^0.5
## [1] -0.03265116
cor(A, B)
## [1] 0.8824975
cor(A, C)
## [1] -0.03266277
library(tidyverse)
my_df <- data.frame(
x = c(3, 3, 9),
y = c(4, 4, -18),
z = c(5, 29, 8),
row.names = c("A", "B", "C"))
# ユークリッド距離
my_df %>% proxy::dist("Euclidean")
## A B
## B 24
## C 23 31
# マンハッタン距離
my_df %>% proxy::dist("Manhattan")
## A B
## B 24
## C 31 49
# コサイン類似度
my_df %>% proxy::simil("cosine")
## A B
## B 0.81696786
## C -0.03265116 0.29342441
# 相関係数
my_df %>% proxy::simil("correlation")
## A B
## B 0.88249750
## C -0.03266277 0.44124132
library(tidyverse)
library(tidyverse)
f1 <- function(x) {
tmp <- runif(x)
mean(tmp)
}
f1(10) # 動作確認
## [1] 0.5821029
replicate(n = 3, expr = f1(10))
## [1] 0.4878048 0.4695691 0.5339249
rep(x = f1(10), times = 3)
## [1] 0.6140743 0.6140743 0.6140743
v <- c(5, 10, 100)
v %>% map_dbl(f1)
## [1] 0.6447337 0.5604720 0.5270688
rep(x = 10, times = 3) %>% map_dbl(f1)
## [1] 0.6154754 0.4872310 0.4831569
# 結果は割愛
f2 <- function(n) {
tmp <- runif(n)
list(x = n,
p = mean(tmp),
q = sd(tmp))
}
f2(10) # 動作確認
## $x
## [1] 10
##
## $p
## [1] 0.3094924
##
## $q
## [1] 0.2527441
v <- c(5, 10, 100)
v %>% map_dfr(f2)
## # A tibble: 3 × 3
## x p q
## <dbl> <dbl> <dbl>
## 1 5 0.485 0.274
## 2 10 0.399 0.328
## 3 100 0.580 0.283
f3 <- function(x, y) {
tmp <- runif(x, min = 1,
max = y + 1) %>%
as.integer
list(x = x,
y = y,
p = mean(tmp),
q = sd(tmp))
}
f3(x = 10, y = 6) # 動作確認
## $x
## [1] 10
##
## $y
## [1] 6
##
## $p
## [1] 3.5
##
## $q
## [1] 1.581139
my_df <- data.frame(
x = c(5, 10, 100, 5, 10, 100),
y = c(6, 6, 6, 12, 12, 12))
my_df %>% pmap_dfr(f3)
## # A tibble: 6 × 4
## x y p q
## <dbl> <dbl> <dbl> <dbl>
## 1 5 6 3 2
## 2 10 6 2.6 1.78
## 3 100 6 3.41 1.70
## 4 5 12 7.8 4.09
## 5 10 12 6.1 3.38
## 6 100 12 6.56 3.53
library(furrr)
## Loading required package: future
plan(multisession) # 準備
v <- c(5, 10, 100)
v %>% future_map_dbl(f1, .options =
furrr_options(seed = TRUE))
## [1] 0.6562407 0.7985511 0.4760155
# 結果は割愛
x <- 123
typeof(x)
## [1] "double"
?log
# あるいは
help(log)
v <- c(1, NA, 3)
v
## [1] 1 NA 3
is.na(v[2])
## [1] TRUE
v[2] == NA # 誤り
## [1] NA
# Google Colaboratoryの環境設定
if (file.exists("/content")) {
options(Ncpus = parallel::detectCores())
installed_packages <- rownames(installed.packages())
packages_to_install <- c("exactci", "ggmosaic", "pastecs", "psych", "vcd")
install.packages(setdiff(packages_to_install, installed_packages))
}
x <- c(165, 170, 175, 180, 185)
mean(x) # 平均
## [1] 175
n <- length(x) # サンプルサイズ
sum(x) / n
## [1] 175
y <- c(173, 174, 175, 176, 177)
mean(y)
## [1] 175
var(x) # xの分散
## [1] 62.5
var(y) # yの分散
## [1] 2.5
sum((x - mean(x))^2) / (n - 1)
## [1] 62.5
sd(x) # xの標準偏差
## [1] 7.905694
sd(y) # yの標準偏差
## [1] 1.581139
var(x)**0.5 # xの標準偏差
## [1] 7.905694
psych::describe(x)
## vars n mean sd median trimmed mad min max range skew kurtosis se
## X1 1 5 175 7.91 175 175 7.41 165 185 20 0 -1.91 3.54
# あるいは
pastecs::stat.desc(x)
## nbr.val nbr.null nbr.na min max range
## 5.0000000 0.0000000 0.0000000 165.0000000 185.0000000 20.0000000
## sum median mean SE.mean CI.mean.0.95 var
## 875.0000000 175.0000000 175.0000000 3.5355339 9.8162158 62.5000000
## std.dev coef.var
## 7.9056942 0.0451754
quantile(x)
## 0% 25% 50% 75% 100%
## 165 170 175 180 185
x <- c(165, 170, 175, 180, 185)
var(x) # 不偏分散
## [1] 62.5
mean((x - mean(x))^2) # 標本分散
## [1] 50
# あるいは
n <- length(x)
var(x) * (n - 1) / n # 標本分散
## [1] 50
sd(x) # √不偏分散
## [1] 7.905694
mean((x - mean(x))^2)^0.5 # √標本分散
## [1] 7.071068
# あるいは
sd(x) * sqrt((n - 1) / n) # √標本分散
## [1] 7.071068
sd(x) / length(x)**0.5
## [1] 3.535534
library(tidyverse)
my_df <- data.frame(
name = c("A", "B", "C", "D"),
english = c( 60, 90, 70, 90),
math = c( 70, 80, 90, 100),
gender = c("f", "m", "m", "f"))
var(my_df$english)
## [1] 225
# 結果はベクタ
my_df[, c(2, 3)] %>% sapply(var)
## english math
## 225.0000 166.6667
# 結果はリスト
my_df[, c(2, 3)] %>% lapply(var)
## $english
## [1] 225
##
## $math
## [1] 166.6667
# 結果はデータフレーム
my_df[, c(2, 3)] %>% # 2, 3列目
summarize(across( # の
everything(), # 全ての
var)) # 不偏分散
## english math
## 1 225 166.6667
# あるいは
my_df %>% # データフレーム
summarize(across( # の
where(is.numeric), # 数値の列の
var)) # 不偏分散
## english math
## 1 225 166.6667
# あるいは
my_df %>% # データフレーム
summarize(across( # の
where(is.numeric), # 数値の列の
function(x) { var(x) })) # 不偏分散
## english math
## 1 225 166.6667
psych::describe(my_df)
## vars n mean sd median trimmed mad min max range skew kurtosis
## name* 1 4 2.5 1.29 2.5 2.5 1.48 1 4 3 0.00 -2.08
## english 2 4 77.5 15.00 80.0 77.5 14.83 60 90 30 -0.14 -2.28
## math 3 4 85.0 12.91 85.0 85.0 14.83 70 100 30 0.00 -2.08
## gender* 4 4 1.5 0.58 1.5 1.5 0.74 1 2 1 0.00 -2.44
## se
## name* 0.65
## english 7.50
## math 6.45
## gender* 0.29
# あるいは
pastecs::stat.desc(my_df)
## name english math gender
## nbr.val NA 4.0000000 4.0000000 NA
## nbr.null NA 0.0000000 0.0000000 NA
## nbr.na NA 0.0000000 0.0000000 NA
## min NA 60.0000000 70.0000000 NA
## max NA 90.0000000 100.0000000 NA
## range NA 30.0000000 30.0000000 NA
## sum NA 310.0000000 340.0000000 NA
## median NA 80.0000000 85.0000000 NA
## mean NA 77.5000000 85.0000000 NA
## SE.mean NA 7.5000000 6.4549722 NA
## CI.mean NA 23.8683473 20.5426026 NA
## var NA 225.0000000 166.6666667 NA
## std.dev NA 15.0000000 12.9099445 NA
## coef.var NA 0.1935484 0.1518817 NA
# 以下省略
table(my_df$gender)
##
## f m
## 2 2
my_df2 <- data.frame(
gender = my_df$gender,
excel = my_df$math >= 80)
table(my_df2)
## excel
## gender FALSE TRUE
## f 1 1
## m 0 2
my_df %>% group_by(gender) %>%
summarize(across(
where(is.numeric), mean),
.groups = "drop") # グループ化解除
## # A tibble: 2 × 3
## gender english math
## <chr> <dbl> <dbl>
## 1 f 75 85
## 2 m 80 85
head(iris)
## Sepal.Length Sepal.Width Petal.Length Petal.Width Species
## 1 5.1 3.5 1.4 0.2 setosa
## 2 4.9 3.0 1.4 0.2 setosa
## 3 4.7 3.2 1.3 0.2 setosa
## 4 4.6 3.1 1.5 0.2 setosa
## 5 5.0 3.6 1.4 0.2 setosa
## 6 5.4 3.9 1.7 0.4 setosa
hist(iris$Sepal.Length)
x <- c(10, 20, 30)
hist(x, breaks = 2) # 階級数は2
x <- iris$Sepal.Length
tmp <- seq(min(x), max(x),
length.out = 10)
hist(x, breaks = tmp, right = FALSE)
plot(iris$Sepal.Length,
iris$Sepal.Width)
boxplot(iris[, -5])
library(tidyverse)
my_df <- psych::describe(iris[, -5])
my_df %>% select(mean, sd, se)
## mean sd se
## Sepal.Length 5.84 0.83 0.07
## Sepal.Width 3.06 0.44 0.04
## Petal.Length 3.76 1.77 0.14
## Petal.Width 1.20 0.76 0.06
tmp <- rownames(my_df)
my_df %>% ggplot(aes(x = factor(tmp, levels = tmp), y = mean)) +
geom_col() +
geom_errorbar(aes(ymin = mean - se, ymax = mean + se)) +
xlab(NULL)
my_group <- iris %>% group_by(Species) # 品種ごとに,
my_df <- my_group %>% # 各変数の,平均と
summarize(across(everything(), mean)) %>%
pivot_longer(-Species)
tmp <- my_group %>% # 標準誤差を求める.
summarize(across(everything(), ~ sd(.) / length(.)**0.5)) %>%
pivot_longer(-Species)
my_df$se <- tmp$value
head(my_df)
## # A tibble: 6 × 4
## Species name value se
## <fct> <chr> <dbl> <dbl>
## 1 setosa Sepal.Length 5.01 0.0498
## 2 setosa Sepal.Width 3.43 0.0536
## 3 setosa Petal.Length 1.46 0.0246
## 4 setosa Petal.Width 0.246 0.0149
## 5 versicolor Sepal.Length 5.94 0.0730
## 6 versicolor Sepal.Width 2.77 0.0444
my_df %>%
ggplot(aes(x = Species, y = value, fill = name)) +
geom_col(position = "dodge") +
geom_errorbar(aes(ymin = value - se, ymax = value + se), position = "dodge")
# 各変数の平均
iris %>% pivot_longer(-Species) %>%
ggplot(aes(x = name, y = value)) +
geom_bar(stat = "summary", fun = mean) +
stat_summary(geom = "errorbar", fun.data = mean_se) +
xlab(NULL)
# 各変数の平均(品種ごと)
iris %>% pivot_longer(-Species) %>%
ggplot(aes(x = Species, y = value, fill = name)) +
geom_bar(stat = "summary", fun = mean, position = "dodge") +
stat_summary(geom = "errorbar", fun.data = mean_se, position = "dodge")
my_df <- data.frame(
Species = iris$Species,
w_Sepal = iris$Sepal.Width > 3)
table(my_df) # 分割表
## w_Sepal
## Species FALSE TRUE
## setosa 8 42
## versicolor 42 8
## virginica 33 17
mosaicplot(
formula = ~ Species + w_Sepal,
data = my_df)
library(vcd)
## Loading required package: grid
vcd::mosaic(formula = ~w_Sepal + Species, data = my_df,
labeling = labeling_values)
curve(x^3 - x, -2, 2)
x <- iris$Sepal.Length
tmp <- seq(min(x), max(x),
length.out = 10)
iris %>%
ggplot(aes(x = Sepal.Length)) +
geom_histogram(breaks = tmp,
closed = "left")
iris %>%
ggplot(aes(x = Sepal.Length,
y = Sepal.Width)) +
geom_point()
iris %>%
pivot_longer(-Species) %>%
ggplot(aes(
x = factor(name,
levels = names(iris)),
y = value)) +
geom_boxplot() +
xlab(NULL)
library(ggmosaic)
##
## Attaching package: 'ggmosaic'
## The following objects are masked from 'package:vcd':
##
## mosaic, spine
my_df <- data.frame(
Species = iris$Species,
w_Sepal = iris$Sepal.Width > 3)
my_df %>%
ggplot() +
geom_mosaic(
aes(x = product(w_Sepal, Species)))
f <- function(x) { x^3 - x }
data.frame(x = c(-2, 2)) %>%
ggplot(aes(x = x)) +
stat_function(fun = f)
x <- sample(x = 1:6, # 範囲
size = 10000, # 乱数の数
replace = TRUE) # 重複あり
hist(x, breaks = 0:6) # ヒストグラム
x <- runif(min = 0, # 最小
max = 1, # 最大
n = 1000) # 乱数の数
hist(x)
x <- as.integer( # 整数に変換
runif(min = 1, # 最小
max = 7, # 最大 + 1
n = 1000)) # 乱数の数
hist(x, breaks = 0:6) # 結果は割愛
n <- 100
p <- 0.5
r <- 10000
x <- rbinom(size = n, # 試行回数
prob = p, # 確率
n = r) # 乱数の数
hist(x, breaks = max(x) - min(x))
r <- 10000
x <- rnorm(mean = 50, # 平均
sd = 5, # 標準偏差
n = r) # 乱数の数
hist(x, breaks = 40)
library(tidyverse)
f <- function(k) {
n <- 10000
tmp <- replicate(n = n, expr = g(rnorm(n = k, sd = 3)))
list(k = k,
mean = mean(tmp), # 平均
se = sd(tmp) / sqrt(n)) # 標準誤差
}
g <- var
c(10, 20, 30) %>% map_dfr(f)
## # A tibble: 3 × 3
## k mean se
## <dbl> <dbl> <dbl>
## 1 10 8.95 0.0420
## 2 20 9.00 0.0289
## 3 30 8.99 0.0239
g <- sd
c(5, 10, 15, 20) %>% map_dfr(f)
## # A tibble: 4 × 3
## k mean se
## <dbl> <dbl> <dbl>
## 1 5 2.80 0.0101
## 2 10 2.92 0.00704
## 3 15 2.95 0.00559
## 4 20 2.97 0.00488
g <- function(x) {
n <- length(x)
sd(x) *
sqrt((n - 1) / 2) *
gamma((n - 1) / 2) /
gamma(n / 2)
}
c(10, 20, 30) %>% map_dfr(f)
## # A tibble: 3 × 3
## k mean se
## <dbl> <dbl> <dbl>
## 1 10 3.00 0.00711
## 2 20 3.01 0.00497
## 3 30 3.01 0.00398
library(exactci)
## Loading required package: ssanv
## Loading required package: testthat
##
## Attaching package: 'testthat'
## The following object is masked from 'package:dplyr':
##
## matches
## The following object is masked from 'package:purrr':
##
## is_null
## The following objects are masked from 'package:readr':
##
## edition_get, local_edition
## The following object is masked from 'package:tidyr':
##
## matches
library(tidyverse)
a <- 0.05 # 有意水準
binom.exact(x = 2, # 当たった回数
n = 15, # くじを引いた回数
p = 4 / 10, # 当たる確率(仮説)
plot = TRUE, # p値の描画(結果は次項に掲載)
conf.level = 1 - a, # 信頼係数(デフォルト)
tsmethod = "minlike", # p値の定義
alternative = "two.sided") # 両側検定(デフォルト)
##
## Exact two-sided binomial test (sum of minimum likelihood method)
##
## data: 2 and 15
## number of successes = 2, number of trials = 15, p-value = 0.03646
## alternative hypothesis: true probability of success is not equal to 0.4
## 95 percent confidence interval:
## 0.0242 0.3967
## sample estimates:
## probability of success
## 0.1333333
# 左片側検定なら'less'
# 右片側検定なら'greater'
t <- 4 / 10 # 当たる確率
n <- 15 # くじを引いた回数
x <- 0:n # 当たった回数
my_pr <- dbinom(x, n, t) # x回当たる確率
my_pr2 <- dbinom(2, n, t) # 2回当たる確率
my_data <- data.frame(x = x,
probability = my_pr,
color = my_pr <= my_pr2) # 当たる確率が,2回当たる確率以下
my_data %>% ggplot(aes(x = x, y = probability, color = color)) +
geom_point(size = 3) +
geom_linerange(aes(ymin = 0, ymax = probability), ) + # 垂直線
geom_hline(yintercept = my_pr2) + # 水平線
theme(legend.position = "none") # 凡例を表示しない.
# 前項の結果(再掲)
# 前項冒頭のコード
X <- c(32.1, 26.2, 27.5, 31.8, 32.1, 31.2, 30.1, 32.4, 32.3, 29.9,
29.6, 26.6, 31.2, 30.9, 29.3)
Y <- c(35.4, 34.6, 31.1, 32.4, 33.3, 34.7, 35.3, 34.3, 32.1, 28.3,
33.3, 30.5, 32.6, 33.3, 32.2)
t.test(x = X, y = Y,
conf.level = 0.95, # 信頼係数(デフォルト)
paired = TRUE, # 対標本である.
alternative = "two.sided") # 両側検定(デフォルト)
##
## Paired t-test
##
## data: X and Y
## t = -4.3694, df = 14, p-value = 0.0006416
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -3.995525 -1.364475
## sample estimates:
## mean of the differences
## -2.68
# 左片側検定なら'less'
# 右片側検定なら'greater'
t.test(x = X, y = Y,
paired = FALSE, # 対標本ではない(デフォルト).
var.equal = TRUE, # 等分散を仮定する.仮定しないならFALSE(デフォルト).
alternative = "two.sided",
conf.level = 0.95)
##
## Two Sample t-test
##
## data: X and Y
## t = -3.6821, df = 28, p-value = 0.0009785
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -4.170906 -1.189094
## sample estimates:
## mean of x mean of y
## 30.21333 32.89333
my_url <- str_c("https://raw.githubusercontent.com/taroyabuki",
"/fromzero/master/data/smoker.csv")
my_data <- read_csv(my_url)
## Rows: 1469 Columns: 2
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (2): alive, smoker
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
head(my_data)
## # A tibble: 6 × 2
## alive smoker
## <chr> <chr>
## 1 Yes No
## 2 Yes No
## 3 Yes No
## 4 Yes No
## 5 Yes No
## 6 Yes No
my_table <- table(my_data)
my_table
## smoker
## alive No Yes
## No 117 54
## Yes 950 348
chisq.test(my_table, correct = FALSE)
##
## Pearson's Chi-squared test
##
## data: my_table
## X-squared = 1.7285, df = 1, p-value = 0.1886
X <- rep(0:1, c(13, 2)) # 手順1
X
## [1] 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1
tmp <- sample(X, size = length(X), replace = TRUE) # 手順2
tmp
## [1] 1 1 0 0 0 0 0 0 0 0 1 1 0 0 0
sum(tmp) # 手順3
## [1] 4
n <- 10^5
result <- replicate(n, sum(sample(X, size = length(X), replace = TRUE))) # 手順4
hist(x = result, breaks = 0:15,
right = FALSE)
quantile(result, c(0.025, 0.975))
## 2.5% 97.5%
## 0 5
# Google Colaboratoryの環境設定
if (file.exists("/content")) {
options(Ncpus = parallel::detectCores())
installed_packages <- rownames(installed.packages())
packages_to_install <- c("caret")
install.packages(setdiff(packages_to_install, installed_packages))
}
library(tidyverse)
system(str_c("wget https://raw.githubusercontent.com/taroyabuki",
"/fromzero/master/data/exam.csv"))
my_df <- read_csv("exam.csv")
## Rows: 4 Columns: 4
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (2): name, gender
## dbl (2): english, math
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
# あるいは
my_df <- read.csv("exam.csv",
stringsAsFactors = FALSE)
my_df
## name english math gender
## 1 A 60 70 f
## 2 B 90 80 m
## 3 C 70 90 m
## 4 D 90 100 f
my_url <- str_c("https://raw.githubusercontent.com/taroyabuki",
"/fromzero/master/data/exam.csv")
my_df <- read_csv(my_url)
## Rows: 4 Columns: 4
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (2): name, gender
## dbl (2): english, math
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
# あるいは
my_df <- read.csv(my_url, stringsAsFactors = FALSE)
my_df2 <- read.csv(
file = "exam.csv",
stringsAsFactors = FALSE,
row.names = 1)
my_df2
## english math gender
## A 60 70 f
## B 90 80 m
## C 70 90 m
## D 90 100 f
my_df %>% write_csv("exam2.csv")
# あるいは
my_df %>% write.csv(
file = "exam2.csv",
row.names = FALSE)
my_df2 %>% write.csv("exam3.csv")
my_df <- read_csv(file = "exam.csv",
locale = locale(encoding = "UTF-8"))
## Rows: 4 Columns: 4
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (2): name, gender
## dbl (2): english, math
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
# あるいは
my_df <- read.csv(file = "exam.csv",
stringsAsFactors = FALSE,
fileEncoding = "UTF-8")
my_df %>% write_csv("exam2.csv")
# あるいは
my_df %>% write.csv(file = "exam2.csv", row.names = FALSE,
fileEncoding = "UTF-8")
my_url <- "https://taroyabuki.github.io/fromzero/exam.html"
my_tables <- xml2::read_html(my_url) %>% rvest::html_table()
my_tables
## [[1]]
## # A tibble: 5 × 5
## X1 X2 X3 X4 X5
## <lgl> <chr> <chr> <chr> <chr>
## 1 NA name english math gender
## 2 NA A 60 70 f
## 3 NA B 90 80 m
## 4 NA C 70 90 m
## 5 NA D 90 100 f
tmp <- my_tables[[1]]
tmp
## # A tibble: 5 × 5
## X1 X2 X3 X4 X5
## <lgl> <chr> <chr> <chr> <chr>
## 1 NA name english math gender
## 2 NA A 60 70 f
## 3 NA B 90 80 m
## 4 NA C 70 90 m
## 5 NA D 90 100 f
# 1行目のデータを使って列の名前を付け直す.
colnames(tmp) <- tmp[1, ]
# 1行目と1列目を削除する.
my_data <- tmp[-1, -1]
my_data
## # A tibble: 4 × 4
## name english math gender
## <chr> <chr> <chr> <chr>
## 1 A 60 70 f
## 2 B 90 80 m
## 3 C 70 90 m
## 4 D 90 100 f
library(jsonlite)
##
## Attaching package: 'jsonlite'
## The following object is masked from 'package:purrr':
##
## flatten
my_url <- str_c("https://raw.githubusercontent.com/taroyabuki",
"/fromzero/master/data/exam.json")
my_data <- fromJSON(my_url)
#my_data <- fromJSON("exam.json") # (ファイルを使う場合)
my_data
## name english math gender
## 1 A 60 70 f
## 2 B 90 80 m
## 3 C 70 90 m
## 4 D 90 100 f
library(xml2)
my_url <- str_c("https://raw.githubusercontent.com/taroyabuki",
"/fromzero/master/data/exam.xml")
my_xml <- read_xml(my_url) # XMLデータの読み込み
#my_xml <- read_xml("exam.xml") # (ファイルを使う場合)
xml_ns(my_xml) # 名前空間の確認(d1)
## d1 <-> https://www.example.net/ns/1.0
my_records <- xml_find_all(my_xml, ".//d1:record")
f <- function(record) {
tmp <- xml_attrs(record) # 属性を全て取り出し,
xml_children(record) %>% walk(function(e) {
tmp[xml_name(e)] <<- xml_text(e) # 子要素の名前と内容を追加する.
})
tmp
}
my_data <- my_records %>% map_dfr(f)
my_data$english <- as.numeric(my_data$english)
my_data$math <- as.numeric(my_data$math)
my_data
## # A tibble: 4 × 4
## english math gender name
## <dbl> <dbl> <chr> <chr>
## 1 60 70 f A
## 2 90 80 m B
## 3 70 90 m C
## 4 90 100 f D
x1 <- c(1, 2, 3)
z1 <- scale(x1)
# あるいは
z1 <- (x1 - mean(x1)) / sd(x1)
z1
## [1] -1 0 1
c(mean(z1), sd(z1))
## [1] 0 1
z1 * sd(x1) + mean(x1)
## [1] 1 2 3
x2 <- c(1, 3, 5)
z2 <- (x2 - mean(x1)) / sd(x1)
c(mean(z2), sd(z2))
## [1] 1 2
library(caret)
## Loading required package: lattice
##
## Attaching package: 'caret'
## The following object is masked from 'package:future':
##
## cluster
## The following object is masked from 'package:purrr':
##
## lift
library(tidyverse)
my_df <- data.frame(
id = c(1, 2, 3),
class = as.factor(
c("A", "B", "C")))
my_enc <- my_df %>%
dummyVars(formula = ~ .)
my_enc %>% predict(my_df)
## id class.A class.B class.C
## 1 1 1 0 0
## 2 2 0 1 0
## 3 3 0 0 1
my_df2 <- data.frame(
id = c( 4 , 5 , 6 ),
class = c("B", "C", "B"))
my_enc %>% predict(my_df2)
## id class.A class.B class.C
## 1 4 0 1 0
## 2 5 0 0 1
## 3 6 0 1 0
my_enc <- my_df %>%
dummyVars(formula = ~ .,
fullRank = TRUE)
my_enc %>% predict(my_df)
## id class.B class.C
## 1 1 0 0
## 2 2 1 0
## 3 3 0 1
my_enc %>% predict(my_df2)
## id class.B class.C
## 1 4 1 0
## 2 5 0 1
## 3 6 1 0
iris
## Sepal.Length Sepal.Width Petal.Length Petal.Width Species
## 1 5.1 3.5 1.4 0.2 setosa
## 2 4.9 3.0 1.4 0.2 setosa
## 3 4.7 3.2 1.3 0.2 setosa
## 4 4.6 3.1 1.5 0.2 setosa
## 5 5.0 3.6 1.4 0.2 setosa
## 6 5.4 3.9 1.7 0.4 setosa
## 7 4.6 3.4 1.4 0.3 setosa
## 8 5.0 3.4 1.5 0.2 setosa
## 9 4.4 2.9 1.4 0.2 setosa
## 10 4.9 3.1 1.5 0.1 setosa
## 11 5.4 3.7 1.5 0.2 setosa
## 12 4.8 3.4 1.6 0.2 setosa
## 13 4.8 3.0 1.4 0.1 setosa
## 14 4.3 3.0 1.1 0.1 setosa
## 15 5.8 4.0 1.2 0.2 setosa
## 16 5.7 4.4 1.5 0.4 setosa
## 17 5.4 3.9 1.3 0.4 setosa
## 18 5.1 3.5 1.4 0.3 setosa
## 19 5.7 3.8 1.7 0.3 setosa
## 20 5.1 3.8 1.5 0.3 setosa
## 21 5.4 3.4 1.7 0.2 setosa
## 22 5.1 3.7 1.5 0.4 setosa
## 23 4.6 3.6 1.0 0.2 setosa
## 24 5.1 3.3 1.7 0.5 setosa
## 25 4.8 3.4 1.9 0.2 setosa
## 26 5.0 3.0 1.6 0.2 setosa
## 27 5.0 3.4 1.6 0.4 setosa
## 28 5.2 3.5 1.5 0.2 setosa
## 29 5.2 3.4 1.4 0.2 setosa
## 30 4.7 3.2 1.6 0.2 setosa
## 31 4.8 3.1 1.6 0.2 setosa
## 32 5.4 3.4 1.5 0.4 setosa
## 33 5.2 4.1 1.5 0.1 setosa
## 34 5.5 4.2 1.4 0.2 setosa
## 35 4.9 3.1 1.5 0.2 setosa
## 36 5.0 3.2 1.2 0.2 setosa
## 37 5.5 3.5 1.3 0.2 setosa
## 38 4.9 3.6 1.4 0.1 setosa
## 39 4.4 3.0 1.3 0.2 setosa
## 40 5.1 3.4 1.5 0.2 setosa
## 41 5.0 3.5 1.3 0.3 setosa
## 42 4.5 2.3 1.3 0.3 setosa
## 43 4.4 3.2 1.3 0.2 setosa
## 44 5.0 3.5 1.6 0.6 setosa
## 45 5.1 3.8 1.9 0.4 setosa
## 46 4.8 3.0 1.4 0.3 setosa
## 47 5.1 3.8 1.6 0.2 setosa
## 48 4.6 3.2 1.4 0.2 setosa
## 49 5.3 3.7 1.5 0.2 setosa
## 50 5.0 3.3 1.4 0.2 setosa
## 51 7.0 3.2 4.7 1.4 versicolor
## 52 6.4 3.2 4.5 1.5 versicolor
## 53 6.9 3.1 4.9 1.5 versicolor
## 54 5.5 2.3 4.0 1.3 versicolor
## 55 6.5 2.8 4.6 1.5 versicolor
## 56 5.7 2.8 4.5 1.3 versicolor
## 57 6.3 3.3 4.7 1.6 versicolor
## 58 4.9 2.4 3.3 1.0 versicolor
## 59 6.6 2.9 4.6 1.3 versicolor
## 60 5.2 2.7 3.9 1.4 versicolor
## 61 5.0 2.0 3.5 1.0 versicolor
## 62 5.9 3.0 4.2 1.5 versicolor
## 63 6.0 2.2 4.0 1.0 versicolor
## 64 6.1 2.9 4.7 1.4 versicolor
## 65 5.6 2.9 3.6 1.3 versicolor
## 66 6.7 3.1 4.4 1.4 versicolor
## 67 5.6 3.0 4.5 1.5 versicolor
## 68 5.8 2.7 4.1 1.0 versicolor
## 69 6.2 2.2 4.5 1.5 versicolor
## 70 5.6 2.5 3.9 1.1 versicolor
## 71 5.9 3.2 4.8 1.8 versicolor
## 72 6.1 2.8 4.0 1.3 versicolor
## 73 6.3 2.5 4.9 1.5 versicolor
## 74 6.1 2.8 4.7 1.2 versicolor
## 75 6.4 2.9 4.3 1.3 versicolor
## 76 6.6 3.0 4.4 1.4 versicolor
## 77 6.8 2.8 4.8 1.4 versicolor
## 78 6.7 3.0 5.0 1.7 versicolor
## 79 6.0 2.9 4.5 1.5 versicolor
## 80 5.7 2.6 3.5 1.0 versicolor
## 81 5.5 2.4 3.8 1.1 versicolor
## 82 5.5 2.4 3.7 1.0 versicolor
## 83 5.8 2.7 3.9 1.2 versicolor
## 84 6.0 2.7 5.1 1.6 versicolor
## 85 5.4 3.0 4.5 1.5 versicolor
## 86 6.0 3.4 4.5 1.6 versicolor
## 87 6.7 3.1 4.7 1.5 versicolor
## 88 6.3 2.3 4.4 1.3 versicolor
## 89 5.6 3.0 4.1 1.3 versicolor
## 90 5.5 2.5 4.0 1.3 versicolor
## 91 5.5 2.6 4.4 1.2 versicolor
## 92 6.1 3.0 4.6 1.4 versicolor
## 93 5.8 2.6 4.0 1.2 versicolor
## 94 5.0 2.3 3.3 1.0 versicolor
## 95 5.6 2.7 4.2 1.3 versicolor
## 96 5.7 3.0 4.2 1.2 versicolor
## 97 5.7 2.9 4.2 1.3 versicolor
## 98 6.2 2.9 4.3 1.3 versicolor
## 99 5.1 2.5 3.0 1.1 versicolor
## 100 5.7 2.8 4.1 1.3 versicolor
## 101 6.3 3.3 6.0 2.5 virginica
## 102 5.8 2.7 5.1 1.9 virginica
## 103 7.1 3.0 5.9 2.1 virginica
## 104 6.3 2.9 5.6 1.8 virginica
## 105 6.5 3.0 5.8 2.2 virginica
## 106 7.6 3.0 6.6 2.1 virginica
## 107 4.9 2.5 4.5 1.7 virginica
## 108 7.3 2.9 6.3 1.8 virginica
## 109 6.7 2.5 5.8 1.8 virginica
## 110 7.2 3.6 6.1 2.5 virginica
## 111 6.5 3.2 5.1 2.0 virginica
## 112 6.4 2.7 5.3 1.9 virginica
## 113 6.8 3.0 5.5 2.1 virginica
## 114 5.7 2.5 5.0 2.0 virginica
## 115 5.8 2.8 5.1 2.4 virginica
## 116 6.4 3.2 5.3 2.3 virginica
## 117 6.5 3.0 5.5 1.8 virginica
## 118 7.7 3.8 6.7 2.2 virginica
## 119 7.7 2.6 6.9 2.3 virginica
## 120 6.0 2.2 5.0 1.5 virginica
## 121 6.9 3.2 5.7 2.3 virginica
## 122 5.6 2.8 4.9 2.0 virginica
## 123 7.7 2.8 6.7 2.0 virginica
## 124 6.3 2.7 4.9 1.8 virginica
## 125 6.7 3.3 5.7 2.1 virginica
## 126 7.2 3.2 6.0 1.8 virginica
## 127 6.2 2.8 4.8 1.8 virginica
## 128 6.1 3.0 4.9 1.8 virginica
## 129 6.4 2.8 5.6 2.1 virginica
## 130 7.2 3.0 5.8 1.6 virginica
## 131 7.4 2.8 6.1 1.9 virginica
## 132 7.9 3.8 6.4 2.0 virginica
## 133 6.4 2.8 5.6 2.2 virginica
## 134 6.3 2.8 5.1 1.5 virginica
## 135 6.1 2.6 5.6 1.4 virginica
## 136 7.7 3.0 6.1 2.3 virginica
## 137 6.3 3.4 5.6 2.4 virginica
## 138 6.4 3.1 5.5 1.8 virginica
## 139 6.0 3.0 4.8 1.8 virginica
## 140 6.9 3.1 5.4 2.1 virginica
## 141 6.7 3.1 5.6 2.4 virginica
## 142 6.9 3.1 5.1 2.3 virginica
## 143 5.8 2.7 5.1 1.9 virginica
## 144 6.8 3.2 5.9 2.3 virginica
## 145 6.7 3.3 5.7 2.5 virginica
## 146 6.7 3.0 5.2 2.3 virginica
## 147 6.3 2.5 5.0 1.9 virginica
## 148 6.5 3.0 5.2 2.0 virginica
## 149 6.2 3.4 5.4 2.3 virginica
## 150 5.9 3.0 5.1 1.8 virginica
# 以下省略
# Google Colaboratoryの環境設定
if (file.exists("/content")) {
options(Ncpus = parallel::detectCores())
installed_packages <- rownames(installed.packages())
packages_to_install <- c("caret", "doParallel", "pastecs")
install.packages(setdiff(packages_to_install, installed_packages))
}
library(caret)
library(tidyverse)
my_data <- cars
dim(my_data)
## [1] 50 2
head(my_data)
## speed dist
## 1 4 2
## 2 4 10
## 3 7 4
## 4 7 22
## 5 8 16
## 6 9 10
options(digits = 3)
pastecs::stat.desc(my_data)
## speed dist
## nbr.val 50.000 50.00
## nbr.null 0.000 0.00
## nbr.na 0.000 0.00
## min 4.000 2.00
## max 25.000 120.00
## range 21.000 118.00
## sum 770.000 2149.00
## median 15.000 36.00
## mean 15.400 42.98
## SE.mean 0.748 3.64
## CI.mean.0.95 1.503 7.32
## var 27.959 664.06
## std.dev 5.288 25.77
## coef.var 0.343 0.60
my_data %>%
ggplot(aes(x = speed, y = dist)) +
geom_point()
library(tidyverse)
my_data <- cars
tmp <- data.frame(speed = 21.5, dist = 67)
my_data %>% ggplot(aes(x = speed, y = dist)) +
coord_cartesian(xlim = c(4, 25), ylim = c(0, 120)) +
geom_point() +
stat_smooth(formula = y ~ x, method = "lm") +
geom_pointrange(data = tmp, aes(ymin = -9, ymax = dist), linetype = "dotted") +
geom_pointrange(data = tmp, aes(xmin = 0, xmax = speed), linetype = "dotted")
library(caret)
library(tidyverse)
my_data <- cars
my_model <- train(form = dist ~ speed, # モデル式(出力変数と入力変数の関係)
data = my_data, # データ
method = "lm") # 手法
coef(my_model$finalModel)
## (Intercept) speed
## -17.58 3.93
tmp <- data.frame(speed = 21.5)
my_model %>% predict(tmp)
## 1
## 67
f <- function(x) { my_model %>% predict(data.frame(speed = x)) }
my_data %>%
ggplot(aes(x = speed, y = dist,
color = "data")) +
geom_point() +
stat_function(
fun = f,
mapping = aes(color = "model"))
library(caret)
library(tidyverse)
my_data <- cars
my_model <- train(form = dist ~ speed, data = my_data, method = "lm")
y <- my_data$dist
y_ <- my_model %>% predict(my_data)
my_data$y_ <- y_
my_data$residual <- y - y_
head(my_data)
## speed dist y_ residual
## 1 4 2 -1.85 3.85
## 2 4 10 -1.85 11.85
## 3 7 4 9.95 -5.95
## 4 7 22 9.95 12.05
## 5 8 16 13.88 2.12
## 6 9 10 17.81 -7.81
my_data %>%
ggplot(aes(x = speed, y = dist)) +
geom_point() +
geom_line(aes(x = speed, y = y_)) +
geom_linerange(mapping = aes(ymin = y_, ymax = dist), linetype = "dotted")
RMSE(y_, y)
## [1] 15.1
# あるいは
mean((my_data$residual^2))**0.5
## [1] 15.1
R2(pred = y_, obs = y,
form = "traditional")
## [1] 0.651
R2(pred = y_, obs = y,
form = "corr")
## [1] 0.651
# あるいは
summary(my_model$finalModel)$r.squared
## [1] 0.651
my_test <- my_data[1:3, ]
y <- my_test$dist
y_ <- my_model %>% predict(my_test)
R2(pred = y_, obs = y,
form = "traditional")
## [1] -4.5
R2(pred = y_, obs = y,
form = "corr")
## [1] 0.0769
library(caret)
library(tidyverse)
my_data <- cars
my_idx <- c(2, 11, 27, 34, 39, 44)
my_sample <- my_data[my_idx, ]
options(warn = -1) # これ以降,警告を表示しない.
my_model <- train(form = dist ~ poly(speed, degree = 5, raw = TRUE),
data = my_sample,
method = "lm")
options(warn = 0) # これ以降,警告を表示する.
y <- my_sample$dist
y_ <- my_model %>% predict(my_sample)
RMSE(y_, y)
## [1] 1.07e-10
R2(pred = y_, obs = y,
form = "traditional")
## [1] 1
R2(pred = y_, obs = y,
form = "corr")
## [1] 1
f <- function(x) { my_model %>% predict(data.frame(speed = x)) }
my_data %>%
ggplot(aes(x = speed, y = dist, color = "data")) +
geom_point() +
geom_point(data = my_sample, mapping = aes(color = "sample")) +
stat_function(fun = f, mapping = aes(color = "model")) +
coord_cartesian(ylim = c(0, 120))
# 準備
library(caret)
library(tidyverse)
my_data <- cars
# 訓練
my_model <- train(form = dist ~ speed, data = my_data, method = "knn")
# 可視化の準備
f <- function(x) { my_model %>% predict(data.frame(speed = x))}
my_data %>%
ggplot(aes(x = speed,
y = dist,
color = "data")) +
geom_point() +
stat_function(
fun = f,
mapping = aes(color = "model"))
y <- my_data$dist
y_ <- my_model %>% predict(my_data)
RMSE(y_, y)
## [1] 14
R2(pred = y_, obs = y,
form = "traditional")
## [1] 0.7
R2(pred = y_, obs = y,
form = "corr")
## [1] 0.702
library(caret)
library(tidyverse)
my_data <- cars
my_model <- train(form = dist ~ speed, data = my_data, method = "lm")
my_model$results
## intercept RMSE Rsquared MAE RMSESD RsquaredSD MAESD
## 1 TRUE 15.1 0.697 12.1 2.41 0.122 1.67
my_model <- train(form = dist ~ speed, data = my_data, method = "lm",
trControl = trainControl(method = "cv", number = 5))
my_model$results
## intercept RMSE Rsquared MAE RMSESD RsquaredSD MAESD
## 1 TRUE 15.6 0.68 12.1 2.39 0.18 1.59
my_model <- train(form = dist ~ speed, data = my_data, method = "lm",
trControl = trainControl(method = "LOOCV"))
my_model$results
## intercept RMSE Rsquared MAE
## 1 TRUE 15.7 0.622 12.1
library(doParallel)
## Loading required package: foreach
##
## Attaching package: 'foreach'
## The following objects are masked from 'package:purrr':
##
## accumulate, when
## Loading required package: iterators
## Loading required package: parallel
cl <- makeCluster(detectCores())
registerDoParallel(cl)
library(caret)
library(tidyverse)
my_data <- cars
my_model <- train(form = dist ~ speed, data = my_data, method = "lm")
y <- my_data$dist
y_ <- my_model %>% predict(my_data)
# RMSE(訓練)
RMSE(y_, y)
## [1] 15.1
# 決定係数1(訓練)
R2(pred = y_, obs = y,
form = "traditional")
## [1] 0.651
# 決定係数6(訓練)
R2(pred = y_, obs = y,
form = "corr")
## [1] 0.651
postResample(pred = y_, obs = y)
## RMSE Rsquared MAE
## 15.069 0.651 11.580
my_model <- train(form = dist ~ speed, data = my_data, method = "lm")
my_model$results
## intercept RMSE Rsquared MAE RMSESD RsquaredSD MAESD
## 1 TRUE 14.4 0.701 11.4 2.59 0.0923 1.78
# 左から,RMSE(検証),決定係数6(検証),MAE(検証)
my_model <- train(form = dist ~ speed, data = my_data, method = "lm",
trControl = trainControl(method = "LOOCV"))
# 方法1
my_model$results
## intercept RMSE Rsquared MAE
## 1 TRUE 15.7 0.622 12.1
# 方法2
y <- my_model$pred$obs
y_ <- my_model$pred$pred
mean((y - y_)^2)**0.5
## [1] 15.7
mean(((y - y_)^2)**0.5)
## [1] 12.1
library(caret)
library(tidyverse)
my_data <- cars
my_lm_model <- train(form = dist ~ speed, data = my_data, method = "lm",
trControl = trainControl(method = "LOOCV"))
my_knn_model <- train(form = dist ~ speed, data = my_data, method = "knn",
tuneGrid = data.frame(k = 5),
trControl = trainControl(method = "LOOCV"))
my_lm_model$results$RMSE
## [1] 15.7
my_knn_model$results$RMSE
## [1] 15.8
y <- my_data$dist
y_lm <- my_lm_model$pred$pred
y_knn <- my_knn_model$pred$pred
my_df <- data.frame(
lm = (y - y_lm)^2,
knn = (y - y_knn)^2)
head(my_df)
## lm knn
## 1 18.91 108.16
## 2 179.22 0.64
## 3 41.03 175.56
## 4 168.49 49.00
## 5 5.09 9.00
## 6 67.62 112.89
boxplot(my_df, ylab = "r^2")
t.test(x = my_df$lm, y = my_df$knn,
conf.level = 0.95,
paired = TRUE,
alternative = "two.sided")
##
## Paired t-test
##
## data: my_df$lm and my_df$knn
## t = -0.1, df = 49, p-value = 0.9
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -53.5 47.0
## sample estimates:
## mean of the differences
## -3.21
library(caret)
library(tidyverse)
my_data <- cars
my_model <- train(form = dist ~ speed, data = my_data, method = "knn")
my_model$results
## k RMSE Rsquared MAE RMSESD RsquaredSD MAESD
## 1 5 16.5 0.618 13.0 3.37 0.140 2.29
## 2 7 16.9 0.601 13.0 3.45 0.130 2.59
## 3 9 17.2 0.579 13.2 3.52 0.128 2.54
my_params <- expand.grid(k = 1:15)
my_model <- train(form = dist ~ speed, data = my_data, method = "knn",
tuneGrid = my_params,
trControl = trainControl(method = "LOOCV"))
head(my_model$results)
## k RMSE Rsquared MAE
## 1 1 17.2 0.578 13.8
## 2 2 16.8 0.594 13.0
## 3 3 16.3 0.622 12.7
## 4 4 16.0 0.609 12.3
## 5 5 15.8 0.617 12.0
## 6 6 16.0 0.608 12.3
ggplot(my_model)
my_model$bestTune
## k
## 5 5
my_model$results %>%
filter(RMSE == min(RMSE))
## k RMSE Rsquared MAE
## 1 5 15.8 0.617 12
y <- my_data$dist
y_ <- my_model %>% predict(my_data)
RMSE(y_, y)
## [1] 14
library(caret)
library(tidyverse)
my_data <- cars
my_loocv <- function(k) {
my_model <- train(form = dist ~ speed, data = my_data, method = "knn",
tuneGrid = data.frame(k = k),
trControl = trainControl(method = "LOOCV"))
y <- my_data$dist
y_ <- my_model %>% predict(my_data)
list(k = k,
training = RMSE(y_, y), # RMSE(訓練)
validation = my_model$results$RMSE) # RMSE(検証)
}
my_results <- 1:15 %>% map_dfr(my_loocv)
my_results %>%
pivot_longer(-k) %>%
ggplot(aes(x = k, y = value,
color = name)) +
geom_line() + geom_point() +
xlab("#Neighbors") + ylab("RMSE") +
theme(legend.position = c(1, 0),
legend.justification = c(1, 0))
# 並列クラスタの停止
stopCluster(cl)
registerDoSEQ()
# Google Colaboratoryの環境設定
if (file.exists("/content")) {
options(Ncpus = parallel::detectCores())
installed_packages <- rownames(installed.packages())
packages_to_install <- c("caret", "ggfortify", "glmnet", "glmnetUtils", "leaps", "neuralnet", "psych")
install.packages(setdiff(packages_to_install, installed_packages))
}
library(tidyverse)
my_url <- "http://www.liquidasset.com/winedata.html"
tmp <- read.table(file = my_url, # 読み込む対象
header = TRUE, # 1行目は変数名
na.string = ".", # 欠損値を表す文字列
skip = 62, # 読み飛ばす行数
nrows = 38) # 読み込む行数
psych::describe(tmp)
## vars n mean sd median trimmed mad min max range
## OBS 1 38 19.50 11.11 19.50 19.50 14.08 1.00 38.0 37.00
## VINT 2 38 1970.50 11.11 1970.50 1970.50 14.08 1952.00 1989.0 37.00
## LPRICE2 3 27 -1.45 0.63 -1.51 -1.49 0.72 -2.29 0.0 2.29
## WRAIN 4 38 605.00 135.28 586.50 603.06 174.95 376.00 845.0 469.00
## DEGREES 5 37 16.52 0.66 16.53 16.55 0.67 14.98 17.6 2.67
## HRAIN 6 38 137.00 66.74 120.50 132.19 59.30 38.00 292.0 254.00
## TIME_SV 7 38 12.50 11.11 12.50 12.50 14.08 -6.00 31.0 37.00
## skew kurtosis se
## OBS 0.00 -1.30 1.80
## VINT 0.00 -1.30 1.80
## LPRICE2 0.41 -0.82 0.12
## WRAIN 0.15 -1.10 21.95
## DEGREES -0.34 -0.73 0.11
## HRAIN 0.69 -0.20 10.83
## TIME_SV 0.00 -1.30 1.80
my_data <- na.omit(tmp[, -c(1, 2)])
head(my_data)
## LPRICE2 WRAIN DEGREES HRAIN TIME_SV
## 1 -0.999 600 17.1 160 31
## 2 -0.454 690 16.7 80 30
## 4 -0.808 502 17.1 130 28
## 6 -1.509 420 16.1 110 26
## 7 -1.717 582 16.4 187 25
## 8 -0.418 485 17.5 187 24
dim(my_data)
## [1] 27 5
my_data %>% write_csv("wine.csv")
#my_data <- read_csv("wine.csv") # 作ったファイルを使う場合
my_url <- str_c("https://raw.githubusercontent.com/taroyabuki",
"/fromzero/master/data/wine.csv")
my_data <- read_csv(my_url)
## Rows: 27 Columns: 5
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (5): LPRICE2, WRAIN, DEGREES, HRAIN, TIME_SV
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
library(caret)
library(tidyverse)
my_url <- str_c("https://raw.githubusercontent.com/taroyabuki",
"/fromzero/master/data/wine.csv")
my_data <- read_csv(my_url)
## Rows: 27 Columns: 5
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (5): LPRICE2, WRAIN, DEGREES, HRAIN, TIME_SV
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
my_model <- train(form = LPRICE2 ~ WRAIN + DEGREES + HRAIN + TIME_SV,
data = my_data,
method = "lm",
trControl = trainControl(method = "LOOCV"))
coef(my_model$finalModel) %>%
as.data.frame
## .
## (Intercept) -12.14533
## WRAIN 0.00117
## DEGREES 0.61639
## HRAIN -0.00386
## TIME_SV 0.02385
my_test <- data.frame(
WRAIN = 500, DEGREES = 17,
HRAIN = 120, TIME_SV = 2)
my_model %>% predict(my_test)
## 1
## -1.5
y <- my_data$LPRICE2
y_ <- my_model %>% predict(my_data)
RMSE(y_, y)
## [1] 0.259
R2(pred = y_, obs = y,
form = "traditional")
## [1] 0.828
R2(pred = y_, obs = y,
form = "corr")
## [1] 0.828
my_model$results
## intercept RMSE Rsquared MAE
## 1 TRUE 0.323 0.736 0.277
M <- my_data[, -1] %>%
mutate(b0 = 1) %>% as.matrix
b <- MASS::ginv(M) %*% y
matrix(b,
dimnames = list(colnames(M)))
## [,1]
## WRAIN 0.00117
## DEGREES 0.61639
## HRAIN -0.00386
## TIME_SV 0.02385
## b0 -12.14533
library(caret)
library(tidyverse)
my_url <- str_c("https://raw.githubusercontent.com/taroyabuki",
"/fromzero/master/data/wine.csv")
my_data <- read_csv(my_url)
## Rows: 27 Columns: 5
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (5): LPRICE2, WRAIN, DEGREES, HRAIN, TIME_SV
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
my_data %>%
mutate_if(is.numeric, scale) %>% # 数値の列の標準化
pivot_longer(-LPRICE2) %>%
ggplot(aes(x = name, y = value)) +
geom_boxplot() +
stat_summary(fun = mean, geom = "point", size = 3) +
xlab(NULL)
my_model <- train(
form = LPRICE2 ~ .,
data = my_data,
method = "lm",
preProcess = c("center", "scale"))
coef(my_model$finalModel) %>%
as.data.frame
## .
## (Intercept) -1.452
## WRAIN 0.151
## DEGREES 0.406
## HRAIN -0.282
## TIME_SV 0.197
my_test <- data.frame(
WRAIN = 500, DEGREES = 17,
HRAIN = 120, TIME_SV = 2)
my_model %>% predict(my_test)
## 1
## -1.5
library(caret)
library(tidyverse)
my_url <- str_c("https://raw.githubusercontent.com/taroyabuki",
"/fromzero/master/data/wine.csv")
my_data <- read_csv(my_url)
## Rows: 27 Columns: 5
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (5): LPRICE2, WRAIN, DEGREES, HRAIN, TIME_SV
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
n <- nrow(my_data)
my_data2 <- my_data %>% mutate(v1 = 0:(n - 1) %% 2,
v2 = 0:(n - 1) %% 3)
head(my_data2)
## # A tibble: 6 × 7
## LPRICE2 WRAIN DEGREES HRAIN TIME_SV v1 v2
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 -0.999 600 17.1 160 31 0 0
## 2 -0.454 690 16.7 80 30 1 1
## 3 -0.808 502 17.2 130 28 0 2
## 4 -1.51 420 16.1 110 26 1 0
## 5 -1.72 582 16.4 187 25 0 1
## 6 -0.418 485 17.5 187 24 1 2
my_model2 <- train(form = LPRICE2 ~ ., data = my_data2, method = "lm",
trControl = trainControl(method = "LOOCV"))
y <- my_data2$LPRICE2
y_ <- my_model2 %>% predict(my_data2)
RMSE(y_, y)
## [1] 0.256
my_model2$results$RMSE
## [1] 0.357
library(caret)
library(tidyverse)
my_url <- str_c("https://raw.githubusercontent.com/taroyabuki",
"/fromzero/master/data/wine.csv")
my_data <- read_csv(my_url)
## Rows: 27 Columns: 5
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (5): LPRICE2, WRAIN, DEGREES, HRAIN, TIME_SV
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
n <- nrow(my_data)
my_data2 <- my_data %>% mutate(v1 = 0:(n - 1) %% 2,
v2 = 0:(n - 1) %% 3)
my_model <- train(form = LPRICE2 ~ .,
data = my_data2,
method = "leapForward", # 変数増加法
trControl = trainControl(method = "LOOCV"),
tuneGrid = data.frame(nvmax = 1:6)) # 選択する変数の上限
summary(my_model$finalModel)$outmat
## WRAIN DEGREES HRAIN TIME_SV v1 v2
## 1 ( 1 ) " " "*" " " " " " " " "
## 2 ( 1 ) " " "*" "*" " " " " " "
## 3 ( 1 ) " " "*" "*" "*" " " " "
## 4 ( 1 ) "*" "*" "*" "*" " " " "
library(caret)
library(tidyverse)
my_url <- str_c("https://raw.githubusercontent.com/taroyabuki",
"/fromzero/master/data/wine.csv")
my_data <- read_csv(my_url)
## Rows: 27 Columns: 5
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (5): LPRICE2, WRAIN, DEGREES, HRAIN, TIME_SV
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
A <- 2
B <- 0.1
my_model <- train(
form = LPRICE2 ~ .,
data = my_data,
method = "glmnet",
standardize = TRUE,
tuneGrid = data.frame(
lambda = A,
alpha = B))
coef(my_model$finalModel, A)
## 5 x 1 sparse Matrix of class "dgCMatrix"
## s1
## (Intercept) -2.801552
## WRAIN .
## DEGREES 0.083291
## HRAIN -0.000415
## TIME_SV 0.002310
my_test <- data.frame(
WRAIN = 500, DEGREES = 17,
HRAIN = 120, TIME_SV = 2)
my_model %>% predict(my_test)
## [1] -1.43
library(ggfortify)
library(glmnetUtils)
my_data2 <- my_data %>% scale %>%
as.data.frame
B <- 0.1
glmnet(
form = LPRICE2 ~ .,
data = my_data2,
alpha = B) %>%
autoplot(xvar = "lambda") +
xlab("log A ( = log lambda)") +
theme(legend.position = c(0.15, 0.25))
As <- seq(0, 0.1, length.out = 21)
Bs <- seq(0, 0.1, length.out = 6)
my_model <- train(
form = LPRICE2 ~ ., data = my_data, method = "glmnet", standardize = TRUE,
trControl = trainControl(method = "LOOCV"),
tuneGrid = expand.grid(lambda = As, alpha = Bs))
my_model$bestTune
## alpha lambda
## 8 0 0.035
tmp <- "B ( = alpha)"
ggplot(my_model) +
theme(legend.position = c(0, 1), legend.justification = c(0, 1)) +
xlab("A ( = lambda)") +
guides(shape = guide_legend(tmp), color = guide_legend(tmp))
my_model$results %>%
filter(RMSE == min(RMSE))
## alpha lambda RMSE Rsquared MAE
## 1 0 0.000 0.32 0.737 0.278
## 2 0 0.005 0.32 0.737 0.278
## 3 0 0.010 0.32 0.737 0.278
## 4 0 0.015 0.32 0.737 0.278
## 5 0 0.020 0.32 0.737 0.278
## 6 0 0.025 0.32 0.737 0.278
## 7 0 0.030 0.32 0.737 0.278
## 8 0 0.035 0.32 0.737 0.278
curve(1 / (1 + exp(-x)), -6, 6)
library(caret)
library(tidyverse)
my_url <- str_c("https://raw.githubusercontent.com/taroyabuki",
"/fromzero/master/data/wine.csv")
my_data <- read_csv(my_url)
## Rows: 27 Columns: 5
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (5): LPRICE2, WRAIN, DEGREES, HRAIN, TIME_SV
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
my_model <- train(form = LPRICE2 ~ .,
data = my_data,
method = "neuralnet", # ニューラルネットワーク
preProcess = c("center", "scale"), # 標準化
trControl = trainControl(method = "LOOCV"))
plot(my_model$finalModel) # 訓練済ネットワークの描画
my_model$results
## layer1 layer2 layer3 RMSE Rsquared MAE
## 1 1 0 0 0.315 0.749 0.271
## 2 3 0 0 0.329 0.741 0.267
## 3 5 0 0 0.331 0.772 0.270
my_model <- train(
form = LPRICE2 ~ .,
data = my_data,
method = "neuralnet",
preProcess = c("center", "scale"),
trControl = trainControl(method = "LOOCV"),
tuneGrid = expand.grid(layer1 = 1:5,
layer2 = 0:2,
layer3 = 0))
my_model$results %>%
filter(RMSE == min(RMSE))
## layer1 layer2 layer3 RMSE Rsquared MAE
## 1 3 0 0 0.298 0.784 0.225
# Google Colaboratoryの環境設定
if (file.exists("/content")) {
options(Ncpus = parallel::detectCores())
installed_packages <- rownames(installed.packages())
packages_to_install <- c("caret", "furrr", "psych", "randomForest", "rpart.plot", "xgboost")
install.packages(setdiff(packages_to_install, installed_packages))
}
my_data <- iris
head(my_data)
## Sepal.Length Sepal.Width Petal.Length Petal.Width Species
## 1 5.1 3.5 1.4 0.2 setosa
## 2 4.9 3.0 1.4 0.2 setosa
## 3 4.7 3.2 1.3 0.2 setosa
## 4 4.6 3.1 1.5 0.2 setosa
## 5 5.0 3.6 1.4 0.2 setosa
## 6 5.4 3.9 1.7 0.4 setosa
psych::describe(my_data)
## vars n mean sd median trimmed mad min max range skew
## Sepal.Length 1 150 5.84 0.83 5.80 5.81 1.04 4.3 7.9 3.6 0.31
## Sepal.Width 2 150 3.06 0.44 3.00 3.04 0.44 2.0 4.4 2.4 0.31
## Petal.Length 3 150 3.76 1.77 4.35 3.76 1.85 1.0 6.9 5.9 -0.27
## Petal.Width 4 150 1.20 0.76 1.30 1.18 1.04 0.1 2.5 2.4 -0.10
## Species* 5 150 2.00 0.82 2.00 2.00 1.48 1.0 3.0 2.0 0.00
## kurtosis se
## Sepal.Length -0.61 0.07
## Sepal.Width 0.14 0.04
## Petal.Length -1.42 0.14
## Petal.Width -1.36 0.06
## Species* -1.52 0.07
library(caret)
library(tidyverse)
my_data <- iris
my_model <- train(form = Species ~ ., data = my_data, method = "rpart")
rpart.plot::rpart.plot(my_model$finalModel, extra = 1)
my_test <- tribble(
~Sepal.Length, ~Sepal.Width, ~Petal.Length, ~Petal.Width,
5.0, 3.5, 1.5, 0.5,
6.5, 3.0, 5.0, 2.0)
my_model %>% predict(my_test)
## [1] setosa virginica
## Levels: setosa versicolor virginica
my_model %>% predict(my_test,
type = "prob")
## setosa versicolor virginica
## 1 1 0.0000 0.000
## 2 0 0.0217 0.978
library(caret)
library(tidyverse)
my_data <- iris
my_model <- train(form = Species ~ ., data = my_data, method = "rpart2")
## note: only 2 possible values of the max tree depth from the initial fit.
## Truncating the grid to 2 .
y <- my_data$Species
y_ <- my_model %>% predict(my_data)
confusionMatrix(data = y_, reference = y)
## Confusion Matrix and Statistics
##
## Reference
## Prediction setosa versicolor virginica
## setosa 50 0 0
## versicolor 0 49 5
## virginica 0 1 45
##
## Overall Statistics
##
## Accuracy : 0.96
## 95% CI : (0.915, 0.985)
## No Information Rate : 0.333
## P-Value [Acc > NIR] : <2e-16
##
## Kappa : 0.94
##
## Mcnemar's Test P-Value : NA
##
## Statistics by Class:
##
## Class: setosa Class: versicolor Class: virginica
## Sensitivity 1.000 0.980 0.900
## Specificity 1.000 0.950 0.990
## Pos Pred Value 1.000 0.907 0.978
## Neg Pred Value 1.000 0.990 0.952
## Prevalence 0.333 0.333 0.333
## Detection Rate 0.333 0.327 0.300
## Detection Prevalence 0.333 0.360 0.307
## Balanced Accuracy 1.000 0.965 0.945
# 以下は割愛
y <- my_data$Species
y_ <- my_model %>% predict(my_data)
mean(y_ == y)
## [1] 0.96
my_model <- train(form = Species ~ ., data = my_data, method = "rpart2",
trControl = trainControl(method = "LOOCV"))
## note: only 2 possible values of the max tree depth from the initial fit.
## Truncating the grid to 2 .
my_model$results
## maxdepth Accuracy Kappa
## 1 1 0.333 0.00
## 2 2 0.953 0.93
my_model <- train(form = Species ~ ., data = my_data, method = "rpart2",
tuneGrid = data.frame(maxdepth = 1:10),
trControl = trainControl(method = "LOOCV"))
my_model$results %>% filter(Accuracy == max(Accuracy))
## maxdepth Accuracy Kappa
## 1 2 0.953 0.93
# パラメータを与えると正解率(LOOCV)を返す関数
my_loocv <- function(maxdepth, minbucket, minsplit) {
my_model <- train(form = Species ~ ., data = my_data, method = "rpart2",
trControl = trainControl(method = "LOOCV"),
tuneGrid = data.frame(maxdepth = maxdepth),
control = rpart::rpart.control(cp = 0.01,
minbucket = minbucket,
minsplit = minsplit))
list(maxdepth = maxdepth,
minbucket = minbucket,
minsplit = minsplit,
Accuracy = my_model$results$Accuracy)
}
my_params <- expand.grid(
maxdepth = 2:5,
minbucket = 1:7,
minsplit = c(2, 20))
library(furrr)
plan(multisession) # 並列処理の準備
my_results <- my_params %>% future_pmap_dfr(my_loocv, # 実行
.options = furrr_options(seed = TRUE))
my_results %>% filter(Accuracy == max(Accuracy)) # 正解率(検証)の最大値
## # A tibble: 6 × 4
## maxdepth minbucket minsplit Accuracy
## <int> <int> <dbl> <dbl>
## 1 3 5 2 0.973
## 2 4 5 2 0.973
## 3 5 5 2 0.973
## 4 3 5 20 0.973
## 5 4 5 20 0.973
## 6 5 5 20 0.973
my_model <- train(form = Species ~ ., data = my_data, method = "rpart2",
trControl = trainControl(method = "none"),
tuneGrid = data.frame(maxdepth = 3),
control = rpart::rpart.control(cp = 0.01,
minbucket = 5,
minsplit = 2))
rpart.plot::rpart.plot(
my_model$finalModel, extra = 1)
library(caret)
library(tidyverse)
my_data <- iris
my_model <- train(form = Species ~ ., data = my_data, method = "rf",
tuneGrid = data.frame(mtry = 2:4), # 省略可
trControl = trainControl(method = "LOOCV"))
my_model$results
## mtry Accuracy Kappa
## 1 2 0.96 0.94
## 2 3 0.96 0.94
## 3 4 0.96 0.94
my_model <- train(
form = Species ~ ., data = my_data, method = "xgbTree",
tuneGrid = expand.grid(
nrounds = c(50, 100, 150),
max_depth = c(1, 2, 3),
eta = c(0.3, 0.4),
gamma = 0,
colsample_bytree = c(0.6, 0.8),
min_child_weight = 1,
subsample = c(0.5, 0.75, 1)),
trControl = trainControl(method = "cv", number = 5)) # 5分割交差検証
my_model$results %>% filter(Accuracy == max(Accuracy)) %>% head(5) %>% t
## 1 2 3 4 5
## eta 0.3000 0.300 0.4000 0.4000 0.3000
## max_depth 1.0000 2.000 2.0000 3.0000 2.0000
## gamma 0.0000 0.000 0.0000 0.0000 0.0000
## colsample_bytree 0.8000 0.600 0.8000 0.8000 0.6000
## min_child_weight 1.0000 1.000 1.0000 1.0000 1.0000
## subsample 0.5000 0.750 0.7500 0.5000 0.7500
## nrounds 50.0000 50.000 50.0000 50.0000 100.0000
## Accuracy 0.9533 0.953 0.9533 0.9533 0.9533
## Kappa 0.9300 0.930 0.9300 0.9300 0.9300
## AccuracySD 0.0183 0.038 0.0183 0.0183 0.0183
## KappaSD 0.0274 0.057 0.0274 0.0274 0.0274
my_model <- train(form = Species ~ ., data = my_data, method = "rf")
ggplot(varImp(my_model))
library(caret)
library(tidyverse)
my_data <- iris
n <- nrow(my_data)
my_data$Petal.Length[0:(n - 1) %% 10 == 0] <- NA
my_data$Petal.Width[ 0:(n - 1) %% 10 == 1] <- NA
psych::describe(my_data) # nの値が135の変数に,150 - 135 = 15個の欠損がある.
## vars n mean sd median trimmed mad min max range skew
## Sepal.Length 1 150 5.84 0.83 5.8 5.81 1.04 4.3 7.9 3.6 0.31
## Sepal.Width 2 150 3.06 0.44 3.0 3.04 0.44 2.0 4.4 2.4 0.31
## Petal.Length 3 135 3.75 1.76 4.3 3.75 1.78 1.0 6.9 5.9 -0.27
## Petal.Width 4 135 1.20 0.77 1.3 1.18 1.04 0.1 2.5 2.4 -0.09
## Species* 5 150 2.00 0.82 2.0 2.00 1.48 1.0 3.0 2.0 0.00
## kurtosis se
## Sepal.Length -0.61 0.07
## Sepal.Width 0.14 0.04
## Petal.Length -1.41 0.15
## Petal.Width -1.35 0.07
## Species* -1.52 0.07
my_model <- train(
form = Species ~ ., data = my_data, method = "rpart2",
na.action = na.pass, # 欠損があっても学習を止めない.
preProcess = "medianImpute", # 欠損を中央値で埋める.
trControl = trainControl(method = "LOOCV"),
tuneGrid = data.frame(maxdepth = 20), # 木の高さの上限
control = rpart::rpart.control(minsplit = 2, # 分岐の条件
minbucket = 1)) # 終端ノードの条件
max(my_model$results$Accuracy)
## [1] 0.927
my_model <- train(form = Species ~ ., data = my_data, method = "xgbTree",
na.action = na.pass,
trControl = trainControl(method = "cv", number = 5))
max(my_model$results$Accuracy)
## [1] 0.96
library(caret)
library(tidyverse)
my_data <- iris
my_model <- train(form = Species ~ ., data = my_data, method = "knn",
trControl = trainControl(method = "LOOCV"))
my_model$results %>% filter(Accuracy == max(Accuracy))
## k Accuracy Kappa
## 1 5 0.967 0.95
## 2 7 0.967 0.95
## 3 9 0.967 0.95
library(caret)
library(tidyverse)
my_data <- iris
my_model <- train(form = Species ~ ., data = my_data, method = "nnet",
preProcess = c("center", "scale"), # 標準化
trControl = trainControl(method = "LOOCV"),
trace = FALSE) # 途中経過を表示しない
my_model$results %>% filter(Accuracy == max(Accuracy))
## size decay Accuracy Kappa
## 1 3 0.1 0.973 0.96
## 2 5 0.1 0.973 0.96
# Google Colaboratoryの環境設定
if (file.exists("/content")) {
options(Ncpus = parallel::detectCores())
installed_packages <- rownames(installed.packages())
packages_to_install <- c("caret", "PRROC", "rpart.plot")
install.packages(setdiff(packages_to_install, installed_packages))
}
y <- c( 0, 1, 1, 0, 1, 0, 1, 0, 0, 1)
y_score <- c(0.7, 0.8, 0.3, 0.4, 0.9, 0.6, 0.99, 0.1, 0.2, 0.5)
y_ <- ifelse(0.5 <= y_score, 1, 0)
y_
## [1] 1 1 0 0 1 1 1 0 0 1
library(caret)
confusionMatrix(data = as.factor(y_), # 予測
reference = as.factor(y), # 正解
positive = "1", # 「1」を陽性とする.
mode = "everything") # 全ての指標を求める.
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 3 1
## 1 2 4
##
## Accuracy : 0.7
## 95% CI : (0.348, 0.933)
## No Information Rate : 0.5
## P-Value [Acc > NIR] : 0.172
##
## Kappa : 0.4
##
## Mcnemar's Test P-Value : 1.000
##
## Sensitivity : 0.800
## Specificity : 0.600
## Pos Pred Value : 0.667
## Neg Pred Value : 0.750
## Precision : 0.667
## Recall : 0.800
## F1 : 0.727
## Prevalence : 0.500
## Detection Rate : 0.400
## Detection Prevalence : 0.600
## Balanced Accuracy : 0.700
##
## 'Positive' Class : 1
##
library(PRROC)
library(tidyverse)
y <- c( 0, 1, 1, 0, 1, 0, 1, 0, 0, 1)
y_score <- c(0.7, 0.8, 0.3, 0.4, 0.9, 0.6, 0.99, 0.1, 0.2, 0.5)
y_ <- ifelse(0.5 <= y_score, 1, 0)
c(sum((y == 0) & (y_ == 1)) / sum(y == 0), # FPR
sum((y == 1) & (y_ == 1)) / sum(y == 1)) # TPR
## [1] 0.4 0.8
my_roc <- roc.curve(scores.class0 = y_score[y == 1], # 答えが1のもののスコア
scores.class1 = y_score[y == 0], # 答えが0のもののスコア
curve = TRUE)
my_roc %>% plot(xlab = "False Positive Rate",
ylab = "True Positive Rate",
legend = FALSE)
my_roc$auc
## [1] 0.8
c(sum((y == 1) & (y_ == 1)) / sum(y == 1), # Recall == TPR
sum((y == 1) & (y_ == 1)) / sum(y_ == 1)) # Precision
## [1] 0.800 0.667
my_pr <- pr.curve(scores.class0 = y_score[y == 1],
scores.class1 = y_score[y == 0],
curve = TRUE)
my_pr %>% plot(xlab = "Recall",
ylab = "Precision",
legend = FALSE)
my_pr$auc.integral
## [1] 0.847
library(caret)
library(PRROC)
library(tidyverse)
my_url <- str_c("https://raw.githubusercontent.com/taroyabuki",
"/fromzero/master/data/titanic.csv")
my_data <- read_csv(my_url)
## Rows: 2201 Columns: 4
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (4): Class, Sex, Age, Survived
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
head(my_data)
## # A tibble: 6 × 4
## Class Sex Age Survived
## <chr> <chr> <chr> <chr>
## 1 1st Male Child Yes
## 2 1st Male Child Yes
## 3 1st Male Child Yes
## 4 1st Male Child Yes
## 5 1st Male Child Yes
## 6 1st Male Adult No
my_model <- train(form = Survived ~ ., data = my_data, method = "rpart2",
tuneGrid = data.frame(maxdepth = 2),
trControl = trainControl(method = "LOOCV"))
rpart.plot::rpart.plot(my_model$finalModel, extra = 1)
my_model$results
## maxdepth Accuracy Kappa
## 1 2 0.783 0.41
y <- my_data$Survived
tmp <- my_model %>% predict(newdata = my_data, type = "prob")
y_score <- tmp$Yes
my_roc <- roc.curve(scores.class0 = y_score[y == "Yes"],
scores.class1 = y_score[y == "No"],
curve = TRUE)
my_roc$auc
## [1] 0.711
my_roc %>% plot(xlab = "False Positive Rate",
ylab = "True Positive Rate",
legend = FALSE)
X <- my_data %>% select(Class) # 質的入力変数
y <- my_data$Survived # 出力変数
options(warn = -1) # これ以降,警告を表示しない.
my_model1 <- train(x = X, y = y, method = "rpart2",
tuneGrid = data.frame(maxdepth = 2),
trControl = trainControl(method = "LOOCV"))
options(warn = 0) # これ以降,警告を表示する.
rpart.plot::rpart.plot(my_model1$finalModel, extra = 1)
my_model1$results
## maxdepth Accuracy Kappa
## 1 2 0.714 0.237
my_enc <- my_data %>% dummyVars(formula = Survived ~ Class)
my_data2 <- my_enc %>%
predict(my_data) %>%
as.data.frame %>%
mutate(Survived = my_data$Survived)
my_model2 <- train(form = Survived ~ ., data = my_data2, method = "rpart2",
tuneGrid = data.frame(maxdepth = 2),
trControl = trainControl(method = "LOOCV"))
rpart.plot::rpart.plot(my_model2$finalModel, extra = 1)
my_model2$results
## maxdepth Accuracy Kappa
## 1 2 0.714 0.237
my_model3 <- train(form = Survived ~ Class, data = my_data, method = "rpart2",
tuneGrid = data.frame(maxdepth = 2),
trControl = trainControl(method = "LOOCV"))
rpart.plot::rpart.plot(my_model3$finalModel, extra = 1)
my_model3$results
## maxdepth Accuracy Kappa
## 1 2 0.692 0.267
curve(1 / (1 + exp(-x)), -6, 6)
library(caret)
library(PRROC)
library(tidyverse)
my_url <- str_c("https://raw.githubusercontent.com/taroyabuki",
"/fromzero/master/data/titanic.csv")
my_data <- read_csv(my_url)
## Rows: 2201 Columns: 4
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (4): Class, Sex, Age, Survived
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
my_model <- train(form = Survived ~ ., data = my_data, method = "glm",
trControl = trainControl(method = "LOOCV"))
coef(my_model$finalModel) %>%
as.data.frame
## .
## (Intercept) 2.044
## Class2nd -1.018
## Class3rd -1.778
## ClassCrew -0.858
## SexMale -2.420
## AgeChild 1.062
my_model$results
## parameter Accuracy Kappa
## 1 none 0.778 0.445
# Google Colaboratoryの環境設定
if (file.exists("/content")) {
options(Ncpus = parallel::detectCores())
installed_packages <- rownames(installed.packages())
packages_to_install <- c("h2o", "keras")
install.packages(setdiff(packages_to_install, installed_packages))
}
library(keras)
##
## Attaching package: 'keras'
## The following object is masked from 'package:future':
##
## %<-%
library(tidyverse)
my_url <- str_c("https://raw.githubusercontent.com/taroyabuki",
"/fromzero/master/data/wine.csv")
tmp <- read_csv(my_url)
## Rows: 27 Columns: 5
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (5): LPRICE2, WRAIN, DEGREES, HRAIN, TIME_SV
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
my_data <- tmp[sample(nrow(tmp)), ]
X <- my_data %>%
select(-LPRICE2) %>% scale
y <- my_data$LPRICE2
curve(activation_relu(x), -3, 3)
## Loaded Tensorflow version 2.7.0
my_model <- keras_model_sequential() %>%
layer_dense(units = 3, activation = "relu", input_shape = c(4)) %>%
layer_dense(units = 1)
summary(my_model) # ネットワークの概要
## Model: "sequential"
## ________________________________________________________________________________
## Layer (type) Output Shape Param #
## ================================================================================
## dense_1 (Dense) (None, 3) 15
##
## dense (Dense) (None, 1) 4
##
## ================================================================================
## Total params: 19
## Trainable params: 19
## Non-trainable params: 0
## ________________________________________________________________________________
# 割愛(Pythonの結果を参照)
my_model %>% compile(
loss = "mse",
optimizer = "rmsprop")
my_cb <- callback_early_stopping(
patience = 20,
restore_best_weights = TRUE)
my_history <- my_model %>% fit(
x = X,
y = y,
validation_split = 0.25,
batch_size = 10,
epochs = 500,
callbacks = list(my_cb),
verbose = 0)
plot(my_history)
## `geom_smooth()` using formula 'y ~ x'
my_history
##
## Final epoch (plot to see history):
## loss: 0.1457
## val_loss: 0.3786
y_ <- my_model %>% predict(X)
mean((y_ - y)^2)**0.5
## [1] 0.453
library(keras)
library(tidyverse)
my_data <- iris[sample(nrow(iris)), ]
X <- my_data %>%
select(-Species) %>% scale
y <- as.integer(my_data$Species) - 1
my_model <- keras_model_sequential() %>%
layer_dense(units = 3, activation = "relu", input_shape = c(4)) %>%
layer_dense(units = 3, activation = "softmax")
my_model %>% compile(loss = "sparse_categorical_crossentropy",
optimizer = "rmsprop",
metrics = c("accuracy"))
my_cb <- callback_early_stopping(
patience = 20,
restore_best_weights = TRUE)
my_history <- my_model %>%
fit(x = X,
y = y,
validation_split = 0.25,
batch_size = 10,
epochs = 500,
callbacks = list(my_cb),
verbose = 0)
plot(my_history)
## `geom_smooth()` using formula 'y ~ x'
my_history
##
## Final epoch (plot to see history):
## loss: 0.1077
## accuracy: 0.9732
## val_loss: 0.1418
## val_accuracy: 0.9474
tmp <- my_model %>% predict(X)
y_ <- apply(tmp, 1, which.max) - 1
mean(y_ == y)
## [1] 0.967
-mean(log(c(0.8, 0.7, 0.3, 0.8)))
## [1] 0.502
-mean(log(c(0.7, 0.6, 0.2, 0.7)))
## [1] 0.708
y <- c(2, 1, 0, 1)
y_1 <- list(c(0.1, 0.1, 0.8),
c(0.1, 0.7, 0.2),
c(0.3, 0.4, 0.3),
c(0.1, 0.8, 0.1))
y_2 <- list(c(0.1, 0.2, 0.7),
c(0.2, 0.6, 0.2),
c(0.2, 0.5, 0.3),
c(0.2, 0.7, 0.1))
c(mean(as.array(loss_sparse_categorical_crossentropy(y_true = y, y_pred = y_1))),
mean(as.array(loss_sparse_categorical_crossentropy(y_true = y, y_pred = y_2))))
## [1] 0.502 0.708
library(keras)
library(tidyverse)
keras::`%<-%`(c(c(x_train, y_train), c(x_test, y_test)), dataset_mnist())
dim(x_train)
## [1] 60000 28 28
x_train[5, , ]
## [,1] [,2] [,3] [,4] [,5] [,6] [,7] [,8] [,9] [,10] [,11] [,12] [,13]
## [1,] 0 0 0 0 0 0 0 0 0 0 0 0 0
## [2,] 0 0 0 0 0 0 0 0 0 0 0 0 0
## [3,] 0 0 0 0 0 0 0 0 0 0 0 0 0
## [4,] 0 0 0 0 0 0 0 0 0 0 0 0 0
## [5,] 0 0 0 0 0 0 0 0 0 0 0 0 0
## [6,] 0 0 0 0 0 0 0 0 0 0 0 0 0
## [7,] 0 0 0 0 0 0 0 0 0 0 0 0 0
## [8,] 0 0 0 0 0 0 0 0 0 0 0 0 55
## [9,] 0 0 0 0 0 0 0 0 0 0 0 87 232
## [10,] 0 0 0 0 0 0 0 0 0 4 57 242 252
## [11,] 0 0 0 0 0 0 0 0 0 96 252 252 183
## [12,] 0 0 0 0 0 0 0 0 132 253 252 146 14
## [13,] 0 0 0 0 0 0 0 126 253 247 176 9 0
## [14,] 0 0 0 0 0 0 16 232 252 176 0 0 0
## [15,] 0 0 0 0 0 0 22 252 252 30 22 119 197
## [16,] 0 0 0 0 0 0 16 231 252 253 252 252 252
## [17,] 0 0 0 0 0 0 0 55 235 253 217 138 42
## [18,] 0 0 0 0 0 0 0 0 0 0 0 0 0
## [19,] 0 0 0 0 0 0 0 0 0 0 0 0 0
## [20,] 0 0 0 0 0 0 0 0 0 0 0 0 0
## [21,] 0 0 0 0 0 0 0 0 0 0 0 0 0
## [22,] 0 0 0 0 0 0 0 0 0 0 0 0 0
## [23,] 0 0 0 0 0 0 0 0 0 0 0 0 0
## [24,] 0 0 0 0 0 0 0 0 0 0 0 0 0
## [25,] 0 0 0 0 0 0 0 0 0 0 0 0 0
## [26,] 0 0 0 0 0 0 0 0 0 0 0 0 0
## [27,] 0 0 0 0 0 0 0 0 0 0 0 0 0
## [28,] 0 0 0 0 0 0 0 0 0 0 0 0 0
## [,14] [,15] [,16] [,17] [,18] [,19] [,20] [,21] [,22] [,23] [,24] [,25]
## [1,] 0 0 0 0 0 0 0 0 0 0 0 0
## [2,] 0 0 0 0 0 0 0 0 0 0 0 0
## [3,] 0 0 0 0 0 0 0 0 0 0 0 0
## [4,] 0 0 0 0 0 0 0 0 0 0 0 0
## [5,] 0 0 0 0 0 0 0 0 0 0 0 0
## [6,] 0 0 0 0 0 0 0 0 0 0 0 0
## [7,] 0 0 0 0 0 0 0 0 0 0 0 0
## [8,] 148 210 253 253 113 87 148 55 0 0 0 0
## [9,] 252 253 189 210 252 252 253 168 0 0 0 0
## [10,] 190 65 5 12 182 252 253 116 0 0 0 0
## [11,] 14 0 0 92 252 252 225 21 0 0 0 0
## [12,] 0 0 0 215 252 252 79 0 0 0 0 0
## [13,] 0 8 78 245 253 129 0 0 0 0 0 0
## [14,] 36 201 252 252 169 11 0 0 0 0 0 0
## [15,] 241 253 252 251 77 0 0 0 0 0 0 0
## [16,] 226 227 252 231 0 0 0 0 0 0 0 0
## [17,] 24 192 252 143 0 0 0 0 0 0 0 0
## [18,] 62 255 253 109 0 0 0 0 0 0 0 0
## [19,] 71 253 252 21 0 0 0 0 0 0 0 0
## [20,] 0 253 252 21 0 0 0 0 0 0 0 0
## [21,] 71 253 252 21 0 0 0 0 0 0 0 0
## [22,] 106 253 252 21 0 0 0 0 0 0 0 0
## [23,] 45 255 253 21 0 0 0 0 0 0 0 0
## [24,] 0 218 252 56 0 0 0 0 0 0 0 0
## [25,] 0 96 252 189 42 0 0 0 0 0 0 0
## [26,] 0 14 184 252 170 11 0 0 0 0 0 0
## [27,] 0 0 14 147 252 42 0 0 0 0 0 0
## [28,] 0 0 0 0 0 0 0 0 0 0 0 0
## [,26] [,27] [,28]
## [1,] 0 0 0
## [2,] 0 0 0
## [3,] 0 0 0
## [4,] 0 0 0
## [5,] 0 0 0
## [6,] 0 0 0
## [7,] 0 0 0
## [8,] 0 0 0
## [9,] 0 0 0
## [10,] 0 0 0
## [11,] 0 0 0
## [12,] 0 0 0
## [13,] 0 0 0
## [14,] 0 0 0
## [15,] 0 0 0
## [16,] 0 0 0
## [17,] 0 0 0
## [18,] 0 0 0
## [19,] 0 0 0
## [20,] 0 0 0
## [21,] 0 0 0
## [22,] 0 0 0
## [23,] 0 0 0
## [24,] 0 0 0
## [25,] 0 0 0
## [26,] 0 0 0
## [27,] 0 0 0
## [28,] 0 0 0
plot(as.raster(x = x_train[5, , ],
max = max(x_train)))
head(y_train)
## [1] 5 0 4 1 9 2
c(min(x_train), max(x_train))
## [1] 0 255
x_train <- x_train / 255
x_test <- x_test / 255
my_index <- sample(1:60000, 6000)
x_train <- x_train[my_index, , ]
y_train <- y_train[my_index]
my_model <- keras_model_sequential() %>%
layer_flatten(input_shape = c(28, 28)) %>%
layer_dense(units = 256, activation = "relu") %>%
layer_dense(units = 10, activation = "softmax")
summary(my_model)
## Model: "sequential_2"
## ________________________________________________________________________________
## Layer (type) Output Shape Param #
## ================================================================================
## flatten (Flatten) (None, 784) 0
##
## dense_5 (Dense) (None, 256) 200960
##
## dense_4 (Dense) (None, 10) 2570
##
## ================================================================================
## Total params: 203,530
## Trainable params: 203,530
## Non-trainable params: 0
## ________________________________________________________________________________
# 割愛(Pythonの結果を参照)
my_model %>% compile(loss = "sparse_categorical_crossentropy",
optimizer = "rmsprop",
metrics = c("accuracy"))
my_cb <- callback_early_stopping(patience = 5,
restore_best_weights = TRUE)
my_history <- my_model %>%
fit(x = x_train,
y = y_train,
validation_split = 0.2,
batch_size = 128,
epochs = 20,
callbacks = list(my_cb),
verbose = 0)
plot(my_history)
## `geom_smooth()` using formula 'y ~ x'
tmp <- my_model %>% predict(x_test)
y_ <- apply(tmp, 1, which.max) - 1
table(y_, y_test)
## y_test
## y_ 0 1 2 3 4 5 6 7 8 9
## 0 964 0 12 4 2 9 11 3 12 9
## 1 0 1124 5 2 1 2 3 10 9 9
## 2 1 2 958 17 1 2 4 21 5 2
## 3 1 2 14 938 0 25 0 8 24 11
## 4 0 0 12 1 944 8 11 9 10 39
## 5 3 1 1 14 0 807 10 1 21 4
## 6 9 3 6 1 8 13 916 0 6 1
## 7 1 1 10 11 2 1 0 946 4 5
## 8 1 2 13 12 3 14 3 3 866 4
## 9 0 0 1 10 21 11 0 27 17 925
mean(y_ == y_test)
## [1] 0.939
my_model %>%
evaluate(x = x_test, y = y_test)
## loss accuracy
## 0.208 0.939
x_train2d <- x_train %>% array_reshape(c(-1, 28, 28, 1))
x_test2d <- x_test %>% array_reshape(c(-1, 28, 28, 1))
my_model <- keras_model_sequential() %>%
layer_conv_2d(filters = 32, kernel_size = 3, # 畳み込み層
activation = "relu",
input_shape = c(28, 28, 1)) %>%
layer_max_pooling_2d(pool_size = 2) %>% # プーリング層
layer_flatten() %>%
layer_dense(units = 128, activation = "relu") %>%
layer_dense(units = 10, activation = "softmax")
summary(my_model)
## Model: "sequential_3"
## ________________________________________________________________________________
## Layer (type) Output Shape Param #
## ================================================================================
## conv2d (Conv2D) (None, 26, 26, 32) 320
##
## max_pooling2d (MaxPooling2D) (None, 13, 13, 32) 0
##
## flatten_1 (Flatten) (None, 5408) 0
##
## dense_7 (Dense) (None, 128) 692352
##
## dense_6 (Dense) (None, 10) 1290
##
## ================================================================================
## Total params: 693,962
## Trainable params: 693,962
## Non-trainable params: 0
## ________________________________________________________________________________
# 割愛(Python の結果を参照)
my_model %>% compile(loss = "sparse_categorical_crossentropy",
optimizer = "rmsprop",
metrics = c("accuracy"))
my_cb <- callback_early_stopping(patience = 5,
restore_best_weights = TRUE)
my_history <- my_model %>%
fit(x = x_train2d,
y = y_train,
validation_split = 0.2,
batch_size = 128,
epochs = 20,
callbacks = list(my_cb),
verbose = 0)
plot(my_history)
## `geom_smooth()` using formula 'y ~ x'
my_model %>%
evaluate(x = x_test2d, y = y_test)
## loss accuracy
## 0.122 0.965
my_model <- keras_model_sequential() %>%
layer_conv_2d(filters = 20, kernel_size = 5, activation = "relu",
input_shape = c(28, 28, 1)) %>%
layer_max_pooling_2d(pool_size = 2, strides = 2) %>%
layer_conv_2d(filters = 50, kernel_size = 5, activation = "relu") %>%
layer_max_pooling_2d(pool_size = 2, strides = 2) %>%
layer_dropout(rate = 0.25) %>%
layer_flatten() %>%
layer_dense(units = 500, activation = "relu") %>%
layer_dropout(rate = 0.5) %>%
layer_dense(units = 10, activation = "softmax")
my_model %>% compile(
loss = "sparse_categorical_crossentropy",
optimizer = "rmsprop",
metrics = c("accuracy"))
my_cb <- callback_early_stopping(patience = 5,
restore_best_weights = TRUE)
my_history <- my_model %>%
fit(x = x_train2d,
y = y_train,
validation_split = 0.2,
batch_size = 128,
epochs = 20,
callbacks = list(my_cb),
verbose = 0)
plot(my_history)
## `geom_smooth()` using formula 'y ~ x'
my_model %>%
evaluate(x = x_test2d, y = y_test)
## loss accuracy
## 0.0633 0.9825
y_prob <- my_model %>% predict(x_test2d) # カテゴリに属する確率
my_result <- data.frame(
y_prob = apply(y_prob, 1, max), # 確率の最大値
y_ = apply(y_prob, 1, which.max) - 1, # 予測カテゴリ
y = y_test, # 正解
id = seq_len(length(y_test))) %>% # 番号
filter(y_ != y) %>% # 予測がはずれたものを残す
arrange(desc(y_prob)) # 確率の大きい順に並び替える
head(my_result)
## y_prob y_ y id
## 1 1.000 5 3 450
## 2 1.000 0 9 6506
## 3 1.000 4 6 3521
## 4 1.000 7 3 1682
## 5 1.000 6 5 9730
## 6 0.999 0 5 9771
tmp <- my_result[1:5, ]$id
my_labels <- sprintf("%s (%s)",
my_result[1:5, ]$y, tmp)
my_fig <- expand.grid(
label = my_labels,
y = 28:1,
x = 1:28)
my_fig$z <- as.vector(
x_test[tmp, , ])
my_fig %>% ggplot(
aes(x = x, y = y, fill = z)) +
geom_raster() +
coord_fixed() +
theme_void() +
theme(legend.position = "none") +
facet_grid(. ~ label)
library(h2o)
##
## ----------------------------------------------------------------------
##
## Your next step is to start H2O:
## > h2o.init()
##
## For H2O package documentation, ask for help:
## > ??h2o
##
## After starting H2O, you can use the Web UI at http://localhost:54321
## For more information visit https://docs.h2o.ai
##
## ----------------------------------------------------------------------
##
## Attaching package: 'h2o'
## The following objects are masked from 'package:stats':
##
## cor, sd, var
## The following objects are masked from 'package:base':
##
## &&, %*%, %in%, ||, apply, as.factor, as.numeric, colnames,
## colnames<-, ifelse, is.character, is.factor, is.numeric, log,
## log10, log1p, log2, round, signif, trunc
library(keras)
library(tidyverse)
h2o.init()
##
## H2O is not running yet, starting it now...
##
## Note: In case of errors look at the following log files:
## /tmp/RtmpUk6Jta/file414622b8b2/h2o_rstudio_started_from_r.out
## /tmp/RtmpUk6Jta/file414ef06fbf/h2o_rstudio_started_from_r.err
##
##
## Starting H2O JVM and connecting: .. Connection successful!
##
## R is connected to the H2O cluster:
## H2O cluster uptime: 1 seconds 332 milliseconds
## H2O cluster timezone: Etc/UTC
## H2O data parsing timezone: UTC
## H2O cluster version: 3.34.0.3
## H2O cluster version age: 2 years, 5 months and 16 days !!!
## H2O cluster name: H2O_started_from_R_rstudio_rir268
## H2O cluster total nodes: 1
## H2O cluster total memory: 15.69 GB
## H2O cluster total cores: 32
## H2O cluster allowed cores: 32
## H2O cluster healthy: TRUE
## H2O Connection ip: localhost
## H2O Connection port: 54321
## H2O Connection proxy: NA
## H2O Internal Security: FALSE
## H2O API Extensions: Amazon S3, XGBoost, Algos, AutoML, Core V3, TargetEncoder, Core V4
## R Version: R version 4.1.2 (2021-11-01)
## Warning in h2o.clusterInfo():
## Your H2O cluster version is too old (2 years, 5 months and 16 days)!
## Please download and install the latest version from http://h2o.ai/download/
h2o.no_progress()
# h2o.shutdown(prompt = FALSE) # 停止
my_url <- str_c("https://raw.githubusercontent.com/taroyabuki",
"/fromzero/master/data/wine.csv")
my_data <- read_csv(my_url)
## Rows: 27 Columns: 5
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (5): LPRICE2, WRAIN, DEGREES, HRAIN, TIME_SV
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
my_frame <- as.h2o(my_data) # 通常のデータフレームをH2OFrameに変換する.
# あるいは
my_frame <- h2o.importFile(my_url, header = TRUE) # データを読み込む.
my_frame
## LPRICE2 WRAIN DEGREES HRAIN TIME_SV
## 1 -0.999 600 17.1 160 31
## 2 -0.454 690 16.7 80 30
## 3 -0.808 502 17.1 130 28
## 4 -1.509 420 16.1 110 26
## 5 -1.717 582 16.4 187 25
## 6 -0.418 485 17.5 187 24
##
## [27 rows x 5 columns]
# 通常のデータフレームに戻す.
my_frame %>% as.data.frame %>% head
## LPRICE2 WRAIN DEGREES HRAIN TIME_SV
## 1 -0.999 600 17.1 160 31
## 2 -0.454 690 16.7 80 30
## 3 -0.808 502 17.1 130 28
## 4 -1.509 420 16.1 110 26
## 5 -1.717 582 16.4 187 25
## 6 -0.418 485 17.5 187 24
# 結果は割愛(見た目は同じ)
my_model <- h2o.automl(
y = "LPRICE2",
training_frame = my_frame,
max_runtime_secs = 60)
##
## 15:30:42.217: Skipping training of model GBM_1_AutoML_1_20240323_153039 due to exception: water.exceptions.H2OModelBuilderIllegalArgumentException: Illegal argument(s) for GBM model: GBM_1_AutoML_1_20240323_153039. Details: ERRR on field: _min_rows: The dataset size is too small to split for min_rows=100.0: must have at least 200.0 (weighted) rows, but have only 27.0.
min(my_model@leaderboard$rmse)
## [1] 0.289
tmp <- my_model %>%
predict(my_frame) %>%
as.data.frame
y_ <- tmp$predict
y <- my_data$LPRICE2
plot(y, y_)
keras::`%<-%`(c(c(x_train, y_train), c(x_test, y_test)), dataset_mnist())
my_index <- sample(1:60000, 6000)
x_train <- x_train[my_index, , ]
y_train <- y_train[my_index]
tmp <- x_train %>%
array_reshape(c(-1, 28 * 28)) %>%
as.data.frame
tmp$y <- as.factor(y_train)
my_train <- as.h2o(tmp)
tmp <- x_test %>%
array_reshape(c(-1, 28 * 28)) %>%
as.data.frame
my_test <- as.h2o(tmp)
my_model <- h2o.automl(
y = "y",
training_frame = my_train,
max_runtime_secs = 120)
min(my_model@leaderboard$
mean_per_class_error)
## [1] 0.076
tmp <- my_model %>%
predict(my_test) %>% as.data.frame
y_ <- tmp$predict
mean(y_ == y_test)
## [1] 0.932
# Google Colaboratoryの環境設定
if (file.exists("/content")) {
options(Ncpus = parallel::detectCores())
installed_packages <- rownames(installed.packages())
packages_to_install <- c("caret", "fable", "feasts", "prophet", "tsibble", "urca")
packages_to_upgrade <- c("ggplot2")
install.packages(union(setdiff(packages_to_install, installed_packages), packages_to_upgrade))
}
as.POSIXct("2021-01-01")
## [1] "2021-01-01 UTC"
library(tsibble)
##
## Attaching package: 'tsibble'
## The following objects are masked from 'package:base':
##
## intersect, setdiff, union
seq(from = 2021, to = 2023, by = 1)
## [1] 2021 2022 2023
seq(from = yearmonth("202101"), to = yearmonth("202103"), by = 2)
## Warning: `yearmonth()` may yield unexpected results.
## ℹ Please use arg `format` to supply formats.
## `yearmonth()` may yield unexpected results.
## ℹ Please use arg `format` to supply formats.
## <yearmonth[2]>
## [1] "2021 Jan" "2021 Mar"
seq(from = as.POSIXct("2021-01-01"), to = as.POSIXct("2021-01-03"), by = "1 day")
## [1] "2021-01-01 UTC" "2021-01-02 UTC" "2021-01-03 UTC"
seq(from = as.POSIXct("2021-01-01 00:00:00"),
to = as.POSIXct("2021-01-01 03:00:00"), by = "2 hour")
## [1] "2021-01-01 00:00:00 UTC" "2021-01-01 02:00:00 UTC"
my_data <- as.vector(AirPassengers)
n <- length(my_data) # データ数(144)
k <- 108 # 訓練データ数
library(tidyverse)
library(tsibble)
my_ds <- seq(
from = yearmonth("1949/01"),
to = yearmonth("1960/12"),
by = 1)
my_label <- rep(
c("train", "test"),
c(k, n - k))
my_df <- tsibble(
ds = my_ds,
x = 0:(n - 1),
y = my_data,
label = my_label,
index = ds) # 日時の列の指定
head(my_df)
## # A tsibble: 6 x 4 [1M]
## ds x y label
## <mth> <int> <dbl> <chr>
## 1 1949 Jan 0 112 train
## 2 1949 Feb 1 118 train
## 3 1949 Mar 2 132 train
## 4 1949 Apr 3 129 train
## 5 1949 May 4 121 train
## 6 1949 Jun 5 135 train
my_train <- my_df[ 1:k , ]
my_test <- my_df[-(1:k), ]
y <- my_test$y
my_plot <- my_df %>%
ggplot(aes(x = ds,
y = y,
color = label)) +
geom_line()
my_plot
library(caret)
my_lm_model <- train(form = y ~ x, data = my_train, method = "lm")
y_ <- my_lm_model %>% predict(my_test)
caret::RMSE(y, y_) # RMSE(テスト)
## [1] 70.6
y_ <- my_lm_model %>% predict(my_df)
tmp <- my_df %>%
mutate(y = y_, label = "model")
my_plot + geom_line(data = tmp)
library(fable)
## Loading required package: fabletools
##
## Attaching package: 'fabletools'
## The following objects are masked from 'package:caret':
##
## MAE, RMSE
my_arima_model <- my_train %>% model(ARIMA(y))
my_arima_model
## # A mable: 1 x 1
## `ARIMA(y)`
## <model>
## 1 <ARIMA(1,1,0)(0,1,0)[12]>
tmp <- my_arima_model %>% forecast(h = "3 years")
head(tmp)
## # A fable: 6 x 4 [1M]
## # Key: .model [1]
## .model ds y .mean
## <chr> <mth> <dist> <dbl>
## 1 ARIMA(y) 1958 Jan N(346, 94) 346.
## 2 ARIMA(y) 1958 Feb N(332, 148) 332.
## 3 ARIMA(y) 1958 Mar N(387, 210) 387.
## 4 ARIMA(y) 1958 Apr N(379, 271) 379.
## 5 ARIMA(y) 1958 May N(386, 332) 386.
## 6 ARIMA(y) 1958 Jun N(453, 393) 453.
y_ <- tmp$.mean
caret::RMSE(y_, y)
## [1] 22.1
# 予測結果のみでよい場合
#tmp %>% autoplot
tmp %>% autoplot +
geom_line(data = my_df,
aes(x = ds,
y = y,
color = label))
library(prophet)
## Loading required package: Rcpp
## Loading required package: rlang
##
## Attaching package: 'rlang'
## The following object is masked from 'package:xml2':
##
## as_list
## The following objects are masked from 'package:jsonlite':
##
## flatten, unbox
## The following objects are masked from 'package:testthat':
##
## is_false, is_null, is_true
## The following objects are masked from 'package:purrr':
##
## %@%, as_function, flatten, flatten_chr, flatten_dbl, flatten_int,
## flatten_lgl, flatten_raw, invoke, splice
my_prophet_model <- my_train %>%
prophet(seasonality.mode = "multiplicative")
## Disabling weekly seasonality. Run prophet with weekly.seasonality=TRUE to override this.
## Disabling daily seasonality. Run prophet with daily.seasonality=TRUE to override this.
tmp <- my_prophet_model %>% predict(my_test)
head(tmp[, c("ds", "yhat", "yhat_lower", "yhat_upper")])
## # A tibble: 6 × 4
## ds yhat yhat_lower yhat_upper
## <dttm> <dbl> <dbl> <dbl>
## 1 1958-01-01 00:00:00 359. 350. 369.
## 2 1958-02-01 00:00:00 351. 342. 359.
## 3 1958-03-01 00:00:00 407. 398. 416.
## 4 1958-04-01 00:00:00 398. 389. 407.
## 5 1958-05-01 00:00:00 402. 393. 411.
## 6 1958-06-01 00:00:00 460. 451. 469.
y_ <- tmp$yhat
caret::RMSE(y_, y)
## [1] 33.7
# my_prophet_model %>% plot(tmp) # 予測結果のみでよい場合
my_prophet_model %>% plot(tmp) +
geom_line(data = my_train, aes(x = as.POSIXct(ds))) +
geom_line(data = my_test, aes(x = as.POSIXct(ds)), color = "red")
# Google Colaboratoryの環境設定
if (file.exists("/content")) {
options(Ncpus = parallel::detectCores())
installed_packages <- rownames(installed.packages())
packages_to_install <- c("factoextra", "ggbiplot", "igraph")
install.packages(setdiff(packages_to_install, installed_packages))
}
library(tidyverse)
my_data <- data.frame(
language = c( 0, 20, 20, 25, 22, 17),
english = c( 0, 20, 40, 20, 24, 18),
math = c(100, 20, 5, 30, 17, 25),
science = c( 0, 20, 5, 25, 16, 23),
society = c( 0, 20, 30, 0, 21, 17),
row.names = c("A", "B", "C", "D", "E", "F"))
my_result <- my_data %>% prcomp # 主成分分析の実行
my_result$x # 主成分スコア
## PC1 PC2 PC3 PC4 PC5
## A -74.91 -7.01 -0.361 0.0763 2.86e-15
## B 13.82 2.75 5.273 0.8418 -2.25e-15
## C 33.71 -18.42 -4.876 -0.8820 1.19e-15
## D 1.73 17.88 -7.925 -0.1493 1.91e-16
## E 17.84 -1.06 1.653 2.3125 -3.20e-15
## F 7.81 5.86 6.237 -2.1994 -3.09e-16
my_result %>% ggbiplot::ggbiplot(
labels = row.names(my_data),
scale = 0)
## ------------------------------------------------------------------------------
## You have loaded plyr after dplyr - this is likely to cause problems.
## If you need functions from both plyr and dplyr, please load plyr first, then dplyr:
## library(plyr); library(dplyr)
## ------------------------------------------------------------------------------
##
## Attaching package: 'plyr'
## The following objects are masked from 'package:dplyr':
##
## arrange, count, desc, failwith, id, mutate, rename, summarise,
## summarize
## The following object is masked from 'package:purrr':
##
## compact
##
## Attaching package: 'scales'
## The following object is masked from 'package:purrr':
##
## discard
## The following object is masked from 'package:readr':
##
## col_factor
my_result$rotation %>% t
## language english math science society
## PC1 0.207 0.304 -0.88726 0.130 0.245
## PC2 0.279 -0.325 -0.09764 0.703 -0.559
## PC3 -0.306 -0.616 -0.05635 0.338 0.640
## PC4 0.765 -0.472 -0.00765 -0.418 0.132
## PC5 0.447 0.447 0.44721 0.447 0.447
summary(my_result)
## Importance of components:
## PC1 PC2 PC3 PC4 PC5
## Standard deviation 38.264 12.2557 5.588 1.52970 1.23e-15
## Proportion of Variance 0.888 0.0911 0.019 0.00142 0.00e+00
## Cumulative Proportion 0.888 0.9796 0.999 1.00000 1.00e+00
my_result <- prcomp(
x = my_data,
scale = TRUE) # 標準化
# あるいは
my_result <- prcomp(
x = scale(my_data)) # 標準化データ
my_result$x # 主成分スコア
## PC1 PC2 PC3 PC4 PC5
## A -3.674 -0.569 -0.0584 0.0106 1.67e-16
## B 0.653 0.247 0.4348 0.0930 -1.47e-16
## C 1.568 -1.743 -0.3806 -0.0977 -2.38e-16
## D 0.251 1.640 -0.6754 -0.0320 4.44e-16
## E 0.886 -0.110 0.0925 0.2304 6.74e-18
## F 0.316 0.535 0.5870 -0.2043 -2.44e-16
Z <- my_data %>% scale(scale = FALSE) %>% as.matrix # 標準化しない場合
#Z <- my_data %>% scale(scale = TRUE) %>% as.matrix # 標準化する場合
n <- nrow(my_data)
S <- var(Z) # 分散共分散行列
#S <- t(Z) %*% Z / (n - 1) # (同じ結果)
tmp <- eigen(S) # 固有値と固有ベクトル
Z %*% tmp$vectors # 主成分スコア(結果は割愛)
## [,1] [,2] [,3] [,4] [,5]
## A 74.91 -7.01 0.361 0.0763 4.63e-15
## B -13.82 2.75 -5.273 0.8418 -8.02e-15
## C -33.71 -18.42 4.876 -0.8820 6.96e-15
## D -1.73 17.88 7.925 -0.1493 2.86e-15
## E -17.84 -1.06 -1.653 2.3125 -1.01e-14
## F -7.81 5.86 -6.237 -2.1994 1.91e-15
cumsum(tmp$values) / sum(tmp$values) # 累積寄与率
## [1] 0.888 0.980 0.999 1.000 1.000
udv <- svd(Z) # 特異値分解
U <- udv$u
d <- udv$d
V <- udv$v
W <- diag(d)
c(all.equal(Z, U %*% W %*% t(V), check.attributes = FALSE), # 確認1
all.equal(t(U) %*% U, diag(dim(U)[2])), # 確認2
all.equal(t(V) %*% V, diag(dim(V)[2]))) # 確認3
## [1] TRUE TRUE TRUE
U %*% W # 主成分スコア(結果は割愛)
## [,1] [,2] [,3] [,4] [,5]
## [1,] -74.91 -7.01 -0.361 0.0763 -6.85e-16
## [2,] 13.82 2.75 5.273 0.8418 8.96e-16
## [3,] 33.71 -18.42 -4.876 -0.8820 -5.18e-16
## [4,] 1.73 17.88 -7.925 -0.1493 -5.83e-16
## [5,] 17.84 -1.06 1.653 2.3125 -1.91e-15
## [6,] 7.81 5.86 6.237 -2.1994 -1.44e-15
e <- d^2 / (n - 1) # 分散共分散行列の固有値
cumsum(e) / sum(e) # 累積寄与率
## [1] 0.888 0.980 0.999 1.000 1.000
library(tidyverse)
my_data <- data.frame(
x = c( 0, -16, 10, 10),
y = c( 0, 0, 10, -15),
row.names = c("A", "B", "C", "D"))
my_result <- my_data %>%
dist("euclidian") %>% # distだけでも可
hclust("complete") # hclustだけでも可
my_result %>% factoextra::fviz_dend(
k = 3, # クラスタ数
rect = TRUE, rect_fill = TRUE)
## Warning: `guides(<scale> = FALSE)` is deprecated. Please use `guides(<scale> =
## "none")` instead.
my_result %>% factoextra::fviz_dend(
k = 3,
rect = TRUE, rect_fill = TRUE,
type = "phylogenic")
my_result %>% cutree(3)
## A B C D
## 1 2 1 3
library(tidyverse)
my_data <- data.frame(
language = c( 0, 20, 20, 25, 22, 17),
english = c( 0, 20, 40, 20, 24, 18),
math = c(100, 20, 5, 30, 17, 25),
science = c( 0, 20, 5, 25, 16, 23),
society = c( 0, 20, 30, 0, 21, 17),
row.names = c("A", "B", "C", "D", "E", "F"))
try( # RMarkdownで発生するエラーを回避する.
my_data %>% scale %>% # 列ごとの標準化
gplots::heatmap.2(cexRow = 1, cexCol = 1), # ラベルのサイズを指定して描画する.
silent = TRUE)
library(tidyverse)
my_data <- data.frame(
x = c( 0, -16, 10, 10),
y = c( 0, 0, 10, -15),
row.names = c("A", "B", "C", "D"))
my_result <- my_data %>% kmeans(3)
my_result$cluster
## A B C D
## 3 2 3 1
library(tidyverse)
library(factoextra)
## Welcome! Want to learn more? See two factoextra-related books at https://goo.gl/ve3WBa
my_data <- iris[, -5]
f <- 2:5 %>% map(function(k) {
my_data %>% kmeans(k) %>%
fviz_cluster(data = my_data, geom = "point") +
ggtitle(sprintf("k = %s", k))
})
gridExtra::grid.arrange(f[[1]], f[[2]], f[[3]], f[[4]], ncol = 2)
fviz_nbclust(my_data, kmeans, method = "wss")
library(tidyverse)
my_data <- iris[, -5] %>% scale
my_result <- prcomp(my_data)$x %>% as.data.frame # 主成分分析
# 非階層的クラスタ分析の場合
my_result$cluster <- (my_data %>% scale %>% kmeans(3))$cluster %>% as.factor
# 階層的クラスタ分析の場合
#my_result$cluster <- my_data %>% dist %>% hclust %>% cutree(3) %>% as.factor
my_result %>%
ggplot(aes(x = PC1, y = PC2, color = cluster)) + # 色でクラスタを表現する.
geom_point(shape = iris$Species) + # 形で品種を表現する.
theme(legend.position = "none")