独热编码–哑变量
独热编码: n种状态 转变为n列
哑变量: n种状态转变为n-1列(目的:为了防止共线性的问题)
testFrame <- data.frame(First=sample(1:10, 20, replace=TRUE),
Second=sample(1:20, 20, replace=TRUE),
Third=sample(1:10, 20, replace=TRUE),
Fourth=factor(rep(c("=A", "=B", "=C", "=D"),5)),
Fifth=ordered(rep(c("=E", "=F", "=G","=H", "=I"), 4)),
Sixth=rep(c("=a", "=b"), 10),
Seventh=factor(c(rep(c("=J","=K","=L"),6),"=J","=K")) ,
stringsAsFactors=F)
head(testFrame)
#> First Second Third Fourth Fifth Sixth Seventh
#> 1 8 7 8 =A =E =a =J
#> 2 9 2 5 =B =F =b =K
#> 3 7 20 10 =C =G =a =L
#> 4 4 9 3 =D =H =b =J
#> 5 1 6 2 =A =I =a =K
#> 6 1 13 1 =B =E =b =L
str(testFrame)
#> 'data.frame': 20 obs. of 7 variables:
#> $ First : int 8 9 7 4 1 1 2 10 4 9 ...
#> $ Second : int 7 2 20 9 6 13 5 19 20 14 ...
#> $ Third : int 8 5 10 3 2 1 1 7 6 9 ...
#> $ Fourth : Factor w/ 4 levels "=A","=B","=C",..: 1 2 3 4 1 2 3 4 1 2 ...
#> $ Fifth : Ord.factor w/ 5 levels "=E"<"=F"<"=G"<..: 1 2 3 4 5 1 2 3 4 5 ...
#> $ Sixth : chr "=a" "=b" "=a" "=b" ...
#> $ Seventh: Factor w/ 3 levels "=J","=K","=L": 1 2 3 1 2 3 1 2 3 1 ...
独热编码1
###########################################################################
#### 以下涉及公式的地方,
#### 公式右边,-1代表不要截距项同时生成独热编码(只在一个因子变量的情况下)
#### 公式左边为要排除的变量
###########################################################################
###### 独热编码1
library(data.table)
library(magrittr)
iris_dt = data.table(iris)
library(mltools)## 此编码必须依赖data.table
one_hot(iris_dt)%>% head()
#> Sepal.Length Sepal.Width Petal.Length Petal.Width Species_setosa
#> 1: 5.1 3.5 1.4 0.2 1
#> 2: 4.9 3.0 1.4 0.2 1
#> 3: 4.7 3.2 1.3 0.2 1
#> 4: 4.6 3.1 1.5 0.2 1
#> 5: 5.0 3.6 1.4 0.2 1
#> 6: 5.4 3.9 1.7 0.4 1
#> Species_versicolor Species_virginica
#> 1: 0 0
#> 2: 0 0
#> 3: 0 0
#> 4: 0 0
#> 5: 0 0
#> 6: 0 0
独热编码2
###### 独热编码2
library(onehot) #先编码后预测输出独热编码
encoder <- onehot(iris)
x <- predict(encoder, iris)
x %>% head()
#> Sepal.Length Sepal.Width Petal.Length Petal.Width Species=setosa
#> [1,] 5.1 3.5 1.4 0.2 1
#> [2,] 4.9 3.0 1.4 0.2 1
#> [3,] 4.7 3.2 1.3 0.2 1
#> [4,] 4.6 3.1 1.5 0.2 1
#> [5,] 5.0 3.6 1.4 0.2 1
#> [6,] 5.4 3.9 1.7 0.4 1
#> Species=versicolor Species=virginica
#> [1,] 0 0
#> [2,] 0 0
#> [3,] 0 0
#> [4,] 0 0
#> [5,] 0 0
#> [6,] 0 0
## 类似的还有 caret::dummyVars函数
library(caret)
dummy <- dummyVars(~ ., data = iris, fullRank = TRUE)
predict(dummy,iris) %>% head()
#> Sepal.Length Sepal.Width Petal.Length Petal.Width Species.versicolor
#> 1 5.1 3.5 1.4 0.2 0
#> 2 4.9 3.0 1.4 0.2 0
#> 3 4.7 3.2 1.3 0.2 0
#> 4 4.6 3.1 1.5 0.2 0
#> 5 5.0 3.6 1.4 0.2 0
#> 6 5.4 3.9 1.7 0.4 0
#> Species.virginica
#> 1 0
#> 2 0
#> 3 0
#> 4 0
#> 5 0
#> 6 0
dummy <- dummyVars(~.-1, data = iris, fullRank = TRUE)
predict(dummy,iris) %>% head()
#> Sepal.Length Sepal.Width Petal.Length Petal.Width Species.setosa
#> 1 5.1 3.5 1.4 0.2 1
#> 2 4.9 3.0 1.4 0.2 1
#> 3 4.7 3.2 1.3 0.2 1
#> 4 4.6 3.1 1.5 0.2 1
#> 5 5.0 3.6 1.4 0.2 1
#> 6 5.4 3.9 1.7 0.4 1
#> Species.versicolor Species.virginica
#> 1 0 0
#> 2 0 0
#> 3 0 0
#> 4 0 0
#> 5 0 0
#> 6 0 0
独热编码3
###### 独热编码3
model.matrix(~.-1,iris) %>% head()
#> Sepal.Length Sepal.Width Petal.Length Petal.Width Speciessetosa
#> 1 5.1 3.5 1.4 0.2 1
#> 2 4.9 3.0 1.4 0.2 1
#> 3 4.7 3.2 1.3 0.2 1
#> 4 4.6 3.1 1.5 0.2 1
#> 5 5.0 3.6 1.4 0.2 1
#> 6 5.4 3.9 1.7 0.4 1
#> Speciesversicolor Speciesvirginica
#> 1 0 0
#> 2 0 0
#> 3 0 0
#> 4 0 0
#> 5 0 0
#> 6 0 0
#### 类似Matrix包,只不过这个包用的是系数矩阵,这个包对大数据的时候特别友好
library(Matrix)
sparse.model.matrix(~.-1, data = iris)%>% head()
#> 6 x 7 sparse Matrix of class "dgCMatrix"
#> Sepal.Length Sepal.Width Petal.Length Petal.Width Speciessetosa
#> 1 5.1 3.5 1.4 0.2 1
#> 2 4.9 3.0 1.4 0.2 1
#> 3 4.7 3.2 1.3 0.2 1
#> 4 4.6 3.1 1.5 0.2 1
#> 5 5.0 3.6 1.4 0.2 1
#> 6 5.4 3.9 1.7 0.4 1
#> Speciesversicolor Speciesvirginica
#> 1 . .
#> 2 . .
#> 3 . .
#> 4 . .
#> 5 . .
#> 6 . .
独热编码4
###### 独热编码4
library(qdapTools)
#只适用于factor向量,不能用数据框,必须是一列,生成的列名为因子水平
mtabulate(iris$Species) %>% head()
#> setosa versicolor virginica
#> 1 1 0 0
#> 2 1 0 0
#> 3 1 0 0
#> 4 1 0 0
#> 5 1 0 0
#> 6 1 0 0
# 同理,下面这个也只能适用于只含有因子的向量(不能用于数据框),生成的列名为因子水平
library(nnet)
class.ind(iris$Species) %>% head()
#> setosa versicolor virginica
#> [1,] 1 0 0
#> [2,] 1 0 0
#> [3,] 1 0 0
#> [4,] 1 0 0
#> [5,] 1 0 0
#> [6,] 1 0 0
独热编码5
###### 独热编码5
## 只能用于因子的数据框,不能包含向量or数字列
library(ade4)
acm.disjonctif(iris[,5,drop=F]) %>% head()
#> Species.setosa Species.versicolor Species.virginica
#> 1 1 0 0
#> 2 1 0 0
#> 3 1 0 0
#> 4 1 0 0
#> 5 1 0 0
#> 6 1 0 0
独热编码6
###### 独热编码 6 ************
library(dummies)
# 全部转换,指定列, 设置all=F 只显示被转换的因子变量
alldata <- dummy.data.frame(iris, names=c("Species"), sep="_",all=T)
alldata %>% head()
#> Sepal.Length Sepal.Width Petal.Length Petal.Width Species_setosa
#> 1 5.1 3.5 1.4 0.2 1
#> 2 4.9 3.0 1.4 0.2 1
#> 3 4.7 3.2 1.3 0.2 1
#> 4 4.6 3.1 1.5 0.2 1
#> 5 5.0 3.6 1.4 0.2 1
#> 6 5.4 3.9 1.7 0.4 1
#> Species_versicolor Species_virginica
#> 1 0 0
#> 2 0 0
#> 3 0 0
#> 4 0 0
#> 5 0 0
#> 6 0 0
#把所有factor类型都转变为独热编码
dummy.data.frame(iris, dummy.class="factor") %>% head()
#> Sepal.Length Sepal.Width Petal.Length Petal.Width Speciessetosa
#> 1 5.1 3.5 1.4 0.2 1
#> 2 4.9 3.0 1.4 0.2 1
#> 3 4.7 3.2 1.3 0.2 1
#> 4 4.6 3.1 1.5 0.2 1
#> 5 5.0 3.6 1.4 0.2 1
#> 6 5.4 3.9 1.7 0.4 1
#> Speciesversicolor Speciesvirginica
#> 1 0 0
#> 2 0 0
#> 3 0 0
#> 4 0 0
#> 5 0 0
#> 6 0 0
dummy.data.frame(testFrame, dummy.class="factor") %>% head()
#> First Second Third Fourth=A Fourth=B Fourth=C Fourth=D Fifth Sixth Seventh=J
#> 1 8 7 8 1 0 0 0 =E =a 1
#> 2 9 2 5 0 1 0 0 =F =b 0
#> 3 7 20 10 0 0 1 0 =G =a 0
#> 4 4 9 3 0 0 0 1 =H =b 1
#> 5 1 6 2 1 0 0 0 =I =a 0
#> 6 1 13 1 0 1 0 0 =E =b 0
#> Seventh=K Seventh=L
#> 1 0 0
#> 2 1 0
#> 3 0 1
#> 4 0 0
#> 5 1 0
#> 6 0 1
独热编码7
###### 独热编码7
library(useful)
build.x(~.,iris)%>% head()
#> (Intercept) Sepal.Length Sepal.Width Petal.Length Petal.Width
#> 1 1 5.1 3.5 1.4 0.2
#> 2 1 4.9 3.0 1.4 0.2
#> 3 1 4.7 3.2 1.3 0.2
#> 4 1 4.6 3.1 1.5 0.2
#> 5 1 5.0 3.6 1.4 0.2
#> 6 1 5.4 3.9 1.7 0.4
#> Speciesversicolor Speciesvirginica
#> 1 0 0
#> 2 0 0
#> 3 0 0
#> 4 0 0
#> 5 0 0
#> 6 0 0
build.x(~.-1,iris)%>% head()
#> Sepal.Length Sepal.Width Petal.Length Petal.Width Speciessetosa
#> 1 5.1 3.5 1.4 0.2 1
#> 2 4.9 3.0 1.4 0.2 1
#> 3 4.7 3.2 1.3 0.2 1
#> 4 4.6 3.1 1.5 0.2 1
#> 5 5.0 3.6 1.4 0.2 1
#> 6 5.4 3.9 1.7 0.4 1
#> Speciesversicolor Speciesvirginica
#> 1 0 0
#> 2 0 0
#> 3 0 0
#> 4 0 0
#> 5 0 0
#> 6 0 0
build.y(Species~.,data = iris) # 直接把公式左边的保留即可,不做任何修改
#> [1] setosa setosa setosa setosa setosa setosa
#> [7] setosa setosa setosa setosa setosa setosa
#> [13] setosa setosa setosa setosa setosa setosa
#> [19] setosa setosa setosa setosa setosa setosa
#> [25] setosa setosa setosa setosa setosa setosa
#> [31] setosa setosa setosa setosa setosa setosa
#> [37] setosa setosa setosa setosa setosa setosa
#> [43] setosa setosa setosa setosa setosa setosa
#> [49] setosa setosa versicolor versicolor versicolor versicolor
#> [55] versicolor versicolor versicolor versicolor versicolor versicolor
#> [61] versicolor versicolor versicolor versicolor versicolor versicolor
#> [67] versicolor versicolor versicolor versicolor versicolor versicolor
#> [73] versicolor versicolor versicolor versicolor versicolor versicolor
#> [79] versicolor versicolor versicolor versicolor versicolor versicolor
#> [85] versicolor versicolor versicolor versicolor versicolor versicolor
#> [91] versicolor versicolor versicolor versicolor versicolor versicolor
#> [97] versicolor versicolor versicolor versicolor virginica virginica
#> [103] virginica virginica virginica virginica virginica virginica
#> [109] virginica virginica virginica virginica virginica virginica
#> [115] virginica virginica virginica virginica virginica virginica
#> [121] virginica virginica virginica virginica virginica virginica
#> [127] virginica virginica virginica virginica virginica virginica
#> [133] virginica virginica virginica virginica virginica virginica
#> [139] virginica virginica virginica virginica virginica virginica
#> [145] virginica virginica virginica virginica virginica virginica
#> Levels: setosa versicolor virginica
独热编码8
###### 独热编码8
head(model.matrix(First~ Second + Fourth + Fifth, testFrame))
#> (Intercept) Second Fourth=B Fourth=C Fourth=D Fifth.L Fifth.Q
#> 1 1 7 0 0 0 -0.6324555 0.5345225
#> 2 1 2 1 0 0 -0.3162278 -0.2672612
#> 3 1 20 0 1 0 0.0000000 -0.5345225
#> 4 1 9 0 0 1 0.3162278 -0.2672612
#> 5 1 6 0 0 0 0.6324555 0.5345225
#> 6 1 13 1 0 0 -0.6324555 0.5345225
#> Fifth.C Fifth^4
#> 1 -3.162278e-01 0.1195229
#> 2 6.324555e-01 -0.4780914
#> 3 -4.095972e-16 0.7171372
#> 4 -6.324555e-01 -0.4780914
#> 5 3.162278e-01 0.1195229
#> 6 -3.162278e-01 0.1195229
head(model.matrix(First~ .-1, testFrame))
#> Second Third Fourth=A Fourth=B Fourth=C Fourth=D Fifth.L Fifth.Q
#> 1 7 8 1 0 0 0 -0.6324555 0.5345225
#> 2 2 5 0 1 0 0 -0.3162278 -0.2672612
#> 3 20 10 0 0 1 0 0.0000000 -0.5345225
#> 4 9 3 0 0 0 1 0.3162278 -0.2672612
#> 5 6 2 1 0 0 0 0.6324555 0.5345225
#> 6 13 1 0 1 0 0 -0.6324555 0.5345225
#> Fifth.C Fifth^4 Sixth=b Seventh=K Seventh=L
#> 1 -3.162278e-01 0.1195229 0 0 0
#> 2 6.324555e-01 -0.4780914 1 1 0
#> 3 -4.095972e-16 0.7171372 0 0 1
#> 4 -6.324555e-01 -0.4780914 1 0 0
#> 5 3.162278e-01 0.1195229 0 1 0
#> 6 -3.162278e-01 0.1195229 1 0 1
head(model.matrix(First~ ., testFrame))
#> (Intercept) Second Third Fourth=B Fourth=C Fourth=D Fifth.L Fifth.Q
#> 1 1 7 8 0 0 0 -0.6324555 0.5345225
#> 2 1 2 5 1 0 0 -0.3162278 -0.2672612
#> 3 1 20 10 0 1 0 0.0000000 -0.5345225
#> 4 1 9 3 0 0 1 0.3162278 -0.2672612
#> 5 1 6 2 0 0 0 0.6324555 0.5345225
#> 6 1 13 1 1 0 0 -0.6324555 0.5345225
#> Fifth.C Fifth^4 Sixth=b Seventh=K Seventh=L
#> 1 -3.162278e-01 0.1195229 0 0 0
#> 2 6.324555e-01 -0.4780914 1 1 0
#> 3 -4.095972e-16 0.7171372 0 0 1
#> 4 -6.324555e-01 -0.4780914 1 0 0
#> 5 3.162278e-01 0.1195229 0 1 0
#> 6 -3.162278e-01 0.1195229 1 0 1
独热编码 转变为原始变量 (即逆运算)
##独热编码 转变为原始变量 (即逆运算)
d = model.matrix(~Species-1 ,iris)
d =data.frame(d)
ifelse(rowSums(d)>=1,names(d)[max.col(d)],NA)
#> 1 2 3 4
#> "Speciessetosa" "Speciessetosa" "Speciessetosa" "Speciessetosa"
#> 5 6 7 8
#> "Speciessetosa" "Speciessetosa" "Speciessetosa" "Speciessetosa"
#> 9 10 11 12
#> "Speciessetosa" "Speciessetosa" "Speciessetosa" "Speciessetosa"
#> 13 14 15 16
#> "Speciessetosa" "Speciessetosa" "Speciessetosa" "Speciessetosa"
#> 17 18 19 20
#> "Speciessetosa" "Speciessetosa" "Speciessetosa" "Speciessetosa"
#> 21 22 23 24
#> "Speciessetosa" "Speciessetosa" "Speciessetosa" "Speciessetosa"
#> 25 26 27 28
#> "Speciessetosa" "Speciessetosa" "Speciessetosa" "Speciessetosa"
#> 29 30 31 32
#> "Speciessetosa" "Speciessetosa" "Speciessetosa" "Speciessetosa"
#> 33 34 35 36
#> "Speciessetosa" "Speciessetosa" "Speciessetosa" "Speciessetosa"
#> 37 38 39 40
#> "Speciessetosa" "Speciessetosa" "Speciessetosa" "Speciessetosa"
#> 41 42 43 44
#> "Speciessetosa" "Speciessetosa" "Speciessetosa" "Speciessetosa"
#> 45 46 47 48
#> "Speciessetosa" "Speciessetosa" "Speciessetosa" "Speciessetosa"
#> 49 50 51 52
#> "Speciessetosa" "Speciessetosa" "Speciesversicolor" "Speciesversicolor"
#> 53 54 55 56
#> "Speciesversicolor" "Speciesversicolor" "Speciesversicolor" "Speciesversicolor"
#> 57 58 59 60
#> "Speciesversicolor" "Speciesversicolor" "Speciesversicolor" "Speciesversicolor"
#> 61 62 63 64
#> "Speciesversicolor" "Speciesversicolor" "Speciesversicolor" "Speciesversicolor"
#> 65 66 67 68
#> "Speciesversicolor" "Speciesversicolor" "Speciesversicolor" "Speciesversicolor"
#> 69 70 71 72
#> "Speciesversicolor" "Speciesversicolor" "Speciesversicolor" "Speciesversicolor"
#> 73 74 75 76
#> "Speciesversicolor" "Speciesversicolor" "Speciesversicolor" "Speciesversicolor"
#> 77 78 79 80
#> "Speciesversicolor" "Speciesversicolor" "Speciesversicolor" "Speciesversicolor"
#> 81 82 83 84
#> "Speciesversicolor" "Speciesversicolor" "Speciesversicolor" "Speciesversicolor"
#> 85 86 87 88
#> "Speciesversicolor" "Speciesversicolor" "Speciesversicolor" "Speciesversicolor"
#> 89 90 91 92
#> "Speciesversicolor" "Speciesversicolor" "Speciesversicolor" "Speciesversicolor"
#> 93 94 95 96
#> "Speciesversicolor" "Speciesversicolor" "Speciesversicolor" "Speciesversicolor"
#> 97 98 99 100
#> "Speciesversicolor" "Speciesversicolor" "Speciesversicolor" "Speciesversicolor"
#> 101 102 103 104
#> "Speciesvirginica" "Speciesvirginica" "Speciesvirginica" "Speciesvirginica"
#> 105 106 107 108
#> "Speciesvirginica" "Speciesvirginica" "Speciesvirginica" "Speciesvirginica"
#> 109 110 111 112
#> "Speciesvirginica" "Speciesvirginica" "Speciesvirginica" "Speciesvirginica"
#> 113 114 115 116
#> "Speciesvirginica" "Speciesvirginica" "Speciesvirginica" "Speciesvirginica"
#> 117 118 119 120
#> "Speciesvirginica" "Speciesvirginica" "Speciesvirginica" "Speciesvirginica"
#> 121 122 123 124
#> "Speciesvirginica" "Speciesvirginica" "Speciesvirginica" "Speciesvirginica"
#> 125 126 127 128
#> "Speciesvirginica" "Speciesvirginica" "Speciesvirginica" "Speciesvirginica"
#> 129 130 131 132
#> "Speciesvirginica" "Speciesvirginica" "Speciesvirginica" "Speciesvirginica"
#> 133 134 135 136
#> "Speciesvirginica" "Speciesvirginica" "Speciesvirginica" "Speciesvirginica"
#> 137 138 139 140
#> "Speciesvirginica" "Speciesvirginica" "Speciesvirginica" "Speciesvirginica"
#> 141 142 143 144
#> "Speciesvirginica" "Speciesvirginica" "Speciesvirginica" "Speciesvirginica"
#> 145 146 147 148
#> "Speciesvirginica" "Speciesvirginica" "Speciesvirginica" "Speciesvirginica"
#> 149 150
#> "Speciesvirginica" "Speciesvirginica"
sessionInfo()
#> R version 4.0.2 (2020-06-22)
#> Platform: x86_64-apple-darwin17.0 (64-bit)
#> Running under: macOS Mojave 10.14.5
#>
#> Matrix products: default
#> BLAS: /Library/Frameworks/R.framework/Versions/4.0/Resources/lib/libRblas.dylib
#> LAPACK: /Library/Frameworks/R.framework/Versions/4.0/Resources/lib/libRlapack.dylib
#>
#> locale:
#> [1] zh_CN.UTF-8/zh_CN.UTF-8/zh_CN.UTF-8/C/zh_CN.UTF-8/zh_CN.UTF-8
#>
#> attached base packages:
#> [1] stats graphics grDevices utils datasets methods base
#>
#> other attached packages:
#> [1] useful_1.2.6 dummies_1.5.6 ade4_1.7-15 nnet_7.3-14
#> [5] qdapTools_1.3.5 Matrix_1.2-18 caret_6.0-86 ggplot2_3.3.2
#> [9] lattice_0.20-41 onehot_0.1.1 mltools_0.3.5 magrittr_1.5
#> [13] data.table_1.13.0
#>
#> loaded via a namespace (and not attached):
#> [1] tidyselect_1.1.0 xfun_0.17 reshape2_1.4.4
#> [4] purrr_0.3.4 splines_4.0.2 colorspace_1.4-1
#> [7] vctrs_0.3.2 generics_0.0.2 stats4_4.0.2
#> [10] htmltools_0.5.0 yaml_2.2.1 chron_2.3-56
#> [13] survival_3.1-12 prodlim_2019.11.13 rlang_0.4.7
#> [16] ModelMetrics_1.2.2.2 pillar_1.4.6 glue_1.4.1
#> [19] withr_2.2.0 foreach_1.5.0 lifecycle_0.2.0
#> [22] plyr_1.8.6 lava_1.6.7 stringr_1.4.0
#> [25] timeDate_3043.102 munsell_0.5.0 blogdown_0.20
#> [28] gtable_0.3.0 recipes_0.1.13 codetools_0.2-16
#> [31] evaluate_0.14 knitr_1.29 class_7.3-17
#> [34] Rcpp_1.0.5 scales_1.1.1 ipred_0.9-9
#> [37] digest_0.6.25 stringi_1.4.6 bookdown_0.20
#> [40] dplyr_1.0.1 grid_4.0.2 bitops_1.0-6
#> [43] tools_4.0.2 RCurl_1.98-1.2 tibble_3.0.3
#> [46] crayon_1.3.4 pkgconfig_2.0.3 ellipsis_0.3.1
#> [49] MASS_7.3-51.6 pROC_1.16.2 lubridate_1.7.9
#> [52] gower_0.2.2 rmarkdown_2.3 iterators_1.0.12
#> [55] R6_2.4.1 rpart_4.1-15 nlme_3.1-148
#> [58] compiler_4.0.2