options(width = 300)
knitr::opts_chunk$set(message = F,warning = F,comment = "#>",collapse = TRUE)
读入数据
library(data.table)
train=fread("data/train.csv",na.strings = c("",NA))
test=fread("data/test.csv",na.strings = c("",NA))
#把两个合并起来进行数据处理--两个data.table的合并
combine =rbindlist(list(train,test),fill=TRUE)
#### 其实个人不建议这样操作,因为不能把测试集和训练集一起处理,应该分开处理
数据处理
# 统计每一列的缺失率
combine[,lapply(.SD, function(x)sum(is.na(x)))]
#> PassengerId Survived Pclass Name Sex Age SibSp Parch Ticket Fare Cabin Embarked
#> 1: 0 418 0 0 0 263 0 0 0 1 1014 2
# 可以看出 我们需要对缺失列进行处理,以及一些特征衍生工作
library(zoo)
library(purrr)
combine[,Age := na.spline(Age)] # age变量进行处理,进行样条插补
combine[,Fare := na.spline(Fare)] # Fare变量进行处理,进行样条插补
由于Cabin 变量丢失数据太多,于是可以删除这个变量
combine[,Cabin:=NULL]
以及Embarked这个是个字符串,于是用众数去替代,或者用一些相同的类型的人的指标去替代
combine[is.na(Embarked),]## 用同等船舱的且上岸第相等的类型去替代
#> PassengerId Survived Pclass Name Sex Age SibSp Parch Ticket Fare Embarked
#> 1: 62 1 1 Icard, Miss. Amelie female 38 0 0 113572 80 <NA>
#> 2: 830 1 1 Stone, Mrs. George Nelson (Martha Evelyn) female 62 0 0 113572 80 <NA>
combine[Pclass == 1,.N,by=.(Embarked)]
#> Embarked N
#> 1: C 141
#> 2: S 177
#> 3: <NA> 2
#> 4: Q 3
# 可以看出用S去代替
combine[is.na(Embarked),Embarked:=c("S","S")]
# 再次统计每一列的缺失率
combine[,lapply(.SD, function(x)sum(is.na(x)))]
#> PassengerId Survived Pclass Name Sex Age SibSp Parch Ticket Fare Embarked
#> 1: 0 418 0 0 0 0 0 0 0 0 0
特征衍生工作
从名字中提取称谓,并把相同意思的称谓进行融合
# 提取名字的称谓
library(stringr)
combine[,Name :=gsub("(.*, )|(\\..*)","",Name)]
str(combine)
#> Classes 'data.table' and 'data.frame': 1309 obs. of 11 variables:
#> $ PassengerId: int 1 2 3 4 5 6 7 8 9 10 ...
#> $ Survived : int 0 1 1 1 0 0 0 0 1 1 ...
#> $ Pclass : int 3 1 3 1 3 3 1 3 3 2 ...
#> $ Name : chr "Mr" "Mrs" "Miss" "Mrs" ...
#> $ Sex : chr "male" "female" "female" "female" ...
#> $ Age : num 22 38 26 35 35 ...
#> $ SibSp : int 1 1 0 1 0 0 0 3 0 1 ...
#> $ Parch : int 0 0 0 0 0 0 0 1 2 0 ...
#> $ Ticket : chr "A/5 21171" "PC 17599" "STON/O2. 3101282" "113803" ...
#> $ Fare : num 7.25 71.28 7.92 53.1 8.05 ...
#> $ Embarked : chr "S" "C" "S" "S" ...
#> - attr(*, ".internal.selfref")=<externalptr>
#> - attr(*, "index")= int
#> ..- attr(*, "__Pclass")= int 2 4 7 12 24 28 31 32 35 36 ...
# address(combine)
combine[,Name :=str_trim(Name)]
combine$Name %>% table()
#> .
#> Capt Col Don Dona Dr Jonkheer Lady Major Master Miss Mlle Mme Mr Mrs Ms Rev Sir the Countess
#> 1 4 1 1 8 1 1 2 61 260 2 1 757 197 2 8 1 1
## 把称谓进行融合
#因为法语里面 mlle 和Mme 一样
#combi$appellation[combi$appellation %in% c("Mlle","Mme")] = "Mlle"
combine[Name %in% c("Mlle","Mme"),Name:="Mlle"]
combine[Name %in% c("Don","Major","Sir"),Name:="Sir"]
combine[Name %in% c("Jonkheer","Dona","the Countess","Lady"),Name:="Lady"]
combine[,table(Name)]
#> Name
#> Capt Col Dr Lady Master Miss Mlle Mr Mrs Ms Rev Sir
#> 1 4 8 4 61 260 3 757 197 2 8 4
## 将称谓改为因子类型
combine[,Name:= as.factor(Name)]
构造 FamilySize 变量 ,
#家庭的人数(包括自己) 这里居然不能用= 不然会出错,奇怪,后面试过几次有可以了
combine$FamilySize=as.numeric(combine$SibSp) + as.numeric(combine$Parch) + 1
combine$FamilySize[combine$FamilySize > 6 ] = "Large"
combine$FamilySize[combine$FamilySize <= 2] = "Small"
combine$FamilySize[combine$FamilySize > 2 & combine$FamilySize <= 6] = "Middle"
table(combine$FamilySize)
#>
#> Large Middle Small
#> 35 249 1025
combine[,FamilySize := as.factor(FamilySize)]
对于Age变量,可以把age分类处理
## 把1-3岁的小孩单独分类,因为这种情况跟随母亲活下来的情况很大
combine$Age[combine$Age<=1]=1
combine[Age<=3,Age_class := "small"]
combine[3<Age & Age<=14,Age_class := "juvenile"]
combine[14<Age & Age<=60,Age_class := "adult"]
combine[60<Age, Age_class := "old"]
table(combine$Age_class)
#>
#> adult juvenile old small
#> 1108 102 43 56
combine[,Age_class:=as.factor(Age_class)]
把一些字符变量转变为因子变量
# combine[,PassengerId :=as.factor(PassengerId)]
# combine[,Survived := as.factor(Survived)]
# combine[,Pclass :=as.factor(Pclass)]
# combine[,Sex := as.factor(Sex)]
a = c("PassengerId","Survived","Pclass","Sex","Embarked")
combine[,(a):=lapply(.SD,function(x)as.factor(x)), .SDcols = a]
str(combine)
#> Classes 'data.table' and 'data.frame': 1309 obs. of 13 variables:
#> $ PassengerId: Factor w/ 1309 levels "1","2","3","4",..: 1 2 3 4 5 6 7 8 9 10 ...
#> $ Survived : Factor w/ 2 levels "0","1": 1 2 2 2 1 1 1 1 2 2 ...
#> $ Pclass : Factor w/ 3 levels "1","2","3": 3 1 3 1 3 3 1 3 3 2 ...
#> $ Name : Factor w/ 12 levels "Capt","Col","Dr",..: 8 9 6 9 8 8 8 5 9 9 ...
#> $ Sex : Factor w/ 2 levels "female","male": 2 1 1 1 2 2 2 2 1 1 ...
#> $ Age : num 22 38 26 35 35 ...
#> $ SibSp : int 1 1 0 1 0 0 0 3 0 1 ...
#> $ Parch : int 0 0 0 0 0 0 0 1 2 0 ...
#> $ Ticket : chr "A/5 21171" "PC 17599" "STON/O2. 3101282" "113803" ...
#> $ Fare : num 7.25 71.28 7.92 53.1 8.05 ...
#> $ Embarked : Factor w/ 3 levels "C","Q","S": 3 1 3 3 3 2 3 3 3 1 ...
#> $ FamilySize : Factor w/ 3 levels "Large","Middle",..: 3 3 3 3 3 3 3 2 2 3 ...
#> $ Age_class : Factor w/ 4 levels "adult","juvenile",..: 1 1 1 1 1 1 1 4 1 2 ...
#> - attr(*, ".internal.selfref")=<externalptr>
#> - attr(*, "index")= int
数据划分
train <- combine[1:891,] %>% as.data.frame()
test <- combine[892:1309,] %>% as.data.frame()
建模
决策树建模
## 决策树建模
library(rpart)
taitan_tree=rpart(Survived~Pclass +Name + Sex + Age_class + SibSp + Parch + Fare + Embarked + Name + FamilySize, train, method = "class")
prediction <- predict(taitan_tree, test, type = "class")
submit=data.frame(PassengerId=test$PassengerId,Survived=prediction)
## 存储文件
# write.csv(submit,"data/submit.csv",row.names = FALSE)
# 模型评估
table(train$Survived,predict(taitan_tree,train,type = "class"))
#>
#> 0 1
#> 0 509 40
#> 1 96 246
library(rpart.plot)
rpart.plot(taitan_tree)
随机森林建模
### 随机森林建模
library(randomForest)
model_rf = randomForest(Survived~Pclass +Name + Sex + Age_class + SibSp + Parch + Fare + Embarked + Name + FamilySize, train, method = "class")
pred <- predict(model_rf, test, type = "class")
submit_rf=data.frame(PassengerId=test$PassengerId,Survived=prediction)
## 存储文件
# write.csv(submit_rf,"data/submit_rf.csv",row.names = FALSE)
# 模型评估
table(train$Survived,predict(model_rf,train,type = "class"))
#>
#> 0 1
#> 0 531 18
#> 1 67 275
carte包
随机森林建模
library(caret)
library(foreach)
library(doParallel)
no_cores <- detectCores() -1
cl<-makeCluster(no_cores)
registerDoParallel(cl)
fitControl <- trainControl(method = "repeatedcv",
number = 10,
repeats = 1)
grif_rf = expand.grid(.mtry=c(11:13))
set.seed(825)
rf_cv_Fit1 <- train(Survived~Pclass +Name + Sex + Age_class + SibSp + Parch + Fare + Embarked + Name + FamilySize,
data = train,
metric ="Kappa",
method = "rf",
trControl = fitControl,tuneGrid = grif_rf,
verbose = FALSE)
随机森林—模型参数与评估
rf_cv_Fit1
#> Random Forest
#>
#> 891 samples
#> 9 predictor
#> 2 classes: '0', '1'
#>
#> No pre-processing
#> Resampling: Cross-Validated (10 fold, repeated 1 times)
#> Summary of sample sizes: 802, 802, 802, 802, 802, 802, ...
#> Resampling results across tuning parameters:
#>
#> mtry Accuracy Kappa
#> 11 0.8361423 0.6479535
#> 12 0.8249064 0.6235846
#> 13 0.8260175 0.6267026
#>
#> Kappa was used to select the optimal model using the largest value.
#> The final value used for the model was mtry = 11.
pred_rf_cv = predict(rf_cv_Fit1,test,type = "raw")
submit_rf_cv=data.frame(PassengerId=test$PassengerId,Survived=prediction)
## 存储文件
# write.csv(submit_rf_cv,"data/submit_rf_cv.csv",row.names = FALSE)
# 模型评估
table(train$Survived,predict(rf_cv_Fit1,train,type = "raw"))
#>
#> 0 1
#> 0 532 17
#> 1 54 288
svm建模
library(foreach)
library(doParallel)
no_cores <- detectCores() -1
cl<-makeCluster(no_cores)
fitControl <- trainControl(method = "repeatedcv",
number = 10,
repeats = 1)
grif_svm = expand.grid(sigma=seq(1,100,10),C=seq(1,100,10),Weight=c(1,100 / table(train$Survived)))
set.seed(825)
svm_cv_Fit1 <- train(Survived~Pclass +Name + Sex + Age_class + SibSp + Parch + Fare + Embarked + Name + FamilySize,
data = train,
metric ="Kappa",
method = "svmRadialWeights",
trControl = fitControl,tuneGrid = grif_svm,
verbose = FALSE)
svm_cv_Fit1
#> Support Vector Machines with Class Weights
#>
#> 891 samples
#> 9 predictor
#> 2 classes: '0', '1'
#>
#> No pre-processing
#> Resampling: Cross-Validated (10 fold, repeated 1 times)
#> Summary of sample sizes: 802, 802, 802, 802, 802, 802, ...
#> Resampling results across tuning parameters:
#>
#> sigma C Weight Accuracy Kappa
#> 1 1 0.1821494 0.6476529 0.3450120
#> 1 1 0.2923977 0.6879650 0.4080850
#> 1 1 1.0000000 0.7811111 0.5336090
#> 1 11 0.1821494 0.7384270 0.4825548
#> 1 11 0.2923977 0.7676404 0.5318205
#> 1 11 1.0000000 0.7889513 0.5539126
#> 1 21 0.1821494 0.7373034 0.4800775
#> 1 21 0.2923977 0.7721348 0.5396339
#> 1 21 1.0000000 0.7878277 0.5513716
#> 1 31 0.1821494 0.7384270 0.4819020
#> 1 31 0.2923977 0.7721348 0.5396500
#> 1 31 1.0000000 0.7878277 0.5520975
#> 1 41 0.1821494 0.7384395 0.4816815
#> 1 41 0.2923977 0.7721348 0.5396500
#> 1 41 1.0000000 0.7878277 0.5528006
#> 1 51 0.1821494 0.7384519 0.4816418
#> 1 51 0.2923977 0.7687765 0.5332258
#> 1 51 1.0000000 0.7867166 0.5502534
#> 1 61 0.1821494 0.7384519 0.4816418
#> 1 61 0.2923977 0.7699001 0.5353125
#> 1 61 1.0000000 0.7889638 0.5549527
#> 1 71 0.1821494 0.7395755 0.4835483
#> 1 71 0.2923977 0.7698876 0.5354221
#> 1 71 1.0000000 0.7878527 0.5519801
#> 1 81 0.1821494 0.7373408 0.4792572
#> 1 81 0.2923977 0.7710112 0.5374317
#> 1 81 1.0000000 0.7878527 0.5519801
#> 1 91 0.1821494 0.7373408 0.4792572
#> 1 91 0.2923977 0.7698876 0.5349684
#> 1 91 1.0000000 0.7889763 0.5539713
#> 11 1 0.1821494 0.5926966 0.2634169
#> 11 1 0.2923977 0.6566167 0.3569706
#> 11 1 1.0000000 0.7464170 0.4589756
#> 11 11 0.1821494 0.7036954 0.4264619
#> 11 11 0.2923977 0.7407366 0.4872806
#> 11 11 1.0000000 0.7486642 0.4607651
#> 11 21 0.1821494 0.6958302 0.4115283
#> 11 21 0.2923977 0.7396130 0.4853630
#> 11 21 1.0000000 0.7486767 0.4596743
#> 11 31 0.1821494 0.6992010 0.4169313
#> 11 31 0.2923977 0.7373658 0.4810659
#> 11 31 1.0000000 0.7430587 0.4490506
#> 11 41 0.1821494 0.7003246 0.4189393
#> 11 41 0.2923977 0.7362422 0.4791788
#> 11 41 1.0000000 0.7430462 0.4501862
#> 11 51 0.1821494 0.7014357 0.4208375
#> 11 51 0.2923977 0.7362422 0.4791788
#> 11 51 1.0000000 0.7441698 0.4527104
#> 11 61 0.1821494 0.7014357 0.4208375
#> 11 61 0.2923977 0.7373658 0.4810659
#> 11 61 1.0000000 0.7452934 0.4553096
#> 11 71 0.1821494 0.7014357 0.4208375
#> 11 71 0.2923977 0.7385019 0.4830801
#> 11 71 1.0000000 0.7441698 0.4526016
#> 11 81 0.1821494 0.7014357 0.4208375
#> 11 81 0.2923977 0.7373783 0.4805621
#> 11 81 1.0000000 0.7441698 0.4526016
#> 11 91 0.1821494 0.7014357 0.4208375
#> 11 91 0.2923977 0.7396255 0.4843557
#> 11 91 1.0000000 0.7430462 0.4506031
#> 21 1 0.1821494 0.5915730 0.2627613
#> 21 1 0.2923977 0.6442697 0.3377452
#> 21 1 1.0000000 0.7452934 0.4553119
#> 21 11 0.1821494 0.6980774 0.4158949
#> 21 11 0.2923977 0.7351311 0.4778643
#> 21 11 1.0000000 0.7452934 0.4542599
#> 21 21 0.1821494 0.7014482 0.4214853
#> 21 21 0.2923977 0.7340075 0.4753755
#> 21 21 1.0000000 0.7441698 0.4526315
#> 21 31 0.1821494 0.7014482 0.4214853
#> 21 31 0.2923977 0.7328839 0.4728575
#> 21 31 1.0000000 0.7430462 0.4506291
#> 21 41 0.1821494 0.7014482 0.4214853
#> 21 41 0.2923977 0.7351436 0.4766543
#> 21 41 1.0000000 0.7441698 0.4538285
#> 21 51 0.1821494 0.7025718 0.4232449
#> 21 51 0.2923977 0.7362672 0.4785697
#> 21 51 1.0000000 0.7430462 0.4515215
#> 21 61 0.1821494 0.7025718 0.4226345
#> 21 61 0.2923977 0.7373908 0.4805335
#> 21 61 1.0000000 0.7430462 0.4515215
#> 21 71 0.1821494 0.7036954 0.4244115
#> 21 71 0.2923977 0.7385144 0.4831184
#> 21 71 1.0000000 0.7430462 0.4515215
#> 21 81 0.1821494 0.7059426 0.4280555
#> 21 81 0.2923977 0.7385144 0.4831184
#> 21 81 1.0000000 0.7430462 0.4515215
#> 21 91 0.1821494 0.7059426 0.4280555
#> 21 91 0.2923977 0.7385144 0.4831184
#> 21 91 1.0000000 0.7430462 0.4515215
#> 31 1 0.1821494 0.5893258 0.2595359
#> 31 1 0.2923977 0.6476654 0.3435429
#> 31 1 1.0000000 0.7441823 0.4522360
#> 31 11 0.1821494 0.6992010 0.4175620
#> 31 11 0.2923977 0.7340075 0.4758536
#> 31 11 1.0000000 0.7419351 0.4474695
#> 31 21 0.1821494 0.7003246 0.4195318
#> 31 21 0.2923977 0.7317603 0.4714543
#> 31 21 1.0000000 0.7408115 0.4464669
#> 31 31 0.1821494 0.7003246 0.4195318
#> 31 31 0.2923977 0.7340200 0.4752511
#> 31 31 1.0000000 0.7408115 0.4464669
#> 31 41 0.1821494 0.7014482 0.4206810
#> 31 41 0.2923977 0.7362672 0.4791020
#> 31 41 1.0000000 0.7396879 0.4444013
#> 31 51 0.1821494 0.7025718 0.4225303
#> 31 51 0.2923977 0.7362672 0.4791020
#> 31 51 1.0000000 0.7385643 0.4424209
#> 31 61 0.1821494 0.7025718 0.4225544
#> 31 61 0.2923977 0.7373908 0.4815624
#> 31 61 1.0000000 0.7385643 0.4424209
#> 31 71 0.1821494 0.7025718 0.4225544
#> 31 71 0.2923977 0.7373908 0.4815624
#> 31 71 1.0000000 0.7374407 0.4403604
#> 31 81 0.1821494 0.7025718 0.4225544
#> 31 81 0.2923977 0.7385144 0.4834688
#> 31 81 1.0000000 0.7363171 0.4391270
#> 31 91 0.1821494 0.7036954 0.4249922
#> 31 91 0.2923977 0.7396380 0.4853753
#> 31 91 1.0000000 0.7363171 0.4391270
#> 41 1 0.1821494 0.5893258 0.2601027
#> 41 1 0.2923977 0.6398002 0.3310217
#> 41 1 1.0000000 0.7441823 0.4522816
#> 41 11 0.1821494 0.6992010 0.4185330
#> 41 11 0.2923977 0.7306367 0.4695862
#> 41 11 1.0000000 0.7385643 0.4412077
#> 41 21 0.1821494 0.6980774 0.4159480
#> 41 21 0.2923977 0.7328964 0.4733640
#> 41 21 1.0000000 0.7385643 0.4412077
#> 41 31 0.1821494 0.6958552 0.4116192
#> 41 31 0.2923977 0.7362672 0.4791020
#> 41 31 1.0000000 0.7374407 0.4392273
#> 41 41 0.1821494 0.6969913 0.4133824
#> 41 41 0.2923977 0.7362672 0.4791020
#> 41 41 1.0000000 0.7374407 0.4392273
#> 41 51 0.1821494 0.6992385 0.4176660
#> 41 51 0.2923977 0.7373908 0.4810085
#> 41 51 1.0000000 0.7363171 0.4379938
#> 41 61 0.1821494 0.6969913 0.4135130
#> 41 61 0.2923977 0.7373908 0.4804740
#> 41 61 1.0000000 0.7363171 0.4391270
#> 41 71 0.1821494 0.6981149 0.4160191
#> 41 71 0.2923977 0.7373908 0.4811373
#> 41 71 1.0000000 0.7363171 0.4391270
#> 41 81 0.1821494 0.6992385 0.4185078
#> 41 81 0.2923977 0.7373908 0.4811373
#> 41 81 1.0000000 0.7363171 0.4391270
#> 41 91 0.1821494 0.6992385 0.4185078
#> 41 91 0.2923977 0.7373908 0.4805263
#> 41 91 1.0000000 0.7351935 0.4370889
#> 51 1 0.1821494 0.5893258 0.2601027
#> 51 1 0.2923977 0.6386767 0.3292930
#> 51 1 1.0000000 0.7430712 0.4485869
#> 51 11 0.1821494 0.6936205 0.4087871
#> 51 11 0.2923977 0.7272784 0.4641883
#> 51 11 1.0000000 0.7385643 0.4412077
#> 51 21 0.1821494 0.6913858 0.4044756
#> 51 21 0.2923977 0.7328964 0.4737335
#> 51 21 1.0000000 0.7374407 0.4392273
#> 51 31 0.1821494 0.6925094 0.4064855
#> 51 31 0.2923977 0.7351436 0.4779378
#> 51 31 1.0000000 0.7363171 0.4379938
#> 51 41 0.1821494 0.6936330 0.4082625
#> 51 41 0.2923977 0.7351436 0.4774471
#> 51 41 1.0000000 0.7363171 0.4379938
#> 51 51 0.1821494 0.6970037 0.4149275
#> 51 51 0.2923977 0.7362672 0.4793616
#> 51 51 1.0000000 0.7351935 0.4359333
#> 51 61 0.1821494 0.6981273 0.4174162
#> 51 61 0.2923977 0.7362672 0.4793616
#> 51 61 1.0000000 0.7340699 0.4338953
#> 51 71 0.1821494 0.6981273 0.4167874
#> 51 71 0.2923977 0.7385144 0.4832641
#> 51 71 1.0000000 0.7340699 0.4338953
#> 51 81 0.1821494 0.6981273 0.4167874
#> 51 81 0.2923977 0.7373908 0.4814395
#> 51 81 1.0000000 0.7351935 0.4358535
#> 51 91 0.1821494 0.7003745 0.4205404
#> 51 91 0.2923977 0.7373908 0.4814395
#> 51 91 1.0000000 0.7329463 0.4318763
#> 61 1 0.1821494 0.5826217 0.2502068
#> 61 1 0.2923977 0.6285643 0.3133638
#> 61 1 1.0000000 0.7408240 0.4441011
#> 61 11 0.1821494 0.6925094 0.4069598
#> 61 11 0.2923977 0.7284020 0.4660563
#> 61 11 1.0000000 0.7363296 0.4355851
#> 61 21 0.1821494 0.6913858 0.4046547
#> 61 21 0.2923977 0.7328964 0.4737335
#> 61 21 1.0000000 0.7363296 0.4355851
#> 61 31 0.1821494 0.6936330 0.4082625
#> 61 31 0.2923977 0.7317728 0.4713558
#> 61 31 1.0000000 0.7340824 0.4316345
#> 61 41 0.1821494 0.6947566 0.4107686
#> 61 41 0.2923977 0.7340200 0.4752675
#> 61 41 1.0000000 0.7340824 0.4315119
#> 61 51 0.1821494 0.6958801 0.4132805
#> 61 51 0.2923977 0.7351436 0.4771909
#> 61 51 1.0000000 0.7340824 0.4314321
#> 61 61 0.1821494 0.6981273 0.4169757
#> 61 61 0.2923977 0.7318227 0.4572936
#> 61 61 1.0000000 0.7329588 0.4294738
#> 61 71 0.1821494 0.6992509 0.4188250
#> 61 71 0.2923977 0.7307116 0.4542080
#> 61 71 1.0000000 0.7329463 0.4305823
#> 61 81 0.1821494 0.7003745 0.4206999
#> 61 81 0.2923977 0.7329588 0.4580270
#> 61 81 1.0000000 0.7329463 0.4305823
#> 61 91 0.1821494 0.7026217 0.4244271
#> 61 91 0.2923977 0.7329588 0.4580270
#> 61 91 1.0000000 0.7329463 0.4305823
#> 71 1 0.1821494 0.5814981 0.2485895
#> 71 1 0.2923977 0.6251935 0.3078469
#> 71 1 1.0000000 0.7419476 0.4467029
#> 71 11 0.1821494 0.6880150 0.3997900
#> 71 11 0.2923977 0.7272784 0.4639733
#> 71 11 1.0000000 0.7340824 0.4312532
#> 71 21 0.1821494 0.6902622 0.4026816
#> 71 21 0.2923977 0.7272784 0.4635862
#> 71 21 1.0000000 0.7352060 0.4343127
#> 71 31 0.1821494 0.6936330 0.4093879
#> 71 31 0.2923977 0.7328964 0.4733328
#> 71 31 1.0000000 0.7340824 0.4322522
#> 71 41 0.1821494 0.6958801 0.4132805
#> 71 41 0.2923977 0.7295880 0.4522289
#> 71 41 1.0000000 0.7329588 0.4302141
#> 71 51 0.1821494 0.6981273 0.4169757
#> 71 51 0.2923977 0.7318227 0.4563957
#> 71 51 1.0000000 0.7329588 0.4302141
#> 71 61 0.1821494 0.6992509 0.4188250
#> 71 61 0.2923977 0.7329463 0.4582203
#> 71 61 1.0000000 0.7307116 0.4255170
#> 71 71 0.1821494 0.7026217 0.4244271
#> 71 71 0.2923977 0.7340699 0.4611126
#> 71 71 1.0000000 0.7307116 0.4255170
#> 71 81 0.1821494 0.7037453 0.4263832
#> 71 81 0.2923977 0.7351935 0.4636264
#> 71 81 1.0000000 0.7307116 0.4255170
#> 71 91 0.1821494 0.7037453 0.4263832
#> 71 91 0.2923977 0.7340824 0.4614589
#> 71 91 1.0000000 0.7307116 0.4260402
#> 81 1 0.1821494 0.5792634 0.2455210
#> 81 1 0.2923977 0.6251935 0.3078469
#> 81 1 1.0000000 0.7408240 0.4446288
#> 81 11 0.1821494 0.6868914 0.3977439
#> 81 11 0.2923977 0.7261673 0.4627250
#> 81 11 1.0000000 0.7329588 0.4291927
#> 81 21 0.1821494 0.6925094 0.4076129
#> 81 21 0.2923977 0.7295256 0.4680218
#> 81 21 1.0000000 0.7329588 0.4299807
#> 81 31 0.1821494 0.6936330 0.4101861
#> 81 31 0.2923977 0.7284519 0.4509761
#> 81 31 1.0000000 0.7307116 0.4258822
#> 81 41 0.1821494 0.6970037 0.4157226
#> 81 41 0.2923977 0.7295755 0.4538531
#> 81 41 1.0000000 0.7318352 0.4281951
#> 81 51 0.1821494 0.6992509 0.4193634
#> 81 51 0.2923977 0.7318227 0.4571451
#> 81 51 1.0000000 0.7318352 0.4281951
#> 81 61 0.1821494 0.7026217 0.4251154
#> 81 61 0.2923977 0.7318352 0.4569506
#> 81 61 1.0000000 0.7318352 0.4287183
#> 81 71 0.1821494 0.7026217 0.4251154
#> 81 71 0.2923977 0.7318477 0.4563842
#> 81 71 1.0000000 0.7318352 0.4287183
#> 81 81 0.1821494 0.7048689 0.4286436
#> 81 81 0.2923977 0.7318477 0.4563842
#> 81 81 1.0000000 0.7318352 0.4287183
#> 81 91 0.1821494 0.7048689 0.4281019
#> 81 91 0.2923977 0.7318602 0.4562315
#> 81 91 1.0000000 0.7318352 0.4286214
#> 91 1 0.1821494 0.5781523 0.2440235
#> 91 1 0.2923977 0.6240699 0.3062144
#> 91 1 1.0000000 0.7408240 0.4450119
#> 91 11 0.1821494 0.6846442 0.3944491
#> 91 11 0.2923977 0.7261673 0.4627498
#> 91 11 1.0000000 0.7318352 0.4279203
#> 91 21 0.1821494 0.6913858 0.4061352
#> 91 21 0.2923977 0.7250811 0.4451708
#> 91 21 1.0000000 0.7318352 0.4279203
#> 91 31 0.1821494 0.6947566 0.4117704
#> 91 31 0.2923977 0.7284519 0.4518740
#> 91 31 1.0000000 0.7284644 0.4218471
#> 91 41 0.1821494 0.6992509 0.4193634
#> 91 41 0.2923977 0.7295880 0.4535101
#> 91 41 1.0000000 0.7284644 0.4218471
#> 91 51 0.1821494 0.7014981 0.4231977
#> 91 51 0.2923977 0.7284769 0.4504299
#> 91 51 1.0000000 0.7284644 0.4223703
#> 91 61 0.1821494 0.7037453 0.4268340
#> 91 61 0.2923977 0.7307241 0.4549168
#> 91 61 1.0000000 0.7284644 0.4223703
#> 91 71 0.1821494 0.7037453 0.4268340
#> 91 71 0.2923977 0.7318477 0.4569850
#> 91 71 1.0000000 0.7284644 0.4223703
#> 91 81 0.1821494 0.7037453 0.4268340
#> 91 81 0.2923977 0.7329838 0.4593189
#> 91 81 1.0000000 0.7295880 0.4249469
#> 91 91 0.1821494 0.7048689 0.4288071
#> 91 91 0.2923977 0.7329838 0.4593189
#> 91 91 1.0000000 0.7307116 0.4269096
#>
#> Kappa was used to select the optimal model using the largest value.
#> The final values used for the model were sigma = 1, C = 61 and Weight = 1.
svm 参数搜索—范围进一步变小
# 上面结果最好的参数为: sigma = 1, C = 61 and Weight = 1.
no_cores <- detectCores() -1
cl<-makeCluster(no_cores)
#registerDoParallel(cl) # windows要加这个,mac不需要加这个
grif_svm2 = expand.grid(sigma=seq(1,10,1),C=seq(50,70,1),Weight=1)
set.seed(825)
svm_cv_Fit2 <- train(Survived~Pclass +Name + Sex + Age_class + SibSp + Parch + Fare + Embarked + Name + FamilySize,
data = train,
metric ="Kappa",
method = "svmRadialWeights",
trControl = fitControl,tuneGrid = grif_svm2,
verbose = FALSE)
svm_cv_Fit2
#> Support Vector Machines with Class Weights
#>
#> 891 samples
#> 9 predictor
#> 2 classes: '0', '1'
#>
#> No pre-processing
#> Resampling: Cross-Validated (10 fold, repeated 1 times)
#> Summary of sample sizes: 802, 802, 802, 802, 802, 802, ...
#> Resampling results across tuning parameters:
#>
#> sigma C Accuracy Kappa
#> 1 50 0.7867166 0.5502534
#> 1 51 0.7867166 0.5502534
#> 1 52 0.7878402 0.5523393
#> 1 53 0.7878402 0.5523393
#> 1 54 0.7878402 0.5523393
#> 1 55 0.7878402 0.5523393
#> 1 56 0.7878402 0.5523393
#> 1 57 0.7878402 0.5523393
#> 1 58 0.7889638 0.5549527
#> 1 59 0.7889638 0.5549527
#> 1 60 0.7889638 0.5549527
#> 1 61 0.7889638 0.5549527
#> 1 62 0.7889638 0.5549527
#> 1 63 0.7889638 0.5549527
#> 1 64 0.7878527 0.5523786
#> 1 65 0.7878527 0.5523786
#> 1 66 0.7878527 0.5523786
#> 1 67 0.7878527 0.5523786
#> 1 68 0.7878527 0.5519801
#> 1 69 0.7878527 0.5519801
#> 1 70 0.7878527 0.5519801
#> 2 50 0.7822222 0.5437950
#> 2 51 0.7822222 0.5437950
#> 2 52 0.7822222 0.5437950
#> 2 53 0.7833458 0.5458818
#> 2 54 0.7833458 0.5458818
#> 2 55 0.7833458 0.5458818
#> 2 56 0.7833458 0.5458818
#> 2 57 0.7822347 0.5433076
#> 2 58 0.7822347 0.5433076
#> 2 59 0.7789014 0.5348965
#> 2 60 0.7777903 0.5321784
#> 2 61 0.7766792 0.5294305
#> 2 62 0.7755680 0.5266522
#> 2 63 0.7744569 0.5238432
#> 2 64 0.7744569 0.5238432
#> 2 65 0.7744569 0.5238432
#> 2 66 0.7744569 0.5238432
#> 2 67 0.7744569 0.5238432
#> 2 68 0.7744569 0.5238432
#> 2 69 0.7744569 0.5238432
#> 2 70 0.7744569 0.5238432
#> 3 50 0.7677653 0.5079159
#> 3 51 0.7677653 0.5079159
#> 3 52 0.7677653 0.5079159
#> 3 53 0.7677653 0.5079159
#> 3 54 0.7677653 0.5079159
#> 3 55 0.7677653 0.5079159
#> 3 56 0.7677653 0.5079159
#> 3 57 0.7677653 0.5079159
#> 3 58 0.7677653 0.5079159
#> 3 59 0.7688889 0.5099863
#> 3 60 0.7688889 0.5099863
#> 3 61 0.7688889 0.5099863
#> 3 62 0.7688889 0.5099863
#> 3 63 0.7688889 0.5099863
#> 3 64 0.7688889 0.5099863
#> 3 65 0.7688889 0.5099863
#> 3 66 0.7688889 0.5099863
#> 3 67 0.7688889 0.5099863
#> 3 68 0.7677653 0.5073891
#> 3 69 0.7666417 0.5053186
#> 3 70 0.7666417 0.5053186
#> 4 50 0.7576404 0.4904061
#> 4 51 0.7576404 0.4904061
#> 4 52 0.7565169 0.4884006
#> 4 53 0.7565169 0.4884006
#> 4 54 0.7565169 0.4884006
#> 4 55 0.7565169 0.4884006
#> 4 56 0.7565169 0.4884006
#> 4 57 0.7565169 0.4884006
#> 4 58 0.7553933 0.4862959
#> 4 59 0.7553933 0.4862959
#> 4 60 0.7553933 0.4862955
#> 4 61 0.7542697 0.4843110
#> 4 62 0.7542697 0.4843110
#> 4 63 0.7542697 0.4843110
#> 4 64 0.7542697 0.4843110
#> 4 65 0.7542697 0.4843110
#> 4 66 0.7542697 0.4843110
#> 4 67 0.7542697 0.4843110
#> 4 68 0.7542697 0.4843110
#> 4 69 0.7542697 0.4843110
#> 4 70 0.7542697 0.4843110
#> 5 50 0.7509114 0.4678279
#> 5 51 0.7509114 0.4678279
#> 5 52 0.7509114 0.4678279
#> 5 53 0.7509114 0.4678279
#> 5 54 0.7509114 0.4678279
#> 5 55 0.7509114 0.4678279
#> 5 56 0.7509114 0.4678279
#> 5 57 0.7509114 0.4678279
#> 5 58 0.7509114 0.4678279
#> 5 59 0.7509114 0.4678279
#> 5 60 0.7509114 0.4678279
#> 5 61 0.7509114 0.4678279
#> 5 62 0.7509114 0.4678279
#> 5 63 0.7509114 0.4678279
#> 5 64 0.7509114 0.4678279
#> 5 65 0.7509114 0.4678279
#> 5 66 0.7509114 0.4678279
#> 5 67 0.7509114 0.4678279
#> 5 68 0.7497878 0.4657843
#> 5 69 0.7497878 0.4657843
#> 5 70 0.7497878 0.4657843
#> 6 50 0.7464170 0.4566695
#> 6 51 0.7464170 0.4566695
#> 6 52 0.7464170 0.4566695
#> 6 53 0.7464170 0.4566695
#> 6 54 0.7464170 0.4566695
#> 6 55 0.7452934 0.4539311
#> 6 56 0.7452934 0.4539311
#> 6 57 0.7441698 0.4519093
#> 6 58 0.7441698 0.4519093
#> 6 59 0.7441698 0.4519093
#> 6 60 0.7441698 0.4519093
#> 6 61 0.7441698 0.4519093
#> 6 62 0.7441698 0.4519093
#> 6 63 0.7441698 0.4519093
#> 6 64 0.7441698 0.4519093
#> 6 65 0.7441698 0.4519093
#> 6 66 0.7441698 0.4519093
#> 6 67 0.7441698 0.4519093
#> 6 68 0.7441698 0.4519093
#> 6 69 0.7441698 0.4519093
#> 6 70 0.7441698 0.4519093
#> 7 50 0.7430462 0.4497065
#> 7 51 0.7430462 0.4497065
#> 7 52 0.7430462 0.4497065
#> 7 53 0.7430462 0.4497065
#> 7 54 0.7430462 0.4497065
#> 7 55 0.7430462 0.4497065
#> 7 56 0.7430462 0.4497065
#> 7 57 0.7430462 0.4497065
#> 7 58 0.7430462 0.4497065
#> 7 59 0.7430462 0.4497065
#> 7 60 0.7430462 0.4497065
#> 7 61 0.7419226 0.4477303
#> 7 62 0.7408115 0.4446395
#> 7 63 0.7408115 0.4446395
#> 7 64 0.7408115 0.4446395
#> 7 65 0.7408115 0.4446395
#> 7 66 0.7408115 0.4446395
#> 7 67 0.7408115 0.4446395
#> 7 68 0.7408115 0.4446395
#> 7 69 0.7396879 0.4414400
#> 7 70 0.7408115 0.4434618
#> 8 50 0.7441823 0.4508545
#> 8 51 0.7441823 0.4508545
#> 8 52 0.7441823 0.4508545
#> 8 53 0.7441823 0.4508545
#> 8 54 0.7430587 0.4488783
#> 8 55 0.7419351 0.4469231
#> 8 56 0.7419351 0.4469231
#> 8 57 0.7419351 0.4469231
#> 8 58 0.7419351 0.4469231
#> 8 59 0.7419351 0.4469231
#> 8 60 0.7419351 0.4469231
#> 8 61 0.7419351 0.4469231
#> 8 62 0.7419351 0.4469231
#> 8 63 0.7419351 0.4469231
#> 8 64 0.7419351 0.4469231
#> 8 65 0.7408115 0.4443873
#> 8 66 0.7408115 0.4443873
#> 8 67 0.7408115 0.4443873
#> 8 68 0.7408115 0.4443873
#> 8 69 0.7408115 0.4443873
#> 8 70 0.7408115 0.4443873
#> 9 50 0.7408115 0.4443873
#> 9 51 0.7408115 0.4443873
#> 9 52 0.7408115 0.4443873
#> 9 53 0.7408115 0.4443873
#> 9 54 0.7408115 0.4443873
#> 9 55 0.7408115 0.4443873
#> 9 56 0.7408115 0.4443873
#> 9 57 0.7408115 0.4443873
#> 9 58 0.7408115 0.4443873
#> 9 59 0.7408115 0.4443873
#> 9 60 0.7408115 0.4443873
#> 9 61 0.7408115 0.4443873
#> 9 62 0.7408115 0.4443873
#> 9 63 0.7408115 0.4443873
#> 9 64 0.7408115 0.4443873
#> 9 65 0.7408115 0.4443873
#> 9 66 0.7419351 0.4469115
#> 9 67 0.7419351 0.4469115
#> 9 68 0.7419351 0.4469115
#> 9 69 0.7419351 0.4469115
#> 9 70 0.7419351 0.4469115
#> 10 50 0.7408115 0.4443873
#> 10 51 0.7408115 0.4443873
#> 10 52 0.7408115 0.4443873
#> 10 53 0.7408115 0.4443873
#> 10 54 0.7419351 0.4469115
#> 10 55 0.7419351 0.4469115
#> 10 56 0.7419351 0.4469115
#> 10 57 0.7419351 0.4469115
#> 10 58 0.7419351 0.4469115
#> 10 59 0.7419351 0.4469115
#> 10 60 0.7430462 0.4500024
#> 10 61 0.7430462 0.4500024
#> 10 62 0.7430462 0.4500024
#> 10 63 0.7430462 0.4500024
#> 10 64 0.7430462 0.4500024
#> 10 65 0.7430462 0.4500024
#> 10 66 0.7430462 0.4500024
#> 10 67 0.7430462 0.4500024
#> 10 68 0.7430462 0.4500024
#> 10 69 0.7430462 0.4500024
#> 10 70 0.7430462 0.4500024
#>
#> Tuning parameter 'Weight' was held constant at a value of 1
#> Kappa was used to select the optimal model using the largest value.
#> The final values used for the model were sigma = 1, C = 58 and Weight = 1.
# 参数进一步缩小 上面最好的为: sigma = 1, C = 58 and Weight = 1.
no_cores <- detectCores() -1
cl<-makeCluster(no_cores)
# registerDoParallel(cl)
grif_svm3 = expand.grid(sigma=seq(0,1,0.1),C=58,Weight=1)
set.seed(825)
svm_cv_Fit3 <- train(Survived~Pclass +Name + Sex + Age_class + SibSp + Parch + Fare + Embarked + Name + FamilySize,
data = train,
metric ="Kappa",
method = "svmRadialWeights",
trControl = fitControl,tuneGrid = grif_svm3,
verbose = FALSE)
svm_cv_Fit3
#> Support Vector Machines with Class Weights
#>
#> 891 samples
#> 9 predictor
#> 2 classes: '0', '1'
#>
#> No pre-processing
#> Resampling: Cross-Validated (10 fold, repeated 1 times)
#> Summary of sample sizes: 802, 802, 802, 802, 802, 802, ...
#> Resampling results across tuning parameters:
#>
#> sigma Accuracy Kappa
#> 0.0 0.6161673 0.0000000
#> 0.1 0.8013358 0.5669074
#> 0.2 0.7979650 0.5612242
#> 0.3 0.7911985 0.5513664
#> 0.4 0.7900624 0.5519072
#> 0.5 0.7878402 0.5495903
#> 0.6 0.7889638 0.5520950
#> 0.7 0.7889638 0.5524516
#> 0.8 0.7889638 0.5537507
#> 0.9 0.7878527 0.5518741
#> 1.0 0.7889638 0.5549527
#>
#> Tuning parameter 'C' was held constant at a value of 58
#> Tuning parameter 'Weight' was held constant at a value of 1
#> Kappa was used to select the optimal model using the largest value.
#> The final values used for the model were sigma = 0.1, C = 58 and Weight = 1.
# 参数进一步缩小 上面最好的为: sigma = 0.1, C = 58 and Weight = 1.
no_cores <- detectCores() -1
cl<-makeCluster(no_cores)
# registerDoParallel(cl)
grif_svm4 = expand.grid(sigma=seq(0,0.2,0.01),C=58,Weight=1)
set.seed(825)
svm_cv_Fit4 <- train(Survived~Pclass +Name + Sex + Age_class + SibSp + Parch + Fare + Embarked + Name + FamilySize,
data = train,
metric ="Kappa",
method = "svmRadialWeights",
trControl = fitControl,tuneGrid = grif_svm4,
verbose = FALSE)
# stopImplicitCluster()
svm_cv_Fit4
#> Support Vector Machines with Class Weights
#>
#> 891 samples
#> 9 predictor
#> 2 classes: '0', '1'
#>
#> No pre-processing
#> Resampling: Cross-Validated (10 fold, repeated 1 times)
#> Summary of sample sizes: 802, 802, 802, 802, 802, 802, ...
#> Resampling results across tuning parameters:
#>
#> sigma Accuracy Kappa
#> 0.00 0.6161673 0.0000000
#> 0.01 0.8149064 0.5916957
#> 0.02 0.8114981 0.5856433
#> 0.03 0.8081273 0.5793144
#> 0.04 0.8036330 0.5700091
#> 0.05 0.7991511 0.5611674
#> 0.06 0.8002622 0.5645199
#> 0.07 0.8047191 0.5741737
#> 0.08 0.8047066 0.5738563
#> 0.09 0.8024719 0.5689492
#> 0.10 0.8013358 0.5669074
#> 0.11 0.8013358 0.5669991
#> 0.12 0.8002122 0.5643613
#> 0.13 0.8002122 0.5643613
#> 0.14 0.8013233 0.5663641
#> 0.15 0.8024469 0.5690019
#> 0.16 0.8013358 0.5668501
#> 0.17 0.8002122 0.5645959
#> 0.18 0.7979650 0.5601587
#> 0.19 0.7957179 0.5563902
#> 0.20 0.7979650 0.5612242
#>
#> Tuning parameter 'C' was held constant at a value of 58
#> Tuning parameter 'Weight' was held constant at a value of 1
#> Kappa was used to select the optimal model using the largest value.
#> The final values used for the model were sigma = 0.01, C = 58 and Weight = 1.
svm 模型预测
# 最好的参数 为: sigma = 0.01, C = 58 and Weight = 1.
pred_svm_cv = predict(svm_cv_Fit4,test)
submit_svm_cv=data.frame(PassengerId=test$PassengerId,Survived=prediction)
## 存储文件
# write.csv(submit_svm_cv,"data/submit_svm_cv.csv",row.names = FALSE)
# 模型评估
table(train$Survived,predict(svm_cv_Fit4,train,type = "raw"))
#>
#> 0 1
#> 0 519 30
#> 1 98 244
kaggle的得分,
一下是把数据提交kaggle以后的得分,可以发现大概都在0.78分左右
如果需要提升分数,需要重新构建新的特征工程再去预测建模