泰坦尼克号预测(kaggle)

options(width = 300)
knitr::opts_chunk$set(message = F,warning = F,comment = "#>",collapse = TRUE)

读入数据

library(data.table)
train=fread("data/train.csv",na.strings = c("",NA))
test=fread("data/test.csv",na.strings = c("",NA))
#把两个合并起来进行数据处理--两个data.table的合并
combine =rbindlist(list(train,test),fill=TRUE)
#### 其实个人不建议这样操作,因为不能把测试集和训练集一起处理,应该分开处理

数据处理

# 统计每一列的缺失率
combine[,lapply(.SD, function(x)sum(is.na(x)))]
#>    PassengerId Survived Pclass Name Sex Age SibSp Parch Ticket Fare Cabin Embarked
#> 1:           0      418      0    0   0 263     0     0      0    1  1014        2
# 可以看出 我们需要对缺失列进行处理,以及一些特征衍生工作


library(zoo)
library(purrr)
combine[,Age := na.spline(Age)] # age变量进行处理,进行样条插补
combine[,Fare := na.spline(Fare)] # Fare变量进行处理,进行样条插补

由于Cabin 变量丢失数据太多,于是可以删除这个变量

combine[,Cabin:=NULL]

以及Embarked这个是个字符串,于是用众数去替代,或者用一些相同的类型的人的指标去替代

combine[is.na(Embarked),]## 用同等船舱的且上岸第相等的类型去替代
#>    PassengerId Survived Pclass                                      Name    Sex Age SibSp Parch Ticket Fare Embarked
#> 1:          62        1      1                       Icard, Miss. Amelie female  38     0     0 113572   80     <NA>
#> 2:         830        1      1 Stone, Mrs. George Nelson (Martha Evelyn) female  62     0     0 113572   80     <NA>
combine[Pclass == 1,.N,by=.(Embarked)]
#>    Embarked   N
#> 1:        C 141
#> 2:        S 177
#> 3:     <NA>   2
#> 4:        Q   3
# 可以看出用S去代替

combine[is.na(Embarked),Embarked:=c("S","S")]

# 再次统计每一列的缺失率
combine[,lapply(.SD, function(x)sum(is.na(x)))]
#>    PassengerId Survived Pclass Name Sex Age SibSp Parch Ticket Fare Embarked
#> 1:           0      418      0    0   0   0     0     0      0    0        0

特征衍生工作

从名字中提取称谓,并把相同意思的称谓进行融合

# 提取名字的称谓
library(stringr)
combine[,Name :=gsub("(.*, )|(\\..*)","",Name)]

str(combine)
#> Classes 'data.table' and 'data.frame':   1309 obs. of  11 variables:
#>  $ PassengerId: int  1 2 3 4 5 6 7 8 9 10 ...
#>  $ Survived   : int  0 1 1 1 0 0 0 0 1 1 ...
#>  $ Pclass     : int  3 1 3 1 3 3 1 3 3 2 ...
#>  $ Name       : chr  "Mr" "Mrs" "Miss" "Mrs" ...
#>  $ Sex        : chr  "male" "female" "female" "female" ...
#>  $ Age        : num  22 38 26 35 35 ...
#>  $ SibSp      : int  1 1 0 1 0 0 0 3 0 1 ...
#>  $ Parch      : int  0 0 0 0 0 0 0 1 2 0 ...
#>  $ Ticket     : chr  "A/5 21171" "PC 17599" "STON/O2. 3101282" "113803" ...
#>  $ Fare       : num  7.25 71.28 7.92 53.1 8.05 ...
#>  $ Embarked   : chr  "S" "C" "S" "S" ...
#>  - attr(*, ".internal.selfref")=<externalptr> 
#>  - attr(*, "index")= int 
#>   ..- attr(*, "__Pclass")= int  2 4 7 12 24 28 31 32 35 36 ...
# address(combine)
combine[,Name :=str_trim(Name)]

combine$Name %>% table()
#> .
#>         Capt          Col          Don         Dona           Dr     Jonkheer         Lady        Major       Master         Miss         Mlle          Mme           Mr          Mrs           Ms          Rev          Sir the Countess 
#>            1            4            1            1            8            1            1            2           61          260            2            1          757          197            2            8            1            1


## 把称谓进行融合 
#因为法语里面 mlle 和Mme 一样
#combi$appellation[combi$appellation %in% c("Mlle","Mme")] = "Mlle"
combine[Name %in% c("Mlle","Mme"),Name:="Mlle"]

combine[Name %in% c("Don","Major","Sir"),Name:="Sir"]

combine[Name %in% c("Jonkheer","Dona","the Countess","Lady"),Name:="Lady"]
combine[,table(Name)]
#> Name
#>   Capt    Col     Dr   Lady Master   Miss   Mlle     Mr    Mrs     Ms    Rev    Sir 
#>      1      4      8      4     61    260      3    757    197      2      8      4
## 将称谓改为因子类型
combine[,Name:= as.factor(Name)]

构造 FamilySize 变量 ,

#家庭的人数(包括自己) 这里居然不能用=  不然会出错,奇怪,后面试过几次有可以了

combine$FamilySize=as.numeric(combine$SibSp) + as.numeric(combine$Parch) + 1

combine$FamilySize[combine$FamilySize > 6 ] = "Large"
combine$FamilySize[combine$FamilySize <= 2] = "Small"

combine$FamilySize[combine$FamilySize > 2 & combine$FamilySize <= 6] =  "Middle" 

table(combine$FamilySize)
#> 
#>  Large Middle  Small 
#>     35    249   1025
combine[,FamilySize := as.factor(FamilySize)]

对于Age变量,可以把age分类处理

## 把1-3岁的小孩单独分类,因为这种情况跟随母亲活下来的情况很大
combine$Age[combine$Age<=1]=1
combine[Age<=3,Age_class := "small"]
combine[3<Age & Age<=14,Age_class := "juvenile"]
combine[14<Age & Age<=60,Age_class := "adult"]
combine[60<Age, Age_class := "old"]

table(combine$Age_class)
#> 
#>    adult juvenile      old    small 
#>     1108      102       43       56
combine[,Age_class:=as.factor(Age_class)]

把一些字符变量转变为因子变量

# combine[,PassengerId :=as.factor(PassengerId)]
# combine[,Survived := as.factor(Survived)]
# combine[,Pclass :=as.factor(Pclass)]
# combine[,Sex := as.factor(Sex)]
a = c("PassengerId","Survived","Pclass","Sex","Embarked")
combine[,(a):=lapply(.SD,function(x)as.factor(x)), .SDcols = a]
str(combine)
#> Classes 'data.table' and 'data.frame':   1309 obs. of  13 variables:
#>  $ PassengerId: Factor w/ 1309 levels "1","2","3","4",..: 1 2 3 4 5 6 7 8 9 10 ...
#>  $ Survived   : Factor w/ 2 levels "0","1": 1 2 2 2 1 1 1 1 2 2 ...
#>  $ Pclass     : Factor w/ 3 levels "1","2","3": 3 1 3 1 3 3 1 3 3 2 ...
#>  $ Name       : Factor w/ 12 levels "Capt","Col","Dr",..: 8 9 6 9 8 8 8 5 9 9 ...
#>  $ Sex        : Factor w/ 2 levels "female","male": 2 1 1 1 2 2 2 2 1 1 ...
#>  $ Age        : num  22 38 26 35 35 ...
#>  $ SibSp      : int  1 1 0 1 0 0 0 3 0 1 ...
#>  $ Parch      : int  0 0 0 0 0 0 0 1 2 0 ...
#>  $ Ticket     : chr  "A/5 21171" "PC 17599" "STON/O2. 3101282" "113803" ...
#>  $ Fare       : num  7.25 71.28 7.92 53.1 8.05 ...
#>  $ Embarked   : Factor w/ 3 levels "C","Q","S": 3 1 3 3 3 2 3 3 3 1 ...
#>  $ FamilySize : Factor w/ 3 levels "Large","Middle",..: 3 3 3 3 3 3 3 2 2 3 ...
#>  $ Age_class  : Factor w/ 4 levels "adult","juvenile",..: 1 1 1 1 1 1 1 4 1 2 ...
#>  - attr(*, ".internal.selfref")=<externalptr> 
#>  - attr(*, "index")= int

数据划分

train <- combine[1:891,] %>% as.data.frame()
test <- combine[892:1309,] %>% as.data.frame()

建模

决策树建模

## 决策树建模
library(rpart)
taitan_tree=rpart(Survived~Pclass +Name + Sex + Age_class + SibSp + Parch + Fare + Embarked + Name + FamilySize, train, method = "class")
prediction <- predict(taitan_tree, test, type = "class")
submit=data.frame(PassengerId=test$PassengerId,Survived=prediction)

## 存储文件
# write.csv(submit,"data/submit.csv",row.names = FALSE)

# 模型评估
table(train$Survived,predict(taitan_tree,train,type = "class"))
#>    
#>       0   1
#>   0 509  40
#>   1  96 246


library(rpart.plot)
rpart.plot(taitan_tree)

kaggletaitan01

随机森林建模

### 随机森林建模
library(randomForest)
model_rf = randomForest(Survived~Pclass +Name + Sex + Age_class + SibSp + Parch + Fare + Embarked + Name + FamilySize, train, method = "class")



pred <- predict(model_rf, test, type = "class")
submit_rf=data.frame(PassengerId=test$PassengerId,Survived=prediction)

## 存储文件
# write.csv(submit_rf,"data/submit_rf.csv",row.names = FALSE)

# 模型评估
table(train$Survived,predict(model_rf,train,type = "class"))
#>    
#>       0   1
#>   0 531  18
#>   1  67 275

carte包

随机森林建模

library(caret)
library(foreach)
library(doParallel)

no_cores <- detectCores() -1
cl<-makeCluster(no_cores)
registerDoParallel(cl)

fitControl <- trainControl(method = "repeatedcv",
                           number = 10,
                           repeats = 1)
grif_rf = expand.grid(.mtry=c(11:13))

set.seed(825)
rf_cv_Fit1 <- train(Survived~Pclass +Name + Sex + Age_class + SibSp + Parch + Fare + Embarked + Name + FamilySize,
                 data = train,
                 metric ="Kappa",
                 method = "rf", 
                 trControl = fitControl,tuneGrid = grif_rf,
                 verbose = FALSE)

随机森林—模型参数与评估

rf_cv_Fit1
#> Random Forest 
#> 
#> 891 samples
#>   9 predictor
#>   2 classes: '0', '1' 
#> 
#> No pre-processing
#> Resampling: Cross-Validated (10 fold, repeated 1 times) 
#> Summary of sample sizes: 802, 802, 802, 802, 802, 802, ... 
#> Resampling results across tuning parameters:
#> 
#>   mtry  Accuracy   Kappa    
#>   11    0.8361423  0.6479535
#>   12    0.8249064  0.6235846
#>   13    0.8260175  0.6267026
#> 
#> Kappa was used to select the optimal model using the largest value.
#> The final value used for the model was mtry = 11.

pred_rf_cv = predict(rf_cv_Fit1,test,type = "raw")


submit_rf_cv=data.frame(PassengerId=test$PassengerId,Survived=prediction)

## 存储文件
# write.csv(submit_rf_cv,"data/submit_rf_cv.csv",row.names = FALSE)

# 模型评估
table(train$Survived,predict(rf_cv_Fit1,train,type = "raw"))
#>    
#>       0   1
#>   0 532  17
#>   1  54 288

svm建模

library(foreach)
library(doParallel)

no_cores <- detectCores() -1
cl<-makeCluster(no_cores)


fitControl <- trainControl(method = "repeatedcv",
                           number = 10,
                           repeats = 1)
grif_svm = expand.grid(sigma=seq(1,100,10),C=seq(1,100,10),Weight=c(1,100 / table(train$Survived)))

set.seed(825)
svm_cv_Fit1 <- train(Survived~Pclass +Name + Sex + Age_class + SibSp + Parch + Fare + Embarked + Name + FamilySize,
                 data = train,
                 metric ="Kappa",
                 method = "svmRadialWeights", 
                 trControl = fitControl,tuneGrid = grif_svm,
                 verbose = FALSE)



svm_cv_Fit1
#> Support Vector Machines with Class Weights 
#> 
#> 891 samples
#>   9 predictor
#>   2 classes: '0', '1' 
#> 
#> No pre-processing
#> Resampling: Cross-Validated (10 fold, repeated 1 times) 
#> Summary of sample sizes: 802, 802, 802, 802, 802, 802, ... 
#> Resampling results across tuning parameters:
#> 
#>   sigma  C   Weight     Accuracy   Kappa    
#>    1      1  0.1821494  0.6476529  0.3450120
#>    1      1  0.2923977  0.6879650  0.4080850
#>    1      1  1.0000000  0.7811111  0.5336090
#>    1     11  0.1821494  0.7384270  0.4825548
#>    1     11  0.2923977  0.7676404  0.5318205
#>    1     11  1.0000000  0.7889513  0.5539126
#>    1     21  0.1821494  0.7373034  0.4800775
#>    1     21  0.2923977  0.7721348  0.5396339
#>    1     21  1.0000000  0.7878277  0.5513716
#>    1     31  0.1821494  0.7384270  0.4819020
#>    1     31  0.2923977  0.7721348  0.5396500
#>    1     31  1.0000000  0.7878277  0.5520975
#>    1     41  0.1821494  0.7384395  0.4816815
#>    1     41  0.2923977  0.7721348  0.5396500
#>    1     41  1.0000000  0.7878277  0.5528006
#>    1     51  0.1821494  0.7384519  0.4816418
#>    1     51  0.2923977  0.7687765  0.5332258
#>    1     51  1.0000000  0.7867166  0.5502534
#>    1     61  0.1821494  0.7384519  0.4816418
#>    1     61  0.2923977  0.7699001  0.5353125
#>    1     61  1.0000000  0.7889638  0.5549527
#>    1     71  0.1821494  0.7395755  0.4835483
#>    1     71  0.2923977  0.7698876  0.5354221
#>    1     71  1.0000000  0.7878527  0.5519801
#>    1     81  0.1821494  0.7373408  0.4792572
#>    1     81  0.2923977  0.7710112  0.5374317
#>    1     81  1.0000000  0.7878527  0.5519801
#>    1     91  0.1821494  0.7373408  0.4792572
#>    1     91  0.2923977  0.7698876  0.5349684
#>    1     91  1.0000000  0.7889763  0.5539713
#>   11      1  0.1821494  0.5926966  0.2634169
#>   11      1  0.2923977  0.6566167  0.3569706
#>   11      1  1.0000000  0.7464170  0.4589756
#>   11     11  0.1821494  0.7036954  0.4264619
#>   11     11  0.2923977  0.7407366  0.4872806
#>   11     11  1.0000000  0.7486642  0.4607651
#>   11     21  0.1821494  0.6958302  0.4115283
#>   11     21  0.2923977  0.7396130  0.4853630
#>   11     21  1.0000000  0.7486767  0.4596743
#>   11     31  0.1821494  0.6992010  0.4169313
#>   11     31  0.2923977  0.7373658  0.4810659
#>   11     31  1.0000000  0.7430587  0.4490506
#>   11     41  0.1821494  0.7003246  0.4189393
#>   11     41  0.2923977  0.7362422  0.4791788
#>   11     41  1.0000000  0.7430462  0.4501862
#>   11     51  0.1821494  0.7014357  0.4208375
#>   11     51  0.2923977  0.7362422  0.4791788
#>   11     51  1.0000000  0.7441698  0.4527104
#>   11     61  0.1821494  0.7014357  0.4208375
#>   11     61  0.2923977  0.7373658  0.4810659
#>   11     61  1.0000000  0.7452934  0.4553096
#>   11     71  0.1821494  0.7014357  0.4208375
#>   11     71  0.2923977  0.7385019  0.4830801
#>   11     71  1.0000000  0.7441698  0.4526016
#>   11     81  0.1821494  0.7014357  0.4208375
#>   11     81  0.2923977  0.7373783  0.4805621
#>   11     81  1.0000000  0.7441698  0.4526016
#>   11     91  0.1821494  0.7014357  0.4208375
#>   11     91  0.2923977  0.7396255  0.4843557
#>   11     91  1.0000000  0.7430462  0.4506031
#>   21      1  0.1821494  0.5915730  0.2627613
#>   21      1  0.2923977  0.6442697  0.3377452
#>   21      1  1.0000000  0.7452934  0.4553119
#>   21     11  0.1821494  0.6980774  0.4158949
#>   21     11  0.2923977  0.7351311  0.4778643
#>   21     11  1.0000000  0.7452934  0.4542599
#>   21     21  0.1821494  0.7014482  0.4214853
#>   21     21  0.2923977  0.7340075  0.4753755
#>   21     21  1.0000000  0.7441698  0.4526315
#>   21     31  0.1821494  0.7014482  0.4214853
#>   21     31  0.2923977  0.7328839  0.4728575
#>   21     31  1.0000000  0.7430462  0.4506291
#>   21     41  0.1821494  0.7014482  0.4214853
#>   21     41  0.2923977  0.7351436  0.4766543
#>   21     41  1.0000000  0.7441698  0.4538285
#>   21     51  0.1821494  0.7025718  0.4232449
#>   21     51  0.2923977  0.7362672  0.4785697
#>   21     51  1.0000000  0.7430462  0.4515215
#>   21     61  0.1821494  0.7025718  0.4226345
#>   21     61  0.2923977  0.7373908  0.4805335
#>   21     61  1.0000000  0.7430462  0.4515215
#>   21     71  0.1821494  0.7036954  0.4244115
#>   21     71  0.2923977  0.7385144  0.4831184
#>   21     71  1.0000000  0.7430462  0.4515215
#>   21     81  0.1821494  0.7059426  0.4280555
#>   21     81  0.2923977  0.7385144  0.4831184
#>   21     81  1.0000000  0.7430462  0.4515215
#>   21     91  0.1821494  0.7059426  0.4280555
#>   21     91  0.2923977  0.7385144  0.4831184
#>   21     91  1.0000000  0.7430462  0.4515215
#>   31      1  0.1821494  0.5893258  0.2595359
#>   31      1  0.2923977  0.6476654  0.3435429
#>   31      1  1.0000000  0.7441823  0.4522360
#>   31     11  0.1821494  0.6992010  0.4175620
#>   31     11  0.2923977  0.7340075  0.4758536
#>   31     11  1.0000000  0.7419351  0.4474695
#>   31     21  0.1821494  0.7003246  0.4195318
#>   31     21  0.2923977  0.7317603  0.4714543
#>   31     21  1.0000000  0.7408115  0.4464669
#>   31     31  0.1821494  0.7003246  0.4195318
#>   31     31  0.2923977  0.7340200  0.4752511
#>   31     31  1.0000000  0.7408115  0.4464669
#>   31     41  0.1821494  0.7014482  0.4206810
#>   31     41  0.2923977  0.7362672  0.4791020
#>   31     41  1.0000000  0.7396879  0.4444013
#>   31     51  0.1821494  0.7025718  0.4225303
#>   31     51  0.2923977  0.7362672  0.4791020
#>   31     51  1.0000000  0.7385643  0.4424209
#>   31     61  0.1821494  0.7025718  0.4225544
#>   31     61  0.2923977  0.7373908  0.4815624
#>   31     61  1.0000000  0.7385643  0.4424209
#>   31     71  0.1821494  0.7025718  0.4225544
#>   31     71  0.2923977  0.7373908  0.4815624
#>   31     71  1.0000000  0.7374407  0.4403604
#>   31     81  0.1821494  0.7025718  0.4225544
#>   31     81  0.2923977  0.7385144  0.4834688
#>   31     81  1.0000000  0.7363171  0.4391270
#>   31     91  0.1821494  0.7036954  0.4249922
#>   31     91  0.2923977  0.7396380  0.4853753
#>   31     91  1.0000000  0.7363171  0.4391270
#>   41      1  0.1821494  0.5893258  0.2601027
#>   41      1  0.2923977  0.6398002  0.3310217
#>   41      1  1.0000000  0.7441823  0.4522816
#>   41     11  0.1821494  0.6992010  0.4185330
#>   41     11  0.2923977  0.7306367  0.4695862
#>   41     11  1.0000000  0.7385643  0.4412077
#>   41     21  0.1821494  0.6980774  0.4159480
#>   41     21  0.2923977  0.7328964  0.4733640
#>   41     21  1.0000000  0.7385643  0.4412077
#>   41     31  0.1821494  0.6958552  0.4116192
#>   41     31  0.2923977  0.7362672  0.4791020
#>   41     31  1.0000000  0.7374407  0.4392273
#>   41     41  0.1821494  0.6969913  0.4133824
#>   41     41  0.2923977  0.7362672  0.4791020
#>   41     41  1.0000000  0.7374407  0.4392273
#>   41     51  0.1821494  0.6992385  0.4176660
#>   41     51  0.2923977  0.7373908  0.4810085
#>   41     51  1.0000000  0.7363171  0.4379938
#>   41     61  0.1821494  0.6969913  0.4135130
#>   41     61  0.2923977  0.7373908  0.4804740
#>   41     61  1.0000000  0.7363171  0.4391270
#>   41     71  0.1821494  0.6981149  0.4160191
#>   41     71  0.2923977  0.7373908  0.4811373
#>   41     71  1.0000000  0.7363171  0.4391270
#>   41     81  0.1821494  0.6992385  0.4185078
#>   41     81  0.2923977  0.7373908  0.4811373
#>   41     81  1.0000000  0.7363171  0.4391270
#>   41     91  0.1821494  0.6992385  0.4185078
#>   41     91  0.2923977  0.7373908  0.4805263
#>   41     91  1.0000000  0.7351935  0.4370889
#>   51      1  0.1821494  0.5893258  0.2601027
#>   51      1  0.2923977  0.6386767  0.3292930
#>   51      1  1.0000000  0.7430712  0.4485869
#>   51     11  0.1821494  0.6936205  0.4087871
#>   51     11  0.2923977  0.7272784  0.4641883
#>   51     11  1.0000000  0.7385643  0.4412077
#>   51     21  0.1821494  0.6913858  0.4044756
#>   51     21  0.2923977  0.7328964  0.4737335
#>   51     21  1.0000000  0.7374407  0.4392273
#>   51     31  0.1821494  0.6925094  0.4064855
#>   51     31  0.2923977  0.7351436  0.4779378
#>   51     31  1.0000000  0.7363171  0.4379938
#>   51     41  0.1821494  0.6936330  0.4082625
#>   51     41  0.2923977  0.7351436  0.4774471
#>   51     41  1.0000000  0.7363171  0.4379938
#>   51     51  0.1821494  0.6970037  0.4149275
#>   51     51  0.2923977  0.7362672  0.4793616
#>   51     51  1.0000000  0.7351935  0.4359333
#>   51     61  0.1821494  0.6981273  0.4174162
#>   51     61  0.2923977  0.7362672  0.4793616
#>   51     61  1.0000000  0.7340699  0.4338953
#>   51     71  0.1821494  0.6981273  0.4167874
#>   51     71  0.2923977  0.7385144  0.4832641
#>   51     71  1.0000000  0.7340699  0.4338953
#>   51     81  0.1821494  0.6981273  0.4167874
#>   51     81  0.2923977  0.7373908  0.4814395
#>   51     81  1.0000000  0.7351935  0.4358535
#>   51     91  0.1821494  0.7003745  0.4205404
#>   51     91  0.2923977  0.7373908  0.4814395
#>   51     91  1.0000000  0.7329463  0.4318763
#>   61      1  0.1821494  0.5826217  0.2502068
#>   61      1  0.2923977  0.6285643  0.3133638
#>   61      1  1.0000000  0.7408240  0.4441011
#>   61     11  0.1821494  0.6925094  0.4069598
#>   61     11  0.2923977  0.7284020  0.4660563
#>   61     11  1.0000000  0.7363296  0.4355851
#>   61     21  0.1821494  0.6913858  0.4046547
#>   61     21  0.2923977  0.7328964  0.4737335
#>   61     21  1.0000000  0.7363296  0.4355851
#>   61     31  0.1821494  0.6936330  0.4082625
#>   61     31  0.2923977  0.7317728  0.4713558
#>   61     31  1.0000000  0.7340824  0.4316345
#>   61     41  0.1821494  0.6947566  0.4107686
#>   61     41  0.2923977  0.7340200  0.4752675
#>   61     41  1.0000000  0.7340824  0.4315119
#>   61     51  0.1821494  0.6958801  0.4132805
#>   61     51  0.2923977  0.7351436  0.4771909
#>   61     51  1.0000000  0.7340824  0.4314321
#>   61     61  0.1821494  0.6981273  0.4169757
#>   61     61  0.2923977  0.7318227  0.4572936
#>   61     61  1.0000000  0.7329588  0.4294738
#>   61     71  0.1821494  0.6992509  0.4188250
#>   61     71  0.2923977  0.7307116  0.4542080
#>   61     71  1.0000000  0.7329463  0.4305823
#>   61     81  0.1821494  0.7003745  0.4206999
#>   61     81  0.2923977  0.7329588  0.4580270
#>   61     81  1.0000000  0.7329463  0.4305823
#>   61     91  0.1821494  0.7026217  0.4244271
#>   61     91  0.2923977  0.7329588  0.4580270
#>   61     91  1.0000000  0.7329463  0.4305823
#>   71      1  0.1821494  0.5814981  0.2485895
#>   71      1  0.2923977  0.6251935  0.3078469
#>   71      1  1.0000000  0.7419476  0.4467029
#>   71     11  0.1821494  0.6880150  0.3997900
#>   71     11  0.2923977  0.7272784  0.4639733
#>   71     11  1.0000000  0.7340824  0.4312532
#>   71     21  0.1821494  0.6902622  0.4026816
#>   71     21  0.2923977  0.7272784  0.4635862
#>   71     21  1.0000000  0.7352060  0.4343127
#>   71     31  0.1821494  0.6936330  0.4093879
#>   71     31  0.2923977  0.7328964  0.4733328
#>   71     31  1.0000000  0.7340824  0.4322522
#>   71     41  0.1821494  0.6958801  0.4132805
#>   71     41  0.2923977  0.7295880  0.4522289
#>   71     41  1.0000000  0.7329588  0.4302141
#>   71     51  0.1821494  0.6981273  0.4169757
#>   71     51  0.2923977  0.7318227  0.4563957
#>   71     51  1.0000000  0.7329588  0.4302141
#>   71     61  0.1821494  0.6992509  0.4188250
#>   71     61  0.2923977  0.7329463  0.4582203
#>   71     61  1.0000000  0.7307116  0.4255170
#>   71     71  0.1821494  0.7026217  0.4244271
#>   71     71  0.2923977  0.7340699  0.4611126
#>   71     71  1.0000000  0.7307116  0.4255170
#>   71     81  0.1821494  0.7037453  0.4263832
#>   71     81  0.2923977  0.7351935  0.4636264
#>   71     81  1.0000000  0.7307116  0.4255170
#>   71     91  0.1821494  0.7037453  0.4263832
#>   71     91  0.2923977  0.7340824  0.4614589
#>   71     91  1.0000000  0.7307116  0.4260402
#>   81      1  0.1821494  0.5792634  0.2455210
#>   81      1  0.2923977  0.6251935  0.3078469
#>   81      1  1.0000000  0.7408240  0.4446288
#>   81     11  0.1821494  0.6868914  0.3977439
#>   81     11  0.2923977  0.7261673  0.4627250
#>   81     11  1.0000000  0.7329588  0.4291927
#>   81     21  0.1821494  0.6925094  0.4076129
#>   81     21  0.2923977  0.7295256  0.4680218
#>   81     21  1.0000000  0.7329588  0.4299807
#>   81     31  0.1821494  0.6936330  0.4101861
#>   81     31  0.2923977  0.7284519  0.4509761
#>   81     31  1.0000000  0.7307116  0.4258822
#>   81     41  0.1821494  0.6970037  0.4157226
#>   81     41  0.2923977  0.7295755  0.4538531
#>   81     41  1.0000000  0.7318352  0.4281951
#>   81     51  0.1821494  0.6992509  0.4193634
#>   81     51  0.2923977  0.7318227  0.4571451
#>   81     51  1.0000000  0.7318352  0.4281951
#>   81     61  0.1821494  0.7026217  0.4251154
#>   81     61  0.2923977  0.7318352  0.4569506
#>   81     61  1.0000000  0.7318352  0.4287183
#>   81     71  0.1821494  0.7026217  0.4251154
#>   81     71  0.2923977  0.7318477  0.4563842
#>   81     71  1.0000000  0.7318352  0.4287183
#>   81     81  0.1821494  0.7048689  0.4286436
#>   81     81  0.2923977  0.7318477  0.4563842
#>   81     81  1.0000000  0.7318352  0.4287183
#>   81     91  0.1821494  0.7048689  0.4281019
#>   81     91  0.2923977  0.7318602  0.4562315
#>   81     91  1.0000000  0.7318352  0.4286214
#>   91      1  0.1821494  0.5781523  0.2440235
#>   91      1  0.2923977  0.6240699  0.3062144
#>   91      1  1.0000000  0.7408240  0.4450119
#>   91     11  0.1821494  0.6846442  0.3944491
#>   91     11  0.2923977  0.7261673  0.4627498
#>   91     11  1.0000000  0.7318352  0.4279203
#>   91     21  0.1821494  0.6913858  0.4061352
#>   91     21  0.2923977  0.7250811  0.4451708
#>   91     21  1.0000000  0.7318352  0.4279203
#>   91     31  0.1821494  0.6947566  0.4117704
#>   91     31  0.2923977  0.7284519  0.4518740
#>   91     31  1.0000000  0.7284644  0.4218471
#>   91     41  0.1821494  0.6992509  0.4193634
#>   91     41  0.2923977  0.7295880  0.4535101
#>   91     41  1.0000000  0.7284644  0.4218471
#>   91     51  0.1821494  0.7014981  0.4231977
#>   91     51  0.2923977  0.7284769  0.4504299
#>   91     51  1.0000000  0.7284644  0.4223703
#>   91     61  0.1821494  0.7037453  0.4268340
#>   91     61  0.2923977  0.7307241  0.4549168
#>   91     61  1.0000000  0.7284644  0.4223703
#>   91     71  0.1821494  0.7037453  0.4268340
#>   91     71  0.2923977  0.7318477  0.4569850
#>   91     71  1.0000000  0.7284644  0.4223703
#>   91     81  0.1821494  0.7037453  0.4268340
#>   91     81  0.2923977  0.7329838  0.4593189
#>   91     81  1.0000000  0.7295880  0.4249469
#>   91     91  0.1821494  0.7048689  0.4288071
#>   91     91  0.2923977  0.7329838  0.4593189
#>   91     91  1.0000000  0.7307116  0.4269096
#> 
#> Kappa was used to select the optimal model using the largest value.
#> The final values used for the model were sigma = 1, C = 61 and Weight = 1.

svm 参数搜索—范围进一步变小

# 上面结果最好的参数为: sigma = 1, C = 61 and Weight = 1.

no_cores <- detectCores() -1
cl<-makeCluster(no_cores)
#registerDoParallel(cl) # windows要加这个,mac不需要加这个

grif_svm2 = expand.grid(sigma=seq(1,10,1),C=seq(50,70,1),Weight=1)

set.seed(825)
svm_cv_Fit2 <- train(Survived~Pclass +Name + Sex + Age_class + SibSp + Parch + Fare + Embarked + Name + FamilySize,
                 data = train,
                 metric ="Kappa",
                 method = "svmRadialWeights", 
                 trControl = fitControl,tuneGrid = grif_svm2,
                 verbose = FALSE)



svm_cv_Fit2
#> Support Vector Machines with Class Weights 
#> 
#> 891 samples
#>   9 predictor
#>   2 classes: '0', '1' 
#> 
#> No pre-processing
#> Resampling: Cross-Validated (10 fold, repeated 1 times) 
#> Summary of sample sizes: 802, 802, 802, 802, 802, 802, ... 
#> Resampling results across tuning parameters:
#> 
#>   sigma  C   Accuracy   Kappa    
#>    1     50  0.7867166  0.5502534
#>    1     51  0.7867166  0.5502534
#>    1     52  0.7878402  0.5523393
#>    1     53  0.7878402  0.5523393
#>    1     54  0.7878402  0.5523393
#>    1     55  0.7878402  0.5523393
#>    1     56  0.7878402  0.5523393
#>    1     57  0.7878402  0.5523393
#>    1     58  0.7889638  0.5549527
#>    1     59  0.7889638  0.5549527
#>    1     60  0.7889638  0.5549527
#>    1     61  0.7889638  0.5549527
#>    1     62  0.7889638  0.5549527
#>    1     63  0.7889638  0.5549527
#>    1     64  0.7878527  0.5523786
#>    1     65  0.7878527  0.5523786
#>    1     66  0.7878527  0.5523786
#>    1     67  0.7878527  0.5523786
#>    1     68  0.7878527  0.5519801
#>    1     69  0.7878527  0.5519801
#>    1     70  0.7878527  0.5519801
#>    2     50  0.7822222  0.5437950
#>    2     51  0.7822222  0.5437950
#>    2     52  0.7822222  0.5437950
#>    2     53  0.7833458  0.5458818
#>    2     54  0.7833458  0.5458818
#>    2     55  0.7833458  0.5458818
#>    2     56  0.7833458  0.5458818
#>    2     57  0.7822347  0.5433076
#>    2     58  0.7822347  0.5433076
#>    2     59  0.7789014  0.5348965
#>    2     60  0.7777903  0.5321784
#>    2     61  0.7766792  0.5294305
#>    2     62  0.7755680  0.5266522
#>    2     63  0.7744569  0.5238432
#>    2     64  0.7744569  0.5238432
#>    2     65  0.7744569  0.5238432
#>    2     66  0.7744569  0.5238432
#>    2     67  0.7744569  0.5238432
#>    2     68  0.7744569  0.5238432
#>    2     69  0.7744569  0.5238432
#>    2     70  0.7744569  0.5238432
#>    3     50  0.7677653  0.5079159
#>    3     51  0.7677653  0.5079159
#>    3     52  0.7677653  0.5079159
#>    3     53  0.7677653  0.5079159
#>    3     54  0.7677653  0.5079159
#>    3     55  0.7677653  0.5079159
#>    3     56  0.7677653  0.5079159
#>    3     57  0.7677653  0.5079159
#>    3     58  0.7677653  0.5079159
#>    3     59  0.7688889  0.5099863
#>    3     60  0.7688889  0.5099863
#>    3     61  0.7688889  0.5099863
#>    3     62  0.7688889  0.5099863
#>    3     63  0.7688889  0.5099863
#>    3     64  0.7688889  0.5099863
#>    3     65  0.7688889  0.5099863
#>    3     66  0.7688889  0.5099863
#>    3     67  0.7688889  0.5099863
#>    3     68  0.7677653  0.5073891
#>    3     69  0.7666417  0.5053186
#>    3     70  0.7666417  0.5053186
#>    4     50  0.7576404  0.4904061
#>    4     51  0.7576404  0.4904061
#>    4     52  0.7565169  0.4884006
#>    4     53  0.7565169  0.4884006
#>    4     54  0.7565169  0.4884006
#>    4     55  0.7565169  0.4884006
#>    4     56  0.7565169  0.4884006
#>    4     57  0.7565169  0.4884006
#>    4     58  0.7553933  0.4862959
#>    4     59  0.7553933  0.4862959
#>    4     60  0.7553933  0.4862955
#>    4     61  0.7542697  0.4843110
#>    4     62  0.7542697  0.4843110
#>    4     63  0.7542697  0.4843110
#>    4     64  0.7542697  0.4843110
#>    4     65  0.7542697  0.4843110
#>    4     66  0.7542697  0.4843110
#>    4     67  0.7542697  0.4843110
#>    4     68  0.7542697  0.4843110
#>    4     69  0.7542697  0.4843110
#>    4     70  0.7542697  0.4843110
#>    5     50  0.7509114  0.4678279
#>    5     51  0.7509114  0.4678279
#>    5     52  0.7509114  0.4678279
#>    5     53  0.7509114  0.4678279
#>    5     54  0.7509114  0.4678279
#>    5     55  0.7509114  0.4678279
#>    5     56  0.7509114  0.4678279
#>    5     57  0.7509114  0.4678279
#>    5     58  0.7509114  0.4678279
#>    5     59  0.7509114  0.4678279
#>    5     60  0.7509114  0.4678279
#>    5     61  0.7509114  0.4678279
#>    5     62  0.7509114  0.4678279
#>    5     63  0.7509114  0.4678279
#>    5     64  0.7509114  0.4678279
#>    5     65  0.7509114  0.4678279
#>    5     66  0.7509114  0.4678279
#>    5     67  0.7509114  0.4678279
#>    5     68  0.7497878  0.4657843
#>    5     69  0.7497878  0.4657843
#>    5     70  0.7497878  0.4657843
#>    6     50  0.7464170  0.4566695
#>    6     51  0.7464170  0.4566695
#>    6     52  0.7464170  0.4566695
#>    6     53  0.7464170  0.4566695
#>    6     54  0.7464170  0.4566695
#>    6     55  0.7452934  0.4539311
#>    6     56  0.7452934  0.4539311
#>    6     57  0.7441698  0.4519093
#>    6     58  0.7441698  0.4519093
#>    6     59  0.7441698  0.4519093
#>    6     60  0.7441698  0.4519093
#>    6     61  0.7441698  0.4519093
#>    6     62  0.7441698  0.4519093
#>    6     63  0.7441698  0.4519093
#>    6     64  0.7441698  0.4519093
#>    6     65  0.7441698  0.4519093
#>    6     66  0.7441698  0.4519093
#>    6     67  0.7441698  0.4519093
#>    6     68  0.7441698  0.4519093
#>    6     69  0.7441698  0.4519093
#>    6     70  0.7441698  0.4519093
#>    7     50  0.7430462  0.4497065
#>    7     51  0.7430462  0.4497065
#>    7     52  0.7430462  0.4497065
#>    7     53  0.7430462  0.4497065
#>    7     54  0.7430462  0.4497065
#>    7     55  0.7430462  0.4497065
#>    7     56  0.7430462  0.4497065
#>    7     57  0.7430462  0.4497065
#>    7     58  0.7430462  0.4497065
#>    7     59  0.7430462  0.4497065
#>    7     60  0.7430462  0.4497065
#>    7     61  0.7419226  0.4477303
#>    7     62  0.7408115  0.4446395
#>    7     63  0.7408115  0.4446395
#>    7     64  0.7408115  0.4446395
#>    7     65  0.7408115  0.4446395
#>    7     66  0.7408115  0.4446395
#>    7     67  0.7408115  0.4446395
#>    7     68  0.7408115  0.4446395
#>    7     69  0.7396879  0.4414400
#>    7     70  0.7408115  0.4434618
#>    8     50  0.7441823  0.4508545
#>    8     51  0.7441823  0.4508545
#>    8     52  0.7441823  0.4508545
#>    8     53  0.7441823  0.4508545
#>    8     54  0.7430587  0.4488783
#>    8     55  0.7419351  0.4469231
#>    8     56  0.7419351  0.4469231
#>    8     57  0.7419351  0.4469231
#>    8     58  0.7419351  0.4469231
#>    8     59  0.7419351  0.4469231
#>    8     60  0.7419351  0.4469231
#>    8     61  0.7419351  0.4469231
#>    8     62  0.7419351  0.4469231
#>    8     63  0.7419351  0.4469231
#>    8     64  0.7419351  0.4469231
#>    8     65  0.7408115  0.4443873
#>    8     66  0.7408115  0.4443873
#>    8     67  0.7408115  0.4443873
#>    8     68  0.7408115  0.4443873
#>    8     69  0.7408115  0.4443873
#>    8     70  0.7408115  0.4443873
#>    9     50  0.7408115  0.4443873
#>    9     51  0.7408115  0.4443873
#>    9     52  0.7408115  0.4443873
#>    9     53  0.7408115  0.4443873
#>    9     54  0.7408115  0.4443873
#>    9     55  0.7408115  0.4443873
#>    9     56  0.7408115  0.4443873
#>    9     57  0.7408115  0.4443873
#>    9     58  0.7408115  0.4443873
#>    9     59  0.7408115  0.4443873
#>    9     60  0.7408115  0.4443873
#>    9     61  0.7408115  0.4443873
#>    9     62  0.7408115  0.4443873
#>    9     63  0.7408115  0.4443873
#>    9     64  0.7408115  0.4443873
#>    9     65  0.7408115  0.4443873
#>    9     66  0.7419351  0.4469115
#>    9     67  0.7419351  0.4469115
#>    9     68  0.7419351  0.4469115
#>    9     69  0.7419351  0.4469115
#>    9     70  0.7419351  0.4469115
#>   10     50  0.7408115  0.4443873
#>   10     51  0.7408115  0.4443873
#>   10     52  0.7408115  0.4443873
#>   10     53  0.7408115  0.4443873
#>   10     54  0.7419351  0.4469115
#>   10     55  0.7419351  0.4469115
#>   10     56  0.7419351  0.4469115
#>   10     57  0.7419351  0.4469115
#>   10     58  0.7419351  0.4469115
#>   10     59  0.7419351  0.4469115
#>   10     60  0.7430462  0.4500024
#>   10     61  0.7430462  0.4500024
#>   10     62  0.7430462  0.4500024
#>   10     63  0.7430462  0.4500024
#>   10     64  0.7430462  0.4500024
#>   10     65  0.7430462  0.4500024
#>   10     66  0.7430462  0.4500024
#>   10     67  0.7430462  0.4500024
#>   10     68  0.7430462  0.4500024
#>   10     69  0.7430462  0.4500024
#>   10     70  0.7430462  0.4500024
#> 
#> Tuning parameter 'Weight' was held constant at a value of 1
#> Kappa was used to select the optimal model using the largest value.
#> The final values used for the model were sigma = 1, C = 58 and Weight = 1.

# 参数进一步缩小 上面最好的为:  sigma = 1, C = 58 and Weight = 1.
no_cores <- detectCores() -1
cl<-makeCluster(no_cores)
# registerDoParallel(cl)

grif_svm3 = expand.grid(sigma=seq(0,1,0.1),C=58,Weight=1)

set.seed(825)
svm_cv_Fit3 <- train(Survived~Pclass +Name + Sex + Age_class + SibSp + Parch + Fare + Embarked + Name + FamilySize,
                 data = train,
                 metric ="Kappa",
                 method = "svmRadialWeights", 
                 trControl = fitControl,tuneGrid = grif_svm3,
                 verbose = FALSE)



svm_cv_Fit3
#> Support Vector Machines with Class Weights 
#> 
#> 891 samples
#>   9 predictor
#>   2 classes: '0', '1' 
#> 
#> No pre-processing
#> Resampling: Cross-Validated (10 fold, repeated 1 times) 
#> Summary of sample sizes: 802, 802, 802, 802, 802, 802, ... 
#> Resampling results across tuning parameters:
#> 
#>   sigma  Accuracy   Kappa    
#>   0.0    0.6161673  0.0000000
#>   0.1    0.8013358  0.5669074
#>   0.2    0.7979650  0.5612242
#>   0.3    0.7911985  0.5513664
#>   0.4    0.7900624  0.5519072
#>   0.5    0.7878402  0.5495903
#>   0.6    0.7889638  0.5520950
#>   0.7    0.7889638  0.5524516
#>   0.8    0.7889638  0.5537507
#>   0.9    0.7878527  0.5518741
#>   1.0    0.7889638  0.5549527
#> 
#> Tuning parameter 'C' was held constant at a value of 58
#> Tuning parameter 'Weight' was held constant at a value of 1
#> Kappa was used to select the optimal model using the largest value.
#> The final values used for the model were sigma = 0.1, C = 58 and Weight = 1.

# 参数进一步缩小 上面最好的为: sigma = 0.1, C = 58 and Weight = 1.
no_cores <- detectCores() -1
cl<-makeCluster(no_cores)
# registerDoParallel(cl)

grif_svm4 = expand.grid(sigma=seq(0,0.2,0.01),C=58,Weight=1)

set.seed(825)
svm_cv_Fit4 <- train(Survived~Pclass +Name + Sex + Age_class + SibSp + Parch + Fare + Embarked + Name + FamilySize,
                 data = train,
                 metric ="Kappa",
                 method = "svmRadialWeights", 
                 trControl = fitControl,tuneGrid = grif_svm4,
                 verbose = FALSE)

# stopImplicitCluster()

svm_cv_Fit4
#> Support Vector Machines with Class Weights 
#> 
#> 891 samples
#>   9 predictor
#>   2 classes: '0', '1' 
#> 
#> No pre-processing
#> Resampling: Cross-Validated (10 fold, repeated 1 times) 
#> Summary of sample sizes: 802, 802, 802, 802, 802, 802, ... 
#> Resampling results across tuning parameters:
#> 
#>   sigma  Accuracy   Kappa    
#>   0.00   0.6161673  0.0000000
#>   0.01   0.8149064  0.5916957
#>   0.02   0.8114981  0.5856433
#>   0.03   0.8081273  0.5793144
#>   0.04   0.8036330  0.5700091
#>   0.05   0.7991511  0.5611674
#>   0.06   0.8002622  0.5645199
#>   0.07   0.8047191  0.5741737
#>   0.08   0.8047066  0.5738563
#>   0.09   0.8024719  0.5689492
#>   0.10   0.8013358  0.5669074
#>   0.11   0.8013358  0.5669991
#>   0.12   0.8002122  0.5643613
#>   0.13   0.8002122  0.5643613
#>   0.14   0.8013233  0.5663641
#>   0.15   0.8024469  0.5690019
#>   0.16   0.8013358  0.5668501
#>   0.17   0.8002122  0.5645959
#>   0.18   0.7979650  0.5601587
#>   0.19   0.7957179  0.5563902
#>   0.20   0.7979650  0.5612242
#> 
#> Tuning parameter 'C' was held constant at a value of 58
#> Tuning parameter 'Weight' was held constant at a value of 1
#> Kappa was used to select the optimal model using the largest value.
#> The final values used for the model were sigma = 0.01, C = 58 and Weight = 1.

svm 模型预测

# 最好的参数 为: sigma = 0.01, C = 58 and Weight = 1.

pred_svm_cv = predict(svm_cv_Fit4,test)
submit_svm_cv=data.frame(PassengerId=test$PassengerId,Survived=prediction)

## 存储文件
# write.csv(submit_svm_cv,"data/submit_svm_cv.csv",row.names = FALSE)

# 模型评估
table(train$Survived,predict(svm_cv_Fit4,train,type = "raw"))
#>    
#>       0   1
#>   0 519  30
#>   1  98 244

kaggle的得分,

一下是把数据提交kaggle以后的得分,可以发现大概都在0.78分左右

如果需要提升分数,需要重新构建新的特征工程再去预测建模


次;