Titanic資料分析#2

Titanic資料分析#2

2017/3/11

今天練習用 Kaggle 給的 train data 自己做一個正確率的測試。比較 knn 或 mice 填補的值配上 cart 或 randomforest 建出來的模型何者正確率較高。  

#Read files
path_1 <- "D:/R language/Kaggle/Titanic/train.csv "
train <- read.csv(path_1)
path_2 <- "D:/R language/Kaggle/Titanic/test.csv"
test <- read.csv(path_2)
#Install Packages
install.packages("DMwR")
library(DMwR)
install.packages("rpart")
library(rpart)
install.packages("randomForest")
library(randomForest)
install.packages("mice")
library(mice) #將train以7:3分成訓練和測試data

#Build testing mechanism
train$Embarked[train$Embarked==""] <- "S"
check_training <- subset(train[1:round(0.7*891),])
original_check_testing <- subset(train[(round(0.7*891)+1):891,])
check_testing <- subset(train[(round(0.7*891)+1):891,], select = -c(Survived))
check_testing$Age[is.na(check_testing$Age)] <- 29.7
#Knn+rpart
knn_training <- check_training
set.seed(500)
knn_imputeData <- knnImputation(knn_training)
knn_fit <- rpart(Survived ~ Pclass+Sex+Age+SibSp+Parch+Fare+Embarked, knn_imputeData, method = "class")
knn_rpart_prediction <- predict(knn_fit, check_testing[,c("Pclass","Sex","Age","SibSp","Parch","Fare","Embarked")],type="class")
#Knn+randomForest
knn_rffit <- randomForest((as.factor(Survived)) ~ Pclass+Sex+Age+SibSp+Parch+Fare+Embarked, knn_imputeData, ntree=100)
knn_rf_prediction <- predict(knn_rffit, check_testing[,c("Pclass","Sex","Age","SibSp","Parch","Fare","Embarked")],type="class")
#mice填補的missing value 值建成模型後做預測差異大小
#Mice+rpart
check_training_fix <- subset(check_training, select=(c("Pclass","Sex","Age","SibSp","Parch","Fare","Embarked","Survived")))
mice_imputeData <- mice(check_training_fix, m=3,maxit = 50, method = "cart", seed=500)
mice_imputeData_1<- complete(mice_imputeData,1)
mice_imputeData_2<- complete(mice_imputeData,2)
mice_imputeData_3<- complete(mice_imputeData,3)
set.seed(500)
mice_fit1 <- rpart(Survived ~ Pclass+Sex+Age+SibSp+Parch+Fare+Embarked, mice_imputeData_1, method = "class")
mice_rpart_prediction1 <- predict(mice_fit1, check_testing[,c("Pclass","Sex","Age","SibSp","Parch","Fare","Embarked")],type="class")
mice_fit2 <- rpart(Survived ~ Pclass+Sex+Age+SibSp+Parch+Fare+Embarked, mice_imputeData_2, method = "class")
mice_rpart_prediction2 <- predict(mice_fit2, check_testing[,c("Pclass","Sex","Age","SibSp","Parch","Fare","Embarked")],type="class")
mice_fit3 <- rpart(Survived ~ Pclass+Sex+Age+SibSp+Parch+Fare+Embarked, mice_imputeData_3, method = "class")
mice_rpart_prediction3 <- predict(mice_fit3, check_testing[,c("Pclass","Sex","Age","SibSp","Parch","Fare","Embarked")],type="class")
#想探討降低ree數量是否降低正確率
#Mice+randomforest
mice_rf1 <- randomForest((as.factor(Survived)) ~ Pclass+Sex+Age+SibSp+Parch+Fare+Embarked, mice_imputeData_1, ntree=100)
mice_rf_prediction1 <- predict(mice_rf1, check_testing[,c("Pclass","Sex","Age","SibSp","Parch","Fare","Embarked")],type="class")
mice_rf2 <- randomForest((as.factor(Survived)) ~ Pclass+Sex+Age+SibSp+Parch+Fare+Embarked, mice_imputeData_2, ntree=100)
mice_rf_prediction2 <- predict(mice_rf2, check_testing[,c("Pclass","Sex","Age","SibSp","Parch","Fare","Embarked")],type="class")
mice_rf3 <- randomForest((as.factor(Survived)) ~ Pclass+Sex+Age+SibSp+Parch+Fare+Embarked, mice_imputeData_2, ntree=50)
mice_rf_prediction3 <- predict(mice_rf3, check_testing[,c("Pclass","Sex","Age","SibSp","Parch","Fare","Embarked")],type="class")
#set initial value waiting_list <- list(knn_rpart_prediction,knn_rf_prediction,mice_rpart_prediction1,mice_rpart_prediction2,mice_rpart_prediction3,mice_rf_prediction1,mice_rf_prediction2,mice_rf_prediction3) i <- 1 correct <- rep(NA, 8) #Correct function correct_fun <- function(input_list){ for(i in 1:length(waiting_list)){ confusion_matrix <- table(original_check_testing$Survived, input_list[[i]]) correct[i]<- sum(diag(confusion_matrix))/length(check_testing$PassengerId) print(confusion_matrix) } return(c(correct,i)) } #Function call correct_fun(waiting_list)

結果 : 0 1 knn + cart 0 158 13 1 31 65 0 1 knn + randomforest 0 158 13 1 32 64 0 1 mice1 + cart 0 160 11 1 33 63 0 1 mice2 + cart 0 160 11 1 33 63 0 1 mice3 + cart 0 161 10 1 34 62 0 1 mice1 + randomforest 0 157 14 1 30 66 0 1 mice2 + randomforest(tree=100) 0 157 14 1 25 71 0 1 mice2 + randomforest(tree=50) 0 154 17 1 30 66 knn + cart : 0.8352060 ~ 83.52% knn + randomforest : 0.8314607 ~ 83.17% mice1 + cart : 0.8352060 ~ 83.52% mice2 + cart : 0.8352060 ~ 83.52% mice3 + cart : 0.8352060 ~ 83.52% mice1 + randomforest : 0.8352060 ~ 83.52% mice2 + randomforest(tree=100) : 0.8539326 ~ 85.40% mice2 + randomforest(tree=50) : 0.8239700 ~ 82.40%

< 小結 >

1. knn 與 mice填補值做預測差距不大 2. cart 普遍表現較 randomforest 佳 3. randomforest 裡 tree 的選擇影響大

< 可改進處 >

1. 此次作模型資料組數僅627比,測試資料僅267比,資料數小誤差大 2. 下次可試著找tree的數量為何值時正確率最高

留言

這個網誌中的熱門文章

填補遺漏值(Missing Value)方法

ggplot2 繪圖套件