Titanic資料分析#3
#Read files
path_1 <- "D:/R language/Kaggle/Titanic/train.csv "train <- read.csv(path_1)
path_2 <- "D:/R language/Kaggle/Titanic/test.csv"
test <- read.csv(path_2)
install.packages("DMwR")
library(DMwR)
install.packages("rpart")
library(rpart)
test$Survived <- NA
train$Embarked[train$Embarked==""] <- "S"
combine <- rbind(train, test)
#使用knn補齊缺失值
combine <- knnImputation(combine)#作預處理,透過找到同家族的人求出新的 attribute : FamilyId
combine$Familymemb <- combine$SibSp+combine$Parch+1combine$Name <- as.character(combine$Name)
combine$Title <- sapply(combine$Name, FUN=function(x){strsplit(x,split="[,.]")[[1]][2]})
combine$Title <- sub(" ","",combine$Title) #刪除空格
combine$Title[combine$Title %in% c('Mme', 'Mlle')] <- 'Mlle'
combine$Title[combine$Title %in% c('Capt', 'Don', 'Major', 'Sir')] <- 'Sir'
combine$Title[combine$Title %in% c('Dona', 'Lady', 'the Countess', 'Jonkheer')] <- 'Lady'
combine$Title <- factor(combine$Title)
combine$Surname <- sapply(combine$Name, FUN= function(x){strsplit(x,split = "[,.]")[[1]][1]})
combine$FamilyId <- paste(as.character(combine$Familymemb),combine$Surname,sep = "") #sep無字元表中間無任何東西
combine$FamilyId[combine$Familymemb <= 2] <- "Small" #成員人數<=2者歸為同類
familyid <- data.frame(table(combine$FamilyId)) #可從table(combine$FamilyId)找出Fm>2實際組合成familyid數數不到2個者設為small
familyid <- familyid[familyid$Freq <=2,]
combine$FamilyId[combine$FamilyId %in% familyid$Var1] <- 'Small'
combine$FamilyId <- as.factor(combine$FamilyId)
train <- combine[1:891,]
test <- combine[892:1309,]
#decision tree 作預測
fit <- rpart(as.factor(Survived) ~ Pclass+Sex+Age+SibSp+Parch+Fare+Embarked+Familymemb+Title+FamilyId,train,method = "class")prediction <- predict(fit,test[,c("Pclass","Sex","Age","SibSp","Parch","Fare","Embarked","Familymemb","Title","FamilyId")],type = "class")
prediction <- as.numeric(prediction)
to_submit <- cbind(PassengerId=test$PassengerId, Survived=(prediction-1))
write.csv(to_submit,file = "4_preprocessing+knn+dt",row.names = FALSE)
< 成功率 > 0.79426
< 分析 >
library(rattle)
留言
張貼留言