Titanic資料分析 #4
##Read files
path_1 <- "D:/R language/Kaggle/Titanic/train.csv " train <- read.csv(path_1) path_2 <- "D:/R language/Kaggle/Titanic/test.csv" test <- read.csv(path_2)
##Install packages
install.packages("DMwR")
library(DMwR)
install.packages("randomForest")
library(randomForest)
install.packages("party")
library(party)
##Data explore
test$Survived <- NA combine <- rbind(train, test) str(combine)
'data.frame': 1309 obs. of 12 variables:
$ PassengerId: int 1 2 3 4 5 6 7 8 9 10 ...
$ Survived : int 0 1 1 1 0 0 0 0 1 1 ...
$ Pclass : int 3 1 3 1 3 3 1 3 3 2 ...
$ Name : Factor w/ 1307 levels "Abbing, Mr. Anthony",..: 109 191 358 277 16 559 520 629 417 581 ...
$ Sex : Factor w/ 2 levels "female","male": 2 1 1 1 2 2 2 2 1 1 ...
$ Age : num 22 38 26 35 35 NA 54 2 27 14 ...
$ SibSp : int 1 1 0 1 0 0 0 3 0 1 ...
$ Parch : int 0 0 0 0 0 0 0 1 2 0 ...
$ Ticket : Factor w/ 929 levels "110152","110413",..: 524 597 670 50 473 276 86 396 345 133 ...
$ Fare : num 7.25 71.28 7.92 53.1 8.05 ...
$ Cabin : Factor w/ 187 levels "","A10","A14",..: 1 83 1 57 1 1 131 1 1 1 ...
$ Embarked : Factor w/ 4 levels "","C","Q","S": 4 2 4 4 4 3 4 4 4 2 ...
● 12個變數,1309筆data (891筆作為train, 418筆作為test)
summary(combine)● NA 'S
Age : 263
Fare : 1
Embarked : 空白2
< Pclass >
ggplot(train, aes(x=Pclass))+geom_bar(aes(fill=as.factor(Survived)))
Pclass = 2 時存活率大約50%
Pclass = 3 時死亡比率遠超過存活的比例
● Pclass為影響生存的變數之一
< Sex >
ggplot(train, aes(x=Sex))+geom_bar(aes(fill=as.factor(Survived)))
sex_table <- table(train$Survived,train$Sex)
female male
0 81 468
1 233 109
prop.table(sex_table,2)
female male
0 0.2579618 0.8110919
1 0.7420382 0.1889081
● Sex為影響Survived的重要變數
< Age >
age_plot <- train[complete.cases(train$Age),]age_plot$age_level[age_plot$Age >= 60] <- "old"age_plot$age_level[age_plot$Age >=40 & age_plot$Age<60] <- "adult"age_plot$age_level[age_plot$Age >= 18 & age_plot$Age< 40] <- "young"age_plot$age_level[age_plot$Age < 18] <- "children"ggplot(age_plot,aes(x=age_level))+geom_bar(aes(fill=as.factor(Survived)),position="fill")+xlab("Age_level")
● 不同年齡層生存機率也不同,年紀較小(<18歲)者存活率有機會超過50%,老人(>=60歲)低至25%,表Age也是影響Survived的重要變數之一
##Preprocessing
(一)處理 missing value
< Fare >
combine[is.na(combine$Fare),]
PassengerId Pclass Name Sex Age SibSp Parch Ticket
1044 3 Storey, Mr. Thomas male 60.5 0 0 3701
Fare Cabin Embarked Survived
NA S NA
summary(combine[Embarked== "S" & Pclass==3,Fare,])
Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
0.000 7.854 8.050 14.440 15.900 69.550 1
fare_plot<- combine[combine$Pclass==3 & combine$Embarked=="S",] ggplot(fare_plot,aes(x=Fare))+geom_density(fill="blue")
● 因mean、Q3非常接近,Max值明顯較大,表示為右偏的分佈圖,選擇填補median
combine$Fare[is.na(combine$Fare)] <- 8.05
< Embarked >
combine[combine$Embarked=="",]
PassengerId Survived Pclass Name Sex
62 1 1 Icard, Miss. Amelie female
830 1 1Stone, Mrs. George Nelson (Martha Evelyn) female
Age SibSp Parch Ticket Fare Cabin Embarked
38 0 0 113572 80 B28
62 0 0 113572 80 B28
因為 randomForest 限制 predictors 最高32 levels,手動降低 FamilyId(61 levels)成FamilyId(22 levels)
● 我們刪除不重要的變數"SibSp"、"Parch"、"Embarked"、"FamilySize",再用 conditional inference trees 作預測 ( 模型架構與tree類似,差別在於「決策衡量」方式,非以purity為目標,而是使用statistical test進行評估 )
62 1 1 Icard, Miss. Amelie female
830 1 1Stone, Mrs. George Nelson (Martha Evelyn) female
Age SibSp Parch Ticket Fare Cabin Embarked
38 0 0 113572 80 B28
62 0 0 113572 80 B28
ggplot(combine,aes(x=Embarked,y=Fare))+geom_boxplot(aes(fill=as.factor(Pclass)))+geom_hline(aes(yintercept=80,color="red")
train$Embarked[train$Embarked==""] <- "C"
< Age >
選擇使用 Knn 填補 Age
copy_comb <- knnImputation(combine) combine$Age <- copy_comb$Age
(二) 尋找隱藏 features
combine$FamilySize <- combine$SibSp+combine$Parch+1
ggplot( combine[1:891,], aes( x = FamilySize)) + geom_bar( aes( fill = factor( Survived)), stat= 'count', position = 'dodge')
combine$Name <- as.character(combine$Name)
combine$Title <- sapply(combine$Name, FUN = function(x){strsplit(x, split = "[,.]")[[1]][2]}) combine$Title <- sub(" ","",combine$Title) combine$Title[combine$Title %in% c("Mme","Mlle")] <- "Mlle" combine$Title[combine$Title %in% c('Capt', 'Don', 'Major', 'Sir')] <- 'Sir' combine$Title[combine$Title %in% c('Dona', 'Lady', 'the Countess', 'Jonkheer')] <- 'Lady' combine$Title <- as.factor(combine$Title)
combine$Surname <- sapply(combine$Name, FUN=function(x){strsplit(x, split="[,.]")[[1]][1]})
combine$FamilyId <- paste(combine$FamilySize, combine$Surname,sep = "") combine$FamilyId[combine$FamilySize <=2] <- "Small" famid <- data.frame(table(combine$FamilyId)) famid <- famid[famid$Freq<=2,] combine$FamilyId[combine$FamilyId %in% famid$Var1] <- "Small" combine$FamilyId <- as.factor(combine$FamilyId)
##Build model
先用 randomForest 跑一次combine$FamilyId2 <- combine$FamilyId combine$FamilyId2 <- as.character(combine$FamilyId2) combine$FamilyId2[combine$FamilySize <=3] <- "Small" combine$FamilyId2 <- as.factor(combine$FamilyId2)
因為 randomForest 限制 predictors 最高32 levels,手動降低 FamilyId(61 levels)成FamilyId(22 levels)
rffit <- randomForest(as.factor(Survived) ~ Pclass+Sex+Age+SibSp+Parch+Fare+Embarked+ FamilySize+FamilyId2+Title, combine[1:891,], importance=TRUE, ntree=2000) varImpPlot(rffit)
cffit <- cforest(as.factor(Survived) ~ Pclass+Sex+Age+Fare+Title+FamilyId, train, controls = cforest_unbiased(ntree=2000, mtry=3))prediction <- predict(cffit, test, OOB = TRUE, type="response")
留言
張貼留言