Titanic資料分析 #4

4月 23, 2017

##Read files

path_1 <- "D:/R language/Kaggle/Titanic/train.csv "

train <- read.csv(path_1)

path_2 <- "D:/R language/Kaggle/Titanic/test.csv"

test <- read.csv(path_2)

##Install packages

install.packages("DMwR")

library(DMwR)

install.packages("randomForest")

library(randomForest)

install.packages("party")

library(party)

##Data explore

test$Survived <- NA
combine <- rbind(train, test)


str(combine)

'data.frame': 1309 obs. of 12 variables:
$ PassengerId: int 1 2 3 4 5 6 7 8 9 10 ...
$ Survived : int 0 1 1 1 0 0 0 0 1 1 ...
$ Pclass : int 3 1 3 1 3 3 1 3 3 2 ...
$ Name : Factor w/ 1307 levels "Abbing, Mr. Anthony",..: 109 191 358 277 16 559 520 629 417 581 ...
$ Sex : Factor w/ 2 levels "female","male": 2 1 1 1 2 2 2 2 1 1 ...
$ Age : num 22 38 26 35 35 NA 54 2 27 14 ...
$ SibSp : int 1 1 0 1 0 0 0 3 0 1 ...
$ Parch : int 0 0 0 0 0 0 0 1 2 0 ...
$ Ticket : Factor w/ 929 levels "110152","110413",..: 524 597 670 50 473 276 86 396 345 133 ...
$ Fare : num 7.25 71.28 7.92 53.1 8.05 ...
$ Cabin : Factor w/ 187 levels "","A10","A14",..: 1 83 1 57 1 1 131 1 1 1 ...
$ Embarked : Factor w/ 4 levels "","C","Q","S": 4 2 4 4 4 3 4 4 4 2 ...
● 12個變數，1309筆data (891筆作為train, 418筆作為test)

summary(combine)

● NA 'S
Age : 263
Fare : 1
Embarked : 空白2

< Pclass >

ggplot(train, aes(x=Pclass))+geom_bar(aes(fill=as.factor(Survived)))

● 可看出 Pclass = 1 時存活率較高
Pclass = 2 時存活率大約50%
Pclass = 3 時死亡比率遠超過存活的比例
● Pclass為影響生存的變數之一

< Sex >

ggplot(train, aes(x=Sex))+geom_bar(aes(fill=as.factor(Survived)))

sex_table <- table(train$Survived,train$Sex)

female male
0 81 468
1 233 109

prop.table(sex_table,2)

female male
0 0.2579618 0.8110919
1 0.7420382 0.1889081

● Sex為影響Survived的重要變數

< Age >

age_plot <- train[complete.cases(train$Age),]



age_plot$age_level[age_plot$Age >= 60] <- "old"

age_plot$age_level[age_plot$Age >=40 & age_plot$Age<60] <- "adult"

age_plot$age_level[age_plot$Age >= 18 & age_plot$Age< 40] <- "young"

age_plot$age_level[age_plot$Age < 18] <- "children"



ggplot(age_plot,aes(x=age_level))+geom_bar(aes(fill=as.factor(Survived)),position="fill")+xlab("Age_level")

● 不同年齡層生存機率也不同，年紀較小(<18歲)者存活率有機會超過50%，老人(>=60歲)低至25%，表Age也是影響Survived的重要變數之一

##Preprocessing

(一)處理 missing value

< Fare >

combine[is.na(combine$Fare),]

PassengerId Pclass Name Sex Age SibSp Parch Ticket

1044 3 Storey, Mr. Thomas male 60.5 0 0 3701

Fare Cabin Embarked Survived

NA S NA

summary(combine[Embarked== "S" & Pclass==3,Fare,])

Min. 1st Qu. Median Mean 3rd Qu. Max. NA's

0.000 7.854 8.050 14.440 15.900 69.550 1

fare_plot<- combine[combine$Pclass==3 & combine$Embarked=="S",]
ggplot(fare_plot,aes(x=Fare))+geom_density(fill="blue")

● 因mean、Q3非常接近，Max值明顯較大，表示為右偏的分佈圖，選擇填補median

combine$Fare[is.na(combine$Fare)] <- 8.05

< Embarked >

combine[combine$Embarked=="",]

PassengerId Survived Pclass Name Sex
62 1 1 Icard, Miss. Amelie female
830 1 1Stone, Mrs. George Nelson (Martha Evelyn) female
Age SibSp Parch Ticket Fare Cabin Embarked
38 0 0 113572 80 B28
62 0 0 113572 80 B28

ggplot(combine,aes(x=Embarked,y=Fare))+geom_boxplot(aes(fill=as.factor(Pclass)))+geom_hline(aes(yintercept=80,color="red")

train$Embarked[train$Embarked==""] <- "C"

< Age >

選擇使用 Knn 填補 Age

copy_comb <- knnImputation(combine)

combine$Age <- copy_comb$Age

(二) 尋找隱藏 features

combine$FamilySize <- combine$SibSp+combine$Parch+1

ggplot( combine[1:891,],  aes( x = FamilySize)) + geom_bar( aes( fill = factor( Survived)), stat= 'count', position = 'dodge')

combine$Name <- as.character(combine$Name)

combine$Title <- sapply(combine$Name, FUN = function(x){strsplit(x, split = "[,.]")[[1]][2]})
combine$Title <- sub(" ","",combine$Title)
combine$Title[combine$Title %in% c("Mme","Mlle")] <- "Mlle"
combine$Title[combine$Title %in% c('Capt', 'Don', 'Major', 'Sir')] <- 'Sir'
combine$Title[combine$Title %in% c('Dona', 'Lady', 'the Countess', 'Jonkheer')] <- 'Lady'
combine$Title <- as.factor(combine$Title)


combine$Surname <- sapply(combine$Name, FUN=function(x){strsplit(x, split="[,.]")[[1]][1]})


combine$FamilyId <- paste(combine$FamilySize, combine$Surname,sep = "")
combine$FamilyId[combine$FamilySize <=2] <- "Small"
famid <- data.frame(table(combine$FamilyId))
famid <- famid[famid$Freq<=2,]
combine$FamilyId[combine$FamilyId %in% famid$Var1] <- "Small"
combine$FamilyId <- as.factor(combine$FamilyId)

##Build model

先用 randomForest 跑一次

combine$FamilyId2 <- combine$FamilyId 

combine$FamilyId2 <- as.character(combine$FamilyId2)

combine$FamilyId2[combine$FamilySize <=3] <- "Small"

combine$FamilyId2 <- as.factor(combine$FamilyId2)

因為 randomForest 限制 predictors 最高32 levels，手動降低 FamilyId(61 levels)成FamilyId(22 levels)

rffit <- randomForest(as.factor(Survived) ~ Pclass+Sex+Age+SibSp+Parch+Fare+Embarked+

FamilySize+FamilyId2+Title, combine[1:891,], importance=TRUE, ntree=2000)

varImpPlot(rffit)

● 我們刪除不重要的變數"SibSp"、"Parch"、"Embarked"、"FamilySize"，再用 conditional inference trees 作預測 ( 模型架構與tree類似，差別在於「決策衡量」方式，非以purity為目標，而是使用statistical test進行評估 )

cffit <- cforest(as.factor(Survived) ~ Pclass+Sex+Age+Fare+Title+FamilyId, train, controls = cforest_unbiased(ntree=2000, mtry=3))



prediction <- predict(cffit, test, OOB = TRUE, type="response")

搜尋此網誌

R 自學筆記