Titanic資料分析 #4

##Read files

path_1 <- "D:/R language/Kaggle/Titanic/train.csv "

train <- read.csv(path_1)

path_2 <- "D:/R language/Kaggle/Titanic/test.csv"

test <- read.csv(path_2)


##Install packages

install.packages("DMwR")
library(DMwR)
install.packages("randomForest")
library(randomForest)
install.packages("party")
library(party)


##Data explore

test$Survived <- NA
combine <- rbind(train, test)


str(combine)

'data.frame': 1309 obs. of  12 variables:
 $ PassengerId: int  1 2 3 4 5 6 7 8 9 10 ...
 $ Survived   : int  0 1 1 1 0 0 0 0 1 1 ...
 $ Pclass     : int  3 1 3 1 3 3 1 3 3 2 ...
 $ Name       : Factor w/ 1307 levels "Abbing, Mr. Anthony",..: 109 191 358 277 16 559 520 629 417 581 ...
 $ Sex        : Factor w/ 2 levels "female","male": 2 1 1 1 2 2 2 2 1 1 ...
 $ Age        : num  22 38 26 35 35 NA 54 2 27 14 ...
 $ SibSp      : int  1 1 0 1 0 0 0 3 0 1 ...
 $ Parch      : int  0 0 0 0 0 0 0 1 2 0 ...
 $ Ticket     : Factor w/ 929 levels "110152","110413",..: 524 597 670 50 473 276 86 396 345 133 ...
 $ Fare       : num  7.25 71.28 7.92 53.1 8.05 ...
 $ Cabin      : Factor w/ 187 levels "","A10","A14",..: 1 83 1 57 1 1 131 1 1 1 ...
 $ Embarked   : Factor w/ 4 levels "","C","Q","S": 4 2 4 4 4 3 4 4 4 2 ...
● 12個變數,1309筆data (891筆作為train, 418筆作為test)

summary(combine)
● NA 'S
Age : 263
Fare : 1
Embarked : 空白2

< Pclass >

ggplot(train, aes(x=Pclass))+geom_bar(aes(fill=as.factor(Survived)))


● 可看出 Pclass = 1 時存活率較高
             Pclass = 2 時存活率大約50%
             Pclass = 3 時死亡比率遠超過存活的比例
● Pclass為影響生存的變數之一

< Sex >

ggplot(train, aes(x=Sex))+geom_bar(aes(fill=as.factor(Survived)))



sex_table <- table(train$Survived,train$Sex)

           female male
        0     81     468
        1    233    109

prop.table(sex_table,2)

            female      male
    0 0.2579618  0.8110919
    1 0.7420382  0.1889081

● Sex為影響Survived的重要變數

< Age >

age_plot <- train[complete.cases(train$Age),]

age_plot$age_level[age_plot$Age >= 60] <- "old"
age_plot$age_level[age_plot$Age >=40 & age_plot$Age<60] <- "adult"
age_plot$age_level[age_plot$Age >= 18 & age_plot$Age< 40] <- "young"
age_plot$age_level[age_plot$Age < 18] <- "children"
ggplot(age_plot,aes(x=age_level))+geom_bar(aes(fill=as.factor(Survived)),position="fill")+xlab("Age_level")

● 不同年齡層生存機率也不同,年紀較小(<18歲)者存活率有機會超過50%,老人(>=60歲)低至25%,表Age也是影響Survived的重要變數之一

##Preprocessing

(一)處理 missing value

< Fare >

combine[is.na(combine$Fare),]
PassengerId Pclass     Name                         Sex      Age      SibSp     Parch     Ticket       
      1044          3        Storey, Mr. Thomas    male    60.5         0             0         3701              
    Fare    Cabin    Embarked    Survived
     NA                       S                 NA
summary(combine[Embarked== "S" & Pclass==3,Fare,])
   Min.   1st Qu.   Median     Mean    3rd Qu.    Max.      NA's 
  0.000    7.854     8.050      14.440    15.900    69.550       1 
fare_plot<- combine[combine$Pclass==3 & combine$Embarked=="S",]
ggplot(fare_plot,aes(x=Fare))+geom_density(fill="blue")
● 因mean、Q3非常接近,Max值明顯較大,表示為右偏的分佈圖,選擇填補median
combine$Fare[is.na(combine$Fare)] <- 8.05


< Embarked >

combine[combine$Embarked=="",]
 PassengerId  Survived  Pclass          Name                                                    Sex
      62                  1             1         Icard, Miss. Amelie                                 female
     830                 1             1Stone, Mrs. George Nelson (Martha Evelyn)   female
 Age    SibSp    Parch     Ticket     Fare     Cabin     Embarked
   38        0            0       113572       80        B28  
   62        0            0       113572       80        B28

ggplot(combine,aes(x=Embarked,y=Fare))+geom_boxplot(aes(fill=as.factor(Pclass)))+geom_hline(aes(yintercept=80,color="red")
train$Embarked[train$Embarked==""] <- "C"


< Age >

選擇使用 Knn 填補 Age
copy_comb <- knnImputation(combine)

combine$Age <- copy_comb$Age


(二) 尋找隱藏 features


combine$FamilySize <- combine$SibSp+combine$Parch+1 
ggplot( combine[1:891,],  aes( x = FamilySize)) + geom_bar( aes( fill = factor( Survived)), stat= 'count', position = 'dodge') 
combine$Name <- as.character(combine$Name)
combine$Title <- sapply(combine$Name, FUN = function(x){strsplit(x, split = "[,.]")[[1]][2]}) combine$Title <- sub(" ","",combine$Title) combine$Title[combine$Title %in% c("Mme","Mlle")] <- "Mlle" combine$Title[combine$Title %in% c('Capt', 'Don', 'Major', 'Sir')] <- 'Sir' combine$Title[combine$Title %in% c('Dona', 'Lady', 'the Countess', 'Jonkheer')] <- 'Lady' combine$Title <- as.factor(combine$Title)
combine$Surname <- sapply(combine$Name, FUN=function(x){strsplit(x, split="[,.]")[[1]][1]})
combine$FamilyId <- paste(combine$FamilySize, combine$Surname,sep = "") combine$FamilyId[combine$FamilySize <=2] <- "Small" famid <- data.frame(table(combine$FamilyId)) famid <- famid[famid$Freq<=2,] combine$FamilyId[combine$FamilyId %in% famid$Var1] <- "Small" combine$FamilyId <- as.factor(combine$FamilyId)

##Build model

先用 randomForest 跑一次

combine$FamilyId2 <- combine$FamilyId 

combine$FamilyId2 <- as.character(combine$FamilyId2)

combine$FamilyId2[combine$FamilySize <=3] <- "Small"

combine$FamilyId2 <- as.factor(combine$FamilyId2)


因為 randomForest 限制 predictors 最高32 levels,手動降低 FamilyId(61 levels)成FamilyId(22 levels)

rffit <- randomForest(as.factor(Survived) ~ Pclass+Sex+Age+SibSp+Parch+Fare+Embarked+

FamilySize+FamilyId2+Title, combine[1:891,], importance=TRUE, ntree=2000)

varImpPlot(rffit)


● 我們刪除不重要的變數"SibSp"、"Parch"、"Embarked"、"FamilySize",再用 conditional inference trees 作預測 ( 模型架構與tree類似,差別在於「決策衡量」方式,非以purity為目標,而是使用statistical test進行評估 )

cffit <- cforest(as.factor(Survived) ~ Pclass+Sex+Age+Fare+Title+FamilyId, train, controls = cforest_unbiased(ntree=2000, mtry=3))

prediction <- predict(cffit, test, OOB = TRUE, type="response")
























   
        




























留言

這個網誌中的熱門文章

填補遺漏值(Missing Value)方法

ggplot2 繪圖套件

分類演算法