본문 바로가기

머신러닝

파이썬_캐글(kaggle) 의사결정나무(decision tree) 활용한 타이타닉 생존 예측

 

 

 

In [1]:
# pip install plotly
# pip install cufflinks
# pip install chart_studio
In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import chart_studio.plotly as py
import cufflinks as cf
cf.go_offline(connected=True)
 
In [3]:
#train=pd.read_csv('D:\\me\\mine\\python\\titanic\\train.csv',dtype={'Pclass':str})
train=pd.read_csv('C:\\Users\\flyto\\Documents\\me\\kaggle\\titanic\\train.csv',dtype={'Pclass':str})
In [4]:
train.head()
Out[4]:
  PassengerId Survived Pclass Name Sex Age SibSp Parch Ticket Fare Cabin Embarked
0 1 0 3 Braund, Mr. Owen Harris male 22.0 1 0 A/5 21171 7.2500 NaN S
1 2 1 1 Cumings, Mrs. John Bradley (Florence Briggs Th... female 38.0 1 0 PC 17599 71.2833 C85 C
2 3 1 3 Heikkinen, Miss. Laina female 26.0 0 0 STON/O2. 3101282 7.9250 NaN S
3 4 1 1 Futrelle, Mrs. Jacques Heath (Lily May Peel) female 35.0 1 0 113803 53.1000 C123 S
4 5 0 3 Allen, Mr. William Henry male 35.0 0 0 373450 8.0500 NaN S
In [5]:
#test=pd.read_csv('D:\\me\\mine\\python\\titanic\\test.csv',dtype={'Pclass':str})
test=pd.read_csv('C:\\Users\\flyto\\Documents\\me\\kaggle\\titanic\\test.csv',dtype={'Pclass':str})
test.head()
Out[5]:
  PassengerId Pclass Name Sex Age SibSp Parch Ticket Fare Cabin Embarked
0 892 3 Kelly, Mr. James male 34.5 0 0 330911 7.8292 NaN Q
1 893 3 Wilkes, Mrs. James (Ellen Needs) female 47.0 1 0 363272 7.0000 NaN S
2 894 2 Myles, Mr. Thomas Francis male 62.0 0 0 240276 9.6875 NaN Q
3 895 3 Wirz, Mr. Albert male 27.0 0 0 315154 8.6625 NaN S
4 896 3 Hirvonen, Mrs. Alexander (Helga E Lindqvist) female 22.0 1 1 3101298 12.2875 NaN S
In [6]:
train.dtypes.sort_values()
Out[6]:
PassengerId      int64
Survived         int64
SibSp            int64
Parch            int64
Age            float64
Fare           float64
Pclass          object
Name            object
Sex             object
Ticket          object
Cabin           object
Embarked        object
dtype: object
In [7]:
train.select_dtypes(include='object').head()
Out[7]:
  Pclass Name Sex Ticket Cabin Embarked
0 3 Braund, Mr. Owen Harris male A/5 21171 NaN S
1 1 Cumings, Mrs. John Bradley (Florence Briggs Th... female PC 17599 C85 C
2 3 Heikkinen, Miss. Laina female STON/O2. 3101282 NaN S
3 1 Futrelle, Mrs. Jacques Heath (Lily May Peel) female 113803 C123 S
4 3 Allen, Mr. William Henry male 373450 NaN S
 

## NULL값 제거

In [8]:
train.isnull().sum()[train.isnull().sum()>0]
Out[8]:
Age         177
Cabin       687
Embarked      2
dtype: int64
In [9]:
test.isnull().sum()[test.isnull().sum()>0]
Out[9]:
Age       86
Fare       1
Cabin    327
dtype: int64
In [10]:
train['Cabin']=train['Cabin'].fillna("unknown") #unknown
test['Cabin']=test['Cabin'].fillna("unknown") #unknown
In [11]:
train['Embarked']=train['Embarked'].fillna(train['Embarked'].mode()[0]) #최빈값
In [12]:
test.Fare=test.Fare.fillna(train.Fare.mean())
In [13]:
train['title']=train['Name'].apply(lambda x: x.split('.')[0].split(',')[1].strip())
test['title']=test['Name'].apply(lambda x: x.split('.')[0].split(',')[1].strip())
In [14]:
train.groupby(['title','Sex']).Age.mean()
Out[14]:
title         Sex   
Capt          male      70.000000
Col           male      58.000000
Don           male      40.000000
Dr            female    49.000000
              male      40.600000
Jonkheer      male      38.000000
Lady          female    48.000000
Major         male      48.500000
Master        male       4.574167
Miss          female    21.773973
Mlle          female    24.000000
Mme           female    24.000000
Mr            male      32.368090
Mrs           female    35.898148
Ms            female    28.000000
Rev           male      43.166667
Sir           male      49.000000
the Countess  female    33.000000
Name: Age, dtype: float64
In [15]:
#test.groupby(['title','Sex']).Age.mean()
In [16]:
newtitles={
    "Capt":       "Officer",
    "Col":        "Officer",
    "Major":      "Officer",
    "Jonkheer":   "Royalty",
    "Don":        "Royalty",
    "Sir" :       "Royalty",
    "Dr":         "Officer",
    "Rev":        "Officer",
    "the Countess":"Royalty",
    "Dona":       "Royalty",
    "Mme":        "Mrs",
    "Mlle":       "Miss",
    "Ms":         "Mrs",
    "Mr" :        "Mr",
    "Mrs" :       "Mrs",
    "Miss" :      "Miss",
    "Master" :    "Master",
    "Lady" :      "Royalty"}

train['title']=train['title'].map(newtitles)
test['title']=test['title'].map(newtitles)
In [17]:
train.groupby(['title','Sex']).Age.mean()
Out[17]:
title    Sex   
Master   male       4.574167
Miss     female    21.804054
Mr       male      32.368090
Mrs      female    35.718182
Officer  female    49.000000
         male      46.562500
Royalty  female    40.500000
         male      42.333333
Name: Age, dtype: float64
In [18]:
def newage (cols):
    title=cols[0]
    Sex=cols[1]
    Age=cols[2]
    if pd.isnull(Age):
        if title=='Master' and Sex=="male":
            return 4.57
        elif title=='Miss' and Sex=='female':
            return 21.8
        elif title=='Mr' and Sex=='male': 
            return 32.37
        elif title=='Mrs' and Sex=='female':
            return 35.72
        elif title=='Officer' and Sex=='female':
            return 49
        elif title=='Officer' and Sex=='male':
            return 46.56
        elif title=='Royalty' and Sex=='female':
            return 40.50
        else:
            return 42.33
    else:
        return Age 
In [19]:
train.Age=train[['title','Sex','Age']].apply(newage, axis=1)
train.isnull().sum()[train.isnull().sum()>0]
Out[19]:
Series([], dtype: int64)
In [20]:
test.Age=test[['title','Sex','Age']].apply(newage, axis=1)
test.isnull().sum()[train.isnull().sum()>0]
Out[20]:
Series([], dtype: int64)
In [21]:
train.columns
Out[21]:
Index(['PassengerId', 'Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp',
       'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked', 'title'],
      dtype='object')
In [22]:
import seaborn as sns
plt.figure(figsize=[16,9])

index=['Pclass', 'Sex', 'Age', 'SibSp','Parch','Ticket','Fare','Cabin', 'Embarked']

for i,n in zip(range(1,10,1),index):
    plt.subplot(3,3,i)
    sns.barplot(n,'Survived',data=train)
    plt.show
 
In [23]:
sns.distplot(train[train.Survived==1].Age, color='green', kde=False)
sns.distplot(train[train.Survived==0].Age, color='orange', kde=False)
Out[23]:
<matplotlib.axes._subplots.AxesSubplot at 0x280e2fc79a0>
 
In [24]:
sns.distplot(train[train.Survived==1].Fare, color='green', kde=False)
sns.distplot(train[train.Survived==0].Fare, color='orange', kde=False)
Out[24]:
<matplotlib.axes._subplots.AxesSubplot at 0x280e71b4d00>
 
In [25]:
train.drop(['PassengerId','Name','Ticket','SibSp','Parch','Ticket','Cabin'],axis=1, inplace=True)
test.drop(['PassengerId','Name','Ticket','SibSp','Parch','Ticket','Cabin'],axis=1, inplace=True)
#titanic=pd.concat([train, test], sort=False)
#train.drop(['PassengerId','Name','Ticket','SibSp','Parch','Ticket','Cabin'],axis=1)
In [26]:
test=pd.get_dummies(test)
In [27]:
train=pd.get_dummies(train)
In [28]:
test.head()
Out[28]:
  Age Fare Pclass_1 Pclass_2 Pclass_3 Sex_female Sex_male Embarked_C Embarked_Q Embarked_S title_Master title_Miss title_Mr title_Mrs title_Officer title_Royalty
0 34.5 7.8292 0 0 1 0 1 0 1 0 0 0 1 0 0 0
1 47.0 7.0000 0 0 1 1 0 0 0 1 0 0 0 1 0 0
2 62.0 9.6875 0 1 0 0 1 0 1 0 0 0 1 0 0 0
3 27.0 8.6625 0 0 1 0 1 0 0 1 0 0 1 0 0 0
4 22.0 12.2875 0 0 1 1 0 0 0 1 0 0 0 1 0 0
In [29]:
train.head()
Out[29]:
  Survived Age Fare Pclass_1 Pclass_2 Pclass_3 Sex_female Sex_male Embarked_C Embarked_Q Embarked_S title_Master title_Miss title_Mr title_Mrs title_Officer title_Royalty
0 0 22.0 7.2500 0 0 1 0 1 0 0 1 0 0 1 0 0 0
1 1 38.0 71.2833 1 0 0 1 0 1 0 0 0 0 0 1 0 0
2 1 26.0 7.9250 0 0 1 1 0 0 0 1 0 1 0 0 0 0
3 1 35.0 53.1000 1 0 0 1 0 0 0 1 0 0 0 1 0 0
4 0 35.0 8.0500 0 0 1 0 1 0 0 1 0 0 1 0 0 0
In [30]:
train.Survived=train.Survived.astype('int')
train.Survived.dtype
Out[30]:
dtype('int32')
In [31]:
xtrain=train.drop("Survived",axis=1)
ytrain=train['Survived']
xtest=test
In [32]:
#DecisionTree

from sklearn.tree import DecisionTreeClassifier

dt_clf = DecisionTreeClassifier(random_state=11)
dt_model=dt_clf.fit(xtrain, ytrain)
dt_pred = dt_model.predict(xtest)
In [33]:
##교차검증
from sklearn.model_selection import cross_val_score
scores=cross_val_score(dt_clf,xtrain,ytrain,scoring='accuracy',cv=10)
np.mean(scores)
Out[33]:
0.7879525593008738
In [34]:
##교차검증
from sklearn.model_selection import cross_val_score
scores=cross_val_score(dt_clf,xtrain,ytrain,scoring='accuracy',cv=50)
np.mean(scores)
Out[34]:
0.8005882352941175
In [36]:
test_submit=pd.read_csv('C:\\Users\\flyto\\Documents\\me\\kaggle\\titanic\\test.csv',dtype={'Pclass':str})
test_submit.head()
Out[36]:
  PassengerId Pclass Name Sex Age SibSp Parch Ticket Fare Cabin Embarked
0 892 3 Kelly, Mr. James male 34.5 0 0 330911 7.8292 NaN Q
1 893 3 Wilkes, Mrs. James (Ellen Needs) female 47.0 1 0 363272 7.0000 NaN S
2 894 2 Myles, Mr. Thomas Francis male 62.0 0 0 240276 9.6875 NaN Q
3 895 3 Wirz, Mr. Albert male 27.0 0 0 315154 8.6625 NaN S
4 896 3 Hirvonen, Mrs. Alexander (Helga E Lindqvist) female 22.0 1 1 3101298 12.2875 NaN S
In [37]:
output=pd.DataFrame({'PassengerId':test_submit['PassengerId'],'Survived':dt_pred})
output.head()
Out[37]:
  PassengerId Survived
0 892 0
1 893 0
2 894 1
3 895 1
4 896 0
In [ ]:
output.to_csv('submission_decisiontree.csv', index=False)