In [1]:
# pip install plotly
# pip install cufflinks
# pip install chart_studio
In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import chart_studio.plotly as py
import cufflinks as cf
cf.go_offline(connected=True)
In [3]:
#train=pd.read_csv('D:\\me\\mine\\python\\titanic\\train.csv',dtype={'Pclass':str})
train=pd.read_csv('C:\\Users\\flyto\\Documents\\me\\kaggle\\titanic\\train.csv',dtype={'Pclass':str})
In [4]:
train.head()
Out[4]:
In [5]:
#test=pd.read_csv('D:\\me\\mine\\python\\titanic\\test.csv',dtype={'Pclass':str})
test=pd.read_csv('C:\\Users\\flyto\\Documents\\me\\kaggle\\titanic\\test.csv',dtype={'Pclass':str})
test.head()
Out[5]:
In [6]:
train.dtypes.sort_values()
Out[6]:
In [7]:
train.select_dtypes(include='object').head()
Out[7]:
## NULL값 제거
In [8]:
train.isnull().sum()[train.isnull().sum()>0]
Out[8]:
In [9]:
test.isnull().sum()[test.isnull().sum()>0]
Out[9]:
In [10]:
train['Cabin']=train['Cabin'].fillna("unknown") #unknown
test['Cabin']=test['Cabin'].fillna("unknown") #unknown
In [11]:
train['Embarked']=train['Embarked'].fillna(train['Embarked'].mode()[0]) #최빈값
In [12]:
test.Fare=test.Fare.fillna(train.Fare.mean())
In [13]:
train['title']=train['Name'].apply(lambda x: x.split('.')[0].split(',')[1].strip())
test['title']=test['Name'].apply(lambda x: x.split('.')[0].split(',')[1].strip())
In [14]:
train.groupby(['title','Sex']).Age.mean()
Out[14]:
In [15]:
#test.groupby(['title','Sex']).Age.mean()
In [16]:
newtitles={
"Capt": "Officer",
"Col": "Officer",
"Major": "Officer",
"Jonkheer": "Royalty",
"Don": "Royalty",
"Sir" : "Royalty",
"Dr": "Officer",
"Rev": "Officer",
"the Countess":"Royalty",
"Dona": "Royalty",
"Mme": "Mrs",
"Mlle": "Miss",
"Ms": "Mrs",
"Mr" : "Mr",
"Mrs" : "Mrs",
"Miss" : "Miss",
"Master" : "Master",
"Lady" : "Royalty"}
train['title']=train['title'].map(newtitles)
test['title']=test['title'].map(newtitles)
In [17]:
train.groupby(['title','Sex']).Age.mean()
Out[17]:
In [18]:
def newage (cols):
title=cols[0]
Sex=cols[1]
Age=cols[2]
if pd.isnull(Age):
if title=='Master' and Sex=="male":
return 4.57
elif title=='Miss' and Sex=='female':
return 21.8
elif title=='Mr' and Sex=='male':
return 32.37
elif title=='Mrs' and Sex=='female':
return 35.72
elif title=='Officer' and Sex=='female':
return 49
elif title=='Officer' and Sex=='male':
return 46.56
elif title=='Royalty' and Sex=='female':
return 40.50
else:
return 42.33
else:
return Age
In [19]:
train.Age=train[['title','Sex','Age']].apply(newage, axis=1)
train.isnull().sum()[train.isnull().sum()>0]
Out[19]:
In [20]:
test.Age=test[['title','Sex','Age']].apply(newage, axis=1)
test.isnull().sum()[train.isnull().sum()>0]
Out[20]:
In [21]:
train.columns
Out[21]:
In [22]:
import seaborn as sns
plt.figure(figsize=[16,9])
index=['Pclass', 'Sex', 'Age', 'SibSp','Parch','Ticket','Fare','Cabin', 'Embarked']
for i,n in zip(range(1,10,1),index):
plt.subplot(3,3,i)
sns.barplot(n,'Survived',data=train)
plt.show
In [23]:
sns.distplot(train[train.Survived==1].Age, color='green', kde=False)
sns.distplot(train[train.Survived==0].Age, color='orange', kde=False)
Out[23]:
In [24]:
sns.distplot(train[train.Survived==1].Fare, color='green', kde=False)
sns.distplot(train[train.Survived==0].Fare, color='orange', kde=False)
Out[24]:
In [25]:
train.drop(['PassengerId','Name','Ticket','SibSp','Parch','Ticket','Cabin'],axis=1, inplace=True)
test.drop(['PassengerId','Name','Ticket','SibSp','Parch','Ticket','Cabin'],axis=1, inplace=True)
#titanic=pd.concat([train, test], sort=False)
#train.drop(['PassengerId','Name','Ticket','SibSp','Parch','Ticket','Cabin'],axis=1)
In [26]:
test=pd.get_dummies(test)
In [27]:
train=pd.get_dummies(train)
In [28]:
test.head()
Out[28]:
In [29]:
train.head()
Out[29]:
In [30]:
train.Survived=train.Survived.astype('int')
train.Survived.dtype
Out[30]:
In [31]:
xtrain=train.drop("Survived",axis=1)
ytrain=train['Survived']
xtest=test
In [32]:
#DecisionTree
from sklearn.tree import DecisionTreeClassifier
dt_clf = DecisionTreeClassifier(random_state=11)
dt_model=dt_clf.fit(xtrain, ytrain)
dt_pred = dt_model.predict(xtest)
In [33]:
##교차검증
from sklearn.model_selection import cross_val_score
scores=cross_val_score(dt_clf,xtrain,ytrain,scoring='accuracy',cv=10)
np.mean(scores)
Out[33]:
In [34]:
##교차검증
from sklearn.model_selection import cross_val_score
scores=cross_val_score(dt_clf,xtrain,ytrain,scoring='accuracy',cv=50)
np.mean(scores)
Out[34]:
In [36]:
test_submit=pd.read_csv('C:\\Users\\flyto\\Documents\\me\\kaggle\\titanic\\test.csv',dtype={'Pclass':str})
test_submit.head()
Out[36]:
In [37]:
output=pd.DataFrame({'PassengerId':test_submit['PassengerId'],'Survived':dt_pred})
output.head()
Out[37]:
In [ ]:
output.to_csv('submission_decisiontree.csv', index=False)
'머신러닝' 카테고리의 다른 글
파이썬_쇼핑몰 고객 주문 데이터 분석 (판매 데이터 파악, 데이터 정제) (2) | 2020.09.20 |
---|---|
파이썬_캐글(kaggle) 로지스틱 회귀분석(logistic regression) 활용한 타이타닉 생존 예측 (0) | 2020.09.20 |
파이썬_캐글(kaggle) SVC(Support Vector Classification)활용한 타이타닉 생존 예측 (0) | 2020.09.20 |
파이썬_캐글(kaggle) 랜덤포레스트 활용한 타이타닉 생존 예측 (0) | 2020.09.20 |
파이썬_Titanic 생존여부 예측 (0) | 2020.09.07 |