Python / / 2022. 10. 12. 17:39

Python - 의사결정트리 타이타닉 생존자 예측

#!/usr/bin/env python
# coding: utf-8



import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import LabelEncoder
from sklearn.tree import DecisionTreeClassifier ,export_graphviz
import graphviz

'''탐색적 데이터 분석 

pclass 별로 성별에 따른 생존율의 차이가 있는지 확인

sns.barplot(x='Pclass',y='Survived',hue='Sex',data=df1)
'''

df1 = sns.load_dataset('titanic')

'''

어떤 변수들이 문자 혹은 카테고리 타입인지 확인

df1.dtypes 

숫자데이터로 변경 필요

sex, embarked , deck 변수를 숫자로 바꾸기 위해 어떻게 해야 할까 

각 변수의 유니크 항목들을 확인한 후, 임의의 숫자로 변경 

시리즈.unique
'''
# print(df1.dtypes)
# print(df1.sex.unique)



df1.groupby(['sex', 'pclass']).age.agg(['median'])






#Age결측치를 성별,pclass에 따른 중앙값 채우기
m1_med=df1.loc[(df1.sex=='male') &(df1.pclass== 1),'age'].median()
m2_med=df1.loc[(df1.sex=='male') &(df1.pclass== 2),'age'].median()
m3_med=df1.loc[(df1.sex=='male') &(df1.pclass== 3),'age'].median()
f1_med=df1.loc[(df1.sex=='female') &(df1.pclass== 1),'age'].median()
f2_med=df1.loc[(df1.sex=='female') &(df1.pclass== 2),'age'].median()
f3_med=df1.loc[(df1.sex=='female') &(df1.pclass== 3),'age'].median()

df1.loc[(df1.sex=='male')&(df1.pclass==1)&(df1.age.isna()),'age']=m1_med
df1.loc[(df1.sex=='male')&(df1.pclass==2)&(df1.age.isna()),'age']=m2_med
df1.loc[(df1.sex=='male')&(df1.pclass==3)&(df1.age.isna()),'age']=m3_med
df1.loc[(df1.sex=='female')&(df1.pclass==1)&(df1.age.isna()),'age']=f1_med
df1.loc[(df1.sex=='female')&(df1.pclass==2)&(df1.age.isna()),'age']=f2_med
df1.loc[(df1.sex=='female')&(df1.pclass==3)&(df1.age.isna()),'age']=f3_med
df1.isna().sum()





df1.age_new = 0
#노인의 생존율(50세 이상)
#유아의 생존율(10세 미만)
df1.loc[df1.age >=50, 'age_new'] = 'old'
df1.loc[(df1.age <50) & (df1.age >10), 'age_new'] = 'young'
df1.loc[df1.age <=10, 'age_new'] = 'baby'
df1.age.fillna('N', inplace=True)

df1.isna().sum()
df1.embarked.fillna('S', inplace=True)

df1.deck = df1.deck.astype('object')
df1.deck.fillna('N',inplace=True)


#유니크 데이터 바꾸기
for columns in ['sex','embarked','age_new','deck']:
    datas = df1[columns].unique()
    for no, data in enumerate(datas):
        df1[columns].replace(data, no,inplace=True)
   
#컬럼 유니크 값 확인
for columns in ['sex','embarked','age_new','deck']:
    print(df1[columns].unique())

'''데이터 인코딩 
문자열로 숫자로 바꾸는 2가지 방법 


'''
#객체 생성
tiencoder = LabelEncoder()
df2 =df1.copy()

tiencoder.fit(df2['sex'])
labels = tiencoder.transform(df2['sex'])

print(df2.dtypes)





df1.isna().sum()




df = df1.drop(['class', 'alive', 'embark_town', 'who', 'adult_male', 'deck','alone'], axis=1)



df1 = df.copy()
df2 = df.copy()




df1




df1.std()



df1.mean()




# 전체 변수의 최소, 최대값 확인
# 전체 변수 min max 피처 스케일링 진행
# 변경 후 각 변수의 최소, 최대값 등 분포 확인
# 각 변수의 그래프 확인


df1_fs = (df1 - df1.mean())/df1.std()
print('mean: ')
print(df1_fs.mean())
print('std : ')
print(df1_fs.std())
fig = plt.figure(figsize=(10, 10))
for no, i in enumerate(df1_fs.columns):
    globals()[f'ax{no+1}'] = fig.add_subplot(3,3,no+1)
    sns.distplot(df1_fs[i], ax=globals()[f'ax{no+1}'])
#plt.show()




globals()




df1_fs





from sklearn.preprocessing import StandardScaler, MinMaxScaler, MaxAbsScaler


# scaler = StandardScaler()
# scaler.fit()



'''머신러닝 주요 작업은 학습 알고리즘을 선택해서 어떤 데이터에 훈련시키는 것

나쁜데이터 : 
충분하지 않은 양의 훈련데이터 
대표성이 없는 훈련데이터 
낮은 품질의 데이터 
관련없는 특성 

나쁜 알고리즘 : 
훈련데이터 과대적합 

훈련데이터 과소적합 

'''

'''타이타닉 의사 결정 나무'''
#필요한 데이터 
#train_pre = raw_data[['pclass','sex','age','sibsp','parch','fare']]
#df.drop(['embarked','age_new'], axis=1,inplace=True)

# tmp2 = []
# for each in df1['sex']:
#     if each == 'female':
#         tmp2.append(1)
#     elif each == 'male':
#         tmp2.append(0)
#     else:
#         tmp2.append(np.nan)
# df1['sex'] = tmp2
df1['pclass'] = df1['pclass'].astype('float')
df1['sex'] = df1['sex'].astype('float')
df1['sibsp'] = df1['sibsp'].astype('float')
df1['parch'] = df1['parch'].astype('float')
df1['fare'] = df1['fare'].astype('float')

df1.age.fillna(df1.age_new, inplace=True)
# df1 = df1[df1['age'].notnull()]
df1.sibsp.fillna(df1.sibsp.mean(), inplace=True)
df1.parch.fillna(df1.parch.mean(), inplace=True)
df1.fare.fillna(0, inplace=True)
df1_survived_encoder = LabelEncoder()
df1_sex_encoder = LabelEncoder()
df1_pclass_encoder = LabelEncoder()
df1_age_encoder = LabelEncoder()
df1_sibsp_encoder = LabelEncoder()
df1_parch_encoder = LabelEncoder()
df1_fare_encoder = LabelEncoder()


for i in ['survived','age','pclass', 'sex']:
    globals()[f'df1_{i}_encoder'] = LabelEncoder()
    globals()[f'df1_{i}_encoder'].fit(df1[i])
    df[i] = globals()[f'df1_{i}_encoder'].transform(df1[i])





X = df.drop('survived', axis=1)
y = df.survived
print('-----------------------')
print(X.head(1))
print('-----------------------')
print(y.head(1))

titanic_dtclf = DecisionTreeClassifier(max_depth=2)
print(titanic_dtclf.fit(X,y))
export_graphviz(titanic_dtclf,out_file='titanic1.dot',
                feature_names=X.columns,
                class_names=['생존','사망'])

with open('./titanic1.dot', encoding='UTF8') as f: 
    titanic1 = f.read()
graphviz.Source(titanic1)

'''csv파일로 승객명단을 불러온 후, 예측하기'''



df_new = pd.read_csv("./test.csv")
new_df =pd.DataFrame(df_new)
print('-----------------')
print(new_df)
 #PassengerId  Pclass  Name     Sex   Age  SibSp  Parch Ticket Fare Cabin Embarked

tmp = []
for each in new_df['Sex']:
    if each == 'female':
        tmp.append(1)
    elif each == 'male':
        tmp.append(0)
    else:
        tmp.append(np.nan)
new_df.drop(['Name','Ticket','Cabin','Embarked'], axis=1,inplace=True)
new_df['Sex'] = tmp
new_df['Pclass'] = new_df['Pclass'].astype('float')
new_df['Sex'] = new_df['Sex'].astype('float')
new_df['Sibsp'] = new_df['SibSp'].astype('float')
new_df['Parch'] = new_df['Parch'].astype('float')
new_df['Fare'] = new_df['Fare'].astype('float')
new_df['Age'] = new_df['Age'].astype('float')
new_df.Age.fillna(df1.age_new, inplace=True)
# new_df = new_df[new_df['Age'].notnull(),inplace=True]
# new_df = new_df[new_df['SibSp'].notnull()]
new_df.SibSp.fillna(new_df.SibSp.mean(), inplace=True)
# new_df = new_df[new_df['Parch'].notnull()]
new_df.Parch.fillna(new_df.Parch.mean(), inplace=True)
# new_df = new_df[new_df['Fare'].notnull()]
new_df.Fare.fillna(0, inplace=True)
#테스트 csv에서 PassengerId를 인덱스로 지정하기 
# new_df.set_index('PassengerId', inplace=True)
# #필요 없다고 생각하는 데이터 제거 하기 


#train_pre = raw_data[['pclass','sex','age','sibsp','parch','fare']]
for i in ['Pclass', 'Age','Sex','Parch']:
    globals()[f'new_{i}_encoder'] = LabelEncoder()
    globals()[f'new_{i}_encoder'].fit(new_df[i])
    new_df[i] = globals()[f'new_{i}_encoder'].transform(new_df[i])
    

# print(new_Pclass_encoder.classes_)
# print(new_Age_encoder.classes_)

pred_result = titanic_dtclf.predict(new_df)
pred_result1 = df1_survived_encoder.inverse_transform(pred_result)
new_df['Survived']=pred_result1
print('생존자수:', (new_df['Survived'].sum()))
print('예측:',format(titanic_dtclf.score(X,y)))
print(new_df)
#print(df_new.dtypes) 
#new_df = pd.read_csv('test.csv')
new_df.drop(list(new_df.columns)[1:], axis = 1, inplace=True)
new_df['Survived'] = pred_result1
new_df.set_index('PassengerId', inplace=True)
new_df.to_csv('tit5_test.csv')
#PassengerID , Pclass, Age이용 하는게 나을거 같다.  
#필요 없다고 생각 하는 데이터는 드롭 하기 

# df_new = df_new.drop(['Name','Sex','SibSp','Parch','Ticket','Fare','Cabin','Embarked'], axis=1)
#인덱스는 PassengerID로 지정해주자
'''

'''

# df_test = pd.read_csv('./test.csv')
# df_test.head()
# df_test.Cabin.value_counts()
# df_test.set_index('PassengerId', inplace=True)
# df_test.head()
# df_test['family'] = df_test.SibSp + df_test.Parch
# df_test.head()
# df_test.drop(['SibSp', 'Parch', 'Name', 'Ticket', 'Cabin'], axis=1, inplace=True)
# df_test.head()
# df_test.age_new = 0
# df_test.loc[df_test.Age >= 50, 'age_new'] = 'old'
# df_test.loc[(df_test.Age < 50) & (df_test.Age>=10), 'age_new'] = 'young'
# df_test.loc[df_test.Age < 10, 'age_new'] = 'baby'
# df_test.head()
# df_test.columns = (df.columns)[1:]
# df_test.head(1)
# df_test.isna().sum()
# df_test[df_test.fare.isna()]
# fare_new = float(df.loc[(df.pclass==3)&(df.sex==1)&(df.age>60)&(df.age<62)&(df.embarked==2), 'fare'].values)
# fare_new
# df_test.fare.fillna(fare_new, inplace=True)
# df_test.isna().sum()
# m1_med = df.loc[(df.sex == 1)& (df.pclass == 1), 'age'].median()
# m2_med = df.loc[(df.sex == 1)& (df.pclass == 2), 'age'].median()
# m3_med = df.loc[(df.sex == 1)& (df.pclass == 3), 'age'].median()
# f1_med = df.loc[(df.sex == 0)& (df.pclass == 1), 'age'].median()
# f2_med = df.loc[(df.sex == 0)& (df.pclass == 2), 'age'].median()
# f3_med = df.loc[(df.sex == 0)& (df.pclass == 3), 'age'].median()
# df_test.loc[(df_test.sex == 'male')&(df_test.pclass == 1)&(df_test.age.isna()), 'age'] = m1_med
# df_test.loc[(df_test.sex == 'male')&(df_test.pclass == 2)&(df_test.age.isna()), 'age'] = m2_med
# df_test.loc[(df_test.sex == 'male')&(df_test.pclass == 3)&(df_test.age.isna()), 'age'] = m3_med
# df_test.loc[(df_test.sex == 'female')&(df_test.pclass == 1)&(df_test.age.isna()), 'age'] = f1_med
# df_test.loc[(df_test.sex == 'female')&(df_test.pclass == 2)&(df_test.age.isna()), 'age'] = f2_med
# df_test.loc[(df_test.sex == 'female')&(df_test.pclass == 3)&(df_test.age.isna()), 'age'] = f3_med
# df_test.loc[df_test.age >= 50, 'age_new'] = 'old'
# df_test.loc[(df_test.age < 50) & (df_test.age>=10), 'age_new'] = 'young'
# df_test.loc[df_test.age < 10, 'age_new'] = 'baby'
# df_test.isna().sum()
# df_test.head()
# titanic_O_columns = df_test.columns[df_test.dtypes=='O']
# titanic_O_columns
# for i in titanic_O_columns:
#     globals()[f'df_test_{i}_encoder'] = LabelEncoder()
#     globals()[f'df_test_{i}_encoder'].fit(df_test[i])
#     df_test[i] = globals()[f'df_test_{i}_encoder'].transform(df_test[i])
# df_test['survived'] = titanic_dtclf.predict(df_test)
# print('생존자수:', (df_test['survived'].sum()))


'''데이터 분리 : 학습데이터 + 테스트 데이터 
y_titanic_df = df_train['Survived']
x_titanic_df = 
'''
df1['family'] = df1.sibsp + df1.parch
print(df1)


'''학습 데이터는 약 70%로 잡는다 , 테스트 데이터 30%

트레이닝 데이터 / 테스트 데이터 분리 
'''
from sklearn.model_selection import train_test_split
X = df1.drop('survived', axis=1)
y = df1.survived

#X_train,X_test, y_train, y_test = train_test_split(X,y,test_size=0.3, random_state=12)
#print(len(X_train.index))
#train 70% / validation 20% / test 10% 
X_train,X_test, y_train, y_test = train_test_split(X,y,test_size=0.1, random_state=12)
#X_train,X_val, y_train, y_val = train_test_split(X_train, y_train, test_size = 0.22, random_state=12)
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)
dt_clf = DecisionTreeClassifier()
dt_clf.fit(X_train, y_train)
dt_pred = dt_clf.predict(X_test)

from sklearn.metrics import accuracy_score

print('정확도: %.2f'% accuracy_score(y_test, dt_pred))
print('정확도:%.2f' % dt_clf.score(X_test, y_test))

'''
평가 및 적용 

교차검증 KFOLD 

'''

from sklearn.model_selection import KFold
#5개의 train셋 , Val셋 만든다.
kfold = KFold(n_splits=5, shuffle=False)
kfold.split(X_train)
#트레인 인덱스 , val 인덱스 
scores=[]
for no, (train_idx, val_idx) in enumerate(kfold.split(X_train)):
    X_train_k = X_train.iloc[train_idx]
    y_train_k = y_train.iloc[train_idx]
    X_val_k = X_train.iloc[val_idx]
    y_val_k = y_train.iloc[val_idx]
    dt_clf.fit(X_train_k,y_train_k)
    scores.append(dt_clf.score(X_val_k,y_val_k))
print(scores)
print(np.mean(scores))
  • 네이버 블로그 공유
  • 네이버 밴드 공유
  • 페이스북 공유
  • 카카오스토리 공유