#!/usr/bin/env python
# coding: utf-8
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import LabelEncoder
from sklearn.tree import DecisionTreeClassifier ,export_graphviz
import graphviz
'''탐색적 데이터 분석
pclass 별로 성별에 따른 생존율의 차이가 있는지 확인
sns.barplot(x='Pclass',y='Survived',hue='Sex',data=df1)
'''
df1 = sns.load_dataset('titanic')
'''
어떤 변수들이 문자 혹은 카테고리 타입인지 확인
df1.dtypes
숫자데이터로 변경 필요
sex, embarked , deck 변수를 숫자로 바꾸기 위해 어떻게 해야 할까
각 변수의 유니크 항목들을 확인한 후, 임의의 숫자로 변경
시리즈.unique
'''
# print(df1.dtypes)
# print(df1.sex.unique)
df1.groupby(['sex', 'pclass']).age.agg(['median'])
#Age결측치를 성별,pclass에 따른 중앙값 채우기
m1_med=df1.loc[(df1.sex=='male') &(df1.pclass== 1),'age'].median()
m2_med=df1.loc[(df1.sex=='male') &(df1.pclass== 2),'age'].median()
m3_med=df1.loc[(df1.sex=='male') &(df1.pclass== 3),'age'].median()
f1_med=df1.loc[(df1.sex=='female') &(df1.pclass== 1),'age'].median()
f2_med=df1.loc[(df1.sex=='female') &(df1.pclass== 2),'age'].median()
f3_med=df1.loc[(df1.sex=='female') &(df1.pclass== 3),'age'].median()
df1.loc[(df1.sex=='male')&(df1.pclass==1)&(df1.age.isna()),'age']=m1_med
df1.loc[(df1.sex=='male')&(df1.pclass==2)&(df1.age.isna()),'age']=m2_med
df1.loc[(df1.sex=='male')&(df1.pclass==3)&(df1.age.isna()),'age']=m3_med
df1.loc[(df1.sex=='female')&(df1.pclass==1)&(df1.age.isna()),'age']=f1_med
df1.loc[(df1.sex=='female')&(df1.pclass==2)&(df1.age.isna()),'age']=f2_med
df1.loc[(df1.sex=='female')&(df1.pclass==3)&(df1.age.isna()),'age']=f3_med
df1.isna().sum()
df1.age_new = 0
#노인의 생존율(50세 이상)
#유아의 생존율(10세 미만)
df1.loc[df1.age >=50, 'age_new'] = 'old'
df1.loc[(df1.age <50) & (df1.age >10), 'age_new'] = 'young'
df1.loc[df1.age <=10, 'age_new'] = 'baby'
df1.age.fillna('N', inplace=True)
df1.isna().sum()
df1.embarked.fillna('S', inplace=True)
df1.deck = df1.deck.astype('object')
df1.deck.fillna('N',inplace=True)
#유니크 데이터 바꾸기
for columns in ['sex','embarked','age_new','deck']:
datas = df1[columns].unique()
for no, data in enumerate(datas):
df1[columns].replace(data, no,inplace=True)
#컬럼 유니크 값 확인
for columns in ['sex','embarked','age_new','deck']:
print(df1[columns].unique())
'''데이터 인코딩
문자열로 숫자로 바꾸는 2가지 방법
'''
#객체 생성
tiencoder = LabelEncoder()
df2 =df1.copy()
tiencoder.fit(df2['sex'])
labels = tiencoder.transform(df2['sex'])
print(df2.dtypes)
df1.isna().sum()
df = df1.drop(['class', 'alive', 'embark_town', 'who', 'adult_male', 'deck','alone'], axis=1)
df1 = df.copy()
df2 = df.copy()
df1
df1.std()
df1.mean()
# 전체 변수의 최소, 최대값 확인
# 전체 변수 min max 피처 스케일링 진행
# 변경 후 각 변수의 최소, 최대값 등 분포 확인
# 각 변수의 그래프 확인
df1_fs = (df1 - df1.mean())/df1.std()
print('mean: ')
print(df1_fs.mean())
print('std : ')
print(df1_fs.std())
fig = plt.figure(figsize=(10, 10))
for no, i in enumerate(df1_fs.columns):
globals()[f'ax{no+1}'] = fig.add_subplot(3,3,no+1)
sns.distplot(df1_fs[i], ax=globals()[f'ax{no+1}'])
#plt.show()
globals()
df1_fs
from sklearn.preprocessing import StandardScaler, MinMaxScaler, MaxAbsScaler
# scaler = StandardScaler()
# scaler.fit()
'''머신러닝 주요 작업은 학습 알고리즘을 선택해서 어떤 데이터에 훈련시키는 것
나쁜데이터 :
충분하지 않은 양의 훈련데이터
대표성이 없는 훈련데이터
낮은 품질의 데이터
관련없는 특성
나쁜 알고리즘 :
훈련데이터 과대적합
훈련데이터 과소적합
'''
'''타이타닉 의사 결정 나무'''
#필요한 데이터
#train_pre = raw_data[['pclass','sex','age','sibsp','parch','fare']]
#df.drop(['embarked','age_new'], axis=1,inplace=True)
# tmp2 = []
# for each in df1['sex']:
# if each == 'female':
# tmp2.append(1)
# elif each == 'male':
# tmp2.append(0)
# else:
# tmp2.append(np.nan)
# df1['sex'] = tmp2
df1['pclass'] = df1['pclass'].astype('float')
df1['sex'] = df1['sex'].astype('float')
df1['sibsp'] = df1['sibsp'].astype('float')
df1['parch'] = df1['parch'].astype('float')
df1['fare'] = df1['fare'].astype('float')
df1.age.fillna(df1.age_new, inplace=True)
# df1 = df1[df1['age'].notnull()]
df1.sibsp.fillna(df1.sibsp.mean(), inplace=True)
df1.parch.fillna(df1.parch.mean(), inplace=True)
df1.fare.fillna(0, inplace=True)
df1_survived_encoder = LabelEncoder()
df1_sex_encoder = LabelEncoder()
df1_pclass_encoder = LabelEncoder()
df1_age_encoder = LabelEncoder()
df1_sibsp_encoder = LabelEncoder()
df1_parch_encoder = LabelEncoder()
df1_fare_encoder = LabelEncoder()
for i in ['survived','age','pclass', 'sex']:
globals()[f'df1_{i}_encoder'] = LabelEncoder()
globals()[f'df1_{i}_encoder'].fit(df1[i])
df[i] = globals()[f'df1_{i}_encoder'].transform(df1[i])
X = df.drop('survived', axis=1)
y = df.survived
print('-----------------------')
print(X.head(1))
print('-----------------------')
print(y.head(1))
titanic_dtclf = DecisionTreeClassifier(max_depth=2)
print(titanic_dtclf.fit(X,y))
export_graphviz(titanic_dtclf,out_file='titanic1.dot',
feature_names=X.columns,
class_names=['생존','사망'])
with open('./titanic1.dot', encoding='UTF8') as f:
titanic1 = f.read()
graphviz.Source(titanic1)
'''csv파일로 승객명단을 불러온 후, 예측하기'''
df_new = pd.read_csv("./test.csv")
new_df =pd.DataFrame(df_new)
print('-----------------')
print(new_df)
#PassengerId Pclass Name Sex Age SibSp Parch Ticket Fare Cabin Embarked
tmp = []
for each in new_df['Sex']:
if each == 'female':
tmp.append(1)
elif each == 'male':
tmp.append(0)
else:
tmp.append(np.nan)
new_df.drop(['Name','Ticket','Cabin','Embarked'], axis=1,inplace=True)
new_df['Sex'] = tmp
new_df['Pclass'] = new_df['Pclass'].astype('float')
new_df['Sex'] = new_df['Sex'].astype('float')
new_df['Sibsp'] = new_df['SibSp'].astype('float')
new_df['Parch'] = new_df['Parch'].astype('float')
new_df['Fare'] = new_df['Fare'].astype('float')
new_df['Age'] = new_df['Age'].astype('float')
new_df.Age.fillna(df1.age_new, inplace=True)
# new_df = new_df[new_df['Age'].notnull(),inplace=True]
# new_df = new_df[new_df['SibSp'].notnull()]
new_df.SibSp.fillna(new_df.SibSp.mean(), inplace=True)
# new_df = new_df[new_df['Parch'].notnull()]
new_df.Parch.fillna(new_df.Parch.mean(), inplace=True)
# new_df = new_df[new_df['Fare'].notnull()]
new_df.Fare.fillna(0, inplace=True)
#테스트 csv에서 PassengerId를 인덱스로 지정하기
# new_df.set_index('PassengerId', inplace=True)
# #필요 없다고 생각하는 데이터 제거 하기
#train_pre = raw_data[['pclass','sex','age','sibsp','parch','fare']]
for i in ['Pclass', 'Age','Sex','Parch']:
globals()[f'new_{i}_encoder'] = LabelEncoder()
globals()[f'new_{i}_encoder'].fit(new_df[i])
new_df[i] = globals()[f'new_{i}_encoder'].transform(new_df[i])
# print(new_Pclass_encoder.classes_)
# print(new_Age_encoder.classes_)
pred_result = titanic_dtclf.predict(new_df)
pred_result1 = df1_survived_encoder.inverse_transform(pred_result)
new_df['Survived']=pred_result1
print('생존자수:', (new_df['Survived'].sum()))
print('예측:',format(titanic_dtclf.score(X,y)))
print(new_df)
#print(df_new.dtypes)
#new_df = pd.read_csv('test.csv')
new_df.drop(list(new_df.columns)[1:], axis = 1, inplace=True)
new_df['Survived'] = pred_result1
new_df.set_index('PassengerId', inplace=True)
new_df.to_csv('tit5_test.csv')
#PassengerID , Pclass, Age이용 하는게 나을거 같다.
#필요 없다고 생각 하는 데이터는 드롭 하기
# df_new = df_new.drop(['Name','Sex','SibSp','Parch','Ticket','Fare','Cabin','Embarked'], axis=1)
#인덱스는 PassengerID로 지정해주자
'''
'''
# df_test = pd.read_csv('./test.csv')
# df_test.head()
# df_test.Cabin.value_counts()
# df_test.set_index('PassengerId', inplace=True)
# df_test.head()
# df_test['family'] = df_test.SibSp + df_test.Parch
# df_test.head()
# df_test.drop(['SibSp', 'Parch', 'Name', 'Ticket', 'Cabin'], axis=1, inplace=True)
# df_test.head()
# df_test.age_new = 0
# df_test.loc[df_test.Age >= 50, 'age_new'] = 'old'
# df_test.loc[(df_test.Age < 50) & (df_test.Age>=10), 'age_new'] = 'young'
# df_test.loc[df_test.Age < 10, 'age_new'] = 'baby'
# df_test.head()
# df_test.columns = (df.columns)[1:]
# df_test.head(1)
# df_test.isna().sum()
# df_test[df_test.fare.isna()]
# fare_new = float(df.loc[(df.pclass==3)&(df.sex==1)&(df.age>60)&(df.age<62)&(df.embarked==2), 'fare'].values)
# fare_new
# df_test.fare.fillna(fare_new, inplace=True)
# df_test.isna().sum()
# m1_med = df.loc[(df.sex == 1)& (df.pclass == 1), 'age'].median()
# m2_med = df.loc[(df.sex == 1)& (df.pclass == 2), 'age'].median()
# m3_med = df.loc[(df.sex == 1)& (df.pclass == 3), 'age'].median()
# f1_med = df.loc[(df.sex == 0)& (df.pclass == 1), 'age'].median()
# f2_med = df.loc[(df.sex == 0)& (df.pclass == 2), 'age'].median()
# f3_med = df.loc[(df.sex == 0)& (df.pclass == 3), 'age'].median()
# df_test.loc[(df_test.sex == 'male')&(df_test.pclass == 1)&(df_test.age.isna()), 'age'] = m1_med
# df_test.loc[(df_test.sex == 'male')&(df_test.pclass == 2)&(df_test.age.isna()), 'age'] = m2_med
# df_test.loc[(df_test.sex == 'male')&(df_test.pclass == 3)&(df_test.age.isna()), 'age'] = m3_med
# df_test.loc[(df_test.sex == 'female')&(df_test.pclass == 1)&(df_test.age.isna()), 'age'] = f1_med
# df_test.loc[(df_test.sex == 'female')&(df_test.pclass == 2)&(df_test.age.isna()), 'age'] = f2_med
# df_test.loc[(df_test.sex == 'female')&(df_test.pclass == 3)&(df_test.age.isna()), 'age'] = f3_med
# df_test.loc[df_test.age >= 50, 'age_new'] = 'old'
# df_test.loc[(df_test.age < 50) & (df_test.age>=10), 'age_new'] = 'young'
# df_test.loc[df_test.age < 10, 'age_new'] = 'baby'
# df_test.isna().sum()
# df_test.head()
# titanic_O_columns = df_test.columns[df_test.dtypes=='O']
# titanic_O_columns
# for i in titanic_O_columns:
# globals()[f'df_test_{i}_encoder'] = LabelEncoder()
# globals()[f'df_test_{i}_encoder'].fit(df_test[i])
# df_test[i] = globals()[f'df_test_{i}_encoder'].transform(df_test[i])
# df_test['survived'] = titanic_dtclf.predict(df_test)
# print('생존자수:', (df_test['survived'].sum()))
'''데이터 분리 : 학습데이터 + 테스트 데이터
y_titanic_df = df_train['Survived']
x_titanic_df =
'''
df1['family'] = df1.sibsp + df1.parch
print(df1)
'''학습 데이터는 약 70%로 잡는다 , 테스트 데이터 30%
트레이닝 데이터 / 테스트 데이터 분리
'''
from sklearn.model_selection import train_test_split
X = df1.drop('survived', axis=1)
y = df1.survived
#X_train,X_test, y_train, y_test = train_test_split(X,y,test_size=0.3, random_state=12)
#print(len(X_train.index))
#train 70% / validation 20% / test 10%
X_train,X_test, y_train, y_test = train_test_split(X,y,test_size=0.1, random_state=12)
#X_train,X_val, y_train, y_val = train_test_split(X_train, y_train, test_size = 0.22, random_state=12)
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)
dt_clf = DecisionTreeClassifier()
dt_clf.fit(X_train, y_train)
dt_pred = dt_clf.predict(X_test)
from sklearn.metrics import accuracy_score
print('정확도: %.2f'% accuracy_score(y_test, dt_pred))
print('정확도:%.2f' % dt_clf.score(X_test, y_test))
'''
평가 및 적용
교차검증 KFOLD
'''
from sklearn.model_selection import KFold
#5개의 train셋 , Val셋 만든다.
kfold = KFold(n_splits=5, shuffle=False)
kfold.split(X_train)
#트레인 인덱스 , val 인덱스
scores=[]
for no, (train_idx, val_idx) in enumerate(kfold.split(X_train)):
X_train_k = X_train.iloc[train_idx]
y_train_k = y_train.iloc[train_idx]
X_val_k = X_train.iloc[val_idx]
y_val_k = y_train.iloc[val_idx]
dt_clf.fit(X_train_k,y_train_k)
scores.append(dt_clf.score(X_val_k,y_val_k))
print(scores)
print(np.mean(scores))