'''
지금까지 왜 BOOSTING이 BAGGING에 밀렸는가 ?
너무 많은 계산량
튜닝해야 할 파라미터가 너무 많다
bagging -> 분산 컴퓨팅 기능
boosting -> 분산 컴퓨팅 어려움
- loss
- learning_rate
- n_estimators
- subsample
'''
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier
from sklearn.ensemble import BaggingClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import VotingClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_squared_error
from sklearn.tree import plot_tree
from sklearn import metrics
import warnings
warnings.filterwarnings('ignore')
df = pd.read_csv('./titanic/train.csv')
df_kg = pd.read_csv('./titanic/test.csv')
df1 = df.copy()
df1.set_index('PassengerId', inplace=True)
df_kg.set_index('PassengerId', inplace=True)
df1.drop(['Name'], axis=1, inplace=True)
df_kg.drop(['Name'], axis=1, inplace=True)
df_kg.Cabin.astype(str)
# 결측치 처리
df1.Cabin.fillna('N', inplace=True)
df1.Embarked.fillna('S', inplace=True)
df1.Age.fillna(df1.Age.median(), inplace=True)
df_kg.Cabin.fillna('N', inplace=True)
df_kg.Fare.fillna(df1.Fare.median(), inplace=True)
df_kg.Age.fillna(df1.Age.median(), inplace=True)
df1.Cabin = df1.Cabin.apply(lambda x:x[0])
df_kg.Cabin = df_kg.Cabin.apply(lambda x:x[0])
# 인코딩
og_columns = df1.columns[(df1.dtypes=='O')|(df1.dtypes=='category')|(df1.dtypes=='bool')]
for i in og_columns:
globals()[f'df1_{i}_encoder'] = LabelEncoder()
globals()[f'df1_{i}_encoder'].fit(df1[i])
globals()[f'df_kg_{i}_encoder'] = LabelEncoder()
globals()[f'df_kg_{i}_encoder'].fit(df_kg[i])
df1[i] = globals()[f'df1_{i}_encoder'].transform(df1[i])
df_kg[i] = globals()[f'df_kg_{i}_encoder'].transform(df_kg[i])
# X, y 분리
X = df1.drop('Survived', axis=1)
y = df1.Survived
# train, test 분리
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.1, random_state=19 )
parameters = {
'n_estimators':[100,200,300],
'random_state':[0,10,20,30],
'max_depth':[1,3,5,7,9],
'min_samples_split':[5,10],
'criterion':['gini','entropy']
}
params = {
'n_estimators':[50,100,200,300],
'learning_rate':[0.05,1,2,3]
}
#{'criterion': 'entropy', 'max_depth': 9, 'min_samples_split': 10, 'n_estimators': 200, 'random_state': 20}
rd_clf = RandomForestClassifier()
rd_clf.fit(X_train, y_train)
print(rd_clf.score(X_val, y_val))
gvm_clf = GradientBoostingClassifier()
gvm_clf.fit(X_train, y_train)
print('gvm',gvm_clf.score(X_val, y_val))
grid_dt = GridSearchCV(gvm_clf, param_grid=params ,cv = 5, n_jobs = -1)
grid_dt.fit(X_train,y_train)
print(grid_dt.best_params_)
print(grid_dt.best_estimator_.score(X_train, y_train))
rd_dt = GridSearchCV(rd_clf, param_grid=parameters ,cv = 5, n_jobs = -1)
rd_dt.fit(X_train,y_train)
print(rd_dt.best_params_)
print(rd_dt.best_estimator_.score(X_train, y_train))
grid_dt.fit(X, y)
kg_up = df_kg.copy()
kg_up['Survived'] = grid_dt.predict(df_kg)
kg_up.Survived.to_csv('kaggle_upload_Grid3.csv')
rd_clf.fit(X, y)
kg_up = df_kg.copy()
kg_up['Survived'] = rd_clf.predict(df_kg)
kg_up.Survived.to_csv('kaggle_upload_rd_clf.csv')
'Python' 카테고리의 다른 글
Python - 모델 Stacking (0) | 2022.10.22 |
---|---|
Python - XGB, HGBM 파라미터 튜닝 , 캐글 점수 (0) | 2022.10.21 |
Python - 랜덤포레스트 iris (0) | 2022.10.19 |
Python - 머신러닝 모델 평가 [최종] (0) | 2022.10.17 |
Python - 모델평가 함수 (0) | 2022.10.17 |