HGBM을 사용할 경우,
from sklearn.experimental import enable_hist_gradient_boosting
from sklearn.ensemble import HistGradientBoostingClassifier
을 해줘야 임포트에러가 발생하지 않는다.
ImportError: cannot import name 'HistGradientBoostingClassifier' from 'sklearn.ensemble' (C:\Users\Hoon\anaconda3\lib\site-packages\sklearn\ensemble\__init__.py)
또한, XGB를 사용 할 경우 , Warning 메세지가 출력 되는데
WARNING: C:/Users/Administrator/workspace/xgboost-win64_release_1.5.1/src/learner.cc:1115: Starting in XGBoost 1.3.0, the default evaluation metric used with the objective 'binary:logistic' was changed from 'error' to 'logloss'. Explicitly set eval_metric if you'd like to restore the old behavior.
xgb_clf = XGBClassifier(eval_metric='mlogloss',use_label_encoder= False) < -- 이런식으로 해주면 WarningMessage또한 출력되지 않는다 .
SNS 타이타닉 예측 HGBM , XGB 학습 , 파라미터 튜닝 -- > 검증은 캐글
import xgboost as xgb
from xgboost import XGBClassifier
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier
from sklearn.ensemble import BaggingClassifier
from sklearn.experimental import enable_hist_gradient_boosting
from sklearn.ensemble import HistGradientBoostingClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import VotingClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_squared_error
from sklearn.tree import plot_tree
from sklearn import metrics
import warnings
warnings.filterwarnings('ignore')
df = pd.read_csv('./titanic/train.csv')
df_kg = pd.read_csv('./titanic/test.csv')
df1 = df.copy()
df1.set_index('PassengerId', inplace=True)
df_kg.set_index('PassengerId', inplace=True)
df1.drop(['Name'], axis=1, inplace=True)
df_kg.drop(['Name'], axis=1, inplace=True)
df_kg.Cabin.astype(str)
# 결측치 처리
df1.Cabin.fillna('N', inplace=True)
df1.Embarked.fillna('S', inplace=True)
df1.Age.fillna(df1.Age.median(), inplace=True)
df_kg.Cabin.fillna('N', inplace=True)
df_kg.Fare.fillna(df1.Fare.median(), inplace=True)
df_kg.Age.fillna(df1.Age.median(), inplace=True)
df1.Cabin = df1.Cabin.apply(lambda x:x[0])
df_kg.Cabin = df_kg.Cabin.apply(lambda x:x[0])
# 인코딩
og_columns = df1.columns[(df1.dtypes=='O')|(df1.dtypes=='category')|(df1.dtypes=='bool')]
for i in og_columns:
globals()[f'df1_{i}_encoder'] = LabelEncoder()
globals()[f'df1_{i}_encoder'].fit(df1[i])
globals()[f'df_kg_{i}_encoder'] = LabelEncoder()
globals()[f'df_kg_{i}_encoder'].fit(df_kg[i])
df1[i] = globals()[f'df1_{i}_encoder'].transform(df1[i])
df_kg[i] = globals()[f'df_kg_{i}_encoder'].transform(df_kg[i])
# X, y 분리
X = df1.drop('Survived', axis=1)
y = df1.Survived
# train, test 분리
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.1, random_state=19 )
xgb_clf = XGBClassifier(eval_metric='mlogloss',use_label_encoder= False)
xgb_clf.fit(X_train, y_train)
print('xgb',xgb_clf.score(X_val, y_val))
hgbm_clf = HistGradientBoostingClassifier()
hgbm_clf.fit(X_train,y_train)
print('hgbm',hgbm_clf.score(X_val, y_val))
hgbm_clf.fit(X, y)
kg_up = df_kg.copy()
kg_up['Survived'] = hgbm_clf.predict(df_kg)
kg_up.Survived.to_csv('kaggle_upload_hgbm_clf.csv')
xgb_clf.fit(X, y)
kg_up = df_kg.copy()
kg_up['Survived'] = xgb_clf.predict(df_kg)
kg_up.Survived.to_csv('kaggle_upload_xgb_clf.csv')
XGBClassifier 점수가 더 잘나온다.
여태까지 제일 잘 나온 점수는,
GridSearch에 rdclf 이용 할 경우,
parameters = {
'n_estimators':[100,200,300],
'random_state':[0,10,20,30],
'max_depth':[1,3,5,7,9],
'min_samples_split':[5,10],
'criterion':['gini','entropy']
}
rd_dt = GridSearchCV(rd_clf, param_grid=parameters ,cv = 5, n_jobs = -1)
rd_dt.fit(X_train,y_train)
print(rd_dt.best_params_)
print(rd_dt.best_estimator_.score(X_train, y_train))
'Python' 카테고리의 다른 글
Python - 다중공선성 , 차원축소 , 군집 (0) | 2022.10.24 |
---|---|
Python - 모델 Stacking (0) | 2022.10.22 |
Python - Grid Search Hyper parameter (0) | 2022.10.19 |
Python - 랜덤포레스트 iris (0) | 2022.10.19 |
Python - 머신러닝 모델 평가 [최종] (0) | 2022.10.17 |