●SelectFromModel
model = XGBRegressor(random_state=123, n_estimators=1000,
learning_rate = 0.1, max_depth = 6, gamma= 1)
model.fit(x_train, y_train, early_stopping_rounds=200,
eval_set = [(x_train, y_train), (x_test, y_test)],
eval_metric='rmse')
result = model.score(x_test, y_test)
print('r2 : ', result)
y_predict = model.predict(x_test)
acc = r2_score(y_test, y_predict)
print("진짜 최종 test 점수 : ", acc)
print(model.feature_importances_)
thresholds = model.feature_importances_
print("=========== SelectFromModel ===============")
for thresh in thresholds:
selection = SelectFromModel(model, threshold=thresh, prefit=True)
select_x_train = selection.transform(x_train)
select_x_test = selection.transform(x_test)
print(select_x_train.shape, select_x_test.shape)
selection_model = XGBRegressor(n_jobs=-1,
random_state=123,
n_estimators=1000,
learning_rate = 0.1,
max_depth = 6,
gamma= 1,)
selection_model.fit(select_x_train, y_train)
y_predict = selection_model.predict(select_x_test)
score = r2_score(y_test, y_predict)
print("Thresh=%.3f, n=%d, R2:%.2f%%"
%(thresh, select_x_train.shape[1], score*100))
●Optuna
import optuna
from optuna import Trial, visualization
from optuna.samplers import TPESampler
from sklearn.metrics import mean_absolute_error
'''
optuna.trial.Trial.suggest_categorical() : 리스트 범위 내에서 값을 선택한다.
optuna.trial.Trial.suggest_int() : 범위 내에서 정수형 값을 선택한다.
optuna.trial.Trial.suggest_float() : 범위 내에서 소수형 값을 선택한다.
optuna.trial.Trial.suggest_uniform() : 범위 내에서 균일분포 값을 선택한다.
optuna.trial.Trial.suggest_discrete_uniform() : 범위 내에서 이산 균일분포 값을 선택한다.
optuna.trial.Trial.suggest_loguniform() : 범위 내에서 로그 함수 값을 선택한다.
'''
def objectiveCAT(trial: Trial, x_train, y_train, x_test):
param = {
'n_estimators' : trial.suggest_int('n_estimators', 500, 4000),
'depth' : trial.suggest_int('depth', 8, 16),
'fold_permutation_block' : trial.suggest_int('fold_permutation_block', 1, 256),
'learning_rate' : trial.suggest_float('learning_rate', 0, 1),
'od_pval' : trial.suggest_float('od_pval', 0, 1),
'l2_leaf_reg' : trial.suggest_float('l2_leaf_reg', 0, 4),
'random_state' :trial.suggest_int('random_state', 1, 2000)
}
model = CatBoostClassifier(**param)
CAT_model = model.fit(x_train, y_train, verbose=True)
score = accuracy_score(CAT_model.predict(x_test), y_test)
return score
study = optuna.create_study(direction='maximize', sampler=TPESampler())
study.optimize(lambda trial : objectiveCAT(trial, x, y, x_test), n_trials = 5)
print('Best trial : score {}, \nparams {}'.format(study.best_trial.value, study.best_trial.params))
print(optuna.visualization.plot_param_importances(study))
optuna.visualization.plot_optimization_history(study)
plt.show()
●코드 실습
import numpy as np
from sklearn.metrics import r2_score
from sklearn.model_selection import train_test_split, KFold
from sklearn.model_selection import cross_val_score, cross_val_predict
from sklearn.datasets import fetch_california_housing
from sklearn.preprocessing import MinMaxScaler
import tensorflow as tf
tf.random.set_seed(77)
datasets = fetch_california_housing()
x = datasets['data']
y = datasets.target
x_train, x_test, y_train, y_test = train_test_split(
x, y, train_size=0.8, shuffle=True, random_state=42
)
n_splits = 11
random_state = 42
kfold = KFold(n_splits=n_splits, shuffle=True,
random_state=random_state)
scaler = MinMaxScaler()
scaler.fit(x_train)
x_train = scaler.transform(x_train)
x_test = scaler.transform(x_test)
from xgboost import XGBRegressor
model = XGBRegressor(random_state=123,
n_estimators=1000,
learning_rate = 0.1,
max_depth = 6,
gamma= 1)
model.fit(x_train, y_train,
early_stopping_rounds=1000,
eval_set = [(x_train, y_train), (x_test, y_test)],
eval_metric='rmse')
score = cross_val_score(model,
x_train, y_train,
cv=kfold)
y_predict = cross_val_predict(model,
x_test, y_test,
cv=kfold)
r2 = r2_score(y_test, y_predict)
print('cv pred acc : ', r2)
from sklearn.feature_selection import SelectFromModel
thresholds = model.feature_importances_
for thresh in thresholds:
selection = SelectFromModel(model, threshold=thresh, prefit=True)
select_x_train = selection.transform(x_train)
select_x_test = selection.transform(x_test)
print(select_x_train.shape, select_x_test.shape)
selection_model = XGBRegressor()
selection_model.fit(select_x_train, y_train)
y_predict = selection_model.predict(select_x_test)
score = r2_score(y_test, y_predict)
print("Thresh=%.3f, n=%d, R2:%.2f%%"
%(thresh, select_x_train.shape[1], score*100))
import numpy as np
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split, KFold
from sklearn.model_selection import cross_val_score, cross_val_predict
from sklearn.datasets import load_iris
from sklearn.preprocessing import MinMaxScaler, StandardScaler
import tensorflow as tf
tf.random.set_seed(77)
datasets = load_iris()
x = datasets['data']
y = datasets.target
x_train, x_test, y_train, y_test = train_test_split(
x, y, train_size=0.8, shuffle=True, random_state=42
)
n_splits = 11
random_state = 42
kfold = KFold(n_splits=n_splits, shuffle=True,
random_state=random_state)
scaler = MinMaxScaler()
scaler.fit(x_train)
x_train = scaler.transform(x_train)
x_test = scaler.transform(x_test)
from xgboost import XGBClassifier
model = XGBClassifier()
model.fit(x_train, y_train, early_stopping_rounds=100,
eval_set = [(x_train, y_train), (x_test, y_test)],
eval_metric='merror')
score = cross_val_score(model,
x_train, y_train,
cv=kfold)
y_predict = cross_val_predict(model,
x_test, y_test,
cv=kfold)
acc = accuracy_score(y_test, y_predict)
print('cv pred acc : ', acc)
from sklearn.feature_selection import SelectFromModel
thresholds = model.feature_importances_
for thresh in thresholds:
selection = SelectFromModel(model, threshold=thresh, prefit=True)
select_x_train = selection.transform(x_train)
select_x_test = selection.transform(x_test)
print(select_x_train.shape, select_x_test.shape)
selection_model = XGBClassifier()
selection_model.fit(select_x_train, y_train)
y_predict = selection_model.predict(select_x_test)
score = accuracy_score(y_test, y_predict)
print("Thresh=%.3f, n=%d, ACC:%.2f%%"
%(thresh, select_x_train.shape[1], score*100))
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score
from sklearn.model_selection import train_test_split, KFold
from sklearn.model_selection import cross_val_score, cross_val_predict
from sklearn.datasets import fetch_california_housing
from sklearn.preprocessing import MinMaxScaler
import tensorflow as tf
tf.random.set_seed(77)
datasets = fetch_california_housing()
x = datasets['data']
y = datasets.target
x_train, x_test, y_train, y_test = train_test_split(
x, y, train_size=0.8, shuffle=True, random_state=42
)
n_splits = 5
random_state = 42
kfold = KFold(n_splits=n_splits, shuffle=True,
random_state=random_state)
scaler = MinMaxScaler()
scaler.fit(x_train)
x_train = scaler.transform(x_train)
x_test = scaler.transform(x_test)
import optuna
from optuna import Trial, visualization
from optuna.samplers import TPESampler
from sklearn.metrics import mean_absolute_error
from catboost import CatBoostRegressor
import matplotlib.pyplot as plt
def objectiveCAT(trial: Trial, x_train, y_train, x_test):
param = {
'n_estimators' : trial.suggest_int('n_estimators', 500, 4000),
'depth' : trial.suggest_int('depth', 8, 16),
'fold_permutation_block' : trial.suggest_int('fold_permutation_block', 1, 256),
'learning_rate' : trial.suggest_float('learning_rate', 0, 1),
'od_pval' : trial.suggest_float('od_pval', 0, 1),
'l2_leaf_reg' : trial.suggest_float('l2_leaf_reg', 0, 4),
'random_state' :trial.suggest_int('random_state', 1, 2000)
}
model = CatBoostRegressor(**param)
CAT_model = model.fit(x_train, y_train, verbose=True)
score = r2_score(CAT_model.predict(x_test), y_test)
return score
study = optuna.create_study(direction='maximize', sampler=TPESampler())
study.optimize(lambda trial : objectiveCAT(trial, x, y, x_test), n_trials = 5)
print('Best trial : score {}, /nparams {}'.format(study.best_trial.value,
study.best_trial.params))
print(optuna.visualization.plot_param_importances(study))
optuna.visualization.plot_optimization_history(study)
plt.show()
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler, MaxAbsScaler, RobustScaler
from sklearn.preprocessing import QuantileTransformer, PowerTransformer
from sklearn.metrics import accuracy_score
from sklearn.ensemble import RandomForestClassifier
import warnings
warnings.filterwarnings('ignore')
import time
datasets = load_iris()
x, y = datasets.data, datasets.target
print(x.shape, y.shape)
x_train, x_test, y_train, y_test = train_test_split(
x, y, shuffle=True, random_state=72, train_size=0.8
)
sts = StandardScaler()
mms = MinMaxScaler()
mas = MaxAbsScaler()
rbs = RobustScaler()
qtf = QuantileTransformer()
ptf1 = PowerTransformer(method='yeo-johnson')
ptf2 = PowerTransformer(method='box-cox')
scalers = [sts, mms, mas, rbs, qtf, ptf1, ptf2]
for scaler in scalers:
x_train = scaler.fit_transform(x_train)
x_test = scaler.transform(x_test)
model = RandomForestClassifier()
model.fit(x_train, y_train)
y_predict = model.predict(x_test)
result = accuracy_score(y_test, y_predict)
scale_name = scaler.__class__.__name__
print('{0} 결과 : {1:.4f}'.format(scale_name, result), )