Python/인공지능

AI (SelectFromModel/ Optuna)

아리빠 2023. 5. 17. 16:33

●SelectFromModel

#2. 모델
model = XGBRegressor(random_state=123, n_estimators=1000, 
learning_rate = 0.1, max_depth = 6, gamma= 1)
#3. 훈련
model.fit(x_train, y_train, early_stopping_rounds=200,
eval_set = [(x_train, y_train), (x_test, y_test)],
eval_metric='rmse')
#4. 평가, 예측
result = model.score(x_test, y_test) 
print('r2 : ', result)
y_predict = model.predict(x_test)
acc = r2_score(y_test, y_predict)
print("진짜 최종 test 점수 : ", acc)
print(model.feature_importances_)
thresholds = model.feature_importances_
print("=========== SelectFromModel ===============")
for thresh in thresholds:
selection = SelectFromModel(model, threshold=thresh, prefit=True) 
select_x_train = selection.transform(x_train)
select_x_test = selection.transform(x_test)
print(select_x_train.shape, select_x_test.shape)
selection_model = XGBRegressor(n_jobs=-1, 
random_state=123, 
n_estimators=1000, 
learning_rate = 0.1,
max_depth = 6, 
gamma= 1,)
selection_model.fit(select_x_train, y_train)
y_predict = selection_model.predict(select_x_test)
score = r2_score(y_test, y_predict)
print("Thresh=%.3f, n=%d, R2:%.2f%%"
%(thresh, select_x_train.shape[1], score*100))

●Optuna

import optuna
from optuna import Trial, visualization
from optuna.samplers import TPESampler
from sklearn.metrics import mean_absolute_error
'''
optuna.trial.Trial.suggest_categorical() : 리스트 범위 내에서 값을 선택한다.
optuna.trial.Trial.suggest_int() : 범위 내에서 정수형 값을 선택한다.
optuna.trial.Trial.suggest_float() : 범위 내에서 소수형 값을 선택한다.
optuna.trial.Trial.suggest_uniform() : 범위 내에서 균일분포 값을 선택한다.
optuna.trial.Trial.suggest_discrete_uniform() : 범위 내에서 이산 균일분포 값을 선택한다.
optuna.trial.Trial.suggest_loguniform() : 범위 내에서 로그 함수 값을 선택한다.
''' 
def objectiveCAT(trial: Trial, x_train, y_train, x_test):
param = { 
'n_estimators' : trial.suggest_int('n_estimators', 500, 4000),
'depth' : trial.suggest_int('depth', 8, 16),
'fold_permutation_block' : trial.suggest_int('fold_permutation_block', 1, 256),
'learning_rate' : trial.suggest_float('learning_rate', 0, 1),
'od_pval' : trial.suggest_float('od_pval', 0, 1),
'l2_leaf_reg' : trial.suggest_float('l2_leaf_reg', 0, 4),
'random_state' :trial.suggest_int('random_state', 1, 2000)
}
# 학습 모델 생성
model = CatBoostClassifier(**param)
CAT_model = model.fit(x_train, y_train, verbose=True) # 학습 진행
# 모델 성능 확인
score = accuracy_score(CAT_model.predict(x_test), y_test)
return score
# MAE가 최소가 되는 방향으로 학습을 진행
# TPESampler : Sampler using TPE (Tree-structured Parzen Estimator) algorithm.
study = optuna.create_study(direction='maximize', sampler=TPESampler())
# n_trials 지정해주지 않으면, 무한 반복
study.optimize(lambda trial : objectiveCAT(trial, x, y, x_test), n_trials = 5)
print('Best trial : score {}, \nparams {}'.format(study.best_trial.value, study.best_trial.params))
# 하이퍼파라미터별 중요도를 확인할 수 있는 그래프
print(optuna.visualization.plot_param_importances(study))
# 하이퍼파라미터 최적화 과정을 확인
optuna.visualization.plot_optimization_history(study)
plt.show()

●코드 실습

import numpy as np
from sklearn.metrics import r2_score
from sklearn.model_selection import train_test_split, KFold
from sklearn.model_selection import cross_val_score, cross_val_predict
from sklearn.datasets import fetch_california_housing
from sklearn.preprocessing import MinMaxScaler

import tensorflow as tf
tf.random.set_seed(77) # weight 난수값 조정

#1. 데이터
datasets = fetch_california_housing()
x = datasets['data']
y = datasets.target

x_train, x_test, y_train, y_test = train_test_split(
    x, y, train_size=0.8, shuffle=True, random_state=42
)

# kfold
n_splits = 11
random_state = 42
kfold = KFold(n_splits=n_splits, shuffle=True, 
              random_state=random_state)

# Scaler 적용
scaler = MinMaxScaler()
scaler.fit(x_train)
x_train = scaler.transform(x_train)
x_test = scaler.transform(x_test)

#2. 모델
from xgboost import XGBRegressor
model = XGBRegressor(random_state=123, 
                    n_estimators=1000,
                    learning_rate = 0.1, 
                    max_depth = 6, 
                    gamma= 1)

#3. 훈련
model.fit(x_train, y_train,
          early_stopping_rounds=1000,
          eval_set = [(x_train, y_train), (x_test, y_test)],
          eval_metric='rmse')
          # eval_metric 회귀모델 : rmse, mae, rmsle...
          #             이진분류 : error, auc, logloss...
          #             다중분류 : merror, mlogloss...

#4. 평가, 예측
score = cross_val_score(model, 
                        x_train, y_train, 
                        cv=kfold) # cv : cross validation
y_predict = cross_val_predict(model,
                              x_test, y_test,
                              cv=kfold)
r2 = r2_score(y_test, y_predict)
print('cv pred acc : ', r2)

# cv pred acc :  0.7788353667042689

# SelectFromModel
from sklearn.feature_selection import SelectFromModel

thresholds = model.feature_importances_

for thresh in thresholds:
    selection = SelectFromModel(model, threshold=thresh, prefit=True)
    select_x_train = selection.transform(x_train)
    select_x_test = selection.transform(x_test)
    print(select_x_train.shape, select_x_test.shape)

    selection_model = XGBRegressor()
    selection_model.fit(select_x_train, y_train)
    y_predict = selection_model.predict(select_x_test)
    score = r2_score(y_test, y_predict)
    print("Thresh=%.3f, n=%d, R2:%.2f%%"
        %(thresh, select_x_train.shape[1], score*100))

import numpy as np
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split, KFold
from sklearn.model_selection import cross_val_score, cross_val_predict
from sklearn.datasets import load_iris
from sklearn.preprocessing import MinMaxScaler, StandardScaler

import tensorflow as tf
tf.random.set_seed(77) # weight 난수값 조정

#1. 데이터
datasets = load_iris()
x = datasets['data']
y = datasets.target

x_train, x_test, y_train, y_test = train_test_split(
    x, y, train_size=0.8, shuffle=True, random_state=42
)

# kfold
n_splits = 11
random_state = 42
kfold = KFold(n_splits=n_splits, shuffle=True, 
              random_state=random_state)

# Scaler 적용
scaler = MinMaxScaler()
scaler.fit(x_train)
x_train = scaler.transform(x_train)
x_test = scaler.transform(x_test)

#2. 모델
from xgboost import XGBClassifier
model = XGBClassifier()

#3. 훈련
model.fit(x_train, y_train, early_stopping_rounds=100,
          eval_set = [(x_train, y_train), (x_test, y_test)],
          eval_metric='merror')

#4. 평가, 예측
score = cross_val_score(model, 
                        x_train, y_train, 
                        cv=kfold) # cv : cross validation
# print('cv acc : ', score)
y_predict = cross_val_predict(model,
                              x_test, y_test,
                              cv=kfold)
# print('cv pred : ', y_predict)
acc = accuracy_score(y_test, y_predict)
print('cv pred acc : ', acc)

# cv pred acc :  0.8666666666666667

# SelectFromModel
from sklearn.feature_selection import SelectFromModel

thresholds = model.feature_importances_

for thresh in thresholds:
    selection = SelectFromModel(model, threshold=thresh, prefit=True)
    select_x_train = selection.transform(x_train)
    select_x_test = selection.transform(x_test)
    print(select_x_train.shape, select_x_test.shape)

    selection_model = XGBClassifier()
    selection_model.fit(select_x_train, y_train)
    y_predict = selection_model.predict(select_x_test)
    score = accuracy_score(y_test, y_predict)
    print("Thresh=%.3f, n=%d, ACC:%.2f%%"
        %(thresh, select_x_train.shape[1], score*100))

import numpy as np
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score
from sklearn.model_selection import train_test_split, KFold
from sklearn.model_selection import cross_val_score, cross_val_predict
from sklearn.datasets import fetch_california_housing
from sklearn.preprocessing import MinMaxScaler

import tensorflow as tf
tf.random.set_seed(77) # weight 난수값 조정

#1. 데이터
datasets = fetch_california_housing()
x = datasets['data']
y = datasets.target

x_train, x_test, y_train, y_test = train_test_split(
    x, y, train_size=0.8, shuffle=True, random_state=42
)

# kfold
n_splits = 5
random_state = 42
kfold = KFold(n_splits=n_splits, shuffle=True, 
              random_state=random_state)    # KFold : 회귀모델 / StratifiedKFold : 분류모델

# Scaler 적용
scaler = MinMaxScaler()
scaler.fit(x_train)
x_train = scaler.transform(x_train)
x_test = scaler.transform(x_test)

import optuna
from optuna import Trial, visualization
from optuna.samplers import TPESampler
from sklearn.metrics import mean_absolute_error
from catboost import CatBoostRegressor
import matplotlib.pyplot as plt

def objectiveCAT(trial: Trial, x_train, y_train, x_test):
    param = {
        'n_estimators' : trial.suggest_int('n_estimators', 500, 4000),
        'depth' : trial.suggest_int('depth', 8, 16),
        'fold_permutation_block' : trial.suggest_int('fold_permutation_block', 1, 256),
        'learning_rate' : trial.suggest_float('learning_rate', 0, 1),
        'od_pval' : trial.suggest_float('od_pval', 0, 1),
        'l2_leaf_reg' : trial.suggest_float('l2_leaf_reg', 0, 4),
        'random_state' :trial.suggest_int('random_state', 1, 2000)
    }
    # 학습 모델 생성
    model = CatBoostRegressor(**param)
    CAT_model = model.fit(x_train, y_train, verbose=True) # 학습 진행
    # 모델 성능 확인
    score = r2_score(CAT_model.predict(x_test), y_test)
    return score

# MAE가 최소가 되는 방향으로 학습을 진행
# TPESampler : Sampler using TPE (Tree-structured Parzen Estimator) algorithm.
study = optuna.create_study(direction='maximize', sampler=TPESampler())

# n_trials 지정해주지 않으면, 무한 반복
study.optimize(lambda trial : objectiveCAT(trial, x, y, x_test), n_trials = 5)
print('Best trial : score {}, /nparams {}'.format(study.best_trial.value, 
                                                  study.best_trial.params))

# 하이퍼파라미터별 중요도를 확인할 수 있는 그래프
print(optuna.visualization.plot_param_importances(study))
# 하이퍼파라미터 최적화 과정을 확인
optuna.visualization.plot_optimization_history(study)
plt.show()

from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler, MaxAbsScaler, RobustScaler
from sklearn.preprocessing import QuantileTransformer, PowerTransformer
from sklearn.metrics import accuracy_score
from sklearn.ensemble import RandomForestClassifier

import warnings
warnings.filterwarnings('ignore')
import time


#1. 데이터
datasets = load_iris()
x, y = datasets.data, datasets.target
print(x.shape, y.shape)     # (150, 4) (150,)


x_train, x_test, y_train, y_test = train_test_split(
    x, y, shuffle=True, random_state=72, train_size=0.8
)

# 스케일링
sts = StandardScaler() 
mms = MinMaxScaler()
mas = MaxAbsScaler()
rbs = RobustScaler()
qtf = QuantileTransformer()                     # QuantileTransformer 는 지정된 분위수에 맞게 데이터를 변환함. 
                                                # 기본 분위수는 1,000개이며, n_quantiles 매개변수에서 변경할 수 있음
ptf1 = PowerTransformer(method='yeo-johnson')   # 'yeo-johnson', 양수 및 음수 값으로 작동
ptf2 = PowerTransformer(method='box-cox')       # 'box-cox', 양수 값에서만 작동

scalers = [sts, mms, mas, rbs, qtf, ptf1, ptf2]
for scaler in scalers:
    x_train = scaler.fit_transform(x_train)
    x_test = scaler.transform(x_test)
    model = RandomForestClassifier()
    model.fit(x_train, y_train)
    y_predict = model.predict(x_test)
    result = accuracy_score(y_test, y_predict)
    scale_name = scaler.__class__.__name__
    print('{0} 결과 : {1:.4f}'.format(scale_name, result), )
    

#=================================== 결과 =====================================#
# StandardScaler 결과 : 1.0000
# MinMaxScaler 결과 : 1.0000
# MaxAbsScaler 결과 : 1.0000
# RobustScaler 결과 : 1.0000
# QuantileTransformer 결과 : 1.0000
# PowerTransformer 결과 : 1.0000
# ValueError: The Box-Cox transformation can only be applied to strictly positive data
#==============================================================================#