Содержание:
import re
import warnings
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.experimental import enable_halving_search_cv
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import GridSearchCV, StratifiedShuffleSplit, train_test_split, HalvingGridSearchCV
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import roc_auc_score
def split_camel_case(cc_string):
"""
>>> split_camel_case('HTTP2Service')
['HTTP2', 'Service']
>>> split_camel_case('CellRangeA1Z99')
['Cell', 'Range', 'A1', 'Z99']
>>> split_camel_case('customerID')
['customer', 'ID']
"""
return re.split(r'(?<=\d)(?=\D)|(?<=[^A-Z\d])(?=[A-Z\d])|(?<!^)(?=[A-Z][a-z])', cc_string)
def camel_to_snake_case(cc_string):
""" camel_to_snake_case('customerID') -> 'customer_id' """
return '_'.join(split_camel_case(cc_string)).lower()
SEED = 42
CACHE_DIR = '_cache-ml-telecom-users' # `None` to disable
df_raw = pd.read_csv('data/telecom_users.csv', index_col=0, na_values=[' '])
df = df_raw.copy()
df = df.drop(columns='customerID') # (1)
df = df.fillna(0) # (2)
#df = df.sort_index() # (3)
yes_no_columns = [
'Partner',
'Dependents',
'PhoneService',
'MultipleLines',
'OnlineSecurity',
'OnlineBackup',
'DeviceProtection',
'TechSupport',
'StreamingTV',
'StreamingMovies',
'PaperlessBilling',
'Churn',
]
df[yes_no_columns] = df[yes_no_columns] == 'Yes' # (4)
df.SeniorCitizen = df.SeniorCitizen == 1
df = pd.get_dummies(df, dtype=bool) # (5)
df = df.drop(columns=['gender_Female'])
df['InternetService_No'] = ~df['InternetService_No']
column_names_mapping = {
**dict(zip(df_raw.columns, df_raw.columns.map(camel_to_snake_case))),
'gender_Male': 'is_male',
'InternetService_DSL': 'internet_dsl',
'InternetService_Fiber optic': 'internet_fiber',
'InternetService_No': 'internet_service',
'Contract_Month-to-month': 'contract_one_month',
'Contract_One year': 'contract_one_year',
'Contract_Two year': 'contract_two_year',
'PaymentMethod_Bank transfer (automatic)': 'pay_auto_transfer',
'PaymentMethod_Credit card (automatic)': 'pay_auto_credit',
'PaymentMethod_Electronic check': 'pay_check_email',
'PaymentMethod_Mailed check': 'pay_check_mail',
}
df = df.rename(columns=column_names_mapping) # (6)
middle_columns = df.columns.drop(['churn', 'tenure', 'monthly_charges', 'total_charges'])
df = df[['churn', *middle_columns, 'tenure', 'monthly_charges', 'total_charges']] # reorder columns
df.head(10).T
1869 | 4528 | 6344 | 6739 | 432 | 2215 | 5260 | 6001 | 1480 | 5137 | |
---|---|---|---|---|---|---|---|---|---|---|
churn | False | False | True | False | False | False | False | False | False | False |
senior_citizen | False | False | True | False | False | False | False | False | False | True |
partner | True | False | True | False | False | True | False | False | False | False |
dependents | True | False | False | False | False | False | False | False | False | False |
phone_service | True | True | True | True | True | False | True | False | False | True |
multiple_lines | True | False | True | False | False | False | True | False | False | True |
online_security | False | False | False | False | True | True | True | False | False | True |
online_backup | False | True | False | False | False | False | False | False | False | True |
device_protection | False | True | False | False | True | True | False | False | True | True |
tech_support | False | False | False | False | False | True | False | False | True | True |
streaming_tv | False | True | False | False | False | False | False | False | False | True |
streaming_movies | False | False | False | True | False | True | True | False | False | True |
paperless_billing | False | True | True | True | False | True | True | True | False | True |
is_male | True | False | False | True | True | False | False | False | True | True |
internet_dsl | False | False | False | True | True | True | False | True | True | False |
internet_fiber | False | True | True | False | False | False | True | False | False | True |
internet_service | False | True | True | True | True | True | True | True | True | True |
contract_one_month | False | True | True | True | True | False | True | True | False | True |
contract_one_year | False | False | False | False | False | False | False | False | True | False |
contract_two_year | True | False | False | False | False | True | False | False | False | False |
pay_auto_transfer | False | False | True | False | False | True | False | False | False | False |
pay_auto_credit | True | True | False | False | False | False | False | False | False | False |
pay_check_email | False | False | False | True | True | False | True | False | False | True |
pay_check_mail | False | False | False | False | False | False | False | True | True | False |
tenure | 72 | 44 | 38 | 4 | 2 | 70 | 33 | 1 | 39 | 55 |
monthly_charges | 24.1 | 88.15 | 74.95 | 55.9 | 53.45 | 49.85 | 90.65 | 24.9 | 35.55 | 116.5 |
total_charges | 1734.65 | 3973.2 | 2869.85 | 238.5 | 119.5 | 3370.2 | 2989.6 | 24.9 | 1309.15 | 6382.55 |
#df_shuffled = df.sample(frac=1, random_state=SEED)
X = df.drop('churn', axis=1)
y = df.churn
from sklearn.model_selection import TimeSeriesSplit
cross_val_score(sklearn.ensemble.GradientBoostingClassifier(random_state=SEED), X, y, n_jobs=-1,
cv=TimeSeriesSplit(4), scoring='roc_auc')
array([0.84280425, 0.8279874 , 0.83119981, 0.83810758])
def drop_duplicates(iterable):
""" Drop duplicates from the iterable, keeping the order. Return the result as a list. """
return list(dict.fromkeys(iterable))
def new_sf_param_grids(**kwargs):
"""
Create semi-fixed parameter grids suited for sklearn's GridSearchCV function. In semi-fixed grid
all parameters except one are fixed (single value list).
>>> param_grid = new_sf_param_grids(
... learning_rate=[0.1, 0.001, 0.01, 1.0],
... loss=['deviance', 'exponential'],
... max_depth=[3, 1, 6],)
>>> param_grid
[{'learning_rate': [0.1], 'loss': ['deviance'], 'max_depth': [3]},
{'learning_rate': [0.001, 0.01, 1.0], 'loss': ['deviance'], 'max_depth': [3]},
{'learning_rate': [0.1], 'loss': ['exponential'], 'max_depth': [3]},
{'learning_rate': [0.1], 'loss': ['deviance'], 'max_depth': [1, 6]}]
>>> len(list(sklearn.model_selection.ParameterGrid(param_grid)))
7
"""
defaults = {param:values[:1] for param, values in kwargs.items()}
param_grid = [defaults]
for param, values in kwargs.items():
unique_values = drop_duplicates(values)
if len(unique_values) > 1:
param_grid.append({**defaults, param: unique_values[1:]})
return param_grid
def print_score_improvement(sf_cv_results_, score_key='mean_test_score'):
"""
>>> print_score_improvement(sf_grid_search.cv_results_)
Default score improvement: 0.830807 -> 0.835398 (+0.004591)
"""
default = sf_cv_results_[score_key][0]
best = max(sf_cv_results_[score_key])
print(f'Default score improvement: {default:.6f} -> {best:.6f} (+{best - default:.6f})')
def sort_alphanum(s: pd.Series, num_first=True):
"""
This function allows to sort mixed type series (strings + numerics + objects) with respect to numeric order.
1. Lexicographically sorted strings: '1', '10', '100', '2', '3'
2. Numerically sorted strings: '1', '2', '3', '10', '100'
Numeric values, including numeric strings, ordered numerically, non-numeric values ordered by they string
representations.
"""
s = pd.Series(s)
numeric_or_nan = pd.to_numeric(s, errors='coerce')
num_idx = numeric_or_nan.sort_values().dropna().index
alpha_idx = s[numeric_or_nan.isna()].astype(str).sort_values().index
return s[num_idx.append(alpha_idx)] if num_first else s[alpha_idx.append(num_idx)]
def rotate_tick_labels(axis, rotation):
ha = 'center' if rotation == 0 else 'left' if rotation < 0 else 'right'
return axis.set_xticklabels(axis.get_xticklabels(), rotation=rotation, ha=ha)
def plot_sf_cv_results(sf_cv_results, defaults_index=0, score_column='mean_test_score', num_cols=3, ax_size=(4, 2),
sort_x=True, skip_single_value=True):
"""
Plot the results of semi-fixed grid search.
Parameters
----------
results_grid : dict or DataFrame
Either raw value of `cv_results_` property of the fitted GridSearchCV() object or DataFrame(cv_results_).
defaults_index : int, default=0
Which row corresponds to default hyperparameter values.
score_column : str, default='mean_test_score'
Which column contains the score.
num_cols : int, default=3
Number of columns for subplots arrangement.
ax_size : tuple, default=(4, 2)
Size of each subplot.
log_scale : bool, list of str, default 'auto'
Apply log scale to x-axis (value).
If True, apply to all subplots with numerical x-axis (int or float).
If a list, apply to subplots specified by parameter names (as in cv_results_).
If 'auto', try to apply log scale automatically where appropriate.
sort_x : bool, default=True
Sort values on x-axis.
skip_single_value : bool, default=True
Skip plots with single x value.
"""
if isinstance(sf_cv_results, pd.DataFrame):
results_grid = sf_cv_results.copy()
else:
results_grid = pd.DataFrame(sf_cv_results)
# Prepare data for plotting
param_columns = []
for col in results_grid.columns:
if col.startswith('param_'):
results_grid[col] = results_grid[col].astype(str)
if skip_single_value and results_grid[col].nunique(dropna=False) < 2:
continue
else:
param_columns.append(col)
if len(param_columns) == 0:
warnings.warn('No data to plot. Try: `skip_single_value=False`')
default_results = results_grid.loc[defaults_index, :]
default_score = default_results[score_column]
# Prepare axes
num_axes = len(param_columns)
num_rows = int(np.ceil(num_axes / num_cols))
figsize = np.array(ax_size) * (num_cols, num_rows)
fig, axes = plt.subplots(nrows=num_rows, ncols=num_cols, figsize=figsize, sharey=True, squeeze=False)
for ax in axes.ravel()[num_axes:]:
ax.remove()
# Colors
infinite_prop_cycler = plt.rcParams['axes.prop_cycle']()
# For each `param_...` column in cv_results_
for ax, param_name, prop in zip(axes.ravel(), param_columns, infinite_prop_cycler):
non_default_idx = results_grid[param_name] != default_results[param_name]
# In case of GridSearchCV cv_results_ `.head` (first entry) or `.tail` (last entry) is the same.
# In case of HalvingGridSearchCV `.tail` (entry from the latest iteration) should be used.
points = results_grid[non_default_idx].groupby(param_name).tail(1).append(default_results)\
.rename(columns={param_name: 'x', score_column: 'y'})[['x', 'y']]
if sort_x:
points = points.loc[sort_alphanum(points.x).index]
# Value-Score plot
ax.grid(zorder=0)
ax.plot(points.x, points.y, 'o-', lw=2, zorder=20, c=prop['color'])
# Default value
color = '#db0000'
default_value = default_results[param_name]
ax.axvline(default_value, c=color, lw=1, ls='--', zorder=10)
ax.axhline(default_score, c=color, lw=1, ls='--', zorder=10)
ax.scatter([default_value], [default_score],
s=60, zorder=30, label='default', facecolor='w', edgecolor=color)
# Illegal values
points_y_nan = points[points.y.isna()]
ax.scatter(points_y_nan.x, [default_score]*len(points_y_nan), marker='x',
s=40, zorder=40, c=color, label='illegal')
# Axis settings
ax.set_xlabel(param_name)
if ax in axes[:, 0]:
ax.set_ylabel('score')
ax.set_frame_on(False)
axes[0][min(num_axes, num_cols)-1].legend(loc='upper left', bbox_to_anchor=(1, 1), title='Parameter value')
offset = (results_grid[score_column].max() - default_score)*.9
if offset > 0:
axes[0][0].set_ylim(bottom=default_score - offset, top=results_grid[score_column].max() + offset)
plt.tight_layout(pad=3)
return axes
def plot_hyperparam_score(estimator, sf_grid, limit_samples=None, offset_samples=None, title=None, **kwargs):
#cv = StratifiedKFold(n_splits=4, shuffle=True, random_state=SEED)
with warnings.catch_warnings():
warnings.simplefilter('ignore')
# sf_grid_search = HalvingGridSearchCV(estimator, sf_grid, scoring='roc_auc', n_jobs=-1, cv=4,
# error_score=np.nan, random_state=SEED) \
# .fit(X[offset_samples:][:limit_samples], y[offset_samples:][:limit_samples])
sf_grid_search = GridSearchCV(estimator, sf_grid, scoring='roc_auc', n_jobs=-1, cv=4,
error_score=np.nan) \
.fit(X[offset_samples:][:limit_samples], y[offset_samples:][:limit_samples])
results = sf_grid_search.cv_results_
# Restore parameters order as in grid
param_keys = [f'param_{name}' for name in sf_grid[0].keys()]
non_param_keys = [k for k in results.keys() if k not in param_keys]
results = {k: results[k] for k in (param_keys + non_param_keys)}
print_score_improvement(results)
axes = plot_sf_cv_results(results, **kwargs)
if title is not None:
plt.suptitle(str(title)[:80], size=20, y=1.0)
return sf_grid_search, axes
import sklearn
import sklearn.neural_network
import sklearn.discriminant_analysis
import sklearn.ensemble
import sklearn.gaussian_process
import sklearn.linear_model
import sklearn.naive_bayes
import sklearn.neural_network
import sklearn.neighbors
import sklearn.svm
import sklearn.tree
import xgboost
from sklearn.preprocessing import RobustScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import StackingClassifier, VotingClassifier
from sklearn.tree import DecisionTreeClassifier, plot_tree
classifiers = {
"QDA" : sklearn.discriminant_analysis.QuadraticDiscriminantAnalysis(),
"AdaBoost" : sklearn.ensemble.AdaBoostClassifier(),
"ExtraTrees" : sklearn.ensemble.ExtraTreesClassifier(),
"GradientBoosting" : sklearn.ensemble.GradientBoostingClassifier(),
"RandomForest" : sklearn.ensemble.RandomForestClassifier(),
"GaussianProcess" : sklearn.gaussian_process.GaussianProcessClassifier(copy_X_train=False, n_jobs=-1),
"LogisticRegression" : sklearn.linear_model.LogisticRegression(n_jobs=-1),
"SGD" : sklearn.linear_model.SGDClassifier(loss='log'),
"GaussianNB" : sklearn.naive_bayes.GaussianNB(), # <-------------- does not have hyperparameters
"MLP" : sklearn.neural_network.MLPClassifier(),
"KNeighbors" : sklearn.neighbors.KNeighborsClassifier(n_jobs=-1),
"SVC" : sklearn.svm.SVC(probability=True),
"NuSVC" : sklearn.svm.NuSVC(probability=True),
"DecisionTree" : sklearn.tree.DecisionTreeClassifier(),
"XGB" : xgboost.XGBClassifier(use_label_encoder=False, eval_metric='logloss', n_jobs=-1,
verbosity=0),
}
numeric_columns = df.columns[df.dtypes.map(lambda t: np.issubdtype(t, np.floating) or np.issubdtype(t, np.integer))]
preprocessor = ColumnTransformer(
remainder='passthrough',
transformers=[('scaler', RobustScaler(), numeric_columns)]
)
# Classifier -> Pipeline
for k, clf in classifiers.items():
if 'random_state' in clf.get_params():
clf.set_params(random_state=SEED)
classifiers[k] = Pipeline([('pre', preprocessor), ('clf', clf)], memory=CACHE_DIR)
# QuadraticDiscriminantAnalysis()
sf_grid = new_sf_param_grids(
clf__reg_param=[.1, .0, 1.0, .1, .01, .001, .0001]#
)
plot_hyperparam_score(classifiers['QDA'], sf_grid);
Default score improvement: 0.835603 -> 0.835603 (+0.000000)
# AdaBoostClassifier()
sf_grid = new_sf_param_grids(
clf__random_state=[0, 1, 2, 3, 4, 5, 6],
clf__n_estimators=[50, 10, 25, 100, 200],
clf__learning_rate=[.5, 1.0, .1, .5, 2.],#
)
plot_hyperparam_score(classifiers['AdaBoost'], sf_grid);
Default score improvement: 0.844780 -> 0.844780 (+0.000000)
# ExtraTreesClassifier()
sf_grid = new_sf_param_grids(
clf__random_state=[3, 0, 1, 2, 3, 4, 5, 6],
clf__n_estimators=[1000, 50, 200, 1000],
# # clf__criterion=["gini", "entropy"],
# clf__criterion=["entropy"],
# # clf__max_depth=[6, None, 1, 3],
# # clf__min_samples_split=[2, 4, 20, 100, .001, .01, .1, .5],
# # clf__min_samples_split=[.01],
# # clf__min_samples_leaf=[1, 2, 10, 100, .001, .01, .1, .5],
# # clf__min_weight_fraction_leaf=[.0, .1, .5],
# # clf__max_features=['auto', 'sqrt', 'log2', None, 1, 10],
# # clf__max_leaf_nodes=[None, 2, 10, 100, 1000],
# # clf__min_impurity_decrease=[.0, .01, .1],
# # clf__bootstrap=[True],
# # clf__oob_score=[False, True],
# clf__class_weight=[None, 'balanced', 'balanced_subsample'],
# clf__ccp_alpha=[.001, .0, .1, .01, .001, .0001],#
clf__max_samples=[1000, None, .1, .5, 10, 50, 100, 500, 1000, 2000],#
# clf__max_samples=[1000],#
clf__min_samples_split=[.01],
# clf__max_features=[10],
clf__bootstrap=[True],
# clf__max_samples=[.5],
)
plot_hyperparam_score(classifiers['ExtraTrees'], sf_grid);
Default score improvement: 0.841103 -> 0.842785 (+0.001682)
По первому графику (param_random_state
) видно, что классификатор при данных настройках не стабилен.
# GradientBoostingClassifier()
sf_grid = new_sf_param_grids(
clf__random_state=[0, 1, 2, 3, 4, 5, 6],
clf__loss=['deviance', 'exponential'],
clf__learning_rate=[0.1, 1.0, 0.01, 0.001, 2.0, 10.0],
clf__n_estimators=[100, 50, 200, 1000, 5000],
#clf__subsample=[1.0, .9, .5, .1],
clf__criterion=['friedman_mse', 'mse'],
clf__min_samples_split=[2, 4, 20, 100, .001, .01, .1, .5],
clf__min_samples_leaf=[.1, 1, 2, 10, 100, .001, .01, .1, .5],#
clf__min_weight_fraction_leaf=[.0, .1, .2, .5],
clf__max_depth=[3, 1, 2, 6, 10],
clf__min_impurity_decrease=[.0, .01, .1],
clf__init=[None, 'zero'],
#clf__max_features=[None, 'auto', 'sqrt', 'log2', 1, 10, .1, .5, .9],
clf__max_leaf_nodes=[None, 2, 5, 10],
clf__n_iter_no_change=[None, 1, 10, 100],
clf__tol=[1e-4, 1e-3, 1e-5],
clf__ccp_alpha=[.0, .1, .01, .001],
)
plot_hyperparam_score(classifiers['GradientBoosting'], sf_grid);
Default score improvement: 0.847148 -> 0.847232 (+0.000084)
# RandomForestClassifier()
sf_grid = new_sf_param_grids(
clf__random_state=[3, 0, 1, 2, 3, 4, 5, 6],
# clf__n_estimators=[1000, 100, 50, 200, 1000],
# clf__criterion=['entropy', 'gini', 'entropy'],#
# # clf__max_depth=[None, 1, 2, 3, 6, 10],
# clf__max_depth=[6],
# # clf__min_samples_split=[2, 4, 20, 100, .001, .01, .1, .5],
# # clf__min_samples_leaf=[1, 2, 10, 100, .001, .01, .1, .5],
# # clf__min_weight_fraction_leaf=[.0, .1, .2, .5],
# # clf__max_features=['auto', 'sqrt', 'log2', None, 1, 10, .1, .5, .9],
# # clf__max_leaf_nodes=[None, 2, 5, 10, 100, 1000],
# # clf__min_impurity_decrease=[.0, .01, .1],
# # clf__bootstrap=[True, False],
# # clf__oob_score=[False, True],
# # clf__class_weight=[None, 'balanced', 'balanced_subsample'],
# clf__class_weight=['balanced'],
# clf__ccp_alpha=[.001, .0, .1, .01, .001, .0001],#
# #clf__max_samples=[None, .01, .1, .5, .9, 1, 10],
clf__max_depth=[6],
clf__n_estimators=[5000],
clf__max_samples=[.1],
clf__min_samples_leaf=[2],
clf__min_samples_split=[.0001],
)
plot_hyperparam_score(classifiers['RandomForest'], sf_grid);
Default score improvement: 0.845574 -> 0.846040 (+0.000466)
# GaussianProcessClassifier()
sf_grid = new_sf_param_grids(
random_state=[0],
kernel=[
sklearn.gaussian_process.kernels.RBF(length_scale=1.0, length_scale_bounds="fixed"),
sklearn.gaussian_process.kernels.Matern(length_scale=1.0, nu=1.5, length_scale_bounds="fixed"),
sklearn.gaussian_process.kernels.RationalQuadratic(length_scale=1.0, alpha=1.0,
length_scale_bounds="fixed", alpha_bounds="fixed"),
sklearn.gaussian_process.kernels.ExpSineSquared(length_scale=1.0, periodicity=1.0,
length_scale_bounds="fixed", periodicity_bounds="fixed"),
sklearn.gaussian_process.kernels.DotProduct(sigma_0=1.0, sigma_0_bounds="fixed"),
]
)
plot_hyperparam_score(classifiers['GaussianProcess'], sf_grid, limit_samples=None)
plt.xticks(rotation=-15, ha='left')
pass
Default score improvement: 0.538889 -> 0.995807 (+0.456918)
Применять оптимизацию гиперпараметров к GaussianProcessClassifier()
нецелесообразно ввиду бесконечной вариативности функций ядра (за счет комбинаций) и высокой вычислительной сложности $O(n^3)$.
Пример комбинированного ядра, взятый из scikit-learn User Guide:
34.4**2 * RBF(length_scale=41.8) + 3.27**2 * RBF(length_scale=180) * ExpSineSquared(length_scale=1.44, periodicity=1)
# LogisticRegression()
sf_grid = new_sf_param_grids(
clf__random_state=[0, 1, 2, 3, 4, 5, 6],
clf__penalty=['elasticnet', 'l2', 'l1', 'elasticnet', None],#
clf__tol=[.001, 1e-4, 1e-5, 1e-3],#
clf__C=[2.0, 1.0, .1, .01, 2.0, 10.0, 100.0],#
clf__class_weight=[None, 'balanced'],
clf__solver=['saga', 'lbfgs', 'newton-cg', 'liblinear', 'sag', 'saga'],#
clf__max_iter=[1000, 100, 1000, 10000],#
clf__l1_ratio=[1.0, .0, .5, 1.0], # only available with penalty='elasticnet'
)
_dev_null, axes = plot_hyperparam_score(classifiers['LogisticRegression'], sf_grid, limit_samples=None)
rotate_tick_labels(axes[1][-1], -15)
pass
Default score improvement: 0.843245 -> 0.843245 (+0.000000)
%%time
# SGDClassifier(learning_rate='optimal')
sf_grid = new_sf_param_grids(
clf__random_state=[0, 1, 2, 3, 4, 5, 6],
clf__penalty=['elasticnet'],
clf__loss=['log', 'modified_huber', 'squared_hinge', 'perceptron'],
clf__alpha=[.01, .0001, .00001, .001, .01, .1],#
clf__l1_ratio=[0.15, 0.0, 1.0, 0.5, 0.75],
clf__max_iter=[1000, 10, 100, 10000],
clf__tol=[1e-3, 1e-2, 1e-4, 1e-5],
clf__shuffle=[True, False],
clf__learning_rate=['adaptive', 'optimal', 'constant', 'invscaling', 'adaptive'],#
clf__eta0=[.1, .01, .001, .1, 1.0, 10.0],#
clf__early_stopping=[True, False, True],#
clf__n_iter_no_change=[100, 5, 1, 50, 100, 1000],#
clf__class_weight=['balanced', None, 'balanced'],#
)
_dev_null, axis = plot_hyperparam_score(classifiers['SGD'], sf_grid)
rotate_tick_labels(axis[0][1], -15)
pass
Default score improvement: 0.839453 -> 0.842915 (+0.003462) Wall time: 2min 19s
random_state
, влияющего на выбор начальной точки градиентного спуска.penalty=elasticnet
равносильно l1
при l1_ratio=1
и l2
при l1_ratio=0
.learning_rate='constant'
, так и несколько стратегий динамического изменения размера шага в процессе обучения (можно видеть на графике param_clf__learning_rate
).# MLPClassifier()
sf_grid = new_sf_param_grids(
random_state=[0, 1, 2, 3, 4, 5, 6],
hidden_layer_sizes=[(100,), (10,), (50,), (200,), (10, 10), (5, 5, 5), (4, 4, 4, 4)],
activation=['relu', 'identity', 'logistic', 'tanh'],
solver=['adam', 'lbfgs', 'sgd'],
alpha=[1e-4, 0, 1e-5, 1e-3, 1e-2],
batch_size=[200, 20, 100, 400, 2000],
learning_rate_init=[.001, .00001, .0001, .01, .1, 1.],
max_iter=[200, 100, 400],
shuffle=[True, False],
tol=[1e-3, 1e-2, 1e-4, .1],
early_stopping=[False, True],
beta_1=[.9, .9999, .999, .99, .8],
beta_2=[.999, .9999, .9, .8],
n_iter_no_change=[10, 1, 50, 100],
)
_dev_null, axis = plot_hyperparam_score(classifiers['MLP'], sf_grid);
rotate_tick_labels(axis[0][1], -30)
pass
Default score improvement: 0.951363 -> 0.990566 (+0.039203)
Основной гиперпараметр, hidden_layer_sizes
, отвечает за конфигурацию скрытых слоев нейронной сети. Поскольку число конфигураций бесконечно, поиск перебором в данном случае не эффективен.
# KNeighborsClassifier()
sf_grid = new_sf_param_grids(
n_neighbors=[5, 4, 2, 1, 6, 10, 100, 1000, 3000],
weights=['uniform', 'distance'],
algorithm=['auto', 'ball_tree', 'kd_tree', 'brute'],
leaf_size=[30, 15, 60],
p=[2, 1, 10],
)
_dev_null, axis = plot_hyperparam_score(classifiers['KNeighbors'], sf_grid);
rotate_tick_labels(axis[0][0], -30)
pass
Default score improvement: 0.961845 -> 0.972746 (+0.010901)
# SVC()
sf_grid = new_sf_param_grids(
random_state=[0, 1, 2, 3, 4],
C=[1.0, .1, 10],
kernel=['rbf', 'linear', 'poly', 'sigmoid'],
gamma=['scale', 'auto', .1, .5, .9],
shrinking=[True, False],
tol=[1e-3, 1e-2, 1e-4],
class_weight=[None, 'balanced'],
max_iter=[-1, 10, 100],
)
_dev_null, axis = plot_hyperparam_score(classifiers['SVC'], sf_grid);
rotate_tick_labels(axis[0][2], -15)
pass
Default score improvement: 0.973585 -> 0.996226 (+0.022642)
# NuSVC()
sf_grid = new_sf_param_grids(
clf__random_state=[0, 1, 2, 3, 4],
clf__nu=[.5, .1, .25, .75, 1.],
#clf__kernel=['rbf', 'linear', 'poly', 'sigmoid'],
clf__kernel=['linear'],
#clf__gamma=['scale', 'auto', .1, .5, .9],
clf__gamma=['auto'],
#clf__shrinking=[True, False],
#clf__tol=[1e-3, 1e-2, 1e-4],
#clf__class_weight=[None, 'balanced'],
clf__class_weight=['balanced'],
#clf__max_iter=[-1, 10, 100, 1000],
)
_dev_null, axis = plot_hyperparam_score(classifiers['NuSVC'], sf_grid);
#rotate_tick_labels(axis[0][2], -15)
pass
Default score improvement: 0.830539 -> 0.830539 (+0.000000)
# DecisionTreeClassifier()
sf_grid = new_sf_param_grids(
clf__random_state=[0, 1, 2, 3, 4, 5, 6],
clf__criterion=['gini', 'entropy'],
clf__splitter=['best', 'random'],
clf__max_depth=[None, 1, 2, 3, 6, 10],
clf__min_samples_split=[2, 4, 20, 100, .001, .01, .1, .5],
clf__min_samples_leaf=[1, 2, 10, 100, .001, .01, .1, .5],
clf__min_weight_fraction_leaf=[.0, .1, .2, .5, 1.0],
clf__max_features=[None, 'auto', 'sqrt', 'log2', 1, 10, .1, .5, .9],
clf__max_leaf_nodes=[None, 1, 2, 5, 10],
clf__min_impurity_decrease=[.0, .1, .01],
clf__class_weight=[None, 'balanced'],
clf__ccp_alpha=[.0, .1, .01, .001, .0001, 1., 10],
)
plot_hyperparam_score(classifiers['DecisionTree'], sf_grid)
pass
Default score improvement: 0.654751 -> 0.834899 (+0.180148)
%%time
# XGBClassifier()
sf_grid = new_sf_param_grids(
clf__random_state=[0, 1, 2, 3, 4],
clf__booster=['gbtree', 'gblinear', 'dart'],
clf__learning_rate=[.1, .001, .01, 1.0],
clf__gamma=[0, .01, .1, 1.0, 10],
clf__max_depth=[3, 1, 2, 6],
clf__min_child_weight=[1, 0, .1, 10, 100],
clf__max_delta_step=[0, .01, .1, 1, 2, 10],
clf__subsample=[1],
clf__colsample_bytree=[1],
clf__colsample_bylevel=[1],
clf__colsample_bynode=[1],
clf__reg_lambda=[1, 0, .01, .1, 2, 10, 100, 1000],
clf__reg_alpha=[0, .01, .1, 1, 2, 10, 100],
clf__tree_method=['auto', 'exact', 'approx', 'hist'],
clf__scale_pos_weight=[1, 0, 10, 100],
clf__num_parallel_tree=[1, 2, 10],
clf__n_estimators=[100, 10, 50, 200, 1000],
clf__eval_metric=['logloss', 'error', 'auc'],
)
hgs, axes = plot_hyperparam_score(classifiers['XGB'], sf_grid)
pass
Default score improvement: 0.844702 -> 0.847068 (+0.002367) Wall time: 1min 20s
Дефолтные и граничные значения параметров для XGBClassifier()
.
# https://stackoverflow.com/a/52321479/
xgb_defaults = dict(
# General
booster='gbtree', # set: {'gbtree', 'gblinear', 'dart'}
verbosity=1, # set: {0, 1, 2, 3}
# Tree Booster
learning_rate=0.1, # float: [0, 1], alias: eta
gamma=0, # float: [0, inf), alias: min_split_loss
max_depth=3, # int
min_child_weight=1, # float: [0, inf)
max_delta_step=0, # int: [0, inf)
subsample=1, # float: (0, 1]
colsample_bytree=1, # float: (0, 1]
colsample_bylevel=1, # float: (0, 1]
colsample_bynode=1, # float: (0, 1]
reg_lambda=1, # float: [0, inf), alias: lambda
reg_alpha=0, # float: [0, inf), alias: alpha
tree_method='auto', # set: {'auto', 'exact', 'approx', 'hist', 'gpu_hist'}
scale_pos_weight=1, # float: [0, inf)
num_parallel_tree=1, # int: [1, inf)
monotone_constraints='()', # string
interaction_constraints='', # string, example: '[[0, 1], [2, 3, 4]]'
# Learning Task Parameters
objective='binary:logistic', # https://xgboost.readthedocs.io/en/latest/parameter.html#learning-task-parameters
base_score=0.5,
random_state=0, # int, alias: seed
# Sklearn API specific
n_estimators=100, # int: [1, inf), boosting rounds, alias: num_round
n_jobs=1, # int: [1, inf)
missing=np.nan,
importance_type='gain', # set: {'gain', 'weight', 'cover', 'total_gain' or 'total_cover'}
use_label_encoder=True, # should be False, deprecated
eval_metric='logloss', # set: {'logloss', 'error', 'auc', ...}
)