Оптимизация гиперпараметров с помощью semi-fixed grid search¶

Содержание:

  1. Пример пошаговой оптимизации
  2. Примеры сеток для 14-ти классификаторов
In [11]:
import re
import warnings

import numpy as np
import pandas as pd

import matplotlib.pyplot as plt

from sklearn.experimental import enable_halving_search_cv

from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import GridSearchCV, StratifiedShuffleSplit, train_test_split, HalvingGridSearchCV
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import roc_auc_score
In [6]:
def split_camel_case(cc_string):
    """
    >>> split_camel_case('HTTP2Service')
    ['HTTP2', 'Service']
    
    >>> split_camel_case('CellRangeA1Z99')
    ['Cell', 'Range', 'A1', 'Z99']
    
    >>> split_camel_case('customerID')
    ['customer', 'ID']
    """
    return re.split(r'(?<=\d)(?=\D)|(?<=[^A-Z\d])(?=[A-Z\d])|(?<!^)(?=[A-Z][a-z])', cc_string)

def camel_to_snake_case(cc_string):
    """ camel_to_snake_case('customerID') -> 'customer_id' """
    return '_'.join(split_camel_case(cc_string)).lower()
In [392]:
SEED = 42
CACHE_DIR = '_cache-ml-telecom-users'  # `None` to disable
In [393]:
df_raw = pd.read_csv('data/telecom_users.csv', index_col=0, na_values=[' '])
df = df_raw.copy()
df = df.drop(columns='customerID')  # (1)
df = df.fillna(0)                   # (2)
#df = df.sort_index()                # (3)

yes_no_columns = [
    'Partner',
    'Dependents',
    'PhoneService',
    'MultipleLines',
    'OnlineSecurity',
    'OnlineBackup',
    'DeviceProtection',
    'TechSupport',
    'StreamingTV',
    'StreamingMovies',
    'PaperlessBilling',
    'Churn',
]

df[yes_no_columns] = df[yes_no_columns] == 'Yes'  # (4)
df.SeniorCitizen = df.SeniorCitizen == 1
df = pd.get_dummies(df, dtype=bool)               # (5)
df = df.drop(columns=['gender_Female'])
df['InternetService_No'] = ~df['InternetService_No']

column_names_mapping = {
    **dict(zip(df_raw.columns, df_raw.columns.map(camel_to_snake_case))),
    'gender_Male': 'is_male',
    'InternetService_DSL': 'internet_dsl',
    'InternetService_Fiber optic': 'internet_fiber',
    'InternetService_No': 'internet_service',
    'Contract_Month-to-month': 'contract_one_month',
    'Contract_One year': 'contract_one_year',
    'Contract_Two year': 'contract_two_year',
    'PaymentMethod_Bank transfer (automatic)': 'pay_auto_transfer',
    'PaymentMethod_Credit card (automatic)': 'pay_auto_credit',
    'PaymentMethod_Electronic check': 'pay_check_email',
    'PaymentMethod_Mailed check': 'pay_check_mail',
}

df = df.rename(columns=column_names_mapping)   # (6)
middle_columns = df.columns.drop(['churn', 'tenure', 'monthly_charges', 'total_charges'])
df = df[['churn', *middle_columns, 'tenure', 'monthly_charges', 'total_charges']]  # reorder columns
df.head(10).T
Out[393]:
1869 4528 6344 6739 432 2215 5260 6001 1480 5137
churn False False True False False False False False False False
senior_citizen False False True False False False False False False True
partner True False True False False True False False False False
dependents True False False False False False False False False False
phone_service True True True True True False True False False True
multiple_lines True False True False False False True False False True
online_security False False False False True True True False False True
online_backup False True False False False False False False False True
device_protection False True False False True True False False True True
tech_support False False False False False True False False True True
streaming_tv False True False False False False False False False True
streaming_movies False False False True False True True False False True
paperless_billing False True True True False True True True False True
is_male True False False True True False False False True True
internet_dsl False False False True True True False True True False
internet_fiber False True True False False False True False False True
internet_service False True True True True True True True True True
contract_one_month False True True True True False True True False True
contract_one_year False False False False False False False False True False
contract_two_year True False False False False True False False False False
pay_auto_transfer False False True False False True False False False False
pay_auto_credit True True False False False False False False False False
pay_check_email False False False True True False True False False True
pay_check_mail False False False False False False False True True False
tenure 72 44 38 4 2 70 33 1 39 55
monthly_charges 24.1 88.15 74.95 55.9 53.45 49.85 90.65 24.9 35.55 116.5
total_charges 1734.65 3973.2 2869.85 238.5 119.5 3370.2 2989.6 24.9 1309.15 6382.55
In [395]:
#df_shuffled = df.sample(frac=1, random_state=SEED)
X = df.drop('churn', axis=1)
y = df.churn
In [53]:
from sklearn.model_selection import TimeSeriesSplit
In [58]:
cross_val_score(sklearn.ensemble.GradientBoostingClassifier(random_state=SEED), X, y, n_jobs=-1,
                cv=TimeSeriesSplit(4), scoring='roc_auc')
Out[58]:
array([0.84280425, 0.8279874 , 0.83119981, 0.83810758])
In [251]:
def drop_duplicates(iterable):
    """ Drop duplicates from the iterable, keeping the order. Return the result as a list. """
    return list(dict.fromkeys(iterable))


def new_sf_param_grids(**kwargs):
    """
    Create semi-fixed parameter grids suited for sklearn's GridSearchCV function. In semi-fixed grid
    all parameters except one are fixed (single value list).
    
    >>> param_grid = new_sf_param_grids(
    ...     learning_rate=[0.1, 0.001, 0.01, 1.0],
    ...     loss=['deviance', 'exponential'],
    ...     max_depth=[3, 1, 6],)
    
    >>> param_grid
    [{'learning_rate': [0.1], 'loss': ['deviance'], 'max_depth': [3]},
     {'learning_rate': [0.001, 0.01, 1.0], 'loss': ['deviance'], 'max_depth': [3]},
     {'learning_rate': [0.1], 'loss': ['exponential'], 'max_depth': [3]},
     {'learning_rate': [0.1], 'loss': ['deviance'], 'max_depth': [1, 6]}]
    
    >>> len(list(sklearn.model_selection.ParameterGrid(param_grid)))
    7
    """
    defaults = {param:values[:1] for param, values in kwargs.items()}
    param_grid = [defaults]
    for param, values in kwargs.items():
        unique_values = drop_duplicates(values)
        if len(unique_values) > 1:
            param_grid.append({**defaults, param: unique_values[1:]})
    return param_grid


def print_score_improvement(sf_cv_results_, score_key='mean_test_score'):
    """
    >>> print_score_improvement(sf_grid_search.cv_results_)
    Default score improvement: 0.830807 -> 0.835398 (+0.004591)
    """
    default = sf_cv_results_[score_key][0]
    best = max(sf_cv_results_[score_key])
    print(f'Default score improvement: {default:.6f} -> {best:.6f} (+{best - default:.6f})')
    

def sort_alphanum(s: pd.Series, num_first=True):
    """
    This function allows to sort mixed type series (strings + numerics + objects) with respect to numeric order.
    1. Lexicographically sorted strings: '1', '10', '100', '2', '3'
    2. Numerically sorted strings: '1', '2', '3', '10', '100'
    Numeric values, including numeric strings, ordered numerically, non-numeric values ordered by they string
    representations.
    """
    s = pd.Series(s)
    numeric_or_nan = pd.to_numeric(s, errors='coerce')
    
    num_idx = numeric_or_nan.sort_values().dropna().index
    alpha_idx = s[numeric_or_nan.isna()].astype(str).sort_values().index
    return s[num_idx.append(alpha_idx)] if num_first else s[alpha_idx.append(num_idx)]


def rotate_tick_labels(axis, rotation):
    ha = 'center' if rotation == 0 else 'left' if rotation < 0 else 'right'
    return axis.set_xticklabels(axis.get_xticklabels(), rotation=rotation, ha=ha)


def plot_sf_cv_results(sf_cv_results, defaults_index=0, score_column='mean_test_score', num_cols=3, ax_size=(4, 2),
                       sort_x=True, skip_single_value=True):
    """
    Plot the results of semi-fixed grid search.
    
    Parameters
    ----------
    results_grid : dict or DataFrame
        Either raw value of `cv_results_` property of the fitted GridSearchCV() object or DataFrame(cv_results_).
        
    defaults_index : int, default=0
        Which row corresponds to default hyperparameter values.
    
    score_column : str, default='mean_test_score'
        Which column contains the score.
        
    num_cols : int, default=3
        Number of columns for subplots arrangement.
        
    ax_size : tuple, default=(4, 2)
        Size of each subplot.
        
    log_scale : bool, list of str, default 'auto'
        Apply log scale to x-axis (value).
        If True, apply to all subplots with numerical x-axis (int or float).
        If a list, apply to subplots specified by parameter names (as in cv_results_).
        If 'auto', try to apply log scale automatically where appropriate.
        
    sort_x : bool, default=True
        Sort values on x-axis.
        
    skip_single_value : bool, default=True
        Skip plots with single x value.
    """
    if isinstance(sf_cv_results, pd.DataFrame):
        results_grid = sf_cv_results.copy()
    else:
        results_grid = pd.DataFrame(sf_cv_results)
    
    # Prepare data for plotting
    param_columns = []
    for col in results_grid.columns:
        if col.startswith('param_'):
            results_grid[col] = results_grid[col].astype(str)
            if skip_single_value and results_grid[col].nunique(dropna=False) < 2:
                continue
            else:
                param_columns.append(col)
    
    if len(param_columns) == 0:
        warnings.warn('No data to plot. Try: `skip_single_value=False`')
    
    default_results = results_grid.loc[defaults_index, :]
    default_score = default_results[score_column]
    
    # Prepare axes
    num_axes = len(param_columns)
    num_rows = int(np.ceil(num_axes / num_cols))
    figsize = np.array(ax_size) * (num_cols, num_rows)
    fig, axes = plt.subplots(nrows=num_rows, ncols=num_cols, figsize=figsize, sharey=True, squeeze=False)
    
    for ax in axes.ravel()[num_axes:]:
        ax.remove()
    
    # Colors
    infinite_prop_cycler = plt.rcParams['axes.prop_cycle']()
    
    # For each `param_...` column in cv_results_
    for ax, param_name, prop in zip(axes.ravel(), param_columns, infinite_prop_cycler):
        
        non_default_idx = results_grid[param_name] != default_results[param_name]
        
        # In case of GridSearchCV cv_results_ `.head` (first entry) or `.tail` (last entry) is the same.
        #   In case of HalvingGridSearchCV `.tail` (entry from the latest iteration) should be used.
        points = results_grid[non_default_idx].groupby(param_name).tail(1).append(default_results)\
            .rename(columns={param_name: 'x', score_column: 'y'})[['x', 'y']]
        if sort_x:
            points = points.loc[sort_alphanum(points.x).index]
        
        # Value-Score plot
        ax.grid(zorder=0)
        ax.plot(points.x, points.y, 'o-', lw=2, zorder=20, c=prop['color'])
        
        # Default value
        color = '#db0000'
        default_value = default_results[param_name]
        ax.axvline(default_value, c=color, lw=1, ls='--', zorder=10)
        ax.axhline(default_score, c=color, lw=1, ls='--', zorder=10)
        ax.scatter([default_value], [default_score],
                   s=60, zorder=30, label='default', facecolor='w', edgecolor=color)
        
        # Illegal values
        points_y_nan = points[points.y.isna()]
        ax.scatter(points_y_nan.x, [default_score]*len(points_y_nan), marker='x',
                   s=40, zorder=40, c=color, label='illegal')
        
        # Axis settings
        ax.set_xlabel(param_name)
        if ax in axes[:, 0]:
            ax.set_ylabel('score')
        ax.set_frame_on(False)
        
    axes[0][min(num_axes, num_cols)-1].legend(loc='upper left', bbox_to_anchor=(1, 1), title='Parameter value')
    offset = (results_grid[score_column].max() - default_score)*.9
    if offset > 0:
        axes[0][0].set_ylim(bottom=default_score - offset, top=results_grid[score_column].max() + offset)
    plt.tight_layout(pad=3)
    
    return axes
In [174]:
def plot_hyperparam_score(estimator, sf_grid, limit_samples=None, offset_samples=None, title=None, **kwargs):
    #cv = StratifiedKFold(n_splits=4, shuffle=True, random_state=SEED)
    
    with warnings.catch_warnings():
        warnings.simplefilter('ignore')
#         sf_grid_search = HalvingGridSearchCV(estimator, sf_grid, scoring='roc_auc', n_jobs=-1, cv=4,
#                                              error_score=np.nan, random_state=SEED) \
#             .fit(X[offset_samples:][:limit_samples], y[offset_samples:][:limit_samples])
        sf_grid_search = GridSearchCV(estimator, sf_grid, scoring='roc_auc', n_jobs=-1, cv=4,
                                             error_score=np.nan) \
            .fit(X[offset_samples:][:limit_samples], y[offset_samples:][:limit_samples])
    
    results = sf_grid_search.cv_results_
    
    # Restore parameters order as in grid
    param_keys = [f'param_{name}' for name in sf_grid[0].keys()]
    non_param_keys = [k for k in results.keys() if k not in param_keys]
    results = {k: results[k] for k in (param_keys + non_param_keys)}
    
    print_score_improvement(results)
    axes = plot_sf_cv_results(results, **kwargs)
    if title is not None:
        plt.suptitle(str(title)[:80], size=20, y=1.0)
    return sf_grid_search, axes

Сетки для классификаторов¶

In [19]:
import sklearn
import sklearn.neural_network
import sklearn.discriminant_analysis
import sklearn.ensemble
import sklearn.gaussian_process
import sklearn.linear_model
import sklearn.naive_bayes
import sklearn.neural_network
import sklearn.neighbors
import sklearn.svm
import sklearn.tree

import xgboost
In [61]:
from sklearn.preprocessing import RobustScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

from sklearn.model_selection import cross_val_score

from sklearn.ensemble import StackingClassifier, VotingClassifier
from sklearn.tree import DecisionTreeClassifier, plot_tree
In [396]:
classifiers = {
    "QDA"                : sklearn.discriminant_analysis.QuadraticDiscriminantAnalysis(),
    "AdaBoost"           : sklearn.ensemble.AdaBoostClassifier(),
    "ExtraTrees"         : sklearn.ensemble.ExtraTreesClassifier(),
    "GradientBoosting"   : sklearn.ensemble.GradientBoostingClassifier(),
    "RandomForest"       : sklearn.ensemble.RandomForestClassifier(),
    "GaussianProcess"    : sklearn.gaussian_process.GaussianProcessClassifier(copy_X_train=False, n_jobs=-1),
    "LogisticRegression" : sklearn.linear_model.LogisticRegression(n_jobs=-1),
    "SGD"                : sklearn.linear_model.SGDClassifier(loss='log'),
    "GaussianNB"         : sklearn.naive_bayes.GaussianNB(),  # <-------------- does not have hyperparameters
    "MLP"                : sklearn.neural_network.MLPClassifier(),
    "KNeighbors"         : sklearn.neighbors.KNeighborsClassifier(n_jobs=-1),
    "SVC"                : sklearn.svm.SVC(probability=True),
    "NuSVC"              : sklearn.svm.NuSVC(probability=True),
    "DecisionTree"       : sklearn.tree.DecisionTreeClassifier(),
    "XGB"                : xgboost.XGBClassifier(use_label_encoder=False, eval_metric='logloss', n_jobs=-1,
                                                 verbosity=0),
}

numeric_columns = df.columns[df.dtypes.map(lambda t: np.issubdtype(t, np.floating) or np.issubdtype(t, np.integer))]

preprocessor = ColumnTransformer(
    remainder='passthrough',
    transformers=[('scaler', RobustScaler(), numeric_columns)]
)

# Classifier -> Pipeline
for k, clf in classifiers.items():
    if 'random_state' in clf.get_params():
        clf.set_params(random_state=SEED)
    classifiers[k] = Pipeline([('pre', preprocessor), ('clf', clf)], memory=CACHE_DIR)
In [307]:
# QuadraticDiscriminantAnalysis()

sf_grid = new_sf_param_grids(
    clf__reg_param=[.1, .0, 1.0, .1, .01, .001, .0001]#
)
plot_hyperparam_score(classifiers['QDA'], sf_grid);
Default score improvement: 0.835603 -> 0.835603 (+0.000000)
In [312]:
# AdaBoostClassifier()

sf_grid = new_sf_param_grids(
    clf__random_state=[0, 1, 2, 3, 4, 5, 6],
    clf__n_estimators=[50, 10, 25, 100, 200],
    clf__learning_rate=[.5, 1.0, .1, .5, 2.],#
)
plot_hyperparam_score(classifiers['AdaBoost'], sf_grid);
Default score improvement: 0.844780 -> 0.844780 (+0.000000)
In [391]:
# ExtraTreesClassifier()

sf_grid = new_sf_param_grids(
    clf__random_state=[3, 0, 1, 2, 3, 4, 5, 6],
    clf__n_estimators=[1000, 50, 200, 1000],
# #    clf__criterion=["gini", "entropy"],
#     clf__criterion=["entropy"],
# #    clf__max_depth=[6, None, 1, 3],
# #    clf__min_samples_split=[2, 4, 20, 100, .001, .01, .1, .5],
# #    clf__min_samples_split=[.01],
# #     clf__min_samples_leaf=[1, 2, 10, 100, .001, .01, .1, .5],
# #     clf__min_weight_fraction_leaf=[.0, .1, .5],
# #    clf__max_features=['auto', 'sqrt', 'log2', None, 1, 10],
# #     clf__max_leaf_nodes=[None, 2, 10, 100, 1000],
# #     clf__min_impurity_decrease=[.0, .01, .1],
# #     clf__bootstrap=[True],
# #     clf__oob_score=[False, True],
#    clf__class_weight=[None, 'balanced', 'balanced_subsample'],
#     clf__ccp_alpha=[.001, .0, .1, .01, .001, .0001],#
     clf__max_samples=[1000, None, .1, .5, 10, 50, 100, 500, 1000, 2000],#
#     clf__max_samples=[1000],#
        clf__min_samples_split=[.01],
#        clf__max_features=[10],
        clf__bootstrap=[True],
#        clf__max_samples=[.5],
)
plot_hyperparam_score(classifiers['ExtraTrees'], sf_grid);
Default score improvement: 0.841103 -> 0.842785 (+0.001682)

По первому графику (param_random_state) видно, что классификатор при данных настройках не стабилен.

In [314]:
# GradientBoostingClassifier()

sf_grid = new_sf_param_grids(
    clf__random_state=[0, 1, 2, 3, 4, 5, 6],
    clf__loss=['deviance', 'exponential'],
    clf__learning_rate=[0.1, 1.0, 0.01, 0.001, 2.0, 10.0],
    clf__n_estimators=[100, 50, 200, 1000, 5000],
    #clf__subsample=[1.0, .9, .5, .1],
    clf__criterion=['friedman_mse', 'mse'],
    clf__min_samples_split=[2, 4, 20, 100, .001, .01, .1, .5],
    clf__min_samples_leaf=[.1, 1, 2, 10, 100, .001, .01, .1, .5],#
    clf__min_weight_fraction_leaf=[.0, .1, .2, .5],
    clf__max_depth=[3, 1, 2, 6, 10],
    clf__min_impurity_decrease=[.0, .01, .1],
    clf__init=[None, 'zero'],
    #clf__max_features=[None, 'auto', 'sqrt', 'log2', 1, 10, .1, .5, .9],
    clf__max_leaf_nodes=[None, 2, 5, 10],
    clf__n_iter_no_change=[None, 1, 10, 100],
    clf__tol=[1e-4, 1e-3, 1e-5],
    clf__ccp_alpha=[.0, .1, .01, .001],
)
plot_hyperparam_score(classifiers['GradientBoosting'], sf_grid);
Default score improvement: 0.847148 -> 0.847232 (+0.000084)
In [367]:
# RandomForestClassifier()

sf_grid = new_sf_param_grids(
    clf__random_state=[3, 0, 1, 2, 3, 4, 5, 6],
#     clf__n_estimators=[1000, 100, 50, 200, 1000],
#     clf__criterion=['entropy', 'gini', 'entropy'],#
# #    clf__max_depth=[None, 1, 2, 3, 6, 10],
#     clf__max_depth=[6],
# #     clf__min_samples_split=[2, 4, 20, 100, .001, .01, .1, .5],
# #     clf__min_samples_leaf=[1, 2, 10, 100, .001, .01, .1, .5],
# #     clf__min_weight_fraction_leaf=[.0, .1, .2, .5],
# #     clf__max_features=['auto', 'sqrt', 'log2', None, 1, 10, .1, .5, .9],
# #     clf__max_leaf_nodes=[None, 2, 5, 10, 100, 1000],
# #     clf__min_impurity_decrease=[.0, .01, .1],
# #     clf__bootstrap=[True, False],
# #     clf__oob_score=[False, True],
# #     clf__class_weight=[None, 'balanced', 'balanced_subsample'],
#     clf__class_weight=['balanced'],
#     clf__ccp_alpha=[.001, .0, .1, .01, .001, .0001],#
#     #clf__max_samples=[None, .01, .1, .5, .9, 1, 10],
    clf__max_depth=[6],
    clf__n_estimators=[5000],
    clf__max_samples=[.1],
    clf__min_samples_leaf=[2],
    clf__min_samples_split=[.0001],
)
plot_hyperparam_score(classifiers['RandomForest'], sf_grid);
Default score improvement: 0.845574 -> 0.846040 (+0.000466)
In [94]:
# GaussianProcessClassifier()

sf_grid = new_sf_param_grids(
    random_state=[0],
    kernel=[
        sklearn.gaussian_process.kernels.RBF(length_scale=1.0, length_scale_bounds="fixed"),
        sklearn.gaussian_process.kernels.Matern(length_scale=1.0, nu=1.5, length_scale_bounds="fixed"),
        sklearn.gaussian_process.kernels.RationalQuadratic(length_scale=1.0, alpha=1.0,
                                                           length_scale_bounds="fixed", alpha_bounds="fixed"),
        sklearn.gaussian_process.kernels.ExpSineSquared(length_scale=1.0, periodicity=1.0,
                                                        length_scale_bounds="fixed", periodicity_bounds="fixed"),
        sklearn.gaussian_process.kernels.DotProduct(sigma_0=1.0, sigma_0_bounds="fixed"),
    ]
)
plot_hyperparam_score(classifiers['GaussianProcess'], sf_grid, limit_samples=None)
plt.xticks(rotation=-15, ha='left')
pass
Default score improvement: 0.538889 -> 0.995807 (+0.456918)

Применять оптимизацию гиперпараметров к GaussianProcessClassifier() нецелесообразно ввиду бесконечной вариативности функций ядра (за счет комбинаций) и высокой вычислительной сложности $O(n^3)$.

Пример комбинированного ядра, взятый из scikit-learn User Guide:

34.4**2 * RBF(length_scale=41.8) + 3.27**2 * RBF(length_scale=180) * ExpSineSquared(length_scale=1.44, periodicity=1)

In [306]:
# LogisticRegression()

sf_grid = new_sf_param_grids(
    clf__random_state=[0, 1, 2, 3, 4, 5, 6],
    clf__penalty=['elasticnet', 'l2', 'l1', 'elasticnet', None],#
    clf__tol=[.001, 1e-4, 1e-5, 1e-3],#
    clf__C=[2.0, 1.0, .1, .01, 2.0, 10.0, 100.0],#
    clf__class_weight=[None, 'balanced'],
    clf__solver=['saga', 'lbfgs', 'newton-cg', 'liblinear', 'sag', 'saga'],#
    clf__max_iter=[1000, 100, 1000, 10000],#
    clf__l1_ratio=[1.0, .0, .5, 1.0],  # only available with penalty='elasticnet'
)
_dev_null, axes = plot_hyperparam_score(classifiers['LogisticRegression'], sf_grid, limit_samples=None)
rotate_tick_labels(axes[1][-1], -15)
pass
Default score improvement: 0.843245 -> 0.843245 (+0.000000)
In [301]:
%%time

# SGDClassifier(learning_rate='optimal')

sf_grid = new_sf_param_grids(
    clf__random_state=[0, 1, 2, 3, 4, 5, 6],
    clf__penalty=['elasticnet'],
    clf__loss=['log', 'modified_huber', 'squared_hinge', 'perceptron'],
    clf__alpha=[.01, .0001, .00001, .001, .01, .1],#
    clf__l1_ratio=[0.15, 0.0, 1.0, 0.5, 0.75],
    clf__max_iter=[1000, 10, 100, 10000],
    clf__tol=[1e-3, 1e-2, 1e-4, 1e-5],
    clf__shuffle=[True, False],
    clf__learning_rate=['adaptive', 'optimal', 'constant', 'invscaling', 'adaptive'],#
    clf__eta0=[.1, .01, .001, .1, 1.0, 10.0],#
    clf__early_stopping=[True, False, True],#
    clf__n_iter_no_change=[100, 5, 1, 50, 100, 1000],#
    clf__class_weight=['balanced', None, 'balanced'],#
)
_dev_null, axis = plot_hyperparam_score(classifiers['SGD'], sf_grid)
rotate_tick_labels(axis[0][1], -15)
pass
Default score improvement: 0.839453 -> 0.842915 (+0.003462)
Wall time: 2min 19s
  1. Качество классификации значительно варьируется в зависимости от параметра random_state, влияющего на выбор начальной точки градиентного спуска.
  2. penalty=elasticnet равносильно l1 при l1_ratio=1 и l2 при l1_ratio=0.
  3. Шаг градиентного спуска (eta) отвечает за темп обучения (learning rate) и является одним из наиболее важных параметров этого алгоритма.
  4. Данная реализация поддерживает как постоянный шаг – learning_rate='constant', так и несколько стратегий динамического изменения размера шага в процессе обучения (можно видеть на графике param_clf__learning_rate).
In [102]:
# MLPClassifier()

sf_grid = new_sf_param_grids(
    random_state=[0, 1, 2, 3, 4, 5, 6],
    hidden_layer_sizes=[(100,), (10,), (50,), (200,), (10, 10), (5, 5, 5), (4, 4, 4, 4)],
    activation=['relu', 'identity', 'logistic', 'tanh'],
    solver=['adam', 'lbfgs', 'sgd'],
    alpha=[1e-4, 0, 1e-5, 1e-3, 1e-2],
    batch_size=[200, 20, 100, 400, 2000],
    learning_rate_init=[.001, .00001, .0001, .01, .1, 1.],
    max_iter=[200, 100, 400],
    shuffle=[True, False],
    tol=[1e-3, 1e-2, 1e-4, .1],
    early_stopping=[False, True],
    beta_1=[.9, .9999, .999, .99, .8],
    beta_2=[.999, .9999, .9, .8],
    n_iter_no_change=[10, 1, 50, 100],
)
_dev_null, axis = plot_hyperparam_score(classifiers['MLP'], sf_grid);
rotate_tick_labels(axis[0][1], -30)
pass
Default score improvement: 0.951363 -> 0.990566 (+0.039203)

Основной гиперпараметр, hidden_layer_sizes, отвечает за конфигурацию скрытых слоев нейронной сети. Поскольку число конфигураций бесконечно, поиск перебором в данном случае не эффективен.

In [104]:
# KNeighborsClassifier()

sf_grid = new_sf_param_grids(
    n_neighbors=[5, 4, 2, 1, 6, 10, 100, 1000, 3000],
    weights=['uniform', 'distance'],
    algorithm=['auto', 'ball_tree', 'kd_tree', 'brute'],
    leaf_size=[30, 15, 60],
    p=[2, 1, 10],
)
_dev_null, axis = plot_hyperparam_score(classifiers['KNeighbors'], sf_grid);
rotate_tick_labels(axis[0][0], -30)
pass
Default score improvement: 0.961845 -> 0.972746 (+0.010901)
In [108]:
# SVC()

sf_grid = new_sf_param_grids(
    random_state=[0, 1, 2, 3, 4],
    C=[1.0, .1, 10],
    kernel=['rbf', 'linear', 'poly', 'sigmoid'],
    gamma=['scale', 'auto', .1, .5, .9],
    shrinking=[True, False],
    tol=[1e-3, 1e-2, 1e-4],
    class_weight=[None, 'balanced'],
    max_iter=[-1, 10, 100],
)
_dev_null, axis = plot_hyperparam_score(classifiers['SVC'], sf_grid);
rotate_tick_labels(axis[0][2], -15)
pass
Default score improvement: 0.973585 -> 0.996226 (+0.022642)
In [400]:
# NuSVC()

sf_grid = new_sf_param_grids(
    clf__random_state=[0, 1, 2, 3, 4],
    clf__nu=[.5, .1, .25, .75, 1.],
    #clf__kernel=['rbf', 'linear', 'poly', 'sigmoid'],
    clf__kernel=['linear'],
    #clf__gamma=['scale', 'auto', .1, .5, .9],
    clf__gamma=['auto'],
    #clf__shrinking=[True, False],
    #clf__tol=[1e-3, 1e-2, 1e-4],
    #clf__class_weight=[None, 'balanced'],
    clf__class_weight=['balanced'],
    #clf__max_iter=[-1, 10, 100, 1000],
)
_dev_null, axis = plot_hyperparam_score(classifiers['NuSVC'], sf_grid);
#rotate_tick_labels(axis[0][2], -15)
pass
Default score improvement: 0.830539 -> 0.830539 (+0.000000)
In [279]:
# DecisionTreeClassifier()

sf_grid = new_sf_param_grids(
    clf__random_state=[0, 1, 2, 3, 4, 5, 6],
    clf__criterion=['gini', 'entropy'],
    clf__splitter=['best', 'random'],
    clf__max_depth=[None, 1, 2, 3, 6, 10],
    clf__min_samples_split=[2, 4, 20, 100, .001, .01, .1, .5],
    clf__min_samples_leaf=[1, 2, 10, 100, .001, .01, .1, .5],
    clf__min_weight_fraction_leaf=[.0, .1, .2, .5, 1.0],
    clf__max_features=[None, 'auto', 'sqrt', 'log2', 1, 10, .1, .5, .9],
    clf__max_leaf_nodes=[None, 1, 2, 5, 10],
    clf__min_impurity_decrease=[.0, .1, .01],
    clf__class_weight=[None, 'balanced'],
    clf__ccp_alpha=[.0, .1, .01, .001, .0001, 1., 10],
)
plot_hyperparam_score(classifiers['DecisionTree'], sf_grid)
pass
Default score improvement: 0.654751 -> 0.834899 (+0.180148)
In [280]:
%%time
# XGBClassifier()

sf_grid = new_sf_param_grids(
    clf__random_state=[0, 1, 2, 3, 4],
    clf__booster=['gbtree', 'gblinear', 'dart'],
    clf__learning_rate=[.1, .001, .01, 1.0],
    clf__gamma=[0, .01, .1, 1.0, 10],
    clf__max_depth=[3, 1, 2, 6],
    clf__min_child_weight=[1, 0, .1, 10, 100],
    clf__max_delta_step=[0, .01, .1, 1, 2, 10],
    clf__subsample=[1],
    clf__colsample_bytree=[1],
    clf__colsample_bylevel=[1],
    clf__colsample_bynode=[1],
    clf__reg_lambda=[1, 0, .01, .1, 2, 10, 100, 1000],
    clf__reg_alpha=[0, .01, .1, 1, 2, 10, 100],
    clf__tree_method=['auto', 'exact', 'approx', 'hist'],
    clf__scale_pos_weight=[1, 0, 10, 100],
    clf__num_parallel_tree=[1, 2, 10],
    clf__n_estimators=[100, 10, 50, 200, 1000],
    clf__eval_metric=['logloss', 'error', 'auc'],
)
hgs, axes = plot_hyperparam_score(classifiers['XGB'], sf_grid)
pass
Default score improvement: 0.844702 -> 0.847068 (+0.002367)
Wall time: 1min 20s

xgboost.XGBClassifier() default parameter values¶

Дефолтные и граничные значения параметров для XGBClassifier().

In [183]:
# https://stackoverflow.com/a/52321479/

xgb_defaults = dict(
    # General
    booster='gbtree',             # set: {'gbtree', 'gblinear', 'dart'}
    verbosity=1,                  # set: {0, 1, 2, 3}

    # Tree Booster
    learning_rate=0.1,            # float: [0, 1], alias: eta
    gamma=0,                      # float: [0, inf), alias: min_split_loss
    max_depth=3,                  # int
    min_child_weight=1,           # float: [0, inf)
    max_delta_step=0,             # int: [0, inf)
    subsample=1,                  # float: (0, 1]
    colsample_bytree=1,           # float: (0, 1]
    colsample_bylevel=1,          # float: (0, 1]
    colsample_bynode=1,           # float: (0, 1]
    reg_lambda=1,                 # float: [0, inf), alias: lambda
    reg_alpha=0,                  # float: [0, inf), alias: alpha
    tree_method='auto',           # set: {'auto', 'exact', 'approx', 'hist', 'gpu_hist'}
    scale_pos_weight=1,           # float: [0, inf)
    num_parallel_tree=1,          # int: [1, inf)
    monotone_constraints='()',    # string
    interaction_constraints='',   # string, example: '[[0, 1], [2, 3, 4]]'

    # Learning Task Parameters
    objective='binary:logistic',  # https://xgboost.readthedocs.io/en/latest/parameter.html#learning-task-parameters
    base_score=0.5,
    random_state=0,               # int, alias: seed

    # Sklearn API specific
    n_estimators=100,             # int: [1, inf), boosting rounds, alias: num_round
    n_jobs=1,                     # int: [1, inf)
    missing=np.nan,
    importance_type='gain',       # set: {'gain', 'weight', 'cover', 'total_gain' or 'total_cover'}
    use_label_encoder=True,       # should be False, deprecated
    eval_metric='logloss',        # set: {'logloss', 'error', 'auc', ...}
)
In [ ]:
 
In [ ]: