import re
import warnings

import numpy as np
import pandas as pd

import matplotlib.pyplot as plt

from sklearn.experimental import enable_halving_search_cv

from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import GridSearchCV, StratifiedShuffleSplit, train_test_split, HalvingGridSearchCV
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import roc_auc_score


def split_camel_case(cc_string):
    """
    >>> split_camel_case('HTTP2Service')
    ['HTTP2', 'Service']
    
    >>> split_camel_case('CellRangeA1Z99')
    ['Cell', 'Range', 'A1', 'Z99']
    
    >>> split_camel_case('customerID')
    ['customer', 'ID']
    """
    return re.split(r'(?<=\d)(?=\D)|(?<=[^A-Z\d])(?=[A-Z\d])|(?<!^)(?=[A-Z][a-z])', cc_string)

def camel_to_snake_case(cc_string):
    """ camel_to_snake_case('customerID') -> 'customer_id' """
    return '_'.join(split_camel_case(cc_string)).lower()


SEED = 42
CACHE_DIR = '_cache-ml-telecom-users'  # `None` to disable


df_raw = pd.read_csv('data/telecom_users.csv', index_col=0, na_values=[' '])
df = df_raw.copy()
df = df.drop(columns='customerID')  # (1)
df = df.fillna(0)                   # (2)
#df = df.sort_index()                # (3)

yes_no_columns = [
    'Partner',
    'Dependents',
    'PhoneService',
    'MultipleLines',
    'OnlineSecurity',
    'OnlineBackup',
    'DeviceProtection',
    'TechSupport',
    'StreamingTV',
    'StreamingMovies',
    'PaperlessBilling',
    'Churn',
]

df[yes_no_columns] = df[yes_no_columns] == 'Yes'  # (4)
df.SeniorCitizen = df.SeniorCitizen == 1
df = pd.get_dummies(df, dtype=bool)               # (5)
df = df.drop(columns=['gender_Female'])
df['InternetService_No'] = ~df['InternetService_No']

column_names_mapping = {
    **dict(zip(df_raw.columns, df_raw.columns.map(camel_to_snake_case))),
    'gender_Male': 'is_male',
    'InternetService_DSL': 'internet_dsl',
    'InternetService_Fiber optic': 'internet_fiber',
    'InternetService_No': 'internet_service',
    'Contract_Month-to-month': 'contract_one_month',
    'Contract_One year': 'contract_one_year',
    'Contract_Two year': 'contract_two_year',
    'PaymentMethod_Bank transfer (automatic)': 'pay_auto_transfer',
    'PaymentMethod_Credit card (automatic)': 'pay_auto_credit',
    'PaymentMethod_Electronic check': 'pay_check_email',
    'PaymentMethod_Mailed check': 'pay_check_mail',
}

df = df.rename(columns=column_names_mapping)   # (6)
middle_columns = df.columns.drop(['churn', 'tenure', 'monthly_charges', 'total_charges'])
df = df[['churn', *middle_columns, 'tenure', 'monthly_charges', 'total_charges']]  # reorder columns
df.head(10).T


#df_shuffled = df.sample(frac=1, random_state=SEED)
X = df.drop('churn', axis=1)
y = df.churn


from sklearn.model_selection import TimeSeriesSplit


cross_val_score(sklearn.ensemble.GradientBoostingClassifier(random_state=SEED), X, y, n_jobs=-1,
                cv=TimeSeriesSplit(4), scoring='roc_auc')

array([0.84280425, 0.8279874 , 0.83119981, 0.83810758])


def drop_duplicates(iterable):
    """ Drop duplicates from the iterable, keeping the order. Return the result as a list. """
    return list(dict.fromkeys(iterable))


def new_sf_param_grids(**kwargs):
    """
    Create semi-fixed parameter grids suited for sklearn's GridSearchCV function. In semi-fixed grid
    all parameters except one are fixed (single value list).
    
    >>> param_grid = new_sf_param_grids(
    ...     learning_rate=[0.1, 0.001, 0.01, 1.0],
    ...     loss=['deviance', 'exponential'],
    ...     max_depth=[3, 1, 6],)
    
    >>> param_grid
    [{'learning_rate': [0.1], 'loss': ['deviance'], 'max_depth': [3]},
     {'learning_rate': [0.001, 0.01, 1.0], 'loss': ['deviance'], 'max_depth': [3]},
     {'learning_rate': [0.1], 'loss': ['exponential'], 'max_depth': [3]},
     {'learning_rate': [0.1], 'loss': ['deviance'], 'max_depth': [1, 6]}]
    
    >>> len(list(sklearn.model_selection.ParameterGrid(param_grid)))
    7
    """
    defaults = {param:values[:1] for param, values in kwargs.items()}
    param_grid = [defaults]
    for param, values in kwargs.items():
        unique_values = drop_duplicates(values)
        if len(unique_values) > 1:
            param_grid.append({**defaults, param: unique_values[1:]})
    return param_grid


def print_score_improvement(sf_cv_results_, score_key='mean_test_score'):
    """
    >>> print_score_improvement(sf_grid_search.cv_results_)
    Default score improvement: 0.830807 -> 0.835398 (+0.004591)
    """
    default = sf_cv_results_[score_key][0]
    best = max(sf_cv_results_[score_key])
    print(f'Default score improvement: {default:.6f} -> {best:.6f} (+{best - default:.6f})')
    

def sort_alphanum(s: pd.Series, num_first=True):
    """
    This function allows to sort mixed type series (strings + numerics + objects) with respect to numeric order.
    1. Lexicographically sorted strings: '1', '10', '100', '2', '3'
    2. Numerically sorted strings: '1', '2', '3', '10', '100'
    Numeric values, including numeric strings, ordered numerically, non-numeric values ordered by they string
    representations.
    """
    s = pd.Series(s)
    numeric_or_nan = pd.to_numeric(s, errors='coerce')
    
    num_idx = numeric_or_nan.sort_values().dropna().index
    alpha_idx = s[numeric_or_nan.isna()].astype(str).sort_values().index
    return s[num_idx.append(alpha_idx)] if num_first else s[alpha_idx.append(num_idx)]


def rotate_tick_labels(axis, rotation):
    ha = 'center' if rotation == 0 else 'left' if rotation < 0 else 'right'
    return axis.set_xticklabels(axis.get_xticklabels(), rotation=rotation, ha=ha)


def plot_sf_cv_results(sf_cv_results, defaults_index=0, score_column='mean_test_score', num_cols=3, ax_size=(4, 2),
                       sort_x=True, skip_single_value=True):
    """
    Plot the results of semi-fixed grid search.
    
    Parameters
    ----------
    results_grid : dict or DataFrame
        Either raw value of `cv_results_` property of the fitted GridSearchCV() object or DataFrame(cv_results_).
        
    defaults_index : int, default=0
        Which row corresponds to default hyperparameter values.
    
    score_column : str, default='mean_test_score'
        Which column contains the score.
        
    num_cols : int, default=3
        Number of columns for subplots arrangement.
        
    ax_size : tuple, default=(4, 2)
        Size of each subplot.
        
    log_scale : bool, list of str, default 'auto'
        Apply log scale to x-axis (value).
        If True, apply to all subplots with numerical x-axis (int or float).
        If a list, apply to subplots specified by parameter names (as in cv_results_).
        If 'auto', try to apply log scale automatically where appropriate.
        
    sort_x : bool, default=True
        Sort values on x-axis.
        
    skip_single_value : bool, default=True
        Skip plots with single x value.
    """
    if isinstance(sf_cv_results, pd.DataFrame):
        results_grid = sf_cv_results.copy()
    else:
        results_grid = pd.DataFrame(sf_cv_results)
    
    # Prepare data for plotting
    param_columns = []
    for col in results_grid.columns:
        if col.startswith('param_'):
            results_grid[col] = results_grid[col].astype(str)
            if skip_single_value and results_grid[col].nunique(dropna=False) < 2:
                continue
            else:
                param_columns.append(col)
    
    if len(param_columns) == 0:
        warnings.warn('No data to plot. Try: `skip_single_value=False`')
    
    default_results = results_grid.loc[defaults_index, :]
    default_score = default_results[score_column]
    
    # Prepare axes
    num_axes = len(param_columns)
    num_rows = int(np.ceil(num_axes / num_cols))
    figsize = np.array(ax_size) * (num_cols, num_rows)
    fig, axes = plt.subplots(nrows=num_rows, ncols=num_cols, figsize=figsize, sharey=True, squeeze=False)
    
    for ax in axes.ravel()[num_axes:]:
        ax.remove()
    
    # Colors
    infinite_prop_cycler = plt.rcParams['axes.prop_cycle']()
    
    # For each `param_...` column in cv_results_
    for ax, param_name, prop in zip(axes.ravel(), param_columns, infinite_prop_cycler):
        
        non_default_idx = results_grid[param_name] != default_results[param_name]
        
        # In case of GridSearchCV cv_results_ `.head` (first entry) or `.tail` (last entry) is the same.
        #   In case of HalvingGridSearchCV `.tail` (entry from the latest iteration) should be used.
        points = results_grid[non_default_idx].groupby(param_name).tail(1).append(default_results)\
            .rename(columns={param_name: 'x', score_column: 'y'})[['x', 'y']]
        if sort_x:
            points = points.loc[sort_alphanum(points.x).index]
        
        # Value-Score plot
        ax.grid(zorder=0)
        ax.plot(points.x, points.y, 'o-', lw=2, zorder=20, c=prop['color'])
        
        # Default value
        color = '#db0000'
        default_value = default_results[param_name]
        ax.axvline(default_value, c=color, lw=1, ls='--', zorder=10)
        ax.axhline(default_score, c=color, lw=1, ls='--', zorder=10)
        ax.scatter([default_value], [default_score],
                   s=60, zorder=30, label='default', facecolor='w', edgecolor=color)
        
        # Illegal values
        points_y_nan = points[points.y.isna()]
        ax.scatter(points_y_nan.x, [default_score]*len(points_y_nan), marker='x',
                   s=40, zorder=40, c=color, label='illegal')
        
        # Axis settings
        ax.set_xlabel(param_name)
        if ax in axes[:, 0]:
            ax.set_ylabel('score')
        ax.set_frame_on(False)
        
    axes[0][min(num_axes, num_cols)-1].legend(loc='upper left', bbox_to_anchor=(1, 1), title='Parameter value')
    offset = (results_grid[score_column].max() - default_score)*.9
    if offset > 0:
        axes[0][0].set_ylim(bottom=default_score - offset, top=results_grid[score_column].max() + offset)
    plt.tight_layout(pad=3)
    
    return axes


def plot_hyperparam_score(estimator, sf_grid, limit_samples=None, offset_samples=None, title=None, **kwargs):
    #cv = StratifiedKFold(n_splits=4, shuffle=True, random_state=SEED)
    
    with warnings.catch_warnings():
        warnings.simplefilter('ignore')
#         sf_grid_search = HalvingGridSearchCV(estimator, sf_grid, scoring='roc_auc', n_jobs=-1, cv=4,
#                                              error_score=np.nan, random_state=SEED) \
#             .fit(X[offset_samples:][:limit_samples], y[offset_samples:][:limit_samples])
        sf_grid_search = GridSearchCV(estimator, sf_grid, scoring='roc_auc', n_jobs=-1, cv=4,
                                             error_score=np.nan) \
            .fit(X[offset_samples:][:limit_samples], y[offset_samples:][:limit_samples])
    
    results = sf_grid_search.cv_results_
    
    # Restore parameters order as in grid
    param_keys = [f'param_{name}' for name in sf_grid[0].keys()]
    non_param_keys = [k for k in results.keys() if k not in param_keys]
    results = {k: results[k] for k in (param_keys + non_param_keys)}
    
    print_score_improvement(results)
    axes = plot_sf_cv_results(results, **kwargs)
    if title is not None:
        plt.suptitle(str(title)[:80], size=20, y=1.0)
    return sf_grid_search, axes


import sklearn
import sklearn.neural_network
import sklearn.discriminant_analysis
import sklearn.ensemble
import sklearn.gaussian_process
import sklearn.linear_model
import sklearn.naive_bayes
import sklearn.neural_network
import sklearn.neighbors
import sklearn.svm
import sklearn.tree

import xgboost


from sklearn.preprocessing import RobustScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

from sklearn.model_selection import cross_val_score

from sklearn.ensemble import StackingClassifier, VotingClassifier
from sklearn.tree import DecisionTreeClassifier, plot_tree


classifiers = {
    "QDA"                : sklearn.discriminant_analysis.QuadraticDiscriminantAnalysis(),
    "AdaBoost"           : sklearn.ensemble.AdaBoostClassifier(),
    "ExtraTrees"         : sklearn.ensemble.ExtraTreesClassifier(),
    "GradientBoosting"   : sklearn.ensemble.GradientBoostingClassifier(),
    "RandomForest"       : sklearn.ensemble.RandomForestClassifier(),
    "GaussianProcess"    : sklearn.gaussian_process.GaussianProcessClassifier(copy_X_train=False, n_jobs=-1),
    "LogisticRegression" : sklearn.linear_model.LogisticRegression(n_jobs=-1),
    "SGD"                : sklearn.linear_model.SGDClassifier(loss='log'),
    "GaussianNB"         : sklearn.naive_bayes.GaussianNB(),  # <-------------- does not have hyperparameters
    "MLP"                : sklearn.neural_network.MLPClassifier(),
    "KNeighbors"         : sklearn.neighbors.KNeighborsClassifier(n_jobs=-1),
    "SVC"                : sklearn.svm.SVC(probability=True),
    "NuSVC"              : sklearn.svm.NuSVC(probability=True),
    "DecisionTree"       : sklearn.tree.DecisionTreeClassifier(),
    "XGB"                : xgboost.XGBClassifier(use_label_encoder=False, eval_metric='logloss', n_jobs=-1,
                                                 verbosity=0),
}

numeric_columns = df.columns[df.dtypes.map(lambda t: np.issubdtype(t, np.floating) or np.issubdtype(t, np.integer))]

preprocessor = ColumnTransformer(
    remainder='passthrough',
    transformers=[('scaler', RobustScaler(), numeric_columns)]
)

# Classifier -> Pipeline
for k, clf in classifiers.items():
    if 'random_state' in clf.get_params():
        clf.set_params(random_state=SEED)
    classifiers[k] = Pipeline([('pre', preprocessor), ('clf', clf)], memory=CACHE_DIR)


# QuadraticDiscriminantAnalysis()

sf_grid = new_sf_param_grids(
    clf__reg_param=[.1, .0, 1.0, .1, .01, .001, .0001]#
)
plot_hyperparam_score(classifiers['QDA'], sf_grid);

Default score improvement: 0.835603 -> 0.835603 (+0.000000)


# AdaBoostClassifier()

sf_grid = new_sf_param_grids(
    clf__random_state=[0, 1, 2, 3, 4, 5, 6],
    clf__n_estimators=[50, 10, 25, 100, 200],
    clf__learning_rate=[.5, 1.0, .1, .5, 2.],#
)
plot_hyperparam_score(classifiers['AdaBoost'], sf_grid);

Default score improvement: 0.844780 -> 0.844780 (+0.000000)


# ExtraTreesClassifier()

sf_grid = new_sf_param_grids(
    clf__random_state=[3, 0, 1, 2, 3, 4, 5, 6],
    clf__n_estimators=[1000, 50, 200, 1000],
# #    clf__criterion=["gini", "entropy"],
#     clf__criterion=["entropy"],
# #    clf__max_depth=[6, None, 1, 3],
# #    clf__min_samples_split=[2, 4, 20, 100, .001, .01, .1, .5],
# #    clf__min_samples_split=[.01],
# #     clf__min_samples_leaf=[1, 2, 10, 100, .001, .01, .1, .5],
# #     clf__min_weight_fraction_leaf=[.0, .1, .5],
# #    clf__max_features=['auto', 'sqrt', 'log2', None, 1, 10],
# #     clf__max_leaf_nodes=[None, 2, 10, 100, 1000],
# #     clf__min_impurity_decrease=[.0, .01, .1],
# #     clf__bootstrap=[True],
# #     clf__oob_score=[False, True],
#    clf__class_weight=[None, 'balanced', 'balanced_subsample'],
#     clf__ccp_alpha=[.001, .0, .1, .01, .001, .0001],#
     clf__max_samples=[1000, None, .1, .5, 10, 50, 100, 500, 1000, 2000],#
#     clf__max_samples=[1000],#
        clf__min_samples_split=[.01],
#        clf__max_features=[10],
        clf__bootstrap=[True],
#        clf__max_samples=[.5],
)
plot_hyperparam_score(classifiers['ExtraTrees'], sf_grid);

Default score improvement: 0.841103 -> 0.842785 (+0.001682)


# GradientBoostingClassifier()

sf_grid = new_sf_param_grids(
    clf__random_state=[0, 1, 2, 3, 4, 5, 6],
    clf__loss=['deviance', 'exponential'],
    clf__learning_rate=[0.1, 1.0, 0.01, 0.001, 2.0, 10.0],
    clf__n_estimators=[100, 50, 200, 1000, 5000],
    #clf__subsample=[1.0, .9, .5, .1],
    clf__criterion=['friedman_mse', 'mse'],
    clf__min_samples_split=[2, 4, 20, 100, .001, .01, .1, .5],
    clf__min_samples_leaf=[.1, 1, 2, 10, 100, .001, .01, .1, .5],#
    clf__min_weight_fraction_leaf=[.0, .1, .2, .5],
    clf__max_depth=[3, 1, 2, 6, 10],
    clf__min_impurity_decrease=[.0, .01, .1],
    clf__init=[None, 'zero'],
    #clf__max_features=[None, 'auto', 'sqrt', 'log2', 1, 10, .1, .5, .9],
    clf__max_leaf_nodes=[None, 2, 5, 10],
    clf__n_iter_no_change=[None, 1, 10, 100],
    clf__tol=[1e-4, 1e-3, 1e-5],
    clf__ccp_alpha=[.0, .1, .01, .001],
)
plot_hyperparam_score(classifiers['GradientBoosting'], sf_grid);

Default score improvement: 0.847148 -> 0.847232 (+0.000084)


# RandomForestClassifier()

sf_grid = new_sf_param_grids(
    clf__random_state=[3, 0, 1, 2, 3, 4, 5, 6],
#     clf__n_estimators=[1000, 100, 50, 200, 1000],
#     clf__criterion=['entropy', 'gini', 'entropy'],#
# #    clf__max_depth=[None, 1, 2, 3, 6, 10],
#     clf__max_depth=[6],
# #     clf__min_samples_split=[2, 4, 20, 100, .001, .01, .1, .5],
# #     clf__min_samples_leaf=[1, 2, 10, 100, .001, .01, .1, .5],
# #     clf__min_weight_fraction_leaf=[.0, .1, .2, .5],
# #     clf__max_features=['auto', 'sqrt', 'log2', None, 1, 10, .1, .5, .9],
# #     clf__max_leaf_nodes=[None, 2, 5, 10, 100, 1000],
# #     clf__min_impurity_decrease=[.0, .01, .1],
# #     clf__bootstrap=[True, False],
# #     clf__oob_score=[False, True],
# #     clf__class_weight=[None, 'balanced', 'balanced_subsample'],
#     clf__class_weight=['balanced'],
#     clf__ccp_alpha=[.001, .0, .1, .01, .001, .0001],#
#     #clf__max_samples=[None, .01, .1, .5, .9, 1, 10],
    clf__max_depth=[6],
    clf__n_estimators=[5000],
    clf__max_samples=[.1],
    clf__min_samples_leaf=[2],
    clf__min_samples_split=[.0001],
)
plot_hyperparam_score(classifiers['RandomForest'], sf_grid);

Default score improvement: 0.845574 -> 0.846040 (+0.000466)


# GaussianProcessClassifier()

sf_grid = new_sf_param_grids(
    random_state=[0],
    kernel=[
        sklearn.gaussian_process.kernels.RBF(length_scale=1.0, length_scale_bounds="fixed"),
        sklearn.gaussian_process.kernels.Matern(length_scale=1.0, nu=1.5, length_scale_bounds="fixed"),
        sklearn.gaussian_process.kernels.RationalQuadratic(length_scale=1.0, alpha=1.0,
                                                           length_scale_bounds="fixed", alpha_bounds="fixed"),
        sklearn.gaussian_process.kernels.ExpSineSquared(length_scale=1.0, periodicity=1.0,
                                                        length_scale_bounds="fixed", periodicity_bounds="fixed"),
        sklearn.gaussian_process.kernels.DotProduct(sigma_0=1.0, sigma_0_bounds="fixed"),
    ]
)
plot_hyperparam_score(classifiers['GaussianProcess'], sf_grid, limit_samples=None)
plt.xticks(rotation=-15, ha='left')
pass

Default score improvement: 0.538889 -> 0.995807 (+0.456918)


# LogisticRegression()

sf_grid = new_sf_param_grids(
    clf__random_state=[0, 1, 2, 3, 4, 5, 6],
    clf__penalty=['elasticnet', 'l2', 'l1', 'elasticnet', None],#
    clf__tol=[.001, 1e-4, 1e-5, 1e-3],#
    clf__C=[2.0, 1.0, .1, .01, 2.0, 10.0, 100.0],#
    clf__class_weight=[None, 'balanced'],
    clf__solver=['saga', 'lbfgs', 'newton-cg', 'liblinear', 'sag', 'saga'],#
    clf__max_iter=[1000, 100, 1000, 10000],#
    clf__l1_ratio=[1.0, .0, .5, 1.0],  # only available with penalty='elasticnet'
)
_dev_null, axes = plot_hyperparam_score(classifiers['LogisticRegression'], sf_grid, limit_samples=None)
rotate_tick_labels(axes[1][-1], -15)
pass

Default score improvement: 0.843245 -> 0.843245 (+0.000000)


%%time

# SGDClassifier(learning_rate='optimal')

sf_grid = new_sf_param_grids(
    clf__random_state=[0, 1, 2, 3, 4, 5, 6],
    clf__penalty=['elasticnet'],
    clf__loss=['log', 'modified_huber', 'squared_hinge', 'perceptron'],
    clf__alpha=[.01, .0001, .00001, .001, .01, .1],#
    clf__l1_ratio=[0.15, 0.0, 1.0, 0.5, 0.75],
    clf__max_iter=[1000, 10, 100, 10000],
    clf__tol=[1e-3, 1e-2, 1e-4, 1e-5],
    clf__shuffle=[True, False],
    clf__learning_rate=['adaptive', 'optimal', 'constant', 'invscaling', 'adaptive'],#
    clf__eta0=[.1, .01, .001, .1, 1.0, 10.0],#
    clf__early_stopping=[True, False, True],#
    clf__n_iter_no_change=[100, 5, 1, 50, 100, 1000],#
    clf__class_weight=['balanced', None, 'balanced'],#
)
_dev_null, axis = plot_hyperparam_score(classifiers['SGD'], sf_grid)
rotate_tick_labels(axis[0][1], -15)
pass

Default score improvement: 0.839453 -> 0.842915 (+0.003462)
Wall time: 2min 19s


# MLPClassifier()

sf_grid = new_sf_param_grids(
    random_state=[0, 1, 2, 3, 4, 5, 6],
    hidden_layer_sizes=[(100,), (10,), (50,), (200,), (10, 10), (5, 5, 5), (4, 4, 4, 4)],
    activation=['relu', 'identity', 'logistic', 'tanh'],
    solver=['adam', 'lbfgs', 'sgd'],
    alpha=[1e-4, 0, 1e-5, 1e-3, 1e-2],
    batch_size=[200, 20, 100, 400, 2000],
    learning_rate_init=[.001, .00001, .0001, .01, .1, 1.],
    max_iter=[200, 100, 400],
    shuffle=[True, False],
    tol=[1e-3, 1e-2, 1e-4, .1],
    early_stopping=[False, True],
    beta_1=[.9, .9999, .999, .99, .8],
    beta_2=[.999, .9999, .9, .8],
    n_iter_no_change=[10, 1, 50, 100],
)
_dev_null, axis = plot_hyperparam_score(classifiers['MLP'], sf_grid);
rotate_tick_labels(axis[0][1], -30)
pass

Default score improvement: 0.951363 -> 0.990566 (+0.039203)


# KNeighborsClassifier()

sf_grid = new_sf_param_grids(
    n_neighbors=[5, 4, 2, 1, 6, 10, 100, 1000, 3000],
    weights=['uniform', 'distance'],
    algorithm=['auto', 'ball_tree', 'kd_tree', 'brute'],
    leaf_size=[30, 15, 60],
    p=[2, 1, 10],
)
_dev_null, axis = plot_hyperparam_score(classifiers['KNeighbors'], sf_grid);
rotate_tick_labels(axis[0][0], -30)
pass

Default score improvement: 0.961845 -> 0.972746 (+0.010901)


# SVC()

sf_grid = new_sf_param_grids(
    random_state=[0, 1, 2, 3, 4],
    C=[1.0, .1, 10],
    kernel=['rbf', 'linear', 'poly', 'sigmoid'],
    gamma=['scale', 'auto', .1, .5, .9],
    shrinking=[True, False],
    tol=[1e-3, 1e-2, 1e-4],
    class_weight=[None, 'balanced'],
    max_iter=[-1, 10, 100],
)
_dev_null, axis = plot_hyperparam_score(classifiers['SVC'], sf_grid);
rotate_tick_labels(axis[0][2], -15)
pass

Default score improvement: 0.973585 -> 0.996226 (+0.022642)


# NuSVC()

sf_grid = new_sf_param_grids(
    clf__random_state=[0, 1, 2, 3, 4],
    clf__nu=[.5, .1, .25, .75, 1.],
    #clf__kernel=['rbf', 'linear', 'poly', 'sigmoid'],
    clf__kernel=['linear'],
    #clf__gamma=['scale', 'auto', .1, .5, .9],
    clf__gamma=['auto'],
    #clf__shrinking=[True, False],
    #clf__tol=[1e-3, 1e-2, 1e-4],
    #clf__class_weight=[None, 'balanced'],
    clf__class_weight=['balanced'],
    #clf__max_iter=[-1, 10, 100, 1000],
)
_dev_null, axis = plot_hyperparam_score(classifiers['NuSVC'], sf_grid);
#rotate_tick_labels(axis[0][2], -15)
pass

Default score improvement: 0.830539 -> 0.830539 (+0.000000)


# DecisionTreeClassifier()

sf_grid = new_sf_param_grids(
    clf__random_state=[0, 1, 2, 3, 4, 5, 6],
    clf__criterion=['gini', 'entropy'],
    clf__splitter=['best', 'random'],
    clf__max_depth=[None, 1, 2, 3, 6, 10],
    clf__min_samples_split=[2, 4, 20, 100, .001, .01, .1, .5],
    clf__min_samples_leaf=[1, 2, 10, 100, .001, .01, .1, .5],
    clf__min_weight_fraction_leaf=[.0, .1, .2, .5, 1.0],
    clf__max_features=[None, 'auto', 'sqrt', 'log2', 1, 10, .1, .5, .9],
    clf__max_leaf_nodes=[None, 1, 2, 5, 10],
    clf__min_impurity_decrease=[.0, .1, .01],
    clf__class_weight=[None, 'balanced'],
    clf__ccp_alpha=[.0, .1, .01, .001, .0001, 1., 10],
)
plot_hyperparam_score(classifiers['DecisionTree'], sf_grid)
pass

Default score improvement: 0.654751 -> 0.834899 (+0.180148)


%%time
# XGBClassifier()

sf_grid = new_sf_param_grids(
    clf__random_state=[0, 1, 2, 3, 4],
    clf__booster=['gbtree', 'gblinear', 'dart'],
    clf__learning_rate=[.1, .001, .01, 1.0],
    clf__gamma=[0, .01, .1, 1.0, 10],
    clf__max_depth=[3, 1, 2, 6],
    clf__min_child_weight=[1, 0, .1, 10, 100],
    clf__max_delta_step=[0, .01, .1, 1, 2, 10],
    clf__subsample=[1],
    clf__colsample_bytree=[1],
    clf__colsample_bylevel=[1],
    clf__colsample_bynode=[1],
    clf__reg_lambda=[1, 0, .01, .1, 2, 10, 100, 1000],
    clf__reg_alpha=[0, .01, .1, 1, 2, 10, 100],
    clf__tree_method=['auto', 'exact', 'approx', 'hist'],
    clf__scale_pos_weight=[1, 0, 10, 100],
    clf__num_parallel_tree=[1, 2, 10],
    clf__n_estimators=[100, 10, 50, 200, 1000],
    clf__eval_metric=['logloss', 'error', 'auc'],
)
hgs, axes = plot_hyperparam_score(classifiers['XGB'], sf_grid)
pass

Default score improvement: 0.844702 -> 0.847068 (+0.002367)
Wall time: 1min 20s


# https://stackoverflow.com/a/52321479/

xgb_defaults = dict(
    # General
    booster='gbtree',             # set: {'gbtree', 'gblinear', 'dart'}
    verbosity=1,                  # set: {0, 1, 2, 3}

    # Tree Booster
    learning_rate=0.1,            # float: [0, 1], alias: eta
    gamma=0,                      # float: [0, inf), alias: min_split_loss
    max_depth=3,                  # int
    min_child_weight=1,           # float: [0, inf)
    max_delta_step=0,             # int: [0, inf)
    subsample=1,                  # float: (0, 1]
    colsample_bytree=1,           # float: (0, 1]
    colsample_bylevel=1,          # float: (0, 1]
    colsample_bynode=1,           # float: (0, 1]
    reg_lambda=1,                 # float: [0, inf), alias: lambda
    reg_alpha=0,                  # float: [0, inf), alias: alpha
    tree_method='auto',           # set: {'auto', 'exact', 'approx', 'hist', 'gpu_hist'}
    scale_pos_weight=1,           # float: [0, inf)
    num_parallel_tree=1,          # int: [1, inf)
    monotone_constraints='()',    # string
    interaction_constraints='',   # string, example: '[[0, 1], [2, 3, 4]]'

    # Learning Task Parameters
    objective='binary:logistic',  # https://xgboost.readthedocs.io/en/latest/parameter.html#learning-task-parameters
    base_score=0.5,
    random_state=0,               # int, alias: seed

    # Sklearn API specific
    n_estimators=100,             # int: [1, inf), boosting rounds, alias: num_round
    n_jobs=1,                     # int: [1, inf)
    missing=np.nan,
    importance_type='gain',       # set: {'gain', 'weight', 'cover', 'total_gain' or 'total_cover'}
    use_label_encoder=True,       # should be False, deprecated
    eval_metric='logloss',        # set: {'logloss', 'error', 'auc', ...}
)

Оптимизация гиперпараметров с помощью semi-fixed grid search¶

Сетки для классификаторов¶

xgboost.XGBClassifier() default parameter values¶

	1869	4528	6344	6739	432	2215	5260	6001	1480	5137
churn	False	False	True	False	False	False	False	False	False	False
senior_citizen	False	False	True	False	False	False	False	False	False	True
partner	True	False	True	False	False	True	False	False	False	False
dependents	True	False	False	False	False	False	False	False	False	False
phone_service	True	True	True	True	True	False	True	False	False	True
multiple_lines	True	False	True	False	False	False	True	False	False	True
online_security	False	False	False	False	True	True	True	False	False	True
online_backup	False	True	False	False	False	False	False	False	False	True
device_protection	False	True	False	False	True	True	False	False	True	True
tech_support	False	False	False	False	False	True	False	False	True	True
streaming_tv	False	True	False	False	False	False	False	False	False	True
streaming_movies	False	False	False	True	False	True	True	False	False	True
paperless_billing	False	True	True	True	False	True	True	True	False	True
is_male	True	False	False	True	True	False	False	False	True	True
internet_dsl	False	False	False	True	True	True	False	True	True	False
internet_fiber	False	True	True	False	False	False	True	False	False	True
internet_service	False	True	True	True	True	True	True	True	True	True
contract_one_month	False	True	True	True	True	False	True	True	False	True
contract_one_year	False	False	False	False	False	False	False	False	True	False
contract_two_year	True	False	False	False	False	True	False	False	False	False
pay_auto_transfer	False	False	True	False	False	True	False	False	False	False
pay_auto_credit	True	True	False	False	False	False	False	False	False	False
pay_check_email	False	False	False	True	True	False	True	False	False	True
pay_check_mail	False	False	False	False	False	False	False	True	True	False
tenure	72	44	38	4	2	70	33	1	39	55
monthly_charges	24.1	88.15	74.95	55.9	53.45	49.85	90.65	24.9	35.55	116.5
total_charges	1734.65	3973.2	2869.85	238.5	119.5	3370.2	2989.6	24.9	1309.15	6382.55