sklearn StackingClassifier和样本权重

3

我有一个类似于堆叠的工作流程,与

import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import StackingClassifier
from sklearn.pipeline import make_pipeline
import xgboost as xgb

X = np.random.random(size=(1000, 5))
y = np.random.choice([0,1], 1000)
w = np.random.random(size=(1000,))

scaler = StandardScaler()
log_reg = LogisticRegression()

params = {
    'n_estimators': 10,
    'max_depth': 3,
    'learning_rate': 0.1
}

log_reg_pipe = make_pipeline(
    scaler,
    log_reg
)

stack_pipe = make_pipeline(
    StackingClassifier(
        estimators=[('lr', lr_stack_pipe)],
        final_estimator=xgb.XGBClassifier(**params),
        passthrough=True,
        cv=2
    )
)

我希望能够在xgboost中传递样本权重。我的问题是如何在最终的估算器中设置样本权重?

我尝试过

stack_pipe.fit(X, y, sample_weights=w),但会出现错误。

ValueError: Pipeline.fit does not accept the sample_weights parameter. You can pass parameters to specific steps of your pipeline using the stepname__parameter format, e.g. `Pipeline.fit(X, y, logisticregression__sample_weight=sample_weight)`

错误信息非常直观,不是吗?为什么你要在单个步骤“StackingClassifier”周围加上管道呢? - Ben Reiniger
这是一个实际问题的最小可重现示例;我将其包围在管道中,因为真正的问题有一些预处理。显然,我已经尝试访问final_esimator__sample_weight,但它不起作用。 - Jonathan
问题在于StackingClassifier不允许您将拟合参数传递给基学习器。 StackingClassifier.fit仅具有sample_weights参数,但它会将这些权重传递给_每个_基学习器,这不是您要求的。 无论如何,由于您的基学习器实际上是一个管道,而管道不能直接使用sample_weights,因此也会出现您报告的错误。 如果您希望所有基学习器都获得样本权重,则我可以想到一些解决方法,但否则,除了复制StackingClassifier.fit逻辑之外,我看不到简单的修补程序。 - Ben Reiniger
2个回答

5

我最近也意识到,叠加估计器无法处理样本加权管道。我通过从scikit-learn子类化StackingRegressorStackingClassifier类并重写其fit()方法来解决这个问题,以更好地管理管道。请看以下内容:

"""Implement StackingClassifier that can handle sample-weighted Pipelines."""

from sklearn.ensemble import StackingRegressor, StackingClassifier
from copy import deepcopy

import numpy as np
from joblib import Parallel

from sklearn.base import clone
from sklearn.base import is_classifier, is_regressor

from sklearn.model_selection import cross_val_predict
from sklearn.model_selection import check_cv

from sklearn.utils import Bunch
from sklearn.utils.fixes import delayed

from sklearn.pipeline import Pipeline

ESTIMATOR_NAME_IN_PIPELINE = 'estimator'

def new_fit_single_estimator(estimator, X, y, sample_weight=None,
                             message_clsname=None, message=None):
    """Private function used to fit an estimator within a job."""
    if sample_weight is not None:
        try:
            if isinstance(estimator, Pipeline):
                # determine name of final estimator
                estimator_name = estimator.steps[-1][0]
                kwargs = {estimator_name + '__sample_weight': sample_weight}
                estimator.fit(X, y, **kwargs)
            else:
                estimator.fit(X, y, sample_weight=sample_weight)
        except TypeError as exc:
            if "unexpected keyword argument 'sample_weight'" in str(exc):
                raise TypeError(
                    "Underlying estimator {} does not support sample weights."
                    .format(estimator.__class__.__name__)
                ) from exc
            raise
    else:
        estimator.fit(X, y)
    return estimator


class FlexibleStackingClassifier(StackingClassifier):

    def __init__(self, estimators, final_estimator=None, *, cv=None,
                 n_jobs=None, passthrough=False, verbose=0):
        super().__init__(
            estimators=estimators,
            final_estimator=final_estimator,
            cv=cv,
            n_jobs=n_jobs,
            passthrough=passthrough,
            verbose=verbose
        )

    def fit(self, X, y, sample_weight=None):
        """Fit the estimators.

        Parameters
        ----------
        X : {array-like, sparse matrix} of shape (n_samples, n_features)
            Training vectors, where `n_samples` is the number of samples and
            `n_features` is the number of features.
        y : array-like of shape (n_samples,)
            Target values.
        sample_weight : array-like of shape (n_samples,) or default=None
            Sample weights. If None, then samples are equally weighted.
            Note that this is supported only if all underlying estimators
            support sample weights.
            .. versionchanged:: 0.23
               when not None, `sample_weight` is passed to all underlying
               estimators

        Returns
        -------
        self : object
        """
        # all_estimators contains all estimators, the one to be fitted and the
        # 'drop' string.
        names, all_estimators = self._validate_estimators()
        self._validate_final_estimator()

        stack_method = [self.stack_method] * len(all_estimators)

        # Fit the base estimators on the whole training data. Those
        # base estimators will be used in transform, predict, and
        # predict_proba. They are exposed publicly.
        self.estimators_ = Parallel(n_jobs=self.n_jobs)(
            delayed(new_fit_single_estimator)(clone(est), X, y, sample_weight)
            for est in all_estimators if est != 'drop'
        )

        self.named_estimators_ = Bunch()
        est_fitted_idx = 0
        for name_est, org_est in zip(names, all_estimators):
            if org_est != 'drop':
                self.named_estimators_[name_est] = self.estimators_[
                    est_fitted_idx]
                est_fitted_idx += 1
            else:
                self.named_estimators_[name_est] = 'drop'

        # To train the meta-classifier using the most data as possible, we use
        # a cross-validation to obtain the output of the stacked estimators.

        # To ensure that the data provided to each estimator are the same, we
        # need to set the random state of the cv if there is one and we need to
        # take a copy.
        cv = check_cv(self.cv, y=y, classifier=is_classifier(self))
        if hasattr(cv, 'random_state') and cv.random_state is None:
            cv.random_state = np.random.RandomState()

        self.stack_method_ = [
            self._method_name(name, est, meth)
            for name, est, meth in zip(names, all_estimators, stack_method)
        ]
        fit_params = ({f"{ESTIMATOR_NAME_IN_PIPELINE}__sample_weight": sample_weight}
                      if sample_weight is not None
                      else None)
        predictions = Parallel(n_jobs=self.n_jobs)(
            delayed(cross_val_predict)(clone(est), X, y, cv=deepcopy(cv),
                                       method=meth, n_jobs=self.n_jobs,
                                       fit_params=fit_params,
                                       verbose=self.verbose)
            for est, meth in zip(all_estimators, self.stack_method_)
            if est != 'drop'
        )

        # Only not None or not 'drop' estimators will be used in transform.
        # Remove the None from the method as well.
        self.stack_method_ = [
            meth for (meth, est) in zip(self.stack_method_, all_estimators)
            if est != 'drop'
        ]

        X_meta = self._concatenate_predictions(X, predictions)
        new_fit_single_estimator(self.final_estimator_, X_meta, y,
                                 sample_weight=sample_weight)

        return self


class FlexibleStackingRegressor(StackingRegressor):

    def __init__(self, estimators, final_estimator=None, *, cv=None,
                 n_jobs=None, passthrough=False, verbose=0):
        super().__init__(
            estimators=estimators,
            final_estimator=final_estimator,
            cv=cv,
            n_jobs=n_jobs,
            passthrough=passthrough,
            verbose=verbose
        )

    def fit(self, X, y, sample_weight=None):
        """Fit the estimators.

        Parameters
        ----------
        X : {array-like, sparse matrix} of shape (n_samples, n_features)
            Training vectors, where `n_samples` is the number of samples and
            `n_features` is the number of features.
        y : array-like of shape (n_samples,)
            Target values.
        sample_weight : array-like of shape (n_samples,) or default=None
            Sample weights. If None, then samples are equally weighted.
            Note that this is supported only if all underlying estimators
            support sample weights.
            .. versionchanged:: 0.23
               when not None, `sample_weight` is passed to all underlying
               estimators

        Returns
        -------
        self : object
        """
        # all_estimators contains all estimators, the one to be fitted and the
        # 'drop' string.
        names, all_estimators = self._validate_estimators()
        self._validate_final_estimator()

        stack_method = [self.stack_method] * len(all_estimators)

        # Fit the base estimators on the whole training data. Those
        # base estimators will be used in transform, predict, and
        # predict_proba. They are exposed publicly.
        self.estimators_ = Parallel(n_jobs=self.n_jobs)(
            delayed(new_fit_single_estimator)(clone(est), X, y, sample_weight)
            for est in all_estimators if est != 'drop'
        )

        self.named_estimators_ = Bunch()
        est_fitted_idx = 0
        for name_est, org_est in zip(names, all_estimators):
            if org_est != 'drop':
                self.named_estimators_[name_est] = self.estimators_[
                    est_fitted_idx]
                est_fitted_idx += 1
            else:
                self.named_estimators_[name_est] = 'drop'

        # To train the meta-classifier using the most data as possible, we use
        # a cross-validation to obtain the output of the stacked estimators.

        # To ensure that the data provided to each estimator are the same, we
        # need to set the random state of the cv if there is one and we need to
        # take a copy.
        cv = check_cv(self.cv, y=y, classifier=is_classifier(self))
        if hasattr(cv, 'random_state') and cv.random_state is None:
            cv.random_state = np.random.RandomState()

        self.stack_method_ = [
            self._method_name(name, est, meth)
            for name, est, meth in zip(names, all_estimators, stack_method)
        ]
        fit_params = ({f"{ESTIMATOR_NAME_IN_PIPELINE}__sample_weight": sample_weight}
                      if sample_weight is not None
                      else None)
        predictions = Parallel(n_jobs=self.n_jobs)(
            delayed(cross_val_predict)(clone(est), X, y, cv=deepcopy(cv),
                                       method=meth, n_jobs=self.n_jobs,
                                       fit_params=fit_params,
                                       verbose=self.verbose)
            for est, meth in zip(all_estimators, self.stack_method_)
            if est != 'drop'
        )

        # Only not None or not 'drop' estimators will be used in transform.
        # Remove the None from the method as well.
        self.stack_method_ = [
            meth for (meth, est) in zip(self.stack_method_, all_estimators)
            if est != 'drop'
        ]

        X_meta = self._concatenate_predictions(X, predictions)
        new_fit_single_estimator(self.final_estimator_, X_meta, y,
                                 sample_weight=sample_weight)

        return self

我同时提供了回归器和分类器的版本,但您似乎只需要使用分类器子类。

但是请注意: 在管道中为估算器指定相同的名称,并且该名称必须与下面定义的ESTIMATOR_NAME_IN_PIPELINE字段对齐。否则代码不会起作用。例如,这里会使用类定义脚本中定义的相同名称适当地定义一个Pipeline实例:

from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import TweedieRegressor
from sklearn.feature_selection import VarianceThreshold

validly_named_pipeline = Pipeline([
    ('variance_threshold', VarianceThreshold()),
    ('scaler', StandardScaler()),
    ('estimator', TweedieRegressor())
])

这不是理想的情况,但现在它是我所拥有的,并且应该可以正常工作。

编辑:仅仅为了明确,当我重写fit()方法时,我只是从Scikit社区中复制并粘贴了代码,并做出了必要的更改,其中只涉及几行代码。因此,大部分粘贴的代码不是我的原创作品,而是Scikit开发人员的。


1
非常好!谢谢!! - Jonathan

2

对于您的情况,由于您有一个嵌套的管道,在传递参数时,这里是您必须使用的关键字。

list(stack_pipe.get_params().keys())

['memory',
 'steps',
 'verbose',
 'stackingclassifier',
 'stackingclassifier__cv',
 'stackingclassifier__estimators',
 'stackingclassifier__final_estimator__objective',
 'stackingclassifier__final_estimator__use_label_encoder',
 'stackingclassifier__final_estimator__base_score',
 'stackingclassifier__final_estimator__booster',
 'stackingclassifier__final_estimator__colsample_bylevel',
 'stackingclassifier__final_estimator__colsample_bynode',
 'stackingclassifier__final_estimator__colsample_bytree',
 'stackingclassifier__final_estimator__gamma',
 'stackingclassifier__final_estimator__gpu_id',
 'stackingclassifier__final_estimator__importance_type',
 'stackingclassifier__final_estimator__interaction_constraints',
 'stackingclassifier__final_estimator__learning_rate',
 'stackingclassifier__final_estimator__max_delta_step',
 'stackingclassifier__final_estimator__max_depth',
 'stackingclassifier__final_estimator__min_child_weight',
 'stackingclassifier__final_estimator__missing',
 'stackingclassifier__final_estimator__monotone_constraints',
 'stackingclassifier__final_estimator__n_estimators',
 'stackingclassifier__final_estimator__n_jobs',
 'stackingclassifier__final_estimator__num_parallel_tree',
 'stackingclassifier__final_estimator__random_state',
 'stackingclassifier__final_estimator__reg_alpha',
 'stackingclassifier__final_estimator__reg_lambda',
 'stackingclassifier__final_estimator__scale_pos_weight',
 'stackingclassifier__final_estimator__subsample',
 'stackingclassifier__final_estimator__tree_method',
 'stackingclassifier__final_estimator__validate_parameters',
 'stackingclassifier__final_estimator__verbosity',
 'stackingclassifier__final_estimator',
 'stackingclassifier__n_jobs',
 'stackingclassifier__passthrough',
 'stackingclassifier__stack_method',
 'stackingclassifier__verbose',
 'stackingclassifier__lr',
 'stackingclassifier__lr__memory',
 'stackingclassifier__lr__steps',
 'stackingclassifier__lr__verbose',
 'stackingclassifier__lr__standardscaler',
 'stackingclassifier__lr__logisticregression',
 'stackingclassifier__lr__standardscaler__copy',
 'stackingclassifier__lr__standardscaler__with_mean',
 'stackingclassifier__lr__standardscaler__with_std',
 'stackingclassifier__lr__logisticregression__C',
 'stackingclassifier__lr__logisticregression__class_weight',
 'stackingclassifier__lr__logisticregression__dual',
 'stackingclassifier__lr__logisticregression__fit_intercept',
 'stackingclassifier__lr__logisticregression__intercept_scaling',
 'stackingclassifier__lr__logisticregression__l1_ratio',
 'stackingclassifier__lr__logisticregression__max_iter',
 'stackingclassifier__lr__logisticregression__multi_class',
 'stackingclassifier__lr__logisticregression__n_jobs',
 'stackingclassifier__lr__logisticregression__penalty',
 'stackingclassifier__lr__logisticregression__random_state',
 'stackingclassifier__lr__logisticregression__solver',
 'stackingclassifier__lr__logisticregression__tol',
 'stackingclassifier__lr__logisticregression__verbose',
 'stackingclassifier__lr__logisticregression__warm_start']

仔细查看后,final_estimator中没有出现sample_weight键。您可能需要检查原始API以查看是否已弃用或重命名。


3
sample_weight是一个拟合参数(与数据相关),而不是“属性”参数,因此您不应该在get_params()中看到它。 - Ben Reiniger

网页内容由stack overflow 提供, 点击上面的
可以查看英文原文,
原文链接