自定义转换器以添加额外列。

3

我正在尝试将我的Lambda函数复制到我的流水线中

def determine_healthy(_list):
    if ('no' in _list['smoker'] and (_list['bmi'] >= 18.5) and (_list['bmi']<= 24.9)):
        return True
    else:
        return False

df['healthy'] = df.apply(lambda row: determine_healthy(row), axis=1)

问题出现在我将它集成到我的流程中时,我不确定问题是否在于添加了一个附加列“健康”。当我尝试转换我的X_train时,就会抛出此错误。
from sklearn.base import BaseEstimator, TransformerMixin

class HealthyAttributeAdder(BaseEstimator, TransformerMixin):
    def __init__(self, items=None):
        if items is None: items = []
        self.l = items
    def fit(self, X , y=None):
        return self
    def transform(self, X):
        #X = X.copy()
        temp_cols = X.columns.to_list()
        temp_cols = temp_cols.append('healthy')
        new_cols = {k:v for k,v in zip(range(len(temp_cols)),temp_cols)}
        healthy = X.apply(lambda row: determine_healthy(row), axis=1)
        combined_df = pd.DataFrame(np.c_[X, healthy]).rename(columns=new_cols)
        return combined_df

num_col = ['age','bmi']
cat_col = ['sex', 'smoker','region','children','healthy']
y = df.pop('charges')
X = df 
all_col = X.columns
X_train, X_test, y_train, y_test = train_test_split(X,y , test_size=0.2, random_state = 42)

transform_pipeline = ColumnTransformer([
    ('healthy', HealthyAttributeAdder(), all_col),
    ('ss', StandardScaler(), num_col),
    ('ohe', OneHotEncoder(drop='first'), cat_col),
])

price_pipeline = Pipeline([
    ('transform', transform_pipeline),
    ('lasso',Lasso())
])

health_transform = HealthyAttributeAdder()
health_transform.fit_transform(X_train)
---------------------------------------------------------------------------
TypeError                                 Traceback (most recent call last)
~\AppData\Local\Temp/ipykernel_19796/500623650.py in <module>
----> 1 health_transform.fit_transform(X_train)

~\Venv\hdbtest\lib\site-packages\sklearn\base.py in fit_transform(self, X, y, **fit_params)
    850         if y is None:
    851             # fit method of arity 1 (unsupervised transformation)
--> 852             return self.fit(X, **fit_params).transform(X)
    853         else:
    854             # fit method of arity 2 (supervised transformation)

~\AppData\Local\Temp/ipykernel_19796/3713134512.py in transform(self, X)
     11         temp_cols = X.columns.to_list()
     12         temp_cols = temp_cols.append('healthy')
---> 13         new_cols = {k:v for k,v in zip(range(len(temp_cols)),temp_cols)}
     14         healthy = X.apply(lambda row: determine_healthy(row), axis=1)
     15         combined_df = pd.DataFrame(np.c_[X, healthy]).rename(columns=new_cols)

TypeError: object of type 'NoneType' has no len()

在使用它进行预测时出现错误:

price_pipeline.fit(X_train,y_train)
y_pred = price_pipeline.predict(X_test)
---------------------------------------------------------------------------
KeyError                                  Traceback (most recent call last)
~\Venv\hdbtest\lib\site-packages\pandas\core\indexes\base.py in get_loc(self, key, method, tolerance)
   3360             try:
-> 3361                 return self._engine.get_loc(casted_key)
   3362             except KeyError as err:

~\Venv\hdbtest\lib\site-packages\pandas\_libs\index.pyx in pandas._libs.index.IndexEngine.get_loc()

~\Venv\hdbtest\lib\site-packages\pandas\_libs\index.pyx in pandas._libs.index.IndexEngine.get_loc()

pandas\_libs\hashtable_class_helper.pxi in pandas._libs.hashtable.PyObjectHashTable.get_item()

pandas\_libs\hashtable_class_helper.pxi in pandas._libs.hashtable.PyObjectHashTable.get_item()

KeyError: 'healthy'

The above exception was the direct cause of the following exception:

KeyError                                  Traceback (most recent call last)
~\Venv\hdbtest\lib\site-packages\sklearn\utils\__init__.py in _get_column_indices(X, key)
    432             for col in columns:
--> 433                 col_idx = all_columns.get_loc(col)
    434                 if not isinstance(col_idx, numbers.Integral):

~\Venv\hdbtest\lib\site-packages\pandas\core\indexes\base.py in get_loc(self, key, method, tolerance)
   3362             except KeyError as err:
-> 3363                 raise KeyError(key) from err
   3364 

KeyError: 'healthy'

The above exception was the direct cause of the following exception:

ValueError                                Traceback (most recent call last)
~\AppData\Local\Temp/ipykernel_19796/993407432.py in <module>
----> 1 price_pipeline.fit(X_train,y_train)
      2 y_pred = price_pipeline.predict(X_test)

~\Venv\hdbtest\lib\site-packages\sklearn\pipeline.py in fit(self, X, y, **fit_params)
    388         """
    389         fit_params_steps = self._check_fit_params(**fit_params)
--> 390         Xt = self._fit(X, y, **fit_params_steps)
    391         with _print_elapsed_time("Pipeline", self._log_message(len(self.steps) - 1)):
    392             if self._final_estimator != "passthrough":

~\Venv\hdbtest\lib\site-packages\sklearn\pipeline.py in _fit(self, X, y, **fit_params_steps)
    346                 cloned_transformer = clone(transformer)
    347             # Fit or load from cache the current transformer
--> 348             X, fitted_transformer = fit_transform_one_cached(
    349                 cloned_transformer,
    350                 X,

~\Venv\hdbtest\lib\site-packages\joblib\memory.py in __call__(self, *args, **kwargs)
    347 
    348     def __call__(self, *args, **kwargs):
--> 349         return self.func(*args, **kwargs)
    350 
    351     def call_and_shelve(self, *args, **kwargs):

~\Venv\hdbtest\lib\site-packages\sklearn\pipeline.py in _fit_transform_one(transformer, X, y, weight, message_clsname, message, **fit_params)
    891     with _print_elapsed_time(message_clsname, message):
    892         if hasattr(transformer, "fit_transform"):
--> 893             res = transformer.fit_transform(X, y, **fit_params)
    894         else:
    895             res = transformer.fit(X, y, **fit_params).transform(X)

~\Venv\hdbtest\lib\site-packages\sklearn\compose\_column_transformer.py in fit_transform(self, X, y)
    670         self._check_n_features(X, reset=True)
    671         self._validate_transformers()
--> 672         self._validate_column_callables(X)
    673         self._validate_remainder(X)
    674 

~\Venv\hdbtest\lib\site-packages\sklearn\compose\_column_transformer.py in _validate_column_callables(self, X)
    350                 columns = columns(X)
    351             all_columns.append(columns)
--> 352             transformer_to_input_indices[name] = _get_column_indices(X, columns)
    353 
    354         self._columns = all_columns

~\Venv\hdbtest\lib\site-packages\sklearn\utils\__init__.py in _get_column_indices(X, key)
    439 
    440         except KeyError as e:
--> 441             raise ValueError("A given column is not a column of the dataframe") from e
    442 
    443         return column_indices

ValueError: A given column is not a column of the dataframe
1个回答

1
第一个问题实际上与ColumnTransformer的使用无关,而是由于您的HealthyAttributeAdder类中transform方法实现中的错误导致的。
为了获得一致的结果,您应该修改行
temp_cols = temp_cols.append('healthy')

转换为

temp_cols.append('healthy')

实际上,问题就像这里所描述的那样。
另一方面,当您切换到ColumnTransformer时,问题就像这里这里所描述的那样(您还会发现其他相关的帖子)。也就是说,ColumnTransformer将其转换器应用于X_train数据集时是并行的;因此,在对您的分类特征进行独热编码时,OneHotEncoder被要求转换cat_col中存在的'healthy'列,但相同的列在X_train中不存在。
解决该问题的可能方法是定义一个单独的流水线来处理HealthyAttributeAdder,并将其应用于您的ColumnTransformer实例transform_pipeline之前。
class HealthyAttributeAdder(BaseEstimator, TransformerMixin):
    def fit(self, X , y=None):
        return self
    def transform(self, X):
        #X = X.copy()
        temp_cols = X.columns.to_list()
        temp_cols.append('healthy')
        new_cols = {k:v for k,v in zip(range(len(temp_cols)),temp_cols)}
        healthy = X.apply(lambda row: determine_healthy(row), axis=1)
        combined_df = pd.DataFrame(np.c_[X, healthy]).rename(columns=new_cols)
        return combined_df

from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.linear_model import Lasso

transform_pipeline = ColumnTransformer([
    #('healthy', HealthyAttributeAdder(), all_col),
    ('ss', StandardScaler(), num_col),
    ('ohe', OneHotEncoder(drop='first'), cat_col),
])

healthy_pipeline = Pipeline([
    ('healthy', HealthyAttributeAdder())                          
])

price_pipeline = Pipeline([
    ('add_healthy', healthy_pipeline),                     
    ('transform', transform_pipeline),
    ('lasso',Lasso())
])

price_pipeline.fit(X_train,y_train)
y_pred = price_pipeline.predict(X_test)

因此,你的price_pipeline的第一步(add_healthy)的输出将首先向X_train添加健康列;然后这个转换后的X_train将被并行传递给StandardScaler()OneHotEncoder(),尤其是OneHotEncoder()将不会在对列'healthy'进行独热编码时遇到任何问题。


1
我已经编辑了代码以反映 temp_cols.append('healthy') 并按照您的建议将 healthy_pipeline 插入为单独的管道,它运行得非常好。谢谢! - Randy Chng
如果代码中存在多个问题,而 OP 只是发布了代码并讨论了一个错误,那么这不是 Stack Overflow 上适当的问题,也不应该尝试回答。如果它甚至是一个问题(参见 https://meta.stackoverflow.com/questions/284236),那么它就是多个不相关的问题(每个问题都有一个),因此“需要更多关注”。如果这些问题是已经被回答过的常见问题(根据您的链接),那么情况就更糟了:重复问题也不应该被回答。请阅读答案并帮助保持网站的清洁。 - Karl Knechtel

网页内容由stack overflow 提供, 点击上面的
可以查看英文原文,
原文链接