class CarDataset(DataSet):
def __init__(self, csv_file):
df = pd.read_csv(csv_file).drop(["dateCrawled", "name", "abtest", "dateCreated", "nrOfPictures", "postalCode", "lastSeen"], axis = 1)
df = df.drop(df[df["seller"] == "gewerblich"].index).drop(["seller"], axis = 1)
df = df.drop(df[df["offerType"] == "Gesuch"].index).drop(["offerType"], axis = 1)
df = df[df["vehicleType"].notnull()]
df = df[df["notRepairedDamage"].notnull()]
df = df[df["model"].notnull()]
df = df[df["fuelType"].notnull()]
df = df[(df["price"] > 100) & (df["price"] < 100000)]
df = df[(df["monthOfRegistration"] > 0) & (df["monthOfRegistration"] < 13)]
df = df[(df["yearOfRegistration"] < 2019) & (df["yearOfRegistration"] > 1950)]
df = df[(df["powerPS"] > 20) & (df["powerPS"] < 550)]
df["hasDamage"] = np.where(df["notRepairedDamage"] == "ja", 1, 0)
df["automatic"] = np.where(df["gearbox"] == "manuell", 1, 0)
df["fuel"] = np.where(df["fuelType"] == "benzin", 0, 1)
df["age"] = (2019 - df["yearOfRegistration"]) * 12 + df["monthOfRegistration"]
df = df.drop(["notRepairedDamage", "gearbox", "fuelType", "yearOfRegistration", "monthOfRegistration"], axis = 1)
df = pd.get_dummies(df, columns = ["vehicleType", "model", "brand"])
self.df = df
self.Y = self.df["price"].values
self.X = self.df.drop(["price"], axis = 1).values
scaler = StandardScaler()
scaler.fit(self.X)
self.X = scaler.transform(self.X)
self.x_train, self.x_test, self.y_train, self.y_test = train_test_split(self.X,
self.Y,
test_size = 0.25,
random_state = 0)
self.x_train, self.x_valid, self.y_train, self.y_valid = train_test_split(self.x_train,
self.y_train,
test_size = 0.25,
random_state = 0)
def get_input_shape(self):
return (len(self.df.columns)-1, ) # (303, )
最初的回答: 这导致以下准备好的数据集:
翻译后的结果: 这将产生以下准备好的数据集:
price powerPS kilometer hasDamage automatic fuel age vehicleType_andere vehicleType_bus vehicleType_cabrio vehicleType_coupe ... brand_rover brand_saab brand_seat brand_skoda brand_smart brand_subaru brand_suzuki brand_toyota brand_trabant brand_volkswagen brand_volvo
3 1500 75 150000 0 1 0 222 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 1 0
4 3600 69 90000 0 1 1 139 0 0 0 0 ... 0 0 0 1 0 0 0 0 0 0 0
5 650 102 150000 1 1 0 298 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0 0
6 2200 109 150000 0 1 0 188 0 0 1 0 ... 0 0 0 0 0 0 0 0 0 0 0
10 2000 105 150000 0 1 0 192 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0 0
[5 rows x 304 columns]
hasDamage
是一个标志(0或1),表示汽车是否有未修复的损坏automatic
是一个标志(0或1),表示汽车是手动挡还是自动挡fuel
是0表示柴油,1表示汽油age
是汽车的使用年限,以月为单位
列brand
、model
和vehicleType
将使用df = pd.get_dummies(df, columns = ["vehicleType", "model", "brand"])
进行一位有效编码。
此外,我将使用StandardScaler
来转换X值。
现在数据集包含303个X列和“price”列的Y。
使用这个数据集,常规的LinearRegression
在训练和测试集上将获得约0.7的分数。
现在我已经尝试过使用keras进行深度学习方法,但无论我做什么,mse
和损失值都会飙升,并且该模型似乎无法学习任何东西:
input_tensor = model_stack = Input(dataset.get_input_shape()) # (303, )
model_stack = Dense(20)(model_stack)
model_stack = Activation("relu", name = "relu_1")(model_stack)
model_stack = Dense(20)(model_stack)
model_stack = Activation("relu", name = "relu_2")(model_stack)
model_stack = Dense(1, name = "Output")(model_stack)
model = Model(inputs = [input_tensor], outputs = [model_stack])
model.compile(loss = "mse", optimizer = optimizer(lr = learning_rate), metrics = ['mse'])
model.summary()
callbacks = []
callbacks.append(ReduceLROnPlateau(monitor = "val_loss", factor = 0.95, verbose = self.verbose, patience = 1))
callbacks.append(EarlyStopping(monitor='val_loss', patience = 5, min_delta = 0.01, restore_best_weights = True, verbose = self.verbose))
model.fit(x = dataset.x_train,
y = dataset.y_train,
verbose = 1,
batch_size = 128,
epochs = 200,
validation_data = [dataset.x_valid, dataset.y_valid],
callbacks = callbacks)
score = model.evaluate(dataset.x_test, dataset.y_test, verbose = 1)
print("Model score: {}".format(score))
而总结/训练看起来像这样(学习率为3e-4
):
_________________________________________________________________
Layer (type) Output Shape Param #
=================================================================
input_1 (InputLayer) (None, 6) 0
_________________________________________________________________
dense_1 (Dense) (None, 20) 140
_________________________________________________________________
relu_1 (Activation) (None, 20) 0
_________________________________________________________________
dense_2 (Dense) (None, 20) 420
_________________________________________________________________
relu_2 (Activation) (None, 20) 0
_________________________________________________________________
Output (Dense) (None, 1) 21
=================================================================
Total params: 581
Trainable params: 581
Non-trainable params: 0
_________________________________________________________________
Train on 182557 samples, validate on 60853 samples
Epoch 1/200
182557/182557 [==============================] - 2s 13us/step - loss: 110046953.4602 - mean_squared_error: 110046953.4602 - acc: 0.0000e+00 - val_loss: 107416331.4062 - val_mean_squared_error: 107416331.4062 - val_acc: 0.0000e+00
Epoch 2/200
182557/182557 [==============================] - 2s 11us/step - loss: 97859920.3050 - mean_squared_error: 97859920.3050 - acc: 0.0000e+00 - val_loss: 85956634.8803 - val_mean_squared_error: 85956634.8803 - val_acc: 1.6433e-05
Epoch 3/200
182557/182557 [==============================] - 2s 12us/step - loss: 70531052.0493 - mean_squared_error: 70531052.0493 - acc: 2.1911e-05 - val_loss: 54933938.6787 - val_mean_squared_error: 54933938.6787 - val_acc: 3.2866e-05
Epoch 4/200
182557/182557 [==============================] - 2s 11us/step - loss: 42639802.3204 - mean_squared_error: 42639802.3204 - acc: 3.2866e-05 - val_loss: 32645940.6536 - val_mean_squared_error: 32645940.6536 - val_acc: 1.3146e-04
Epoch 5/200
182557/182557 [==============================] - 2s 11us/step - loss: 28282909.0699 - mean_squared_error: 28282909.0699 - acc: 1.4242e-04 - val_loss: 25315220.7446 - val_mean_squared_error: 25315220.7446 - val_acc: 9.8598e-05
Epoch 6/200
182557/182557 [==============================] - 2s 11us/step - loss: 24279169.5270 - mean_squared_error: 24279169.5270 - acc: 3.8344e-05 - val_loss: 23420569.2554 - val_mean_squared_error: 23420569.2554 - val_acc: 9.8598e-05
Epoch 7/200
182557/182557 [==============================] - 2s 11us/step - loss: 22874003.0459 - mean_squared_error: 22874003.0459 - acc: 9.8599e-05 - val_loss: 22380401.0622 - val_mean_squared_error: 22380401.0622 - val_acc: 1.6433e-05
...
Epoch 197/200
182557/182557 [==============================] - 2s 12us/step - loss: 13828827.1595 - mean_squared_error: 13828827.1595 - acc: 3.3414e-04 - val_loss: 14123447.1746 - val_mean_squared_error: 14123447.1746 - val_acc: 3.1223e-04
Epoch 00197: ReduceLROnPlateau reducing learning rate to 0.00020950120233464986.
Epoch 198/200
182557/182557 [==============================] - 2s 13us/step - loss: 13827193.5994 - mean_squared_error: 13827193.5994 - acc: 2.4102e-04 - val_loss: 14116898.8054 - val_mean_squared_error: 14116898.8054 - val_acc: 1.6433e-04
Epoch 00198: ReduceLROnPlateau reducing learning rate to 0.00019902614221791736.
Epoch 199/200
182557/182557 [==============================] - 2s 12us/step - loss: 13823582.4300 - mean_squared_error: 13823582.4300 - acc: 3.3962e-04 - val_loss: 14108715.5067 - val_mean_squared_error: 14108715.5067 - val_acc: 4.1083e-04
Epoch 200/200
182557/182557 [==============================] - 2s 11us/step - loss: 13820568.7721 - mean_squared_error: 13820568.7721 - acc: 3.1223e-04 - val_loss: 14106001.7681 - val_mean_squared_error: 14106001.7681 - val_acc: 2.3006e-04
60853/60853 [==============================] - 1s 18us/step
Model score: [14106001.790199332, 14106001.790199332, 0.00023006260989597883]
我在机器学习方面还是个初学者。我的方法中有什么明显的错误吗?我做错了什么?
最初的回答:
val_loss: 98661204.1644 - val_mean_squared_error: 98661204.1644 - val_acc: 6.5732e-05
开始,经过200个epochs后下降到val_loss: 8097733.0068 - val_mean_squared_error: 8097733.0068 - val_acc: 6.5732e-04
。我还使用了ReduceLROnPlateau
,在第128个epoch后启动。值得注意的是,我的学习率初始值为3e-4
。 所以看起来并没有帮助太多?对我来说,它更像是我在设置/数据/模型方面存在一些更大的问题,因此目前只是产生垃圾? - user826955