类型错误:DataLoader 找到无效类型:<class 'numpy.ndarray'>
大家好,我遇到了困难,找不到解决办法,请帮忙。
程序在 train_fn() 函数处遇到了错误。
train.py
from sklearn.preprocessing import StandardScaler
import joblib
from tqdm import tqdm
import pandas as pd
import numpy as np
import torch_geometric.transforms as T
import torch
import torch.optim as optim
# from torch_geometric.data import DataLoader
from torch_geometric.loader import DataLoader
from model import *
from Constant import *
import os
print(os.getcwd())
# path = '/home/ktcodes/jktModel/data/a09'
path = './data/a09'
e2e_emb = joblib.load(f'{path}/e2e_emb.pkl.zip')
c2c_emb = joblib.load(f'{path}/c2c_emb.pkl.zip')
skill_prob = joblib.load(f'{path}/skill_prob.pkl.zip')
filtered_skill_prob = {}
channel = 10
for i, skill_id in enumerate(skill_prob.index):
if len(skill_prob[skill_id])>= channel:
filtered_skill_prob[skill_id] = skill_prob[skill_id]
joblib.dump(filtered_skill_prob, f'{path}/filtered_skill_prob.pkl.zip')
# normalization
scaler = StandardScaler()
all_c_v = []
for k,v in c2c_emb.items():
all_c_v.extend(list(v.numpy()))
all_c_v = scaler.fit_transform(np.array(all_c_v).reshape(-1,1))
all_c_v1 = {}
for i, (k,v) in enumerate(c2c_emb.items()):
all_c_v1[k] = all_c_v[i*10:(i+1)*10].reshape(-1,)
all_e_v = {}
for skill,qu_embs in e2e_emb.items():
q_num = qu_embs.shape[0]
temp_all_v = qu_embs.numpy().reshape(-1,)
temp_all_v = scaler.fit_transform(np.array(temp_all_v).reshape(-1,1))
all_e_v[skill] = temp_all_v.reshape(-1,10)
skill_emb = {}
for skill in tqdm(filtered_skill_prob.keys()):
temp_c = (np.array(all_c_v1[skill]))
temp_e = np.array(np.mean(all_e_v[skill], axis=0))
skill_emb[skill] = np.append(temp_c, temp_e)
prob_emb = {}
for skill in tqdm(filtered_skill_prob.keys()):
for i, prob in enumerate(filtered_skill_prob[skill]):
temp_c = (np.array(all_c_v1[skill]))
temp_e = (np.array(all_e_v[skill][i]))
new_emb = np.append(temp_c, temp_e)
if prob in prob_emb.keys():
prob_emb[prob] = np.row_stack((prob_emb[prob], new_emb)).squeeze().astype(np.int32)
# print(prob_emb[prob].shape)
else: prob_emb[prob] = new_emb
for prob in tqdm(prob_emb.keys()):
if len(prob_emb[prob].shape) > 1:
prob_emb[prob] = np.mean(prob_emb[prob], axis=0)
# Train/Test data
read_col = ['order_id', 'assignment_id', 'user_id', 'assistment_id', 'problem_id', 'correct',
'sequence_id', 'base_sequence_id', 'skill_id', 'skill_name', 'original']
target = 'correct'
# read in the data
df = pd.read_csv(f'{path}/skill_builder_data.csv', low_memory=False, encoding="ISO-8859-1")[read_col]
df = df.sort_values(['order_id', 'user_id'])
# delete empty skill_id
df = df.dropna(subset=['skill_id'])
df = df[~df['skill_id'].isin(['noskill'])]
df.skill_id = df.skill_id.astype('int')
print('After removing empty skill_id, records number %d' % len(df))
# delete scaffolding problems
df = df[df['original'].isin([1])]
print('After removing scaffolding problems, records number %d' % len(df))
#delete the users whose interaction number is less than min_inter_num
min_inter_num = 3
users = df.groupby(['user_id'], as_index=True)
delete_users = []
for u in users:
if len(u[1]) < min_inter_num:
delete_users.append(u[0])
print('deleted user number based min-inters %d' % len(delete_users))
df = df[~df['user_id'].isin(delete_users)]
df = df[['user_id', 'problem_id', 'skill_id', 'correct']]
print('After deleting some users, records number %d' % len(df))
# print('features: ', df['assistment_id'].unique(), df['answer_type'].unique())
df = df[df['skill_id'].isin(filtered_skill_prob.keys())]
df['skill_cat'] = df['skill_id'].astype('category').cat.codes
df['e_emb'] = df['problem_id'].apply(lambda r: prob_emb[r])
df['c_emb'] = df['skill_id'].apply(lambda r: skill_emb[r])
group_c = df[['user_id', 'c_emb', 'correct']].groupby('user_id').apply(lambda r: (np.array(r['c_emb'].tolist()).squeeze(), r['correct'].values))
train_group_c = group_c.sample(frac=0.8, random_state=2020)
test_group_c = group_c[~group_c.index.isin(train_group_c.index)]
joblib.dump(train_group_c, f'{path}/train_group_c.pkl.zip')
joblib.dump(test_group_c, f'{path}/test_group_c.pkl.zip')
# print(type(train_group_c))
# # print(train_group_c.values)
# userid = train_group_c.index
# print(userid)
# q, qa = train_group_c[userid[0]]
# print(q, qa)
train_dataset = DKTDataset(train_group_c, max_seq=MAX_SEQ)
train_dataloader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
valid_dataset = DKTDataset(test_group_c, max_seq=MAX_SEQ)
valid_dataloader = DataLoader(valid_dataset, batch_size=BATCH_SIZE, shuffle=True)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = DKT(input_dim, hidden_dim, layer_dim, output_dim, device)
optimizer = torch.optim.Adam(model.parameters(), lr=LEARNING_RATE)
criterion = nn.BCEWithLogitsLoss()
scheduler = torch.optim.lr_scheduler.OneCycleLR(
optimizer, max_lr=MAX_LEARNING_RATE, steps_per_epoch=len(train_dataloader), epochs=EPOCHS
)
model.to(device)
criterion.to(device)
for epoch in (range(EPOCHS)):
# there
loss, acc, auc = train_fn(model, train_dataloader, optimizer, criterion, device)
# print("epoch - {}/{} train: - {:.3f} acc - {:.3f} auc - {:.3f}".format(epoch+1, EPOCHS, loss, acc, auc))
loss, acc, pre, rec, f1, auc = valid_fn(model, valid_dataloader, criterion, device)
res = "epoch - {}/{} valid: - {:.3f} acc - {:.3f} pre - {:.3f} rec - {:.3f} f1 - {:3f} auc - {:.3f}".format(epoch+1, EPOCHS, loss, acc, pre, rec, f1, auc)
print(res)
程序不会进入这个函数:
def train_fn(model, dataloader, optimizer, criterion, scheduler=None, device="cpu"):
print('enter...')
print("dataloader", type(dataloader))
model.train()
train_loss = []
num_corrects = 0
num_total = 0
labels = []
outs = []
for x_emb, q_next, y in (dataloader):
x = x_emb.to(device).float()
y = y.to(device).float()
q_next = q_next.to(device).float()
out = model(x, q_next).squeeze().astype(np.int32)#[:, :-1]
loss = criterion(out, y)
loss.backward()
optimizer.step()
# scheduler.step()
train_loss.append(loss.item())
target_mask = (q_next!=0).unique(dim=2).squeeze().astype(np.int32)
# target_mask = (y!=-1)
filtered_out = torch.masked_select(out, target_mask)
filtered_label = torch.masked_select(y, target_mask)
filtered_pred = (torch.sigmoid(filtered_out) >= 0.5).long()
num_corrects = num_corrects + (filtered_pred == filtered_label).sum().item()
num_total = num_total + len(filtered_label)
labels.extend(filtered_label.view(-1).data.cpu().numpy())
outs.extend(filtered_pred.view(-1).data.cpu().numpy())
acc = num_corrects / num_total
auc = roc_auc_score(labels, outs)
loss = np.mean(train_loss)
return loss, acc, auc
错误信息:
TypeError Traceback (most recent call last)
~/kt/jktModel/embedding_dkt.py in <module>
145 for epoch in (range(EPOCHS)):
146 print("ashkdgjggvnskaj")
--> 147 loss, acc, auc = train_fn(model, train_dataloader, optimizer, criterion, device)
148 # print("epoch - {}/{} train: - {:.3f} acc - {:.3f} auc - {:.3f}".format(epoch+1, EPOCHS, loss, acc, auc))
149 loss, acc, pre, rec, f1, auc = valid_fn(model, valid_dataloader, criterion, device)
~/kt/jktModel/model.py in train_fn(model, dataloader, optimizer, criterion, scheduler, device)
110 model.train()
111 train_loss = []
--> 112 num_corrects = 0
113 num_total = 0
114 labels = []
~/anaconda3/envs/dkt/lib/python3.8/site-packages/torch/utils/data/dataloader.py in __next__(self)
519 if self._sampler_iter is None:
520 self._reset()
--> 521 data = self._next_data()
522 self._num_yielded += 1
523 if self._dataset_kind == _DatasetKind.Iterable and \
~/anaconda3/envs/dkt/lib/python3.8/site-packages/torch/utils/data/dataloader.py in _next_data(self)
559 def _next_data(self):
560 index = self._next_index() # may raise StopIteration
--> 561 data = self._dataset_fetcher.fetch(index) # may raise StopIteration
562 if self._pin_memory:
563 data = _utils.pin_memory.pin_memory(data)
~/anaconda3/envs/dkt/lib/python3.8/site-packages/torch/utils/data/_utils/fetch.py in fetch(self, possibly_batched_index)
50 else:
51 data = self.dataset[possibly_batched_index]
---> 52 return self.collate_fn(data)
~/anaconda3/envs/dkt/lib/python3.8/site-packages/torch_geometric/loader/dataloader.py in __call__(self, batch)
32 return type(elem)(*(self(s) for s in zip(*batch)))
33 elif isinstance(elem, Sequence) and not isinstance(elem, str):
---> 34 return [self(s) for s in zip(*batch)]
35
36 raise TypeError(f'DataLoader found invalid type: {type(elem)}')
~/anaconda3/envs/dkt/lib/python3.8/site-packages/torch_geometric/loader/dataloader.py in <listcomp>(.0)
32 return type(elem)(*(self(s) for s in zip(*batch)))
33 elif isinstance(elem, Sequence) and not isinstance(elem, str):
---> 34 return [self(s) for s in zip(*batch)]
35
36 raise TypeError(f'DataLoader found invalid type: {type(elem)}')
~/anaconda3/envs/dkt/lib/python3.8/site-packages/torch_geometric/loader/dataloader.py in __call__(self, batch)
34 return [self(s) for s in zip(*batch)]
35
---> 36 raise TypeError(f'DataLoader found invalid type: {type(elem)}')
37
38 def collate(self, batch): # Deprecated...
TypeError: DataLoader found invalid type: <class 'numpy.ndarray'>
我一点头绪也没有。这困扰我已经几天了。