PyTorch“在DataLoader工作进程0中捕获IndexError”，“IndexError：数组的索引太多”。

Question

PyTorch“在DataLoader工作进程0中捕获IndexError”，“IndexError：数组的索引太多”。

pythonmachine-learningdeep-learningpytorch

4

我正在尝试根据PyTorch的“微调目标检测”官方教程实现一个检测模型。使用少量数据（10张图像）似乎已经成功了。但是，我将整个数据集上传到Drive并检查了索引-数据-标签对应关系。在我的设置中没有不匹配的项目，我已经解决了该部分中的所有错误。（我从GDrive标签中删除了额外的项目）

class SomeDataset(torch.utils.data.Dataset):
def __init__(self, root_path, transforms):
    self.root_path = root_path
    self.transforms = transforms
    # load all image files, sorting them to
    # ensure that they are aligned
    self.imgs = list(sorted(os.listdir(os.path.join(root_path, "images"))))
    self.labels = list(sorted(os.listdir(os.path.join(root_path, "labels"))))


def __getitem__(self, idx):
    # load images ad masks
    img_path = os.path.join(self.root_path, "images", self.imgs[idx])
    label_path = os.path.join(self.root_path, "labels", self.labels[idx])


    img = Image.open(img_path).convert("RGB")

    # get labels and boxes
    label_data = np.loadtxt(label_path, dtype=str, delimiter=' ');
    print(f"{len(label_data)} is the length of label data")
    num_objs = label_data.shape[0];
    if num_objs != 0:
        print(f"number of objects {num_objs}")        
        # label values should start from 1
        for i,label_name in enumerate(classnames):
            label_data[np.where(label_name==label_data)] = i;

        label_data = label_data.astype(np.float);
        print(f"label data {label_data}")
        xs = label_data[:,0:8:2];
        ys = label_data[:,1:8:2];

        x_min = np.min(xs, axis=1)[...,np.newaxis];
        x_max = np.max(xs, axis=1)[...,np.newaxis];
        y_min = np.min(ys, axis=1)[...,np.newaxis];
        y_max = np.max(ys, axis=1)[...,np.newaxis];

        boxes = np.hstack((x_min,y_min,x_max,y_max));

        labels = label_data[:,8];
    else:
        # if there is no label add background whose label is 0
        boxes = [[0,0,1,1]];
        labels = [0];
        num_objs = 1;

    boxes = torch.as_tensor(boxes, dtype=torch.float32)
    labels = torch.as_tensor(labels, dtype=torch.int64)

    image_id = torch.tensor([idx])
    area = (boxes[:, 3] - boxes[:, 1]) * (boxes[:, 2] - boxes[:, 0])
    # suppose all instances are not crowd
    iscrowd = torch.zeros((num_objs,), dtype=torch.int64)

    target = {}
    target["boxes"] = boxes
    target["labels"] = labels
    target["image_id"] = image_id
    target["area"] = area
    target["iscrowd"] = iscrowd

    if self.transforms is not None:
          img, target = self.transforms(img, target)

    return img, target

def __len__(self):
    return len(self.imgs)

我的主方法如下所示，

def main():
# train on the GPU or on the CPU, if a GPU is not available
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')

# our dataset has 16 classes - background and others
num_classes = 16
# use our dataset and defined transformations
dataset = SomeDataset('trainImages', get_transform(train=True))
print(f"{len(dataset)} number of images in training dataset")
dataset_validation = SomeDataset('valImages', get_transform(train=True))
print(f"{len(dataset_validation)} number of images in validation dataset")

# define training and validation data loaders
data_loader = torch.utils.data.DataLoader(
    dataset, batch_size=20, shuffle=True, num_workers=4,
    collate_fn=utils.collate_fn)

data_loader_val = torch.utils.data.DataLoader(
    dataset_validation, batch_size=10, shuffle=False, num_workers=4,
    collate_fn=utils.collate_fn)

# get the model using our helper function
#model = get_model_instance_segmentation(num_classes)
model = get_rcnn(num_classes);

# move model to the right device
model.to(device)

# construct an optimizer
params = [p for p in model.parameters() if p.requires_grad]
#optimizer = torch.optim.SGD(params, lr=0.005, momentum=0.9, weight_decay=0.0005);
optimizer = torch.optim.Adam(params, lr=0.0005);
# and a learning rate scheduler
lr_scheduler = torch.optim.lr_scheduler.StepLR(optimizer,
                                               step_size=3,
                                               gamma=0.1)

# let's train it for 10 epochs
num_epochs = 5

for epoch in range(num_epochs):
    # train for one epoch, printing every 10 iterations
    train_one_epoch(model, optimizer, data_loader, device, epoch, print_freq=100)
    # update the learning rate
    lr_scheduler.step()
    # evaluate on the test dataset
    #evaluate(model, data_loader_test, device=device)

print("That's it!")
return model;

当我运行我的代码时，它会运行几个数据（例如其中的10个），然后停止并输出此错误。

IndexError: Caught IndexError in DataLoader worker process 0.
Original Traceback (most recent call last):
  File "/usr/local/lib/python3.6/dist-packages/torch/utils/data/_utils/worker.py", line 178, in _worker_loop
    data = fetcher.fetch(index)
  File "/usr/local/lib/python3.6/dist-packages/torch/utils/data/_utils/fetch.py", line 44, in fetch
    data = [self.dataset[idx] for idx in possibly_batched_index]
  File "/usr/local/lib/python3.6/dist-packages/torch/utils/data/_utils/fetch.py", line 44, in <listcomp>
    data = [self.dataset[idx] for idx in possibly_batched_index]
  File "<ipython-input-114-e0ccd94603fd>", line 31, in __getitem__
    xs = label_data[:,0:8:2];
IndexError: too many indices for array

错误发生在model = main()到train_one_epoch()之间，且一直持续下去。我不明白为什么会这样。同时，请注意这只是数据集的一个实例。

(<PIL.Image.Image image mode=RGB size=1024x1024 at 0x7F46FC0A94A8>, {'boxes': tensor([[ 628.,    6.,  644.,   26.],
    [ 633.,   50.,  650.,   65.],
    [ 620.,   27.,  637.,   44.],
    [ 424.,  193.,  442.,  207.],
    [ 474.,  188.,  496.,  204.],
    [ 383.,  226.,  398.,  236.],
    [ 399.,  218.,  418.,  231.],
    [  42.,  189.,   63.,  203.],
    [ 106.,  159.,  129.,  169.],
    [ 273.,   17.,  287.,   34.],
    [ 225.,  961.,  234.,  980.],
    [ 220., 1004.,  230., 1024.]]), 'labels': tensor([5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5]), 'image_id': tensor([0]), 'area': tensor([320., 255., 289., 252., 352., 150., 247., 294., 230., 238., 171., 200.]), 'iscrowd': tensor([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0])})

- CrmXao

3个回答

1

使用np.loadtxt()方法时，请确保将ndims = 2作为参数添加。因为即使只有一个对象，num_obj参数的数量也会变成10。

这是因为1个对象变成列向量，显示为10个对象（表示10列）。

ndims = 2，确保np.loadtxt()方法的输出不会输出任何行向量或列向量，只有二维输出。

- CrmXao

0

主要原因可能是内存不足（不是GPU内存）。检查内存和交换空间是否被使用。如果是，那么就是内存问题。您可以使用小批量、少量num_workers、扩展交换空间等方法来减少内存负载。任何能减轻内存负担的方法都会有所帮助。如何在Ubuntu上添加交换空间

- Mahsa Hassankashi

网页内容由stack overflow 提供, 点击上面的

可以查看英文原文，
原文链接

- mktplus · Accepted Answer

在尝试使用批大小为8的Dataloader训练785长度的数据集时，我遇到了相同的问题。

使数据集长度能够被批大小整除解决了这个问题。