使用CUDA Thrust进行向量的替换/合并操作

Question

使用CUDA Thrust进行向量的替换/合并操作

c++vectorcudathrust

3

我有两个操作，用CUDA Thrust来操作设备向量中的元素。哪些方法可以更高效地实现这两个函数？

批量替换向量中的部分值为另一个向量的值。示例如下：
```
arr1 = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12]
arr2 = [1, 1, 1, 2, 2, 2]
// 批量替换 {4, 5, 6} 和 {10, 11, 12} 后，得到：
arr1 = [1, 2, 3, 1, 1, 1, 7, 8, 9, 2, 2, 2]
```
在我的情况下，总是满足 size(arr1) / size(arr2) = 2。

我们从索引 1 * batch 和 3 * batch 开始替换 arr1 中的值。

大多数情况下，我需要替换奇数批次的项目。同时也需要处理一般批次的情况。
合并两个向量，交替插入元素。

类似问题已经在如何交替合并两个向量? 中提出，但是该问题是针对 R 语言的。
```
arr1 = [1, 2, 3, 4, 5, 6]
arr2 = [1, 1, 1, 2, 2, 2]
// 合并 arr1 和 arr2 后，得到：
arr3 = [1, 2, 3, 1, 1, 1, 4, 5, 6, 2, 2, 2]
```
可能可以使用 replace_copy_if 函数，但是我不知道如何与 fancy 迭代器结合使用。此外，一些博客显示 replace_copy_if 比 copy_if 函数更慢。

- Chris

你获取（1）的偏移量是以什么格式呢？是一个偏移量向量 [1, 3] 吗？ - undefined

我不明白replace_copy_if（或任何其他replace算法）在这里如何有帮助，因为它们总是用单个值替换，而不是使用另一个向量中的不同条目进行替换。 - undefined

1个回答

网页内容由stack overflow 提供, 点击上面的

可以查看英文原文，
原文链接

- paleonix · Accepted Answer

这个操作将`arr2`中的值分散到`arr1`中，因此我们可以使用`thrust::scatter`。可以使用`thrust::counting_iterator`（类似于`std::ranges::views::iota`）的`thrust::transform_iterator`计算值被分散到的索引。对于一般情况下批处理索引另外给定为输入的`thrust::device_vector`，你可以使用：

```cpp const auto indices_ptr = indices.data();

const auto scatter_indices_it = thrust::make_transform_iterator( thrust::make_counting_iterator(0), [batch, indices_ptr] __host__ __device__ (int idx) { const int batch_id = idx / batch; const int elem_id = idx % batch; return indices_ptr[batch_id] * batch + elem_id; }); ```

而在只想要分散到奇数批次的特殊情况下，你应该使用：

```cpp const auto scatter_indices_it = thrust::make_transform_iterator( thrust::make_counting_iterator(0), [batch] __host__ __device__ (int idx) { const int batch_id = idx / batch; const int elem_id = idx % batch; return (batch_id * 2 + 1) * batch + elem_id; }); ```

`scatter`操作本身很简单：

```cpp thrust::scatter( arr2.cbegin(), arr2.cend(), scatter_indices_it, arr1.begin()); ```

有多种可能的方法来完成这个问题。我首先想到的是使用`thrust::merge_by_key`将两个向量合并，其中键使用与上述分散索引类似的方案生成：

```cpp const auto merge_keys1_it = thrust::make_transform_iterator( thrust::make_counting_iterator(0), [batch] __host__ __device__ (int idx) { const int batch_id = idx / batch; return batch_id * 2; }); const auto merge_keys2_it = thrust::make_transform_iterator( thrust::make_counting_iterator(0), [batch] __host__ __device__ (int idx) { const int batch_id = idx / batch; return batch_id * 2 + 1; }); thrust::merge_by_key( merge_keys1_it, merge_keys1_it + arr1.size(), merge_keys2_it, merge_keys2_it + arr2.size(), arr1.cbegin(), arr2.cbegin(), thrust::make_discard_iterator(), arr3.begin()); ```

这种方法可以工作，而且相对优雅，但由于合并算法的复杂性和操作的规律性，可能不是最理想的性能选择。更高效的方法可能是创建一个复杂的迭代器来处理整个交错操作：

```cpp const auto arr1_ptr = arr1.data(); const auto arr2_ptr = arr2.data();

const auto batch_interleave_it = thrust::make_transform_iterator( thrust::make_counting_iterator(0), [batch, arr1_ptr, arr2_ptr] __host__ __device__ (int idx) -> int { const int batch_id = idx / batch; const int batch_el = idx % batch; const int in_idx = (batch_id / 2) * batch + batch_el; const bool even_batch = (batch_id % 2 == 0); return even_batch ? arr1_ptr[in_idx] : arr2_ptr[in_idx]; }); ```

现在，可以将该迭代器传递给管道中的下一个算法进行内核融合，或者使用以begin和end迭代器为参数的构造函数来初始化一个新向量。如果未来要多次使用交错的`arr3`（读取），则应该将其放入一个新的向量中，而不是重用迭代器，因为如果`batch`不是warp大小（32）的倍数，迭代器无法进行连续的全局内存访问。

完整源代码：

由于使用了设备lambda函数，nvcc 需要 -extended-lambda。自 CUDA 12 版本起，出于某种原因，还需要 -rdc=true。

#include <iostream>
#include <iterator>

#include <thrust/host_vector.h>
#include <thrust/device_vector.h>

#include <thrust/scatter.h>
#include <thrust/merge.h>

#include <thrust/iterator/transform_iterator.h>
#include <thrust/iterator/counting_iterator.h>
#include <thrust/iterator/discard_iterator.h>

template <typename T>
void print(const thrust::device_vector<T> &vec) {
    thrust::host_vector<int> h_vec(vec);
    std::ostream_iterator<int> out_it(std::cout, ", ");
    thrust::copy(h_vec.cbegin(), h_vec.cend(),
                 out_it);
    std::cout << '\n';
}

void part1() {
    constexpr int h_arr1[] = {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12};
    constexpr int h_arr2[] = {1, 1, 1, 2, 2, 2};
    constexpr int batch = 3;

    thrust::device_vector<int> arr1(std::cbegin(h_arr1), std::cend(h_arr1));
    thrust::device_vector<int> arr2(std::cbegin(h_arr2), std::cend(h_arr2));
    
    assert(arr1.size() == 2 * arr2.size());

#if 1
    // specialized version where arr2 is scattered to "uneven"
    // batches
    const auto scatter_indices_it = thrust::make_transform_iterator(
        thrust::make_counting_iterator(0),
        [batch]
        __host__ __device__ (int idx) {
            const int batch_id = idx / batch;
            const int elem_id = idx % batch;
            return (batch_id * 2 + 1) * batch + elem_id;
        });
#else
    // more general version where arr2 is scattered to batches given
    // in indices
    constexpr int h_indices[] = {1, 3};
    thrust::device_vector<int> indices(
        std::cbegin(h_indices), std::cend(h_indices));
    const auto indices_ptr = indices.data();

    const auto scatter_indices_it = thrust::make_transform_iterator(
        thrust::make_counting_iterator(0),
        [batch, indices_ptr]
        __host__ __device__ (int idx) {
            const int batch_id = idx / batch;
            const int elem_id = idx % batch;
            return indices_ptr[batch_id] * batch + elem_id;
        });
#endif
    
    thrust::scatter(
        arr2.cbegin(), arr2.cend(),
        scatter_indices_it,
        arr1.begin());

    print(arr1);
}

void part2() {
    constexpr int h_arr1[] = {1, 2, 3, 4, 5, 6};
    constexpr int h_arr2[] = {1, 1, 1, 2, 2, 2};
    constexpr int batch = 3;

    thrust::device_vector<int> arr1(std::cbegin(h_arr1), std::cend(h_arr1));
    thrust::device_vector<int> arr2(std::cbegin(h_arr2), std::cend(h_arr2));
    const auto out_size = arr1.size() + arr2.size();

    assert(arr1.size() == arr2.size());

#if 1
    const auto arr1_ptr = arr1.data();
    const auto arr2_ptr = arr2.data();

    const auto batch_interleave_it = thrust::make_transform_iterator(
        thrust::make_counting_iterator(0),
        [batch, arr1_ptr, arr2_ptr]
        __host__ __device__ (int idx) -> int {
            const int batch_id = idx / batch;
            const int batch_el = idx % batch;
            const int in_idx = (batch_id / 2) * batch + batch_el;
            const bool even_batch = (batch_id % 2 == 0);
            return even_batch ? arr1_ptr[in_idx] : arr2_ptr[in_idx];
        });

    // only put into vector if really needed, better to feed the
    // iterator directly to the next operation for kernel fusion
    thrust::device_vector<int> arr3(
        batch_interleave_it, batch_interleave_it + out_size);
#else
    thrust::device_vector<int> arr3(out_size);

    const auto merge_keys1_it = thrust::make_transform_iterator(
        thrust::make_counting_iterator(0),
        [batch]
        __host__ __device__ (int idx) {
            const int batch_id = idx / batch;
            return batch_id * 2;
        });
    const auto merge_keys2_it = thrust::make_transform_iterator(
        thrust::make_counting_iterator(0),
        [batch]
        __host__ __device__ (int idx) {
            const int batch_id = idx / batch;
            return batch_id * 2 + 1;
        });
    
    thrust::merge_by_key(
        merge_keys1_it, merge_keys1_it + arr1.size(),
        merge_keys2_it, merge_keys2_it + arr2.size(),
        arr1.cbegin(),
        arr2.cbegin(),
        thrust::make_discard_iterator(),
        arr3.begin());
#endif

    print(arr3);
}

int main(void) {
    part1();
    part2();
}