这个操作将`arr2`中的值分散到`arr1`中,因此我们可以使用`thrust::scatter`。可以使用`thrust::counting_iterator`(类似于`std::ranges::views::iota`)的`thrust::transform_iterator`计算值被分散到的索引。对于一般情况下批处理索引另外给定为输入的`thrust::device_vector`,你可以使用:
```cpp
const auto indices_ptr = indices.data();
const auto scatter_indices_it = thrust::make_transform_iterator(
thrust::make_counting_iterator(0),
[batch, indices_ptr]
__host__ __device__ (int idx) {
const int batch_id = idx / batch;
const int elem_id = idx % batch;
return indices_ptr[batch_id] * batch + elem_id;
});
```
而在只想要分散到奇数批次的特殊情况下,你应该使用:
```cpp
const auto scatter_indices_it = thrust::make_transform_iterator(
thrust::make_counting_iterator(0),
[batch]
__host__ __device__ (int idx) {
const int batch_id = idx / batch;
const int elem_id = idx % batch;
return (batch_id * 2 + 1) * batch + elem_id;
});
```
`scatter`操作本身很简单:
```cpp
thrust::scatter(
arr2.cbegin(), arr2.cend(),
scatter_indices_it,
arr1.begin());
```
有多种可能的方法来完成这个问题。我首先想到的是使用`thrust::merge_by_key`将两个向量合并,其中键使用与上述分散索引类似的方案生成:
```cpp
const auto merge_keys1_it = thrust::make_transform_iterator(
thrust::make_counting_iterator(0),
[batch]
__host__ __device__ (int idx) {
const int batch_id = idx / batch;
return batch_id * 2;
});
const auto merge_keys2_it = thrust::make_transform_iterator(
thrust::make_counting_iterator(0),
[batch]
__host__ __device__ (int idx) {
const int batch_id = idx / batch;
return batch_id * 2 + 1;
});
thrust::merge_by_key(
merge_keys1_it, merge_keys1_it + arr1.size(),
merge_keys2_it, merge_keys2_it + arr2.size(),
arr1.cbegin(),
arr2.cbegin(),
thrust::make_discard_iterator(),
arr3.begin());
```
这种方法可以工作,而且相对优雅,但由于合并算法的复杂性和操作的规律性,可能不是最理想的性能选择。更高效的方法可能是创建一个复杂的迭代器来处理整个交错操作:
```cpp
const auto arr1_ptr = arr1.data();
const auto arr2_ptr = arr2.data();
const auto batch_interleave_it = thrust::make_transform_iterator(
thrust::make_counting_iterator(0),
[batch, arr1_ptr, arr2_ptr]
__host__ __device__ (int idx) -> int {
const int batch_id = idx / batch;
const int batch_el = idx % batch;
const int in_idx = (batch_id / 2) * batch + batch_el;
const bool even_batch = (batch_id % 2 == 0);
return even_batch ? arr1_ptr[in_idx] : arr2_ptr[in_idx];
});
```
现在,可以将该迭代器传递给管道中的下一个算法进行内核融合,或者使用以
begin
和
end
迭代器为参数的构造函数来初始化一个新向量。如果未来要多次使用交错的`arr3`(读取),则应该将其放入一个新的向量中,而不是重用迭代器,因为如果`batch`不是warp大小(32)的倍数,迭代器无法进行连续的全局内存访问。
完整源代码:
由于使用了设备lambda函数,nvcc
需要 -extended-lambda
。自 CUDA 12 版本起,出于某种原因,还需要 -rdc=true
。
#include <iostream>
#include <iterator>
#include <thrust/host_vector.h>
#include <thrust/device_vector.h>
#include <thrust/scatter.h>
#include <thrust/merge.h>
#include <thrust/iterator/transform_iterator.h>
#include <thrust/iterator/counting_iterator.h>
#include <thrust/iterator/discard_iterator.h>
template <typename T>
void print(const thrust::device_vector<T> &vec) {
thrust::host_vector<int> h_vec(vec);
std::ostream_iterator<int> out_it(std::cout, ", ");
thrust::copy(h_vec.cbegin(), h_vec.cend(),
out_it);
std::cout << '\n';
}
void part1() {
constexpr int h_arr1[] = {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12};
constexpr int h_arr2[] = {1, 1, 1, 2, 2, 2};
constexpr int batch = 3;
thrust::device_vector<int> arr1(std::cbegin(h_arr1), std::cend(h_arr1));
thrust::device_vector<int> arr2(std::cbegin(h_arr2), std::cend(h_arr2));
assert(arr1.size() == 2 * arr2.size());
#if 1
const auto scatter_indices_it = thrust::make_transform_iterator(
thrust::make_counting_iterator(0),
[batch]
__host__ __device__ (int idx) {
const int batch_id = idx / batch;
const int elem_id = idx % batch;
return (batch_id * 2 + 1) * batch + elem_id;
});
#else
constexpr int h_indices[] = {1, 3};
thrust::device_vector<int> indices(
std::cbegin(h_indices), std::cend(h_indices));
const auto indices_ptr = indices.data();
const auto scatter_indices_it = thrust::make_transform_iterator(
thrust::make_counting_iterator(0),
[batch, indices_ptr]
__host__ __device__ (int idx) {
const int batch_id = idx / batch;
const int elem_id = idx % batch;
return indices_ptr[batch_id] * batch + elem_id;
});
#endif
thrust::scatter(
arr2.cbegin(), arr2.cend(),
scatter_indices_it,
arr1.begin());
print(arr1);
}
void part2() {
constexpr int h_arr1[] = {1, 2, 3, 4, 5, 6};
constexpr int h_arr2[] = {1, 1, 1, 2, 2, 2};
constexpr int batch = 3;
thrust::device_vector<int> arr1(std::cbegin(h_arr1), std::cend(h_arr1));
thrust::device_vector<int> arr2(std::cbegin(h_arr2), std::cend(h_arr2));
const auto out_size = arr1.size() + arr2.size();
assert(arr1.size() == arr2.size());
#if 1
const auto arr1_ptr = arr1.data();
const auto arr2_ptr = arr2.data();
const auto batch_interleave_it = thrust::make_transform_iterator(
thrust::make_counting_iterator(0),
[batch, arr1_ptr, arr2_ptr]
__host__ __device__ (int idx) -> int {
const int batch_id = idx / batch;
const int batch_el = idx % batch;
const int in_idx = (batch_id / 2) * batch + batch_el;
const bool even_batch = (batch_id % 2 == 0);
return even_batch ? arr1_ptr[in_idx] : arr2_ptr[in_idx];
});
thrust::device_vector<int> arr3(
batch_interleave_it, batch_interleave_it + out_size);
#else
thrust::device_vector<int> arr3(out_size);
const auto merge_keys1_it = thrust::make_transform_iterator(
thrust::make_counting_iterator(0),
[batch]
__host__ __device__ (int idx) {
const int batch_id = idx / batch;
return batch_id * 2;
});
const auto merge_keys2_it = thrust::make_transform_iterator(
thrust::make_counting_iterator(0),
[batch]
__host__ __device__ (int idx) {
const int batch_id = idx / batch;
return batch_id * 2 + 1;
});
thrust::merge_by_key(
merge_keys1_it, merge_keys1_it + arr1.size(),
merge_keys2_it, merge_keys2_it + arr2.size(),
arr1.cbegin(),
arr2.cbegin(),
thrust::make_discard_iterator(),
arr3.begin());
#endif
print(arr3);
}
int main(void) {
part1();
part2();
}
[1, 3]
吗? - undefinedreplace_copy_if
(或任何其他replace
算法)在这里如何有帮助,因为它们总是用单个值替换,而不是使用另一个向量中的不同条目进行替换。 - undefined