以下两个函数有什么区别?
__device__ inline void comparator_volatile(volatile float &A, volatile float &B, uint dir) {
float t;
if ((A > B) == dir) {
t = A;
A = B;
B = t;
}
}
__device__ inline void comparator(float &A, float &B, uint dir) {
float t;
if ((A > B) == dir) {
t = A;
A = B;
B = t;
}
__threadfence();
}
有人能帮我吗?
我在基于CUDA SDK版本的一些不同版本中实现了BitonicSort
。
对于原子版本(bitonicSortAtomic
),我尝试在__syncblocks_atomic
中使用__threadfence()
来维护内存一致性。但它不起作用(输出不正确)。我不得不调用comparator_volatile
而不是comparator
,然后我得到了正确的结果。有什么想法吗?
BitonicSort
基准测试:
// (C) Copyright 2013, University of Illinois. All Rights Reserved
#include <stdlib.h>
#include <stdio.h>
#include "parboil.h"
#define THREADS 256
#define BLOCKS 32
#define NUM_VALS 2*THREADS*BLOCKS
__device__ volatile int mutex = 0;
__device__ inline void __syncblocks_atomic(int goal) {
__syncthreads();
// __threadfence();
int tx = threadIdx.x;
if (tx == 0) {
atomicAdd((int *)&mutex, 1);
while(g_mutex != goal) {}
}
__syncthreads();
}
__device__ inline void comparator(float &A, float &B, uint dir) {
float t;
if ((A > B) == dir) {
t = A;
A = B;
B = t;
}
}
__device__ inline void comparator_volatile(volatile float &A, volatile float &B, uint dir) {
float t;
if ((A > B) == dir) {
t = A;
A = B;
B = t;
}
}
#ifdef NAIVE
__global__ void bitonicSortNaive(float *src, int stride, int size) {
unsigned int tid = threadIdx.x + blockDim.x * blockIdx.x;
uint dir = (tid & (size / 2)) == 0;
unsigned int pos = 2*tid - (tid & (stride - 1));
comparator(src[pos], src[pos+stride], dir);
}
#endif
#ifdef ATOMIC
__global__ void bitonicSortAtomic(float *src, int length) {
uint numBlocks = gridDim.x * gridDim.y * gridDim.z;
uint goalVal = 0;
uint tid = threadIdx.x + blockDim.x * blockIdx.x;
for(uint size=2; size<=length; size<<=1) {
for(uint stride=size>>1; stride>0; stride=stride>>1) {
uint dir = (tid & (size / 2)) == 0;
uint pos = 2*tid - (tid & (stride - 1));
comparator_volatile(src[pos], src[pos+stride], dir);
if(stride>THREADS || (stride==1 && size>=THREADS)) {
goalVal += numBlocks;
__syncblocks_atomic(goalVal);
}
else
__syncthreads();
} // end for stride
} // end for size
}
#endif
int main() {
printf("[BENCH] Bitonic Sort %d elements\n", NUM_VALS);
printf("[BENCH] Xuhao Chen <cxh@illinois.edu>\n");
#ifdef NAIVE
printf("[BENCH] Naive version\n");
#endif
#ifdef ATOMIC
printf("[BENCH] Atomic Barrier\n");
#endif
float *values = (float*) malloc( NUM_VALS * sizeof(float));
array_init(values, NUM_VALS);
float *dev_values;
size_t size = NUM_VALS * sizeof(float);
cudaMalloc((void**) &dev_values, size);
cudaMemcpy(dev_values, values, size, cudaMemcpyHostToDevice);
dim3 blocks(BLOCKS,1);
dim3 threads(THREADS,1);
cudaDeviceSynchronize();
#ifdef NAIVE
int j, k;
for (k = 2; k <= NUM_VALS; k <<= 1) {
for (j=k>>1; j>0; j=j>>1) {
bitonicSortNaive<<<blocks, threads>>>(dev_values, j, k);
}
}
#endif
#ifdef ATOMIC
bitonicSortAtomic<<<blocks, threads>>>(dev_values, NUM_VALS);
#endif
cudaDeviceSynchronize();
cudaMemcpy(values, dev_values, size, cudaMemcpyDeviceToHost);
cudaFree(dev_values);
free(values);
}
__syncblocks_atomic
是一个用于实现全局屏障的函数。由于存在块间通信,因此我必须保持数据一致性。
__syncthreads()
,它作为同一块中线程的屏障。每个线程应分配到一个固定位置,并且应该知道在每次迭代时要做什么。因此,您不应该使用易失性变量,而只需使用__syncthreads()
,因为您不仅希望线程停止,直到其所做的修改可见,而且您还需要其他线程等待,直到它们都完成了迭代。 - Giulio Franco