あれ、warp suffle を使った方が遅い???
ある CUDA 本に warp suffle を使った sum reduction プログラムが乗っていたのだが、GTX 780 で実行した結果が Tesla K40 とまったく違うのでメモ。
#include <stdio.h> #include <stdlib.h> #define WORK_SIZE 16384 // THREAD_SIZE <= 1024 でなければならない #define THREAD_SIZE 256 #define BLOCK_SIZE (WORK_SIZE / THREAD_SIZE) /** * This macro checks return value of the CUDA runtime call and exits * the application if the call failed. */ #define CUDA_CHECK_RETURN(value) { \ cudaError_t _m_cudaStat = value; \ if (_m_cudaStat != cudaSuccess) { \ fprintf(stderr, "Error %s at line %d in file %s\n", \ cudaGetErrorString(_m_cudaStat), __LINE__, __FILE__); \ exit(1); \ } } __device__ __inline__ int warpReduce(int sum) { sum += __shfl_xor(sum, 16); sum += __shfl_xor(sum, 8); sum += __shfl_xor(sum, 4); sum += __shfl_xor(sum, 2); sum += __shfl_xor(sum, 1); return sum; } __global__ void reduce_by_shfl(int *in, int *out) { unsigned int laneId = threadIdx.x % warpSize; unsigned int warpId = threadIdx.x / warpSize; unsigned int idx = blockIdx.x * blockDim.x + threadIdx.x; __shared__ int psum[THREAD_SIZE / 32]; int sum = in[idx]; // warp 内の reduction を行う sum = warpReduce(sum); if (laneId == 0) { psum[warpId] = sum; } __syncthreads(); // block 内の reduction を行う sum = (threadIdx.x < (THREAD_SIZE / 32))?psum[threadIdx.x]:0; if (warpId == 0) sum = warpReduce(sum); if (threadIdx.x == 0) out[blockIdx.x] = sum; } __global__ void reduce_by_smem(int *in, int *out) { unsigned int idx = blockIdx.x * blockDim.x + threadIdx.x; __shared__ int psum[THREAD_SIZE]; psum[threadIdx.x] = in[idx]; __syncthreads(); int i = THREAD_SIZE / 2; while (i > 0) { if (threadIdx.x < i) { psum[threadIdx.x] += psum[threadIdx.x + i]; } __syncthreads(); i >>= 1; } if (threadIdx.x == 0) out[blockIdx.x] = psum[0]; } __global__ void warmup(int *in, int *out) { } int reduce_cpu(int *in) { int i, sum = 0; for (i = 0; i < WORK_SIZE; ++i) { sum += in[i]; } return sum; } /** * Host function that prepares data array and passes it to the CUDA kernel. */ int main(void) { int *d = NULL, *d_out = NULL; int i, k; int idata[WORK_SIZE], odata[BLOCK_SIZE]; for (i = 0; i < WORK_SIZE; i++) idata[i] = i; CUDA_CHECK_RETURN(cudaMalloc((void**) &d, sizeof(int) * WORK_SIZE)); CUDA_CHECK_RETURN(cudaMalloc((void **)&d_out, sizeof(int) * BLOCK_SIZE)); CUDA_CHECK_RETURN( cudaMemcpy(d, idata, sizeof(int) * WORK_SIZE, cudaMemcpyHostToDevice)); // warm up warmup<<<BLOCK_SIZE, THREAD_SIZE>>>(d, d_out); CUDA_CHECK_RETURN(cudaThreadSynchronize()); // Wait for the GPU launched work to complete CUDA_CHECK_RETURN(cudaGetLastError()); CUDA_CHECK_RETURN(cudaMemcpy(odata, d_out, sizeof(int) * BLOCK_SIZE, cudaMemcpyDeviceToHost)); for (k = 0; k < 10; ++k) { reduce_by_smem<<<BLOCK_SIZE, THREAD_SIZE>>>(d, d_out); CUDA_CHECK_RETURN(cudaThreadSynchronize()); // Wait for the GPU launched work to complete CUDA_CHECK_RETURN(cudaGetLastError()); } CUDA_CHECK_RETURN(cudaMemcpy(odata, d_out, sizeof(int) * BLOCK_SIZE, cudaMemcpyDeviceToHost)); int sum = 0; for (i = 0; i < BLOCK_SIZE; i++) { sum += odata[i]; } printf("smem sum: %d\n", sum); if (reduce_cpu(idata) == sum) { printf("matched\n"); } else { printf("unmatched\n"); } for (k = 0; k < 10; ++k) { reduce_by_shfl<<<BLOCK_SIZE, THREAD_SIZE>>>(d, d_out); CUDA_CHECK_RETURN(cudaThreadSynchronize()); // Wait for the GPU launched work to complete CUDA_CHECK_RETURN(cudaGetLastError()); } CUDA_CHECK_RETURN(cudaMemcpy(odata, d_out, sizeof(int) * BLOCK_SIZE, cudaMemcpyDeviceToHost)); sum = 0; for (i = 0; i < BLOCK_SIZE; i++) { sum += odata[i]; } printf("shlf sum: %d\n", sum); if (reduce_cpu(idata) == sum) { printf("matched\n"); } else { printf("unmatched\n"); } CUDA_CHECK_RETURN(cudaFree((void*) d)); CUDA_CHECK_RETURN(cudaDeviceReset()); return 0; }
で、nvprof をかませてみると
Time(%) Time Calls Avg Min Max Name
53.30% 103.87us 10 10.387us 10.304us 10.880us reduce_by_shfl(int*, int*)
41.36% 80.607us 10 8.0600us 7.9680us 8.2880us reduce_by_smem(int*, int*)
warp suffle を使った方が遅いじゃん!!
どこがいけなかったのだろうか...