読者です 読者をやめる 読者になる 読者になる

徒然なる日々を送るソフトウェアデベロッパーの記録(2)

技術上思ったことや感じたことを気ままに記録していくブログです。さくらから移設しました。

あれ、warp suffle を使った方が遅い???

ある CUDA 本に warp suffle を使った sum reduction プログラムが乗っていたのだが、GTX 780 で実行した結果が Tesla K40 とまったく違うのでメモ。

#include <stdio.h>
#include <stdlib.h>

#define WORK_SIZE 16384
// THREAD_SIZE <= 1024 でなければならない
#define THREAD_SIZE 256
#define BLOCK_SIZE (WORK_SIZE / THREAD_SIZE)

/**
 * This macro checks return value of the CUDA runtime call and exits
 * the application if the call failed.
 */
#define CUDA_CHECK_RETURN(value) {											\
	cudaError_t _m_cudaStat = value;										\
	if (_m_cudaStat != cudaSuccess) {										\
		fprintf(stderr, "Error %s at line %d in file %s\n",					\
				cudaGetErrorString(_m_cudaStat), __LINE__, __FILE__);		\
		exit(1);															\
	} }

__device__ __inline__ int warpReduce(int sum) {
	sum += __shfl_xor(sum, 16);
	sum += __shfl_xor(sum, 8);
	sum += __shfl_xor(sum, 4);
	sum += __shfl_xor(sum, 2);
	sum += __shfl_xor(sum, 1);
	return sum;
}

__global__ void reduce_by_shfl(int *in, int *out) {
	unsigned int laneId = threadIdx.x % warpSize;
	unsigned int warpId = threadIdx.x / warpSize;
	unsigned int idx = blockIdx.x * blockDim.x + threadIdx.x;
	__shared__ int psum[THREAD_SIZE / 32];
	int sum = in[idx];

	// warp 内の reduction を行う
	sum = warpReduce(sum);
	if (laneId == 0) {
		psum[warpId] = sum;
	}
	__syncthreads();

	// block 内の reduction を行う
	sum = (threadIdx.x < (THREAD_SIZE / 32))?psum[threadIdx.x]:0;
	if (warpId == 0) sum = warpReduce(sum);
	if (threadIdx.x == 0) out[blockIdx.x] = sum;
}

__global__ void reduce_by_smem(int *in, int *out) {
	unsigned int idx = blockIdx.x * blockDim.x + threadIdx.x;
	__shared__ int psum[THREAD_SIZE];

	psum[threadIdx.x] = in[idx];
	__syncthreads();
	int i = THREAD_SIZE / 2;
	while (i > 0) {
		if (threadIdx.x < i) {
			psum[threadIdx.x] += psum[threadIdx.x + i];
		}
		__syncthreads();
		i >>= 1;
	}
	if (threadIdx.x == 0) out[blockIdx.x] = psum[0];
}

__global__ void warmup(int *in, int *out) {
}

int reduce_cpu(int *in) {
	int i, sum = 0;
	for (i = 0; i < WORK_SIZE; ++i) {
		sum += in[i];
	}
	return sum;
}

/**
 * Host function that prepares data array and passes it to the CUDA kernel.
 */
int main(void) {
	int *d = NULL, *d_out = NULL;
	int i, k;
	int idata[WORK_SIZE], odata[BLOCK_SIZE];

	for (i = 0; i < WORK_SIZE; i++)
		idata[i] = i;


	CUDA_CHECK_RETURN(cudaMalloc((void**) &d, sizeof(int) * WORK_SIZE));
	CUDA_CHECK_RETURN(cudaMalloc((void **)&d_out, sizeof(int) * BLOCK_SIZE));
	CUDA_CHECK_RETURN(
			cudaMemcpy(d, idata, sizeof(int) * WORK_SIZE, cudaMemcpyHostToDevice));

	// warm up
	warmup<<<BLOCK_SIZE, THREAD_SIZE>>>(d, d_out);
	CUDA_CHECK_RETURN(cudaThreadSynchronize());	// Wait for the GPU launched work to complete
	CUDA_CHECK_RETURN(cudaGetLastError());
	CUDA_CHECK_RETURN(cudaMemcpy(odata, d_out, sizeof(int) * BLOCK_SIZE, cudaMemcpyDeviceToHost));

	for (k = 0; k < 10; ++k) {
		reduce_by_smem<<<BLOCK_SIZE, THREAD_SIZE>>>(d, d_out);

		CUDA_CHECK_RETURN(cudaThreadSynchronize());	// Wait for the GPU launched work to complete
		CUDA_CHECK_RETURN(cudaGetLastError());
	}
	CUDA_CHECK_RETURN(cudaMemcpy(odata, d_out, sizeof(int) * BLOCK_SIZE, cudaMemcpyDeviceToHost));

	int sum = 0;
	for (i = 0; i < BLOCK_SIZE; i++) {
		sum += odata[i];
	}
	printf("smem sum: %d\n", sum);

	if (reduce_cpu(idata) == sum) {
		printf("matched\n");
	} else {
		printf("unmatched\n");
	}

	for (k = 0; k < 10; ++k) {
		reduce_by_shfl<<<BLOCK_SIZE, THREAD_SIZE>>>(d, d_out);

		CUDA_CHECK_RETURN(cudaThreadSynchronize());	// Wait for the GPU launched work to complete
		CUDA_CHECK_RETURN(cudaGetLastError());
	}
	CUDA_CHECK_RETURN(cudaMemcpy(odata, d_out, sizeof(int) * BLOCK_SIZE, cudaMemcpyDeviceToHost));

	sum = 0;
	for (i = 0; i < BLOCK_SIZE; i++) {
		sum += odata[i];
	}
	printf("shlf sum: %d\n", sum);

	if (reduce_cpu(idata) == sum) {
		printf("matched\n");
	} else {
		printf("unmatched\n");
	}

	CUDA_CHECK_RETURN(cudaFree((void*) d));
	CUDA_CHECK_RETURN(cudaDeviceReset());

	return 0;
}

で、nvprof をかませてみると

Time(%) Time Calls Avg Min Max Name
53.30% 103.87us 10 10.387us 10.304us 10.880us reduce_by_shfl(int*, int*)
41.36% 80.607us 10 8.0600us 7.9680us 8.2880us reduce_by_smem(int*, int*)

warp suffle を使った方が遅いじゃん!!
どこがいけなかったのだろうか...