#include "cuda_runtime.h"
#include <stdio.h>
#include <stdlib.h>
#include <math.h>
#include <memory.h>
__global__ static void kernel(int *d_int){
int i;
int x = threadIdx.y;
for(i=0;i<3;i++){
d_int[i*4 + x] = i;
}
}
int main(){
int gridsize = 1;
dim3 blocksize(3,4);
int *h_int,*d_int;
h_int = (int *)malloc(sizeof(int)*3*4);
cudaMalloc((void **) &d_int,sizeof(int)*3*4);
cudaMemset(d_int,0,sizeof(int)*3*4);
memset(h_int,0,sizeof(int)*3*4);
kernel<<<gridsize,blocksize>>>(d_int);
cudaMemcpy(h_int,d_int,sizeof(int)*3*4,cudaMemcpyDeviceToHost);
for(int i=0;i<3;i++){
for(int j=0;j<4;j++){
printf("%d ",h_int[i*4+j]);
}
printf("\n");
}
printf("\n");
getchar();
return 0;
}
执行结果: