saber - Separación de números pares e impares en CUDA

numeros pares e impares del 1 al 100 (1)

Tengo una matriz de números como {1,2,3,4,5,6,7,8,9,10} y quiero separar los números pares e impares como:

even = {2,4,6,8}

odd = {1,3,5,7}

Estoy al tanto de las operaciones atómicas en CUDA, y también soy consciente de que no se espera que la producción sufra condiciones de carrera. No quiero usar operaciones atómicas. ¿Cómo puedo lograr esto sin usar palabras clave atómicas?

CÓDIGO:

#include <stdio.h> #include <cuda.h> // Kernel that executes on the CUDA device __global__ void square_array(float *total,float *even,float *odd, int N) { int idx = blockIdx.x * blockDim.x + threadIdx.x; int a=total[idx]; if ((a%2)==0) { for (int i=0;i<=idx;i++) { int b = even[i]; if(b==0) { even[i] = total[idx]; break; } } } else { for (int i=0;i<idx;i++) { int c = odd[i]; odd[i] = total[idx]; break; } } } // main routine that executes on the host int main(void) { float *total_h,*even_h, *odd_h,*total_d, *even_d,*odd_d; // Pointer to host & device arrays const int N = 10; // Number of elements in arrays size_t size = N * sizeof(float); total_h = (float *)malloc(size); // Allocate array on host even_h = (float *)malloc(size); // Allocate array on host odd_h = (float *)malloc(size); // Allocate array on host cudaMalloc((void **) &total_d, size); cudaMalloc((void **) &even_d, size); cudaMemset(even_d,0,size); cudaMalloc((void **) &odd_d, size); // Allocate array on device cudaMemset(odd_d,0,size); // Initialize host array and copy it to CUDA device for (int i=0; i<N; i++) total_h[i] = (float)i+1; cudaMemcpy(total_d, total_h, size, cudaMemcpyHostToDevice); // Do calculation on device: square_array <<< 1,10 >>> (total_d,even_d,odd_d, N); // Retrieve result from device and store it in host array cudaMemcpy(even_h, even_d, sizeof(float)*N, cudaMemcpyDeviceToHost); cudaMemcpy(odd_h, odd_d, sizeof(float)*N, cudaMemcpyDeviceToHost); // Print results printf("total Numbers/n"); for (int i=0; i<N; i++) printf("%f/n",total_h[i]); printf("EVEN Numbers/n"); for (int i=0; i<N; i++) printf("%f/n",even_h[i]); printf("ODD Numbers/n"); for (int i=0; i<N; i++) printf("%f/n",odd_h[i]); // Cleanup free(total_h); free(even_h); free(odd_h); cudaFree(total_d); cudaFree(even_d); cudaFree(odd_d); }

SALIDA:

Según lo sugerido por Jared Hoberock, sería mucho más fácil usar el algoritmo de partición eficiente disponible en CUDA Thrust en lugar de comenzar el desarrollo de una rutina de partición propia. A continuación, encontrará un ejemplo completo trabajado.

#include <thrust/device_vector.h> #include <thrust/partition.h> #include <thrust/execution_policy.h> struct is_even { __host__ __device__ bool operator()(const int &x) { return (x % 2) == 0; } }; void main() { const int N = 10; thrust::host_vector<int> h_data(N); for (int i=0; i<N; i++) h_data[i] = i; thrust::device_vector<int> d_data(h_data); thrust::device_vector<int> d_evens(N/2); thrust::device_vector<int> d_odds(N/2); thrust::partition_copy(d_data.begin(), d_data.end(), d_evens.begin(), d_odds.begin(), is_even()); printf("Even numbers/n"); for (int i=0; i<N/2; i++) { int val = d_evens[i]; printf("evens[%i] = %i/n",i,val); } printf("Odd numbers/n"); for (int i=0; i<N/2; i++) { int val = d_odds[i]; printf("odds[%i] = %i/n",i,val); } }