Elemento por elemento multiplicación vectorial con CUDA

complex-numbers cublas (2)

Si lo que está tratando de lograr es un producto simple basado en elementos con números complejos, parece que está haciendo algunos pasos adicionales en su kernel multiplyElementwise que aumentan el uso del registro. Lo que intentas calcular es:

f0[i].x = a*c - b*d; f0[i].y = a*d + b*c;

desde (a + ib)*(c + id) = (a*c - b*d) + i(a*d + b*c) . Al usar su multiplicación compleja mejorada, está intercambiando 1 multiplicación por 3 adiciones y algunos registros adicionales. Si esto puede justificarse o no, puede depender del hardware que esté utilizando. Por ejemplo, si su hardware es compatible con FMA (Fused Multiply-Add), ese tipo de optimización puede no ser eficiente. Debe considerar leer este documento: " Precisión y rendimiento: punto flotante y cumplimiento IEEE 754 para GPU NVIDIA ", que también aborda el tema de la precisión de coma flotante.

Aún así, deberías considerar usar Thrust . Esta biblioteca ofrece muchas herramientas de alto nivel para operar tanto en el host como en los vectores del dispositivo. Puede ver una larga lista de ejemplos aquí: https://github.com/thrust/thrust/tree/master/examples . Esto haría tu vida mucho más fácil.

CÓDIGO ACTUALIZADO

En su caso, podría usar este ejemplo y adaptarlo a algo como esto:

#include <thrust/host_vector.h> #include <thrust/device_vector.h> #include <time.h> struct ElementWiseProductBasic : public thrust::binary_function<float2,float2,float2> { __host__ __device__ float2 operator()(const float2& v1, const float2& v2) const { float2 res; res.x = v1.x * v2.x - v1.y * v2.y; res.y = v1.x * v2.y + v1.y * v2.x; return res; } }; /** * See: http://www.embedded.com/design/embedded/4007256/Digital-Signal-Processing-Tricks--Fast-multiplication-of-complex-numbers%5D */ struct ElementWiseProductModified : public thrust::binary_function<float2,float2,float2> { __host__ __device__ float2 operator()(const float2& v1, const float2& v2) const { float2 res; float a, b, c, d, k; a = v1.x; b = v1.y; c = v2.x; d = v2.y; k = a * (c + d); d = d * (a + b); c = c * (b - a); res.x = k -d; res.y = k + c; return res; } }; int get_random_int(int min, int max) { return min + (rand() % (int)(max - min + 1)); } thrust::host_vector<float2> init_vector(const size_t N) { thrust::host_vector<float2> temp(N); for(size_t i = 0; i < N; i++) { temp[i].x = get_random_int(0, 10); temp[i].y = get_random_int(0, 10); } return temp; } int main(void) { const size_t N = 100000; const bool compute_basic_product = true; const bool compute_modified_product = true; srand(time(NULL)); thrust::host_vector<float2> h_A = init_vector(N); thrust::host_vector<float2> h_B = init_vector(N); thrust::device_vector<float2> d_A = h_A; thrust::device_vector<float2> d_B = h_B; thrust::host_vector<float2> h_result(N); thrust::host_vector<float2> h_result_modified(N); if (compute_basic_product) { thrust::device_vector<float2> d_result(N); thrust::transform(d_A.begin(), d_A.end(), d_B.begin(), d_result.begin(), ElementWiseProductBasic()); h_result = d_result; } if (compute_modified_product) { thrust::device_vector<float2> d_result_modified(N); thrust::transform(d_A.begin(), d_A.end(), d_B.begin(), d_result_modified.begin(), ElementWiseProductModified()); h_result_modified = d_result_modified; } std::cout << std::fixed; for (size_t i = 0; i < 4; i++) { float2 a = h_A[i]; float2 b = h_B[i]; std::cout << "(" << a.x << "," << a.y << ")"; std::cout << " * "; std::cout << "(" << b.x << "," << b.y << ")"; if (compute_basic_product) { float2 prod = h_result[i]; std::cout << " = "; std::cout << "(" << prod.x << "," << prod.y << ")"; } if (compute_modified_product) { float2 prod_modified = h_result_modified[i]; std::cout << " = "; std::cout << "(" << prod_modified.x << "," << prod_modified.y << ")"; } std::cout << std::endl; } return 0; }

Esto devuelve:

(6.000000,5.000000) * (0.000000,1.000000) = (-5.000000,6.000000) (3.000000,2.000000) * (0.000000,4.000000) = (-8.000000,12.000000) (2.000000,10.000000) * (10.000000,4.000000) = (-20.000000,108.000000) (4.000000,8.000000) * (10.000000,9.000000) = (-32.000000,116.000000)

A continuación, puede comparar los tiempos de las dos estrategias de multiplicación diferentes y elegir lo mejor con su hardware.

He construido un kernel rudimentario en CUDA para hacer una multiplicación vectorial vectorial de dos vectores complejos. El código del kernel se inserta debajo ( multiplyElementwise ). Funciona bien, pero dado que noté que otras operaciones aparentemente sencillas (como escalar un vector) están optimizadas en bibliotecas como CUBLAS o CULA, me preguntaba si es posible reemplazar mi código por una llamada a la biblioteca. Para mi sorpresa, ni CUBLAS ni CULA tienen esta opción, traté de fingir haciendo que uno de los vectores fuera la diagonal de un producto de vector matricial en diagonal, pero el resultado fue muy lento.

Como último recurso, traté de optimizar este código yo mismo (consulte multiplyElementwiseFast continuación) cargando los dos vectores en la memoria compartida y luego trabajando desde allí, pero eso fue más lento que mi código original.

Entonces mis preguntas:

¿Hay una biblioteca que haga multiplicaciones vectorial-vectoriales por elementos?
Si no, ¿puedo acelerar mi código ( multiplyElementwise )?

¡Cualquier ayuda sería muy apreciada!

__global__ void multiplyElementwise(cufftComplex* f0, cufftComplex* f1, int size) { const int i = blockIdx.x*blockDim.x + threadIdx.x; if (i < size) { float a, b, c, d; a = f0[i].x; b = f0[i].y; c = f1[i].x; d = f1[i].y; float k; k = a * (c + d); d = d * (a + b); c = c * (b - a); f0[i].x = k - d; f0[i].y = k + c; } } __global__ void multiplyElementwiseFast(cufftComplex* f0, cufftComplex* f1, int size) { const int i = blockIdx.x*blockDim.x + threadIdx.x; if (i < 4*size) { const int N = 256; const int thId = threadIdx.x / 4; const int rem4 = threadIdx.x % 4; const int i4 = i / 4; __shared__ float a[N]; __shared__ float b[N]; __shared__ float c[N]; __shared__ float d[N]; __shared__ float Re[N]; __shared__ float Im[N]; if (rem4 == 0) { a[thId] = f0[i4].x; Re[thId] = 0.f; } if (rem4 == 1) { b[thId] = f0[i4].y; Im[thId] = 0.f; } if (rem4 == 2) c[thId] = f1[i4].x; if (rem4 == 0) d[thId] = f1[i4].y; __syncthreads(); if (rem4 == 0) atomicAdd(&(Re[thId]), a[thId]*c[thId]); if (rem4 == 1) atomicAdd(&(Re[thId]), -b[thId]*d[thId]); if (rem4 == 2) atomicAdd(&(Im[thId]), b[thId]*c[thId]); if (rem4 == 3) atomicAdd(&(Im[thId]), a[thId]*d[thId]); __syncthreads(); if (rem4 == 0) f0[i4].x = Re[thId]; if (rem4 == 1) f0[i4].y = Im[thId]; } }

Puedes usar cublasZdgmm.

cublasStatus_t cublasZdgmm(cublasHandle_t handle, cublasSideMode_t mode, int m, int n, const cuDoubleComplex *A, int lda, const cuDoubleComplex *x, int incx, cuDoubleComplex *C, int ldc)