performance - una - ¿Cómo normalizar columnas de matriz en CUDA con un rendimiento máximo?

orden de una matriz (3)

¿Cómo normalizar efectivamente las columnas de matriz en CUDA?

Mi matriz se almacena en la columna principal y el tamaño típico es 2000x200.

La operación se puede representar en el siguiente código de matlab.

A = rand(2000,200); A = exp(A); A = A./repmat(sum(A,1), [size(A,1) 1]);

¿Puede esto hacerse de manera efectiva por Thrust, cuBLAS y / o cuNPP?

Una implementación rápida que incluye 4 kernels se muestra de la siguiente manera.

Preguntándose si esto se puede hacer en 1 o 2 kernels para mejorar el rendimiento, especialmente para el paso de suma de columnas implementado por cublasDgemv ().

#include <cuda.h> #include <curand.h> #include <cublas_v2.h> #include <thrust/device_vector.h> #include <thrust/device_ptr.h> #include <thrust/transform.h> #include <thrust/iterator/constant_iterator.h> #include <math.h> struct Exp { __host__ __device__ void operator()(double& x) { x = exp(x); } }; struct Inv { __host__ __device__ void operator()(double& x) { x = (double) 1.0 / x; } }; int main() { cudaDeviceSetCacheConfig(cudaFuncCachePreferShared); cublasHandle_t hd; curandGenerator_t rng; cublasCreate(&hd); curandCreateGenerator(&rng, CURAND_RNG_PSEUDO_DEFAULT); const size_t m = 2000, n = 200; const double c1 = 1.0; const double c0 = 0.0; thrust::device_vector<double> A(m * n); thrust::device_vector<double> sum(1 * n); thrust::device_vector<double> one(m * n, 1.0); double* pA = thrust::raw_pointer_cast(&A[0]); double* pSum = thrust::raw_pointer_cast(&sum[0]); double* pOne = thrust::raw_pointer_cast(&one[0]); for (int i = 0; i < 100; i++) { curandGenerateUniformDouble(rng, pA, A.size()); thrust::for_each(A.begin(), A.end(), Exp()); cublasDgemv(hd, CUBLAS_OP_T, m, n, &c1, pA, m, pOne, 1, &c0, pSum, 1); thrust::for_each(sum.begin(), sum.end(), Inv()); cublasDdgmm(hd, CUBLAS_SIDE_RIGHT, m, n, pA, m, pSum, 1, pA, m); } curandDestroyGenerator(rng); cublasDestroy(hd); return 0; }

Debería poder fusionar la primera operación para for_each actividad con la llamada cublasSgemv en una sola llamada a la función reduce_by_key . Si define / redefine functors como:

struct Accessor : public thrust::unary_function<int,int> { int lda; __host__ __device__ Accessor(int _lda) : lda(_lda) {}; __host__ __device__ int operator()(const int& idx) { return idx/lda; } }; struct Exp : public thrust::unary_function<double,double> { __host__ __device__ double operator()(const double& x) { return exp(x); } }; struct Inv : public thrust::unary_function<double,double> { __host__ __device__ double operator()(const double& x) { return double(1.0) / x; } };

A continuación, puede calcular la salida normalizada como

Accessor columns(m); thrust::reduce_by_key( thrust::make_transform_iterator(thrust::make_counting_iterator(int(0)), columns), thrust::make_transform_iterator(thrust::make_counting_iterator(int(m*n)), columns), thrust::make_transform_iterator(A.begin(), Exp()), thrust::make_discard_iterator(), sum.begin()); thrust::for_each(sum.begin(), sum.end(), Inv()); cublasDdgmm(hd, CUBLAS_SIDE_RIGHT, m, n, pA, m, pSum, 1, pA, m);

[descargo de responsabilidad: todo el código escrito en el navegador y no probado, usar bajo su propio riesgo]

Además de reducir el número de llamadas al kernel, el uso de iteradores sofisticados elimina la necesidad de una gran matriz de unidades que reduzca la huella de memoria y el número total de transacciones de memoria para realizar las operaciones de suma y exponenciación.

Puede usar ArrayFire de la siguiente manera

array A = randu(2000, 2000); A = exp(A); A /= tile(sum(A, 0), A.dims(0), 1);

Podrías hacer esto con empuje también. Pero si vas a trabajar con matrices (en lugar de vectores simples), tendrías que hacerlo en un bucle for que no sería tan eficiente.

DESCARGO DE RESPONSABILIDAD Soy desarrollador en Accelereyes, trabajando en arrayfire.

EDITAR Estoy trabajando en la generación de nuevos puntos de referencia según lo solicitado.

EDITAR Encontramos errores de rendimiento para exp en nuestro código debido a este punto de referencia. Lo estamos revisando y corrigiendo.

Comparé el rendimiento de 3 enfoques en M2090 con CUDA 5.0.

[173.179 us] implementación de cublas como se muestra en la pregunta
[733.734 us] implementación Thrust pura con thrust::reduce_by_key de @talonmies
[1.508 ms] implementación Thrust pura con thrust::inclusive_scan_by_key

Puede observarse que,

cublas tiene el más alto rendimiento en este caso;
both thrust::reduce_by_key & thrust::inclusive_scan_by_key thrust::reduce_by_key varios kernels, lo que thrust::reduce_by_key una sobrecarga adicional;
thrust::inclusive_scan_by_key escribe muchos más datos en DRAM en comparación con thrust::reduce_by_key , lo que puede ser una de las razones del mayor tiempo del kernel;
la principal diferencia de rendimiento entre los cublos y el enfoque de empuje es la suma de la columna de la matriz. el empuje es más lento posiblemente debido a que thrust::reduce_by_key está diseñado para hacer reducción en segmentos con longitud de variante, pero cublas_gemv() solo puede aplicarse a segmentos de longitud fija (fila / col).

Cuando la matriz A es lo suficientemente grande como para ignorar el inicio del kernel, la aproximación de los cublas sigue siendo la mejor. El resultado del perfil en A_ {20,000 x 2,000} se muestra de la siguiente manera.

La fusión de la primera operación para for_each operación con la llamada cublasSgemv indicada por @talonmies puede mejorar aún más el rendimiento, pero creo que se debe usar kernel escrito a mano en lugar de thrust::reduce_by_key .

El código para los 3 enfoques se muestra de la siguiente manera.

#include <cuda.h> #include <curand.h> #include <cublas_v2.h> #include <thrust/device_vector.h> #include <thrust/device_ptr.h> #include <thrust/transform.h> #include <thrust/reduce.h> #include <thrust/scan.h> #include <thrust/iterator/counting_iterator.h> #include <thrust/iterator/transform_iterator.h> #include <thrust/iterator/discard_iterator.h> #include <thrust/iterator/permutation_iterator.h> #include <math.h> struct Exp: public thrust::unary_function<double, double> { __host__ __device__ double operator()(double x) { return exp(x); } }; struct Inv: public thrust::unary_function<double, double> { __host__ __device__ double operator()(double x) { return (double) 1.0 / x; } }; template<typename T> struct MulC: public thrust::unary_function<T, T> { T C; __host__ __device__ MulC(T c) : C(c) { } __host__ __device__ T operator()(T x) { return x * C; } }; template<typename T> struct line2col: public thrust::unary_function<T, T> { T C; __host__ __device__ line2col(T C) : C(C) { } __host__ __device__ T operator()(T i) { return i / C; } }; int main() { cudaDeviceSetCacheConfig(cudaFuncCachePreferShared); cublasHandle_t hd; curandGenerator_t rng; cublasCreate(&hd); curandCreateGenerator(&rng, CURAND_RNG_PSEUDO_DEFAULT); const size_t m = 2000, n = 200; const double c1 = 1.0; const double c0 = 0.0; thrust::device_vector<double> A(m * n); thrust::device_vector<double> B(m * n); thrust::device_vector<double> C(m * n); thrust::device_vector<double> sum1(1 * n); thrust::device_vector<double> sum2(1 * n); thrust::device_vector<double> one(m * n, 1); double* pA = thrust::raw_pointer_cast(&A[0]); double* pB = thrust::raw_pointer_cast(&B[0]); double* pSum1 = thrust::raw_pointer_cast(&sum1[0]); double* pSum2 = thrust::raw_pointer_cast(&sum2[0]); double* pOne = thrust::raw_pointer_cast(&one[0]); curandGenerateUniformDouble(rng, pA, A.size()); const int count = 2; for (int i = 0; i < count; i++) { thrust::transform(A.begin(), A.end(), B.begin(), Exp()); cublasDgemv(hd, CUBLAS_OP_T, m, n, &c1, pB, m, pOne, 1, &c0, pSum1, 1); thrust::transform(sum1.begin(), sum1.end(), sum1.begin(), Inv()); cublasDdgmm(hd, CUBLAS_SIDE_RIGHT, m, n, pB, m, pSum2, 1, pB, m); } for (int i = 0; i < count; i++) { thrust::reduce_by_key( thrust::make_transform_iterator(thrust::make_counting_iterator(0), line2col<int>(m)), thrust::make_transform_iterator(thrust::make_counting_iterator(0), line2col<int>(m)) + A.size(), thrust::make_transform_iterator(A.begin(), Exp()), thrust::make_discard_iterator(), sum2.begin()); thrust::transform( A.begin(), A.end(), thrust::make_permutation_iterator( sum2.begin(), thrust::make_transform_iterator(thrust::make_counting_iterator(0), line2col<int>(m))), C.begin(), thrust::divides<double>()); } for (int i = 0; i < count; i++) { thrust::inclusive_scan_by_key( thrust::make_transform_iterator(thrust::make_counting_iterator(0), line2col<int>(m)), thrust::make_transform_iterator(thrust::make_counting_iterator(0), line2col<int>(m)) + A.size(), thrust::make_transform_iterator(A.begin(), Exp()), C.begin()); thrust::copy( thrust::make_permutation_iterator( C.begin() + m - 1, thrust::make_transform_iterator(thrust::make_counting_iterator(0), MulC<int>(m))), thrust::make_permutation_iterator( C.begin() + m - 1, thrust::make_transform_iterator(thrust::make_counting_iterator(0), MulC<int>(m))) + n, sum2.begin()); thrust::transform( A.begin(), A.end(), thrust::make_permutation_iterator( sum2.begin(), thrust::make_transform_iterator(thrust::make_counting_iterator(0), line2col<int>(m))), C.begin(), thrust::divides<double>()); } curandDestroyGenerator(rng); cublasDestroy(hd); return 0; }