C++中如何使用CUDA进行高性能大规模矩阵乘法运算

本文首发于个人博客https://kezunlin.me/post/ad5c5bd9/，欢迎阅读最新内容！

cublasSgemm for large matrix multiplication on gpu in C++

Guide

code

demo.cu

#include <cuda_runtime.h>
#include <cublas.h>
#include <cublas_api.h>
#include <cublas_v2.h>

bool CompareFeatureMtoN_gpu(float * featureM, float * featureN, float * result, 
    int count_m, int count_n, int size, int gpu_id) {
    float *dev_featureM = 0;
    float *dev_featureN = 0;
    float *dev_result = 0;
    const float alpha = 1, beta = 0;
    cublasHandle_t handle;
    cudaError_t cudaStatus;

    cudaStatus = cudaSetDevice(gpu_id);
    if (cudaStatus != cudaSuccess) {
        printf("cudaSetDevice failed!  Do you have a CUDA-capable GPU installed?\n");
        goto out;
    }
    cublasCreate(&handle);

    cudaStatus = cudaMalloc((void**)&dev_featureM, count_m * size * sizeof(float));
    if (cudaStatus != cudaSuccess) {
        printf("%s, line %d, cudaMalloc failed!\n", __func__, __LINE__);
        goto out;
    }
    cudaStatus = cudaMalloc((void**)&dev_featureN, count_n * size * sizeof(float));
    if (cudaStatus != cudaSuccess) {
        printf("%s, line %d, cudaMalloc failed!\n", __func__, __LINE__);
        goto out;
    }
    cudaStatus = cudaMalloc((void**)&dev_result, count_m * count_n * sizeof(float));
    if (cudaStatus != cudaSuccess) {
        printf("%s, line %d, cudaMalloc failed!\n", __func__, __LINE__);
        goto out;
    }

    cudaStatus = cudaMemcpy(dev_featureM, featureM, count_m * size * sizeof(float), 
        cudaMemcpyHostToDevice);
    if (cudaStatus != cudaSuccess) {
        printf("%s, line %d, cudaMalloc failed!\n", __func__, __LINE__);
        goto out;
    }
    cudaStatus = cudaMemcpy(dev_featureN, featureN, count_n * size * sizeof(float), 
        cudaMemcpyHostToDevice);
    if (cudaStatus != cudaSuccess) {
        printf("%s, line %d, cudaMalloc failed!\n", __func__, __LINE__);
        goto out;
    }

    /*
    
    CUBLAS assumes that the matrix in the device is stored in column major:

    " where α and β are scalars, and A , B and C are matrices stored in column-major 
    format with dimensions op ( A ) m × k , op ( B ) k × n and C m × n , respectively. 
     
     Also, for matrix A


     // Multiply the arrays A and B on GPU and save the result in C (coloum-major)
      // C(m,n) = A(m,k) * B(k,n)

      cublasSgemm(handle, CUBLAS_OP_N, CUBLAS_OP_N, m, n, k, &alpha, A, lda, B, ldb, &beta, C, ldc);
     */

    cublasSgemm(handle, CUBLAS_OP_T, CUBLAS_OP_N, count_n, count_m, size, 
        &alpha, dev_featureN, size, dev_featureM, size, &beta, dev_result, count_n);
    cudaStatus = cudaThreadSynchronize();

    cudaStatus = cudaMemcpy(result, dev_result, count_m * count_n  * sizeof(float), 
        cudaMemcpyDeviceToHost);
    if (cudaStatus != cudaSuccess) {
        printf("%s, line %d, cudaMemcpy failed!\n", __func__, __LINE__);
        goto out;
    }

out:
    if(dev_featureM) cudaFree(dev_featureM);
    if(dev_featureN) cudaFree(dev_featureN);
    if(dev_result) cudaFree(dev_result);
    cublasDestroy(handle);
    return cudaStatus == cudaSuccess;
}

 style="display:block; text-align:center;"
 data-ad-layout="in-article"
 data-ad-format="fluid"
 data-ad-client="ca-pub-5653382914441020"
 data-ad-slot="7925631830"></ins>

 (adsbygoogle = window.adsbygoogle || []).push({});

</script>

usage

demo.cu

void test_feature_compare()
{
    /*

    [a1]
    [a2]
    [a3]  
         * [b1]
           [b2]

    [10,35]
    [10,35]
    [10,35]
    */
    std::vector<float> f1{0,1,2,3,4,5,6,7,8,9};
    std::vector<float> f2{1,1,1,1,1,0,0,0,0,0},f22{0,0,0,0,0,1,1,1,1,1};

    std::vector<std::vector<float>> A,B;
    // 3*10
    A.push_back(f1);
    A.push_back(f1);
    A.push_back(f1);

    // 10 * 2
    B.push_back(f2);
    B.push_back(f22);


    int m = 3;
    int n = 2; 
    int dim = 10;
    int gpu_id = 0;

    float* feature_m = new float[ m*dim ];
    float* feature_n = new float[ n*dim ];
    auto tmp = feature_m;
    for (int i = 0; i < m; i++) {
        for (int j = 0; j < dim; j++)
            *tmp++ = A[i][j];
    }

    tmp = feature_n;
    for (int i = 0; i < n; i++) {
        for (int j = 0; j < dim; j++)
            *tmp++ = B[i][j];
    }

    printf("m = %d, n= %d, size= %d \n", m, n, dim); // 3, 2, 10

    //float* result = CompareFeatureMtoN(feature_m, m*dim, feature_n, n*dim, dim, gpu_id);

    float* result = new float[m*n];
    CompareFeatureMtoN_gpu(feature_m, feature_n, result, m, n, dim, gpu_id);

    tmp = result;
    for(int i=0;i<6;i++)
        printf("%f ", *(tmp++));

    delete []feature_m;
    delete []feature_n;
    delete []result;
}

output

m = 3, n= 2, size= 10 
10.000000 35.000000 10.000000 35.000000 10.000000 35.000000

Reference

History

20191015: created.

Copyright

Post author: kezunlin
Post link: https://kezunlin.me/post/ad5c5bd9/
Copyright Notice: All articles in this blog are licensed under CC BY-NC-SA 3.0 unless stating additionally.

C++中如何使用CUDA进行高性能大规模矩阵乘法运算

Guide

code

usage

Reference

History

Copyright

kezunlin

引用和评论

C++ 中 VS 项目引入公共配置文件

Visual Studio Code (VS Code) – C/C++ 入门

AI处理器组合

想从事嵌入式软件，有推荐的吗？

程序员如何利用周末提升自己

大厂面试必考！C++ 多态底层原理 + 虚函数表，5 分钟看懂！

现在纠结于到底是学stm32好还是Arduino好？