本文首发于个人博客https://kezunlin.me/post/ad5c5bd9/,欢迎阅读最新内容!
cublasSgemm for large matrix multiplication on gpu in C++
<!--more-->
Guide
- Part 1:cpp cuda programming tutorial
- Part 2: cuda activation kernels
- Part 3: cublasSgemm for large matrix multiplication on gpu
code
demo.cu
#include <cuda_runtime.h>
#include <cublas.h>
#include <cublas_api.h>
#include <cublas_v2.h>
bool CompareFeatureMtoN_gpu(float * featureM, float * featureN, float * result,
int count_m, int count_n, int size, int gpu_id) {
float *dev_featureM = 0;
float *dev_featureN = 0;
float *dev_result = 0;
const float alpha = 1, beta = 0;
cublasHandle_t handle;
cudaError_t cudaStatus;
cudaStatus = cudaSetDevice(gpu_id);
if (cudaStatus != cudaSuccess) {
printf("cudaSetDevice failed! Do you have a CUDA-capable GPU installed?\n");
goto out;
}
cublasCreate(&handle);
cudaStatus = cudaMalloc((void**)&dev_featureM, count_m * size * sizeof(float));
if (cudaStatus != cudaSuccess) {
printf("%s, line %d, cudaMalloc failed!\n", __func__, __LINE__);
goto out;
}
cudaStatus = cudaMalloc((void**)&dev_featureN, count_n * size * sizeof(float));
if (cudaStatus != cudaSuccess) {
printf("%s, line %d, cudaMalloc failed!\n", __func__, __LINE__);
goto out;
}
cudaStatus = cudaMalloc((void**)&dev_result, count_m * count_n * sizeof(float));
if (cudaStatus != cudaSuccess) {
printf("%s, line %d, cudaMalloc failed!\n", __func__, __LINE__);
goto out;
}
cudaStatus = cudaMemcpy(dev_featureM, featureM, count_m * size * sizeof(float),
cudaMemcpyHostToDevice);
if (cudaStatus != cudaSuccess) {
printf("%s, line %d, cudaMalloc failed!\n", __func__, __LINE__);
goto out;
}
cudaStatus = cudaMemcpy(dev_featureN, featureN, count_n * size * sizeof(float),
cudaMemcpyHostToDevice);
if (cudaStatus != cudaSuccess) {
printf("%s, line %d, cudaMalloc failed!\n", __func__, __LINE__);
goto out;
}
/*
CUBLAS assumes that the matrix in the device is stored in column major:
" where α and β are scalars, and A , B and C are matrices stored in column-major
format with dimensions op ( A ) m × k , op ( B ) k × n and C m × n , respectively.
Also, for matrix A
// Multiply the arrays A and B on GPU and save the result in C (coloum-major)
// C(m,n) = A(m,k) * B(k,n)
cublasSgemm(handle, CUBLAS_OP_N, CUBLAS_OP_N, m, n, k, &alpha, A, lda, B, ldb, &beta, C, ldc);
*/
cublasSgemm(handle, CUBLAS_OP_T, CUBLAS_OP_N, count_n, count_m, size,
&alpha, dev_featureN, size, dev_featureM, size, &beta, dev_result, count_n);
cudaStatus = cudaThreadSynchronize();
cudaStatus = cudaMemcpy(result, dev_result, count_m * count_n * sizeof(float),
cudaMemcpyDeviceToHost);
if (cudaStatus != cudaSuccess) {
printf("%s, line %d, cudaMemcpy failed!\n", __func__, __LINE__);
goto out;
}
out:
if(dev_featureM) cudaFree(dev_featureM);
if(dev_featureN) cudaFree(dev_featureN);
if(dev_result) cudaFree(dev_result);
cublasDestroy(handle);
return cudaStatus == cudaSuccess;
}
<script async src="https://pagead2.googlesyndication.com/pagead/js/adsbygoogle.js"></script>
<!-- kzl in-article ad -->
<ins class="adsbygoogle"
style="display:block; text-align:center;"
data-ad-layout="in-article"
data-ad-format="fluid"
data-ad-client="ca-pub-5653382914441020"
data-ad-slot="7925631830"></ins>
<script>
(adsbygoogle = window.adsbygoogle || []).push({});
</script>
usage
demo.cu
void test_feature_compare()
{
/*
[a1]
[a2]
[a3]
* [b1]
[b2]
[10,35]
[10,35]
[10,35]
*/
std::vector<float> f1{0,1,2,3,4,5,6,7,8,9};
std::vector<float> f2{1,1,1,1,1,0,0,0,0,0},f22{0,0,0,0,0,1,1,1,1,1};
std::vector<std::vector<float>> A,B;
// 3*10
A.push_back(f1);
A.push_back(f1);
A.push_back(f1);
// 10 * 2
B.push_back(f2);
B.push_back(f22);
int m = 3;
int n = 2;
int dim = 10;
int gpu_id = 0;
float* feature_m = new float[ m*dim ];
float* feature_n = new float[ n*dim ];
auto tmp = feature_m;
for (int i = 0; i < m; i++) {
for (int j = 0; j < dim; j++)
*tmp++ = A[i][j];
}
tmp = feature_n;
for (int i = 0; i < n; i++) {
for (int j = 0; j < dim; j++)
*tmp++ = B[i][j];
}
printf("m = %d, n= %d, size= %d \n", m, n, dim); // 3, 2, 10
//float* result = CompareFeatureMtoN(feature_m, m*dim, feature_n, n*dim, dim, gpu_id);
float* result = new float[m*n];
CompareFeatureMtoN_gpu(feature_m, feature_n, result, m, n, dim, gpu_id);
tmp = result;
for(int i=0;i<6;i++)
printf("%f ", *(tmp++));
delete []feature_m;
delete []feature_n;
delete []result;
}
output
m = 3, n= 2, size= 10
10.000000 35.000000 10.000000 35.000000 10.000000 35.000000
Reference
History
- 20191015: created.
Copyright
- Post author: kezunlin
- Post link: https://kezunlin.me/post/ad5c5bd9/
- Copyright Notice: All articles in this blog are licensed under CC BY-NC-SA 3.0 unless stating additionally.
**粗体** _斜体_ [链接](http://example.com) `代码` - 列表 > 引用
。你还可以使用@
来通知其他用户。