使用CUDA写一个矩阵乘法C = A X B
(矩阵维度:A: M X K, B: K X N, C: M X N
),当然可以自己写核函数,但效率不如CUDA自带的cublas
算法效率高。使用cublas
唯一值得注意的地方是,在CPU中的矩阵数据存储是行优先存储,而在GPU中是列优先存储,这相当于对原矩阵做了一次转置,我们知道矩阵的两次转置等于原矩阵,要让最后的结果正确,在GPU中计算要使用:TC = T(A X B) = TB x TA
,这里的T
表示矩阵转置。具体原理可参见这篇博客:https://blog.csdn.net/HaoBBNuanMM/article/details/103054357,里面解释得非常直观详细。我刚开始没搞清楚这个差异,结果始终不对,还以为数据复制出了问题,直到查到这篇博客后才豁然开朗。下面是实现代码(matrix_multiply_cublas.cu
):
#include <algorithm>
#include <chrono>
#include <iostream>
#include <iterator>
#include <numeric>
#include <random>
#include <vector>
#include <cuda_runtime.h>
#include <cublas_v2.h>
// For units of time such as h, min, s, ms, us, ns
using namespace std::literals::chrono_literals;
namespace {
constexpr size_t M = 400;
constexpr size_t N = 500;
constexpr size_t K = 600;
} // namespace
int main() {
// Initialize CUBLAS
cublasHandle_t handle;
cublasStatus_t status = cublasCreate(&handle);
if (status != CUBLAS_STATUS_SUCCESS) {
std::cerr << "!!!! CUBLAS initialization error\n";
return EXIT_FAILURE;
}
// Initialize the host input matrix;
std::vector<float> mat_a(M * K, 0.0f);
std::vector<float> mat_b(K * N, 0.0f);
// Fill the matrices with random numbers
std::random_device rd;
std::mt19937 gen(rd());
std::uniform_int_distribution<int> dis(0, 100000);
auto rand_num = [&dis, &gen]() { return dis(gen); };
std::generate(mat_a.begin(), mat_a.end(), rand_num);
std::generate(mat_b.begin(), mat_b.end(), rand_num);
// Show the test matrix data.
if (M == 2 && N == 4 && K == 3) {
// 1 2 3
// A =
// 4 5 6
std::iota(mat_a.begin(), mat_a.end(), 1.0f);
// 1 2 3 4
// B = 5 6 7 8
// 9 10 11 12
std::iota(mat_b.begin(), mat_b.end(), 1.0f);
// 38 44 50 56
// C = AB =
// 83 98 113 128
std::cout << "A = \n";
for (size_t i = 0; i < M * K; ++i) {
std::cout << mat_a[i] << '\t';
if ((i + 1) % K == 0) {
std::cout << '\n';
}
}
std::cout << "B = \n";
for (size_t i = 0; i < K * N; ++i) {
std::cout << mat_b[i] << '\t';
if ((i + 1) % N == 0) {
std::cout << '\n';
}
}
}
// Allocate device memory for the matrices
float *device_mat_a = nullptr;
float *device_mat_b = nullptr;
float *device_mat_c = nullptr;
if (cudaMalloc(reinterpret_cast<void **>(&device_mat_a),
M * K * sizeof(float)) != cudaSuccess) {
std::cerr << "!!!! device memory allocation error (allocate A)\n";
return EXIT_FAILURE;
}
if (cudaMalloc(reinterpret_cast<void **>(&device_mat_b),
K * N * sizeof(float)) != cudaSuccess) {
std::cerr << "!!!! device memory allocation error (allocate B)\n";
cudaFree(device_mat_a);
return EXIT_FAILURE;
}
if (cudaMalloc(reinterpret_cast<void **>(&device_mat_c),
M * N * sizeof(float)) != cudaSuccess) {
std::cerr << "!!!! device memory allocation error (allocate C)\n";
cudaFree(device_mat_a);
cudaFree(device_mat_b);
return EXIT_FAILURE;
}
// Initialize the device matrices with the host matrices.
cudaMemcpy(device_mat_a, mat_a.data(), M * K * sizeof(float),
cudaMemcpyHostToDevice);
cudaMemcpy(device_mat_b, mat_b.data(), K * N * sizeof(float),
cudaMemcpyHostToDevice);
// Free up host memories.
mat_a.clear();
mat_a.shrink_to_fit();
mat_b.clear();
mat_b.shrink_to_fit();
// Performs operation using cublas
// C = alpha * transa(A)*transb(B) + beta * C
// `transa` indicates whether the matrix A is transposed or not.
// `transb` indicates whether the matrix B is transposed or not.
// A: m x k
// B: k x n
// C: m x n
// LDA, LDB, LDC are the leading dimensions of the three matrices,
// respectively.
// If C = A x B is calculated, there is alpha = 1.0, beta = 0.0,
// transa = CUBLAS_OP_N, transb = CUBLAS_OP_N
// cublasStatus_t cublasSgemm(cublasHandle_t handle,
// cublasOperation_t transa, cublasOperation_t
// transb, int m, int n, int k, const float *alpha,
// const float *A, int lda,
// const float *B, int ldb,
// const float *beta,
// float *C, int ldc);
float alpha = 1.0f;
float beta = 0.0f;
// Note that cublas is column primary! We need to transpose the order!
// In CPU: C = A x B
// But in GPU: CT = (A x B)T = BT x AT, T means transpose
status =
cublasSgemm(handle, CUBLAS_OP_N, CUBLAS_OP_N, N, M, K, &alpha,
device_mat_b, N, device_mat_a, K, &beta, device_mat_c, N);
if (status != CUBLAS_STATUS_SUCCESS) {
std::cerr << "!!!! cublas kernel execution error.\n";
cudaFree(device_mat_a);
cudaFree(device_mat_b);
cudaFree(device_mat_c);
return EXIT_FAILURE;
}
// Read back the result from device memory.
std::vector<float> mat_c(M * N, 0.0f);
cudaMemcpy(mat_c.data(), device_mat_c, M * N * sizeof(float),
cudaMemcpyDeviceToHost);
// Show the test results.
if (M == 2 && N == 4 && K == 3) {
std::cout << "C = AB = \n";
for (size_t i = 0; i < M * N; ++i) {
std::cout << mat_c[i] << '\t';
if ((i + 1) % N == 0) {
std::cout << '\n';
}
}
}
// Memory clean up
cudaFree(device_mat_a);
cudaFree(device_mat_b);
cudaFree(device_mat_c);
// Shutdown cublas
cublasDestroy(handle);
return 0;
}
上述代码中,关于CUBLAS函数:
cublasStatus_t cublasSgemm(cublasHandle_t handle,
cublasOperation_t transa, cublasOperation_t transb, int m, int n, int k,
const float *alpha, const float *A, int lda, const float *B, int ldb,
const float *beta, float *C, int ldc);
的解释如下,该函数实际上计算的是如下公式:
C = alpha * transa(A)*transb(B) + beta * C
矩阵维度为:A: m x k, B: k x n,C: m x n
,
我们现在要计算两个矩阵的乘法:C = AB
,则要求:alpha = 1.0, beta = 0.0, transa = transa = CUBLAS_OP_N
(即矩阵不转置)
函数各参数的含义如下:handle:cublas
库句柄(即指针);transa,transb
:矩阵是否转置(包括Hermite转置);m, n, k
:矩阵的维度;alpha, beta
:公式的系数;A, B, C
:矩阵数据指针,要求使用一维指针;lda, ldb, ldc
:矩阵A、B、C
的主维度,即A、B、C
数据指针中连续存储的元素长度。在CUBLAS中,矩阵元素以列优先方式存储,那么对于矩阵A、B、C而言,其数据指针中连续存储的元素长度分别为m, n, k
(可以思考一下为什么是这样),这就是lda, ldb, ldc
的取值。
前已述及,我们在代码中调用该函数,实际上使用的是如下公式:
TC = T(A X B) = TB x TA
矩阵维度为:TB: n x k, TA: k x m,TC: n x m
,于是调用参数为:
cublasHandle_t handle: handle
cublasOperation_t transa: CUBLAS_OP_N
cublasOperation_t transb: CUBLAS_OP_N
int m: N
int n: M
int k: K
const float *alpha: &alpha
const float *A: device_mat_b
int lda: N
const float *B: device_mat_a
int ldb: K
const float *beta: &beta
float *C: device_mat_c
int ldc: N
生成随机数,我使用的是C++ 11方式:
std::random_device rd;
std::mt19937 gen(rd());
std::uniform_int_distribution<int> dis(0, 100000);
auto rand_num = [&dis, &gen]() { return dis(gen); };
std::generate(mat_a.begin(), mat_a.end(), rand_num);
std::generate(mat_b.begin(), mat_b.end(), rand_num);
生成测试数据,我也是使用的C++ STL库算法std::iota
来实现:
std::iota(mat_a.begin(), mat_a.end(), 1.0f);
std::iota(mat_b.begin(), mat_b.end(), 1.0f);
主机内存我直接使用std::vector
,而不是使用C语言函数malloc
来分配,释放内存代码如下:
mat_a.clear();
mat_a.shrink_to_fit();
mat_b.clear();
mat_b.shrink_to_fit();
请大家熟悉C++的编码方式。
CMake编译文件CMakeLists.txt
内容如下所示:
cmake_minimum_required(VERSION 3.0.0)
# Set the project name and its version
project(matrix_multiply VERSION 0.1.0)
# Set the c++17 standard
set(CMAKE_CXX_STANDARD 17)
# Set the -O3 optimization level with debug information
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wall -O3 -g")
# Disable warning messages
cmake_policy(SET CMP0074 NEW)
# Find CUDA
find_package(CUDA REQUIRED)
# These flags embed debugging information for both host and device code
set(CUDA_NVCC_FLAGS -G -g)
# Generate a cuda executable file.
cuda_add_executable(${PROJECT_NAME} ${PROJECT_NAME}.cu)
target_link_libraries(${PROJECT_NAME} ${CUDA_cublas_LIBRARY})
上述配置文件的含义如下图所示:
编译方式说明:
- 使用cmake的编译:
mkdir build && cd build
# 生成Makefile
cmake ..
# 构建目标,也可直接使用:make
cmake --build .
- 如直接使用
nvcc
编译,指令为:
nvcc -O3 -G -g -std=c++17 matrix_multiply_cublas.cu -lcublas -o matrix_multiply_cublas
上述编译语句的选项含义为:-O3
使用O3级优化,-G
设备(device)代码包含调试信息,-g
主机(host)代码包含调试信息,-std=c++17
使用C++17标准,-lcublas
表示链接cublas
库。
注意:CUDA代码带调试信息会极大降低效率,除非调试需要,否则不要添加-G选项。
- GCC编译器版本必须为9.1以上版本(Ubuntu 20.04 2021年以后的版本默认就是GCC 9.3)
将常量修改为M = 2, N = 4, K = 3
,得到的测试结果如下:文章来源:https://www.toymoban.com/news/detail-569736.html
./matrix_multiply_cublas
A =
1 2 3
4 5 6
B =
1 2 3 4
5 6 7 8
9 10 11 12
C = AB =
38 44 50 56
83 98 113 128
将常量修改为M = 400, N = 500, K = 600
,得到的性能分析结果如下:文章来源地址https://www.toymoban.com/news/detail-569736.html
nvprof ./matrix_multiply_cublas
==216602== NVPROF is profiling process 216602, command: ./matrix_multiply
==216602== Profiling application: ./matrix_multiply
==216602== Profiling result:
Type Time(%) Time Calls Avg Min Max Name
GPU activities: 42.78% 184.93us 2 92.463us 75.519us 109.41us [CUDA memcpy HtoD]
42.44% 183.49us 1 183.49us 183.49us 183.49us volta_sgemm_128x32_nn
14.50% 62.687us 1 62.687us 62.687us 62.687us [CUDA memcpy DtoH]
0.28% 1.2160us 1 1.2160us 1.2160us 1.2160us [CUDA memset]
API calls: 99.77% 670.35ms 8 83.794ms 2.8420us 352.22ms cudaFree
0.08% 547.51us 3 182.50us 58.392us 347.88us cudaMemcpy
0.06% 406.58us 297 1.3680us 81ns 98.280us cuDeviceGetAttribute
0.05% 315.62us 6 52.602us 3.5290us 117.89us cudaMalloc
0.02% 113.83us 751 151ns 88ns 772ns cuGetProcAddress
0.01% 92.881us 3 30.960us 15.184us 60.794us cuDeviceGetName
0.00% 14.385us 1 14.385us 14.385us 14.385us cudaLaunchKernel
0.00% 11.389us 18 632ns 269ns 5.8650us cudaEventCreateWithFlags
0.00% 10.808us 1 10.808us 10.808us 10.808us cuDeviceGetPCIBusId
0.00% 6.9020us 2 3.4510us 524ns 6.3780us cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags
0.00% 6.3610us 4 1.5900us 628ns 2.7490us cudaDeviceSynchronize
0.00% 5.6670us 1 5.6670us 5.6670us 5.6670us cudaMemsetAsync
0.00% 5.5610us 18 308ns 226ns 947ns cudaEventDestroy
0.00% 5.3820us 3 1.7940us 321ns 3.5870us cudaGetDevice
0.00% 4.2940us 5 858ns 202ns 2.6630us cuDeviceGetCount
0.00% 4.0470us 14 289ns 171ns 960ns cudaDeviceGetAttribute
0.00% 2.6340us 2 1.3170us 1.2010us 1.4330us cuInit
0.00% 2.1070us 1 2.1070us 2.1070us 2.1070us cudaEventRecord
0.00% 1.9590us 1 1.9590us 1.9590us 1.9590us cudaEventQuery
0.00% 1.7620us 3 587ns 197ns 1.1320us cudaStreamGetCaptureInfo
0.00% 1.7510us 4 437ns 101ns 1.2370us cuDeviceGet
0.00% 1.2910us 3 430ns 219ns 777ns cuDeviceTotalMem
0.00% 1.0030us 1 1.0030us 1.0030us 1.0030us cudaGetSymbolAddress
0.00% 944ns 3 314ns 133ns 649ns cuDeviceGetUuid
0.00% 282ns 2 141ns 127ns 155ns cuDriverGetVersion
0.00% 236ns 1 236ns 236ns 236ns cudaGetLastError
0.00% 100ns 1 100ns 100ns 100ns cuModuleGetLoadingMode
到了这里,关于使用cublas实现矩阵乘法的文章就介绍完了。如果您还想了解更多内容,请在右上角搜索TOY模板网以前的文章或继续浏览下面的相关文章,希望大家以后多多支持TOY模板网!