文件名 |
优化方法 |
gFLOPs |
峰值占比 |
线程数 |
---|---|---|---|---|
MMult1.h |
无任何优化 |
0.24gflops |
2.1% |
1 |
MMult2.h |
一次计算4个元素 |
0.24gflops |
2.1% |
1 |
MMult_1x4_3.h |
一次计算4个元素 |
0.24gflops |
2.1% |
1 |
MMult_1x4_4.h |
一次计算4个元素 |
0.24gflops |
2.1% |
1 |
MMult_1x4_5.h |
一次计算4个元素(将4个循环合并为1个) |
0.25gflops |
2.2% |
1 |
MMult_1x4_7.h |
一次计算4个元素(我们在寄存器中累加C的元素,并对a的元素使用寄存器),用指针来寻址B中的元素 |
0.98gflops |
9.0% |
1 |
MMult_1x4_8.h |
在MMult_1x4_7的基础上循环展开四个(展开因子的相对任意选择) |
1.1gflops |
10% |
1 |
MMult1:
void AddDot( int k, float *x, int incx, float *y, float *gamma )
{
/* compute gamma := x' * y + gamma with vectors x and y of length n.
Here x starts at location x with increment (stride) incx and y starts at location y and has (implicit) stride of 1.
*/
int p;
for ( p=0; p<k; p++ ){
*gamma += x[p] * Y(p);
}
}
void MY_MMult1( int m, int n, int k, float *a, int lda,
float *b, int ldb,
float *c, int ldc )
{
int i, j;
for ( j=0; j<n; j+=1 ){ /* Loop over the columns of C */
for ( i=0; i<m; i+=1 ){ /* Loop over the rows of C */
/* Update the C( i,j ) with the inner product of the ith row of A
and the jth column of B */
// for (int p=0; p<k; p++ ){
// C(i, j) = C(i, j) + A(i, p) * B(p, j);
// }
AddDot( k, &A( i,0 ), lda, &B( 0,j ), &C( i,j ) );
}
}
}
MMult2:
我们一次计算C矩阵的一个元素,这个时候需要遍历A矩阵的一行和B矩阵的一列并做乘加运算。如果我们一次计算C矩阵的4个元素,那么我们可以每次遍历A矩阵的一行和B矩阵的四列.
用B矩阵中当前元素的地址+1,+2,+3来快速索引B矩阵中的下一个元素:
void AddDot( int k, float *x, int incx, float *y, float *gamma )
{
int p;
for ( p=0; p<k; p++ ){
*gamma += x[p] * Y(p);
}
}
void MY_MMult2( int m, int n, int k, float *a, int lda,
float *b, int ldb,
float *c, int ldc )
{
int i, j;
for ( j=0; j<n; j+=4 ){ /* Loop over the columns of C, unrolled by 4 */
for ( i=0; i<m; i+=1 ){ /* Loop over the rows of C */
AddDot( k, &A( i,0 ), lda, &B( 0,j ), &C( i,j ) );
AddDot( k, &A( i,0 ), lda, &B( 0,j+1 ), &C( i,j+1 ) );
AddDot( k, &A( i,0 ), lda, &B( 0,j+2 ), &C( i,j+2 ) );
AddDot( k, &A( i,0 ), lda, &B( 0,j+3 ), &C( i,j+3 ) );
}
}
}
MMult_1x4_3:
将上一步里四次乘加操作独立出来作为单独函数,然后在这个单独函数调用4次上一轮的乘法函数。在主程序中只调用一次单独函数:
void AddDot( int k, float *x, int incx, float *y, float *gamma )
{
int p;
for ( p=0; p<k; p++ ){
*gamma += x[p] * Y(p);
}
}
void AddDot1x4( int k, float *a, int lda, float *b, int ldb, float *c, int ldc )
{
AddDot( k, &A( 0, 0 ), lda, &B( 0, 0 ), &C( 0, 0 ) );
AddDot( k, &A( 0, 0 ), lda, &B( 0, 1 ), &C( 0, 1 ) );
AddDot( k, &A( 0, 0 ), lda, &B( 0, 2 ), &C( 0, 2 ) );
AddDot( k, &A( 0, 0 ), lda, &B( 0, 3 ), &C( 0, 3 ) );
}
void MY_MMult_1x4_3( int m, int n, int k, float *a, int lda,
float *b, int ldb,
float *c, int ldc )
{
int i, j;
for ( j=0; j<n; j+=4 ){ /* Loop over the columns of C, unrolled by 4 */
for ( i=0; i<m; i+=1 ){ /* Loop over the rows of C */
AddDot1x4( k, &A( i,0 ), lda, &B( 0,j ), ldb, &C( i,j ), ldc );
}
}
}
MMult_1x4_4:
将上一步中独立出来的乘法函数进行进一步合并,两个乘加函数合并成一个:
void AddDot1x4( int k, float *a, int lda, float *b, int ldb, float *c, int ldc )
{
int p;
for ( p=0; p<k; p++ ){
C( 0, 0 ) += A( 0, p ) * B( p, 0 );
}
for ( p=0; p<k; p++ ){
C( 0, 1 ) += A( 0, p ) * B( p, 1 );
}
for ( p=0; p<k; p++ ){
C( 0, 2 ) += A( 0, p ) * B( p, 2 );
}
for ( p=0; p<k; p++ ){
C( 0, 3 ) += A( 0, p ) * B( p, 3 );
}
}
void MY_MMult_1x4_4( int m, int n, int k, float *a, int lda,
float *b, int ldb,
float *c, int ldc )
{
int i, j;
for ( j=0; j<n; j+=4 ){ /* Loop over the columns of C, unrolled by 4 */
for ( i=0; i<m; i+=1 ){ /* Loop over the rows of C */
AddDot1x4( k, &A( i,0 ), lda, &B( 0,j ), ldb, &C( i,j ), ldc );
}
}
}
MMult_1x4_5:
将上一步的乘加函数里,4个for循环合并成1个:
void AddDot1x4( int k, float *a, int lda, float *b, int ldb, float *c, int ldc )
{
int p;
for ( p=0; p<k; p++ ){
C( 0, 0 ) += A( 0, p ) * B( p, 0 );
C( 0, 1 ) += A( 0, p ) * B( p, 1 );
C( 0, 2 ) += A( 0, p ) * B( p, 2 );
C( 0, 3 ) += A( 0, p ) * B( p, 3 );
}
}
void MY_MMult_1x4_5( int m, int n, int k, float *a, int lda,
float *b, int ldb,
float *c, int ldc )
{
int i, j;
for ( j=0; j<n; j+=4 ){ /* Loop over the columns of C, unrolled by 4 */
for ( i=0; i<m; i+=1 ){ /* Loop over the rows of C */
AddDot1x4( k, &A( i,0 ), lda, &B( 0,j ), ldb, &C( i,j ), ldc );
}
}
}
MMult_1x4x6:
将A和C矩阵中元素送入寄存器进行计算:
void AddDot1x4( int k, float *a, int lda, float *b, int ldb, float *c, int ldc )
{
/* So, this routine computes four elements of C:
C( 0, 0 ), C( 0, 1 ), C( 0, 2 ), C( 0, 3 ).
Notice that this routine is called with c = C( i, j ) in the
previous routine, so these are actually the elements
C( i, j ), C( i, j+1 ), C( i, j+2 ), C( i, j+3 )
in the original matrix C.
In this version, we accumulate in registers and put A( 0, p ) in a register */
int p;
register float
c_00_reg, c_01_reg, c_02_reg, c_03_reg,
a_0p_reg;
c_00_reg = 0.0;
c_01_reg = 0.0;
c_02_reg = 0.0;
c_03_reg = 0.0;
for ( p=0; p<k; p++ ){
a_0p_reg = A( 0, p );
c_00_reg += a_0p_reg * B( p, 0 );
c_01_reg += a_0p_reg * B( p, 1 );
c_02_reg += a_0p_reg * B( p, 2 );
c_03_reg += a_0p_reg * B( p, 3 );
}
C( 0, 0 ) += c_00_reg;
C( 0, 1 ) += c_01_reg;
C( 0, 2 ) += c_02_reg;
C( 0, 3 ) += c_03_reg;
}
void MY_MMult_1x4_6( int m, int n, int k, float *a, int lda,
float *b, int ldb,
float *c, int ldc )
{
int i, j;
for ( j=0; j<n; j+=4 ){ /* Loop over the columns of C, unrolled by 4 */
for ( i=0; i<m; i+=1 ){ /* Loop over the rows of C */
AddDot1x4( k, &A( i,0 ), lda, &B( 0,j ), ldb, &C( i,j ), ldc );
}
}
}
MMult_1x4_7:
将B和C矩阵中元素送入寄存器计算,将A矩阵的元素用指针进行索引(这里要注意A和B到底是行主序还是列主序):
void AddDot1x4( int k, float *a, int lda, float *b, int ldb, float *c, int ldc )
{
int p;
register float
c_00_reg, c_01_reg, c_02_reg, c_03_reg,
b_0p_reg;
float
*ap0_pntr, *ap1_pntr, *ap2_pntr, *ap3_pntr;
ap0_pntr = &A( 0, 0 );
ap1_pntr = &A( 1, 0 );
ap2_pntr = &A( 2, 0 );
ap3_pntr = &A( 3, 0 );
c_00_reg = 0.0;
c_01_reg = 0.0;
c_02_reg = 0.0;
c_03_reg = 0.0;
for ( p=0; p<k; p++ ){
b_0p_reg = B( p, 0 );
c_00_reg += b_0p_reg * *ap0_pntr++;
c_01_reg += b_0p_reg * *ap1_pntr++;
c_02_reg += b_0p_reg * *ap2_pntr++;
c_03_reg += b_0p_reg * *ap3_pntr++;
}
C( 0, 0 ) += c_00_reg;
C( 1, 0 ) += c_01_reg;
C( 2, 0 ) += c_02_reg;
C( 3, 0 ) += c_03_reg;
}
void MY_MMult_1x4_7( int m, int n, int k, float *a, int lda,
float *b, int ldb,
float *c, int ldc )
{
int i, j;
for ( j=0; j<n; j+=1 ){ /* Loop over the columns of C, unrolled by 4 */
for ( i=0; i<m; i+=4 ){ /* Loop over the rows of C */
AddDot1x4( k, &A( i,0 ), lda, &B( 0,j ), ldb, &C( i,j ), ldc );
}
}
}
MMult_1x4_8:文章来源:https://www.toymoban.com/news/detail-554653.html
展开上一步的“取出B矩阵中元素,存放到到寄存器中”这一过程的循环:文章来源地址https://www.toymoban.com/news/detail-554653.html
void AddDot1x4( int k, float *a, int lda, float *b, int ldb, float *c, int ldc )
{
int p;
register float
c_00_reg, c_01_reg, c_02_reg, c_03_reg,
b_0p_reg;
float
*ap0_pntr, *ap1_pntr, *ap2_pntr, *ap3_pntr;
ap0_pntr = &A( 0, 0 );
ap1_pntr = &A( 1, 0 );
ap2_pntr = &A( 2, 0 );
ap3_pntr = &A( 3, 0 );
c_00_reg = 0.0;
c_01_reg = 0.0;
c_02_reg = 0.0;
c_03_reg = 0.0;
for ( p=0; p<k; p+=4 ){
b_0p_reg = B( p, 0 );
c_00_reg += b_0p_reg * *ap0_pntr++;
c_01_reg += b_0p_reg * *ap1_pntr++;
c_02_reg += b_0p_reg * *ap2_pntr++;
c_03_reg += b_0p_reg * *ap3_pntr++;
b_0p_reg = B( p+1, 0 );
c_00_reg += b_0p_reg * *ap0_pntr++;
c_01_reg += b_0p_reg * *ap1_pntr++;
c_02_reg += b_0p_reg * *ap2_pntr++;
c_03_reg += b_0p_reg * *ap3_pntr++;
b_0p_reg = B( p+2, 0 );
c_00_reg += b_0p_reg * *ap0_pntr++;
c_01_reg += b_0p_reg * *ap1_pntr++;
c_02_reg += b_0p_reg * *ap2_pntr++;
c_03_reg += b_0p_reg * *ap3_pntr++;
b_0p_reg = B( p+3, 0);
c_00_reg += b_0p_reg * *ap0_pntr++;
c_01_reg += b_0p_reg * *ap1_pntr++;
c_02_reg += b_0p_reg * *ap2_pntr++;
c_03_reg += b_0p_reg * *ap3_pntr++;
}
C( 0, 0 ) += c_00_reg;
C( 1, 0 ) += c_01_reg;
C( 2, 0 ) += c_02_reg;
C( 3, 0 ) += c_03_reg;
}
void MY_MMult_1x4_8( int m, int n, int k, float *a, int lda,
float *b, int ldb,
float *c, int ldc )
{
int i, j;
for ( j=0; j<n; j+=1 ){ /* Loop over the columns of C, unrolled by 4 */
for ( i=0; i<m; i+=4 ){ /* Loop over the rows of C */
AddDot1x4( k, &A( i,0 ), lda, &B( 0,j ), ldb, &C( i,j ), ldc );
}
}
}
到了这里,关于矩阵乘法优化:1x4矩阵块的各种优化方法的文章就介绍完了。如果您还想了解更多内容,请在右上角搜索TOY模板网以前的文章或继续浏览下面的相关文章,希望大家以后多多支持TOY模板网!