MMult_4x4_3.h |
一次计算C中的4x4小块 |
0.24gflops |
2.1% |
1 |
MMult_4x4_4.h |
一次计算C中的4x4小块 |
0.24gflops |
2.1% |
1 |
MMult_4x4_5.h |
一次计算C中的4x4小块,将16个循环合并一个 |
0.25gflops |
2.2% |
1 |
MMult_4x4_6.h |
一次计算C中的4x4小块(我们在寄存器中累加C的元素,并对a的元素使用寄存器) |
1.75gflops |
16.0% |
1 |
MMult_4x4_7.h |
在MMult_4x4_6的基础上用指针来寻址B中的元素 |
1.75gflops |
16.0% |
1 |
MMult_4x4_8.h |
使用更多的寄存器 |
1.75gflops |
16.0% |
1 |
MMult_4x4_10.h |
NEON指令集优化 |
2.6gflops |
23.8% |
1 |
MMult_4x4_11.h |
NEON指令集优化, 并且为了保持较小问题规模所获得的性能,我们分块矩阵C(以及相应的A和B) |
2.6gflops |
23.8% |
1 |
MMult_4x4_13.h |
NEON指令集优化, 对矩阵A和B进行Pack,这样就可以连续访问内存 |
2.6gflops |
23.8% |
1 |
MMult_4x4_3:
一次计算C矩阵的16个元素:
void AddDot( int k, float *x, int incx, float *y, float *gamma )
{
int p;
for ( p=0; p<k; p++ ){
*gamma += x[p] * Y(p);
}
}
void AddDot4x4( int k, float *a, int lda, float *b, int ldb, float *c, int ldc )
{
/* First row */
AddDot( k, &A( 0, 0 ), lda, &B( 0, 0 ), &C( 0, 0 ) );
AddDot( k, &A( 0, 0 ), lda, &B( 0, 1 ), &C( 0, 1 ) );
AddDot( k, &A( 0, 0 ), lda, &B( 0, 2 ), &C( 0, 2 ) );
AddDot( k, &A( 0, 0 ), lda, &B( 0, 3 ), &C( 0, 3 ) );
/* Second row */
AddDot( k, &A( 1, 0 ), lda, &B( 0, 0 ), &C( 1, 0 ) );
AddDot( k, &A( 1, 0 ), lda, &B( 0, 1 ), &C( 1, 1 ) );
AddDot( k, &A( 1, 0 ), lda, &B( 0, 2 ), &C( 1, 2 ) );
AddDot( k, &A( 1, 0 ), lda, &B( 0, 3 ), &C( 1, 3 ) );
/* Third row */
AddDot( k, &A( 2, 0 ), lda, &B( 0, 0 ), &C( 2, 0 ) );
AddDot( k, &A( 2, 0 ), lda, &B( 0, 1 ), &C( 2, 1 ) );
AddDot( k, &A( 2, 0 ), lda, &B( 0, 2 ), &C( 2, 2 ) );
AddDot( k, &A( 2, 0 ), lda, &B( 0, 3 ), &C( 2, 3 ) );
/* Four row */
AddDot( k, &A( 3, 0 ), lda, &B( 0, 0 ), &C( 3, 0 ) );
AddDot( k, &A( 3, 0 ), lda, &B( 0, 1 ), &C( 3, 1 ) );
AddDot( k, &A( 3, 0 ), lda, &B( 0, 2 ), &C( 3, 2 ) );
AddDot( k, &A( 3, 0 ), lda, &B( 0, 3 ), &C( 3, 3 ) );
}
void MY_MMult_4x4_3( int m, int n, int k, float *a, int lda,
float *b, int ldb,
float *c, int ldc )
{
int i, j;
for ( j=0; j<n; j+=4 ){ /* Loop over the columns of C, unrolled by 4 */
for ( i=0; i<m; i+=4 ){ /* Loop over the rows of C */
AddDot4x4( k, &A( i,0 ), lda, &B( 0,j ), ldb, &C( i,j ), ldc );
}
}
}
MMult_4x4_4:
将上一步的矩阵元素直接乘加计算,合并成循环操作。将两个计算函数合并成一个。并显式地将C矩阵中各个索引的元素作为计算结果:
void AddDot4x4( int k, float *a, int lda, float *b, int ldb, float *c, int ldc )
{
int p;
/* First row */
// AddDot( k, &A( 0, 0 ), lda, &B( 0, 0 ), &C( 0, 0 ) );
for ( p=0; p<k; p++ ){
C( 0, 0 ) += A( 0, p ) * B( p, 0 );
}
// AddDot( k, &A( 0, 0 ), lda, &B( 0, 1 ), &C( 0, 1 ) );
for ( p=0; p<k; p++ ){
C( 0, 1 ) += A( 0, p ) * B( p, 1 );
}
// AddDot( k, &A( 0, 0 ), lda, &B( 0, 2 ), &C( 0, 2 ) );
for ( p=0; p<k; p++ ){
C( 0, 2 ) += A( 0, p ) * B( p, 2 );
}
// AddDot( k, &A( 0, 0 ), lda, &B( 0, 3 ), &C( 0, 3 ) );
for ( p=0; p<k; p++ ){
C( 0, 3 ) += A( 0, p ) * B( p, 3 );
}
/* Second row */
// AddDot( k, &A( 1, 0 ), lda, &B( 0, 0 ), &C( 1, 0 ) );
for ( p=0; p<k; p++ ){
C( 1, 0 ) += A( 1, p ) * B( p, 0 );
}
// AddDot( k, &A( 1, 0 ), lda, &B( 0, 1 ), &C( 1, 1 ) );
for ( p=0; p<k; p++ ){
C( 1, 1 ) += A( 1, p ) * B( p, 1 );
}
// AddDot( k, &A( 1, 0 ), lda, &B( 0, 2 ), &C( 1, 2 ) );
for ( p=0; p<k; p++ ){
C( 1, 2 ) += A( 1, p ) * B( p, 2 );
}
// AddDot( k, &A( 1, 0 ), lda, &B( 0, 3 ), &C( 1, 3 ) );
for ( p=0; p<k; p++ ){
C( 1, 3 ) += A( 1, p ) * B( p, 3 );
}
/* Third row */
// AddDot( k, &A( 2, 0 ), lda, &B( 0, 0 ), &C( 2, 0 ) );
for ( p=0; p<k; p++ ){
C( 2, 0 ) += A( 2, p ) * B( p, 0 );
}
// AddDot( k, &A( 2, 0 ), lda, &B( 0, 1 ), &C( 2, 1 ) );
for ( p=0; p<k; p++ ){
C( 2, 1 ) += A( 2, p ) * B( p, 1 );
}
// AddDot( k, &A( 2, 0 ), lda, &B( 0, 2 ), &C( 2, 2 ) );
for ( p=0; p<k; p++ ){
C( 2, 2 ) += A( 2, p ) * B( p, 2 );
}
// AddDot( k, &A( 2, 0 ), lda, &B( 0, 3 ), &C( 2, 3 ) );
for ( p=0; p<k; p++ ){
C( 2, 3 ) += A( 2, p ) * B( p, 3 );
}
/* Four row */
// AddDot( k, &A( 3, 0 ), lda, &B( 0, 0 ), &C( 3, 0 ) );
for ( p=0; p<k; p++ ){
C( 3, 0 ) += A( 3, p ) * B( p, 0 );
}
// AddDot( k, &A( 3, 0 ), lda, &B( 0, 1 ), &C( 3, 1 ) );
for ( p=0; p<k; p++ ){
C( 3, 1 ) += A( 3, p ) * B( p, 1 );
}
// AddDot( k, &A( 3, 0 ), lda, &B( 0, 2 ), &C( 3, 2 ) );
for ( p=0; p<k; p++ ){
C( 3, 2 ) += A( 3, p ) * B( p, 2 );
}
// AddDot( k, &A( 3, 0 ), lda, &B( 0, 3 ), &C( 3, 3 ) );
for ( p=0; p<k; p++ ){
C( 3, 3 ) += A( 3, p ) * B( p, 3 );
}
}
void MY_MMult_4x4_4( int m, int n, int k, float *a, int lda,
float *b, int ldb,
float *c, int ldc )
{
int i, j;
for ( j=0; j<n; j+=4 ){ /* Loop over the columns of C, unrolled by 4 */
for ( i=0; i<m; i+=4 ){ /* Loop over the rows of C */
AddDot4x4( k, &A( i,0 ), lda, &B( 0,j ), ldb, &C( i,j ), ldc );
}
}
}
MMult_4x4_5:
上一步一次for循环计算1个C中元素,这一步一次for循环计算4个C中元素:
void AddDot4x4( int k, float *a, int lda, float *b, int ldb, float *c, int ldc )
{
int p;
for ( p=0; p<k; p++ ){
/* First row */
C( 0, 0 ) += A( 0, p ) * B( p, 0 );
C( 0, 1 ) += A( 0, p ) * B( p, 1 );
C( 0, 2 ) += A( 0, p ) * B( p, 2 );
C( 0, 3 ) += A( 0, p ) * B( p, 3 );
/* Second row */
C( 1, 0 ) += A( 1, p ) * B( p, 0 );
C( 1, 1 ) += A( 1, p ) * B( p, 1 );
C( 1, 2 ) += A( 1, p ) * B( p, 2 );
C( 1, 3 ) += A( 1, p ) * B( p, 3 );
/* Third row */
C( 2, 0 ) += A( 2, p ) * B( p, 0 );
C( 2, 1 ) += A( 2, p ) * B( p, 1 );
C( 2, 2 ) += A( 2, p ) * B( p, 2 );
C( 2, 3 ) += A( 2, p ) * B( p, 3 );
/* Fourth row */
C( 3, 0 ) += A( 3, p ) * B( p, 0 );
C( 3, 1 ) += A( 3, p ) * B( p, 1 );
C( 3, 2 ) += A( 3, p ) * B( p, 2 );
C( 3, 3 ) += A( 3, p ) * B( p, 3 );
}
}
void MY_MMult_4x4_5( int m, int n, int k, float *a, int lda,
float *b, int ldb,
float *c, int ldc )
{
int i, j;
for ( j=0; j<n; j+=4 ){ /* Loop over the columns of C, unrolled by 4 */
for ( i=0; i<m; i+=4 ){ /* Loop over the rows of C */
AddDot4x4( k, &A( i,0 ), lda, &B( 0,j ), ldb, &C( i,j ), ldc );
}
}
}
MMult_4x4_6:
将A和C矩阵中元素送入寄存器来优化计算。每次只计算1个C中元素:
void AddDot4x4( int k, float *a, int lda, float *b, int ldb, float *c, int ldc )
{
int p;
register float
c_00_reg, c_01_reg, c_02_reg, c_03_reg,
c_10_reg, c_11_reg, c_12_reg, c_13_reg,
c_20_reg, c_21_reg, c_22_reg, c_23_reg,
c_30_reg, c_31_reg, c_32_reg, c_33_reg,
a_0p_reg,
a_1p_reg,
a_2p_reg,
a_3p_reg;
c_00_reg = 0.0; c_01_reg = 0.0; c_02_reg = 0.0; c_03_reg = 0.0;
c_10_reg = 0.0; c_11_reg = 0.0; c_12_reg = 0.0; c_13_reg = 0.0;
c_20_reg = 0.0; c_21_reg = 0.0; c_22_reg = 0.0; c_23_reg = 0.0;
c_30_reg = 0.0; c_31_reg = 0.0; c_32_reg = 0.0; c_33_reg = 0.0;
for ( p=0; p<k; p++ ){
a_0p_reg = A( 0, p );
a_1p_reg = A( 1, p );
a_2p_reg = A( 2, p );
a_3p_reg = A( 3, p );
/* First row */
c_00_reg += a_0p_reg * B( p, 0 );
c_01_reg += a_0p_reg * B( p, 1 );
c_02_reg += a_0p_reg * B( p, 2 );
c_03_reg += a_0p_reg * B( p, 3 );
/* Second row */
c_10_reg += a_1p_reg * B( p, 0 );
c_11_reg += a_1p_reg * B( p, 1 );
c_12_reg += a_1p_reg * B( p, 2 );
c_13_reg += a_1p_reg * B( p, 3 );
/* Third row */
c_20_reg += a_2p_reg * B( p, 0 );
c_21_reg += a_2p_reg * B( p, 1 );
c_22_reg += a_2p_reg * B( p, 2 );
c_23_reg += a_2p_reg * B( p, 3 );
/* Four row */
c_30_reg += a_3p_reg * B( p, 0 );
c_31_reg += a_3p_reg * B( p, 1 );
c_32_reg += a_3p_reg * B( p, 2 );
c_33_reg += a_3p_reg * B( p, 3 );
}
C( 0, 0 ) += c_00_reg; C( 0, 1 ) += c_01_reg; C( 0, 2 ) += c_02_reg; C( 0, 3 ) += c_03_reg;
C( 1, 0 ) += c_10_reg; C( 1, 1 ) += c_11_reg; C( 1, 2 ) += c_12_reg; C( 1, 3 ) += c_13_reg;
C( 2, 0 ) += c_20_reg; C( 2, 1 ) += c_21_reg; C( 2, 2 ) += c_22_reg; C( 2, 3 ) += c_23_reg;
C( 3, 0 ) += c_30_reg; C( 3, 1 ) += c_31_reg; C( 3, 2 ) += c_32_reg; C( 3, 3 ) += c_33_reg;
}
void MY_MMult_4x4_6( int m, int n, int k, float *a, int lda,
float *b, int ldb,
float *c, int ldc )
{
int i, j;
for ( j=0; j<n; j+=4 ){ /* Loop over the columns of C, unrolled by 4 */
for ( i=0; i<m; i+=4 ){ /* Loop over the rows of C */
AddDot4x4( k, &A( i,0 ), lda, &B( 0,j ), ldb, &C( i,j ), ldc );
}
}
}
MMult_4x4_7:
将A和C矩阵中元素送入寄存器,并用指针来索引B矩阵中元素,使用索引++来获取B矩阵中下一个元素:
void AddDot4x4( int k, float *a, int lda, float *b, int ldb, float *c, int ldc )
{
int p;
register float
c_00_reg, c_01_reg, c_02_reg, c_03_reg,
c_10_reg, c_11_reg, c_12_reg, c_13_reg,
c_20_reg, c_21_reg, c_22_reg, c_23_reg,
c_30_reg, c_31_reg, c_32_reg, c_33_reg,
a_0p_reg,
a_1p_reg,
a_2p_reg,
a_3p_reg;
float
/* Point to the current elements in the four columns of B */
*b_p0_pntr, *b_p1_pntr, *b_p2_pntr, *b_p3_pntr;
c_00_reg = 0.0; c_01_reg = 0.0; c_02_reg = 0.0; c_03_reg = 0.0;
c_10_reg = 0.0; c_11_reg = 0.0; c_12_reg = 0.0; c_13_reg = 0.0;
c_20_reg = 0.0; c_21_reg = 0.0; c_22_reg = 0.0; c_23_reg = 0.0;
c_30_reg = 0.0; c_31_reg = 0.0; c_32_reg = 0.0; c_33_reg = 0.0;
for ( p=0; p<k; p++ ){
a_0p_reg = A( 0, p );
a_1p_reg = A( 1, p );
a_2p_reg = A( 2, p );
a_3p_reg = A( 3, p );
b_p0_pntr = &B( p, 0 );
b_p1_pntr = &B( p, 1 );
b_p2_pntr = &B( p, 2 );
b_p3_pntr = &B( p, 3 );
/* First row */
c_00_reg += a_0p_reg * *b_p0_pntr;
c_01_reg += a_0p_reg * *b_p1_pntr;
c_02_reg += a_0p_reg * *b_p2_pntr;
c_03_reg += a_0p_reg * *b_p3_pntr;
/* Second row */
c_10_reg += a_1p_reg * *b_p0_pntr;
c_11_reg += a_1p_reg * *b_p1_pntr;
c_12_reg += a_1p_reg * *b_p2_pntr;
c_13_reg += a_1p_reg * *b_p3_pntr;
/* Third row */
c_20_reg += a_2p_reg * *b_p0_pntr;
c_21_reg += a_2p_reg * *b_p1_pntr;
c_22_reg += a_2p_reg * *b_p2_pntr;
c_23_reg += a_2p_reg * *b_p3_pntr;
/* Four row */
c_30_reg += a_3p_reg * *b_p0_pntr++;
c_31_reg += a_3p_reg * *b_p1_pntr++;
c_32_reg += a_3p_reg * *b_p2_pntr++;
c_33_reg += a_3p_reg * *b_p3_pntr++;
}
C( 0, 0 ) += c_00_reg; C( 0, 1 ) += c_01_reg; C( 0, 2 ) += c_02_reg; C( 0, 3 ) += c_03_reg;
C( 1, 0 ) += c_10_reg; C( 1, 1 ) += c_11_reg; C( 1, 2 ) += c_12_reg; C( 1, 3 ) += c_13_reg;
C( 2, 0 ) += c_20_reg; C( 2, 1 ) += c_21_reg; C( 2, 2 ) += c_22_reg; C( 2, 3 ) += c_23_reg;
C( 3, 0 ) += c_30_reg; C( 3, 1 ) += c_31_reg; C( 3, 2 ) += c_32_reg; C( 3, 3 ) += c_33_reg;
}
void MY_MMult_4x4_7( int m, int n, int k, float *a, int lda,
float *b, int ldb,
float *c, int ldc )
{
int i, j;
for ( j=0; j<n; j+=4 ){ /* Loop over the columns of C, unrolled by 4 */
for ( i=0; i<m; i+=4 ){ /* Loop over the rows of C */
AddDot4x4( k, &A( i,0 ), lda, &B( 0,j ), ldb, &C( i,j ), ldc );
}
}
}
MMult_4x4_8:文章来源:https://www.toymoban.com/news/detail-556296.html
将A,B,C矩阵中元素都送入寄存器中来加速计算:文章来源地址https://www.toymoban.com/news/detail-556296.html
void AddDot4x4( int k, float *a, int lda, float *b, int ldb, float *c, int ldc )
{
int p;
c_00_reg, c_01_reg, c_02_reg, c_03_reg,
c_10_reg, c_11_reg, c_12_reg, c_13_reg,
c_20_reg, c_21_reg, c_22_reg, c_23_reg,
c_30_reg, c_31_reg, c_32_reg, c_33_reg,
a_0p_reg,
a_1p_reg,
a_2p_reg,
a_3p_reg,
b_p0_reg,
b_p1_reg,
b_p2_reg,
b_p3_reg;
float
/* Point to the current elements in the four rows of A */
*a_0p_pntr, *a_1p_pntr, *a_2p_pntr, *a_3p_pntr;
a_0p_pntr = &A( 0, 0);
a_1p_pntr = &A( 1, 0);
a_2p_pntr = &A( 2, 0);
a_3p_pntr = &A( 3, 0);
c_00_reg = 0.0; c_01_reg = 0.0; c_02_reg = 0.0; c_03_reg = 0.0;
c_10_reg = 0.0; c_11_reg = 0.0; c_12_reg = 0.0; c_13_reg = 0.0;
c_20_reg = 0.0; c_21_reg = 0.0; c_22_reg = 0.0; c_23_reg = 0.0;
c_30_reg = 0.0; c_31_reg = 0.0; c_32_reg = 0.0; c_33_reg = 0.0;
for ( p=0; p<k; p++ ){
a_0p_reg = *a_0p_pntr++;
a_1p_reg = *a_1p_pntr++;
a_2p_reg = *a_2p_pntr++;
a_3p_reg = *a_3p_pntr++;
b_p0_reg = B( p, 0);
b_p1_reg = B( p, 1);
b_p2_reg = B( p, 2);
b_p3_reg = B( p, 3);
/* First row */
c_00_reg += a_0p_reg * b_p0_reg;
c_01_reg += a_0p_reg * b_p1_reg;
c_02_reg += a_0p_reg * b_p2_reg;
c_03_reg += a_0p_reg * b_p3_reg;
/* Second row */
c_10_reg += a_1p_reg * b_p0_reg;
c_11_reg += a_1p_reg * b_p1_reg;
c_12_reg += a_1p_reg * b_p2_reg;
c_13_reg += a_1p_reg * b_p3_reg;
/* Third row */
c_20_reg += a_2p_reg * b_p0_reg;
c_21_reg += a_2p_reg * b_p1_reg;
c_22_reg += a_2p_reg * b_p2_reg;
c_23_reg += a_2p_reg * b_p3_reg;
/* Four row */
c_30_reg += a_3p_reg * b_p0_reg;
c_31_reg += a_3p_reg * b_p1_reg;
c_32_reg += a_3p_reg * b_p2_reg;
c_33_reg += a_3p_reg * b_p3_reg;
}
C( 0, 0 ) += c_00_reg; C( 0, 1 ) += c_01_reg; C( 0, 2 ) += c_02_reg; C( 0, 3 ) += c_03_reg;
C( 1, 0 ) += c_10_reg; C( 1, 1 ) += c_11_reg; C( 1, 2 ) += c_12_reg; C( 1, 3 ) += c_13_reg;
C( 2, 0 ) += c_20_reg; C( 2, 1 ) += c_21_reg; C( 2, 2 ) += c_22_reg; C( 2, 3 ) += c_23_reg;
C( 3, 0 ) += c_30_reg; C( 3, 1 ) += c_31_reg; C( 3, 2 ) += c_32_reg; C( 3, 3 ) += c_33_reg;
}
void MY_MMult_4x4_8( int m, int n, int k, float *a, int lda,
float *b, int ldb,
float *c, int ldc )
{
int i, j;
for ( j=0; j<n; j+=4 ){ /* Loop over the columns of C, unrolled by 4 */
for ( i=0; i<m; i+=4 ){ /* Loop over the rows of C */
AddDot4x4( k, &A( i,0 ), lda, &B( 0,j ), ldb, &C( i,j ), ldc );
}
}
}
到了这里,关于矩阵乘法优化:4x4矩阵块优化方法的文章就介绍完了。如果您还想了解更多内容,请在右上角搜索TOY模板网以前的文章或继续浏览下面的相关文章,希望大家以后多多支持TOY模板网!