目录
1. 分帧
1.1 非整齐分帧
1.2 整齐分帧
2. 示例代码
1. 分帧
问题1:总帧数如何计算?
记符号N为语音总长度,FRAME_LEN为帧长,OVERLAP_LEN为帧与帧之间的重叠部分,STEP_LEN为帧移(步长)。则总帧数N_Frames计算如下:
因为OVERLAP_LEN=FRAME_LEN-STEP_LEN
所以有:
N-FRAME_LEN可以看成一个常数。因此帧移越大,重叠部分就越小,总帧数就越小,反之;当帧移最大等于帧长的时候,式子变为:
此时就变成了均分了,而不是重叠分割了。
重叠分割的情况下,帧移的选择也影响整段语音分帧的结果,而实时语音处理中一般取N为256的整数倍。
问题2:关于输入和输出
重叠输入,按STEP_LEN输出。
1.1 非整齐分帧
非整齐分帧的情况下,有数据丢失。首先是输入,取帧数据的时候没法取到,其次是考虑到输出。例:N=2048,FRAME_LEN=256,当STEP_LEN=28+8=36时。总帧数N_Frames=(1792/36)+1=50,最后一帧的起始位置:49x36=1764,取最后一帧256个点,结果是1764+256=2020,即最后2048-2020=28个点没法取到。
1.2 整齐分帧
例:N=2048,FRAME_LEN=256,当STEP_LEN=128时,可以分为15帧(帧索引是0~14),刚好能把所有的点都取到,最后一个帧的起始位置:14x128=1792,再取一帧长256,刚好就是2048。由此类推,以下帧移,都能整齐分帧:
14x128=1792 28x64=1792 56x32=1792 112x16=1792 224x8=1792 |
第一列就是N_Frame,第二列就是帧移STEP_LEN。帧移越小,分的帧越多,计算量也就越大。一般取50%的重叠,即帧移和帧重叠各占50%。
注:最后一帧全部输出即可。
2. 示例代码
#if 1
#define _CRT_SECURE_NO_WARNINGS
#include <stdlib.h>
#include <stdio.h>
#include <time.h>
#include "baselib.h"
#include "win_fun.h"
typedef unsigned char uint8_t;
#define FRAME_LEN (256)
#define STEP_LEN (128)
//#define STEP_LEN (64)
//#define STEP_LEN (32)
//#define STEP_LEN (16)
//#define STEP_LEN (8)
#define OVERLAP_LEN (FRAME_LEN-STEP_LEN)
#define BLK_INPUT_LEN (4096)
//#define BLK_INPUT_LEN (8192)
static const float win[FRAME_LEN]={
0.080000, 0.080140, 0.080558, 0.081256, 0.082232, 0.083487, 0.085018, 0.086825, 0.088908, 0.091264, 0.093893, 0.096793, 0.099962, 0.103398, 0.107099, 0.111063, 0.115287, 0.119769, 0.124506, 0.129496, 0.134734, 0.140219, 0.145946, 0.151913, 0.158115, 0.164549, 0.171211, 0.178097, 0.185203, 0.192524, 0.200056, 0.207794,
0.215734, 0.223871, 0.232200, 0.240716, 0.249413, 0.258287, 0.267332, 0.276542, 0.285912, 0.295437, 0.305110, 0.314925, 0.324878, 0.334960, 0.345168, 0.355493, 0.365931, 0.376474, 0.387117, 0.397852, 0.408674, 0.419575, 0.430550, 0.441591, 0.452691, 0.463845, 0.475045, 0.486285, 0.497557, 0.508854, 0.520171, 0.531500,
0.542834, 0.554166, 0.565489, 0.576797, 0.588083, 0.599340, 0.610560, 0.621738, 0.632866, 0.643938, 0.654946, 0.665885, 0.676747, 0.687527, 0.698216, 0.708810, 0.719301, 0.729684, 0.739951, 0.750097, 0.760115, 0.770000, 0.779745, 0.789345, 0.798793, 0.808084, 0.817212, 0.826172, 0.834958, 0.843565, 0.851988, 0.860222,
0.868261, 0.876100, 0.883736, 0.891163, 0.898377, 0.905373, 0.912148, 0.918696, 0.925015, 0.931100, 0.936947, 0.942554, 0.947916, 0.953030, 0.957894, 0.962504, 0.966857, 0.970952, 0.974785, 0.978353, 0.981656, 0.984690, 0.987455, 0.989948, 0.992168, 0.994113, 0.995782, 0.997175, 0.998290, 0.999128, 0.999686, 0.999965,
0.999965, 0.999686, 0.999128, 0.998290, 0.997175, 0.995782, 0.994113, 0.992168, 0.989948, 0.987455, 0.984690, 0.981656, 0.978353, 0.974785, 0.970952, 0.966857, 0.962504, 0.957894, 0.953030, 0.947916, 0.942554, 0.936947, 0.931100, 0.925015, 0.918696, 0.912148, 0.905373, 0.898377, 0.891163, 0.883736, 0.876100, 0.868261,
0.860222, 0.851988, 0.843565, 0.834958, 0.826172, 0.817212, 0.808084, 0.798793, 0.789345, 0.779745, 0.770000, 0.760115, 0.750097, 0.739951, 0.729684, 0.719302, 0.708810, 0.698216, 0.687527, 0.676747, 0.665885, 0.654946, 0.643938, 0.632866, 0.621738, 0.610560, 0.599340, 0.588083, 0.576797, 0.565489, 0.554166, 0.542833,
0.531500, 0.520171, 0.508854, 0.497557, 0.486285, 0.475045, 0.463845, 0.452692, 0.441591, 0.430550, 0.419575, 0.408674, 0.397852, 0.387117, 0.376474, 0.365931, 0.355493, 0.345168, 0.334960, 0.324877, 0.314925, 0.305110, 0.295437, 0.285912, 0.276542, 0.267331, 0.258287, 0.249413, 0.240716, 0.232200, 0.223871, 0.215734,
0.207794, 0.200056, 0.192524, 0.185203, 0.178097, 0.171211, 0.164549, 0.158115, 0.151913, 0.145947, 0.140219, 0.134734, 0.129496, 0.124506, 0.119769, 0.115287, 0.111063, 0.107099, 0.103398, 0.099962, 0.096793, 0.093893, 0.091265, 0.088908, 0.086825, 0.085018, 0.083487, 0.082232, 0.081256, 0.080558, 0.080140, 0.080000,
};
//float winGain;
//int zero_cnt;
//int zero_idx;
void voice_frame(short *x, int Nframes, short *xout, int blk_index);
int main(void)
{
int i, j;
int inputdata_length;
struct timeval start, end;
int timeuse;
FILE* input_ptr = NULL;
FILE* output_ptr = NULL;
input_ptr = fopen("80k.pcm", "r");
if (!input_ptr) {
printf("open input stream fail\n");
return -1;
}
fseek(input_ptr, 0, SEEK_END);
inputdata_length = ftell(input_ptr);
printf("inputdata_length:%d\n", inputdata_length);
rewind(input_ptr);
uint8_t* all_in_dat = (uint8_t*)calloc(inputdata_length, sizeof(uint8_t));
uint8_t* all_out_dat = (uint8_t*)calloc(inputdata_length, sizeof(uint8_t));
uint8_t* blk_input_dat = (uint8_t*)calloc(BLK_INPUT_LEN, sizeof(uint8_t));
uint8_t* blk_output_dat = (uint8_t*)calloc(BLK_INPUT_LEN, sizeof(uint8_t));
int count = fread(all_in_dat, sizeof(uint8_t), inputdata_length, input_ptr);
printf("count:%d\n", count);
rewind(input_ptr);
int in_dat_len=BLK_INPUT_LEN/2;
int out_dat_len=in_dat_len;
printf("in_dat_len:%d\n", in_dat_len);
short* in_dat = (short*)calloc(in_dat_len, sizeof(short));
short* out_dat = (short*)calloc(out_dat_len, sizeof(short));
int all_block=inputdata_length/BLK_INPUT_LEN;
printf("all_block:%d\n", all_block);
//int Nframes= in_dat_len/OVERLAP_LEN-1;
int Nframes= (in_dat_len-FRAME_LEN)/STEP_LEN+1;
printf("a block can div to Nframes:%d\n", Nframes);
// for(i=0; i<FRAME_LEN; i++){
// winGain+=win[i];
// }
// winGain=STEP_LEN/winGain; //normalization gain for overlap+add with 50% overlap
// printf("winGain:%f\n", winGain);
int block_index, arr_index;
for (block_index=0; block_index<all_block; block_index++) {
//分块
for (arr_index=0; arr_index<BLK_INPUT_LEN; arr_index++) {
blk_input_dat[arr_index]=all_in_dat[block_index*(BLK_INPUT_LEN)+arr_index];
}
//init input
for(i=0, j=0; i<in_dat_len; i++) {
in_dat[i]=(blk_input_dat[j+1]<<8|blk_input_dat[j]);
j=j+2;
}
#if 0
voice_frame(in_dat, Nframes, out_dat, block_index);
#else
gettimeofday(&start, NULL);
voice_frame(in_dat, Nframes, out_dat, block_index);
gettimeofday(&end, NULL);
timeuse = 1000000 * ( end.tv_sec - start.tv_sec ) + end.tv_usec - start.tv_usec;
printf("time=%.10fs\n", (double)timeuse/1000000);
#endif
#if 0
/*非整齐分帧,帧尾只能舍去(相当于丢失了信息)*/
/*非整齐分帧, 比如:STEP_LEN=40 */
memset(out_dat, 0, out_dat_len);
zero_cnt=0;
voice_frame(in_dat, Nframes, out_dat, block_index);
for(i=0, j=0; i<out_dat_len; i++) {
//printf("%d:%d\n", i, out_dat[i]);
if(out_dat[i]==0) {
if(zero_cnt>5)
zero_cnt++;
else {
if(out_dat[i+1]==0 && out_dat[i+2]==0) {
zero_cnt++;
}
}
}
}
if(zero_cnt!=0)
printf("zero_idx:%d, zero_cnt:%d\n",out_dat_len-zero_cnt, zero_cnt);
#endif
#if 0
for(i=0; i<in_dat_len; i++)
out_dat[i]=in_dat[i];
#endif
for(i=0, j=0; i<out_dat_len; i++) {
blk_output_dat[j]=out_dat[i]&0xff; //low 8bit
blk_output_dat[j+1]=((out_dat[i]>>8)&0xff); //high 8bit
j=j+2;
}
//整合
for (arr_index=0; arr_index<BLK_INPUT_LEN; arr_index++) {
all_out_dat[block_index*BLK_INPUT_LEN+arr_index]=blk_output_dat[arr_index];
}
}
output_ptr = fopen("output80k.pcm", "wb");
if (!output_ptr) {
printf("open output stream fail\n");
return -1;
}
fwrite(all_out_dat, sizeof(uint8_t), inputdata_length, output_ptr);
free(all_in_dat);
free(all_out_dat);
free(in_dat);
free(out_dat);
free(blk_input_dat);
free(blk_output_dat);
fclose(input_ptr);
fclose(output_ptr);
return 0;
}
float one_in_dat[FRAME_LEN];
float one_in_dat_i[FRAME_LEN];
float res_dat[FRAME_LEN];
float one_out_dat[STEP_LEN];
void voice_frame(short *x, int Nframes, short *xout, int blk_index)
{
int i, /*j,*/ k, n;
// printf("blk_index:%d, Nframes:%d\n\n", blk_index, Nframes);
if(blk_index==0)
{
//printf("the first block, do somthing init\n");
//
}
//========================= Start Processing ===============================
for(n=0, k=0; n<Nframes; n++) {
/*input*/
for(i=0; i<FRAME_LEN; i++) {
one_in_dat[i]=(x[k+i]/32768.0)*win[i];
//one_in_dat[i]=(x[k+i]/32768.0);
//one_in_dat[i]=x[k+i];
}
/*process*/
memset(one_in_dat_i, 0, FRAME_LEN);
baselib_fft(one_in_dat, one_in_dat_i, 1, FRAME_LEN);
baselib_fft(one_in_dat, one_in_dat_i, -1, FRAME_LEN);
for(i=0; i<FRAME_LEN; i++)
res_dat[i]=one_in_dat[i]/win[i];
/*output*/
if(n==Nframes-1) {
for(i=0; i<FRAME_LEN; i++){
xout[k+i]=(short)(res_dat[i]*32768.0);
//xout[k+i]=res_dat[i];
}
return;
}
for(i=0; i<STEP_LEN; i++){
one_out_dat[i]=res_dat[i];
}
for(i=0; i<STEP_LEN; i++){
xout[k+i]=(short)(one_out_dat[i]*32768.0);
//xout[k+i]=one_out_dat[i];
}
//update step
k=k+STEP_LEN;
}
}
#else
/*Linux平台下,计算一段代码的运行时长*/
#include <stdio.h>
#include <stdlib.h>
#include <time.h>
#include <math.h>
#include "cmath.h"
int int_cal()
{
int i,j;
int v;
for (i = 0; i < 5000; i++) {
for (j = 0; j < 500; j++) {
v = Pow(i + j, i + j);
//v = 1.1 + i + j+2.2 + i + j;
}
}
return v;
}
int main()
{
struct timeval start, end;
int timeuse;
gettimeofday(&start, NULL);
sleep(2);
gettimeofday(&end, NULL);
timeuse = 1000000 * ( end.tv_sec - start.tv_sec ) + end.tv_usec - start.tv_usec;
printf("time=%.3fs\n", (double)timeuse/1000000);
gettimeofday(&start, NULL);
int_cal();
gettimeofday(&end, NULL);
timeuse = 1000000 * ( end.tv_sec - start.tv_sec ) + end.tv_usec - start.tv_usec;
printf("time=%.3fs\n", (double)timeuse/1000000);
}
#endif
注:代码中有数据归一化,加窗,然后是FFT/IFFT。输入是一个长为80k的音频,分段输入算法再输出(2k输入,2k输出)。
X86平台上运行的结果:
inputdata_length:81920 count:81920 in_dat_len:2048 all_block:20 a block can div to Nframes:15 time=0.0002560000s time=0.0002500000s time=0.0002510000s time=0.0002500000s time=0.0002500000s time=0.0002500000s time=0.0002490000s time=0.0002670000s time=0.0002500000s time=0.0002500000s time=0.0002500000s time=0.0002500000s time=0.0002500000s time=0.0002500000s time=0.0002500000s time=0.0002500000s time=0.0002500000s time=0.0002500000s time=0.0002500000s time=0.0002500000s |
大概就是0.3ms结束运行。
输入输出音频对比:
如果不进行归一化,运行时间会减少很多。代码修改:
void voice_frame(short *x, int Nframes, short *xout, int blk_index)
{
int i, /*j,*/ k, n;
// printf("blk_index:%d, Nframes:%d\n\n", blk_index, Nframes);
if(blk_index==0)
{
//printf("the first block, do somthing init\n");
//
}
//========================= Start Processing ===============================
for(n=0, k=0; n<Nframes; n++) {
/*input*/
for(i=0; i<FRAME_LEN; i++) {
one_in_dat[i]=x[k+i]*win[i];
//one_in_dat[i]=(x[k+i]/32768.0)*win[i];
//one_in_dat[i]=(x[k+i]/32768.0);
//one_in_dat[i]=x[k+i];
}
/*process*/
memset(one_in_dat_i, 0, FRAME_LEN);
baselib_fft(one_in_dat, one_in_dat_i, 1, FRAME_LEN);
baselib_fft(one_in_dat, one_in_dat_i, -1, FRAME_LEN);
for(i=0; i<FRAME_LEN; i++)
res_dat[i]=one_in_dat[i]/win[i];
/*output*/
if(n==Nframes-1) {
for(i=0; i<FRAME_LEN; i++){
xout[k+i]=(short)res_dat[i];
//xout[k+i]=(short)(res_dat[i]*32768.0);
//xout[k+i]=res_dat[i];
}
return;
}
for(i=0; i<STEP_LEN; i++){
one_out_dat[i]=res_dat[i];
}
for(i=0; i<STEP_LEN; i++){
xout[k+i]=(short)res_dat[i];
//xout[k+i]=(short)(one_out_dat[i]*32768.0);
//xout[k+i]=one_out_dat[i];
}
//update step
k=k+STEP_LEN;
}
}
运行结果:
inputdata_length:81920 count:81920 in_dat_len:2048 all_block:20 a block can div to Nframes:15 time=0.0002430000s time=0.0002390000s time=0.0002380000s文章来源:https://www.toymoban.com/news/detail-606237.html time=0.0002380000s time=0.0002380000s time=0.0002380000s time=0.0002660000s time=0.0002620000s time=0.0002380000s time=0.0002380000s time=0.0002380000s time=0.0002380000s time=0.0002380000s time=0.0002380000s time=0.0002390000s time=0.0002380000s time=0.0002570000s time=0.0002390000s time=0.0002380000s time=0.0002380000s |
大概就是0.2ms结束运行。有了统计运行时间的手段,就可以对算法进行时间上以及空间上的优化。文章来源地址https://www.toymoban.com/news/detail-606237.html
到了这里,关于语音分帧简述的文章就介绍完了。如果您还想了解更多内容,请在右上角搜索TOY模板网以前的文章或继续浏览下面的相关文章,希望大家以后多多支持TOY模板网!