public static void filterbank_psy_smooth(SpeexFilterBank bank, float[] ps, float[] mask) { /* Low freq slope: 14 dB/Bark*/ /* High freq slope: 9 dB/Bark*/ /* Noise vs tone: 5 dB difference */ /* FIXME: Temporary kludge */ var bark = new float[100]; int i; /* Assumes 1/3 Bark resolution */ float decay_low = 0.34145f; float decay_high = 0.50119f; filterbank_compute_bank(bank, ps, bark); for (i = 1; i < bank.nb_banks; i++) { /*float decay_high = 13-1.6*log10(bark[i-1]); * decay_high = pow(10,(-decay_high/30.f));*/ bark[i] = bark[i] + decay_high * bark[i - 1]; } for (i = bank.nb_banks - 2; i >= 0; i--) { bark[i] = bark[i] + decay_low * bark[i + 1]; } filterbank_compute_psd(bank, bark, mask); }
public static void filterbank_compute_psd(SpeexFilterBank bank, float[] mel, float[] ps) { int i; for (i = 0; i < bank.len; i++) { int id = bank.bank_left[i]; ps[i] = mel[id] * bank.filter_left[i]; id = bank.bank_right[i]; ps[i] += mel[id] * bank.filter_right[i]; } }
public static void filterbank_compute_psd16(SpeexFilterBank bank, float[] mel, int meloffset, float[] ps) { int i; for (i = 0; i < bank.len; i++) { int id1 = bank.bank_left[i]; int id2 = bank.bank_right[i]; float tmp = MULT16_16(mel[meloffset + id1], bank.filter_left[i]); tmp += MULT16_16(mel[meloffset + id2], bank.filter_right[i]); ps[i] = EXTRACT16(PSHR32(tmp, 15)); } }
private void preprocess_analysis(short[] x) { int i; int N = st.ps_size; int N3 = 2 * N - st.frame_size; int N4 = st.frame_size - N3; float[] ps = st.ps; /* 'Build' input frame */ for (i = 0; i < N3; i++) { st.frame[i] = st.inbuf[i]; } for (i = 0; i < st.frame_size; i++) { st.frame[N3 + i] = x[i]; } /* Update inbuf */ for (i = 0; i < N3; i++) { st.inbuf[i] = x[N4 + i]; } /* Windowing */ for (i = 0; i < 2 * N; i++) { st.frame[i] = MULT16_16_Q15(st.frame[i], st.window[i]); } /* Perform FFT */ st.fft.DoFft(st.frame, 0, st.ft, 0); /* Power spectrum */ ps[0] = MULT16_16(st.ft[0], st.ft[0]); for (i = 1; i < N; i++) { ps[i] = MULT16_16(st.ft[2 * i - 1], st.ft[2 * i - 1]) + MULT16_16(st.ft[2 * i], st.ft[2 * i]); } for (i = 0; i < N; i++) { st.ps[i] = PSHR32(st.ps[i], 2 * frame_shift); } SpeexFilterBank.filterbank_compute_bank32(st.bank, ps, ps, N); }
public static void filterbank_compute_bank(SpeexFilterBank bank, float[] ps, float[] mel) { int i; for (i = 0; i < bank.nb_banks; i++) { mel[i] = 0; } for (i = 0; i < bank.len; i++) { int id = bank.bank_left[i]; mel[id] += bank.filter_left[i] * ps[i]; id = bank.bank_right[i]; mel[id] += bank.filter_right[i] * ps[i]; } for (i = 0; i < bank.nb_banks; i++) { mel[i] *= bank.scaling[i]; } }
public static void filterbank_compute_bank32(SpeexFilterBank bank, float[] ps, float[] mel, int meloffset) { int i; for (i = 0; i < bank.nb_banks; i++) { mel[meloffset + i] = 0; } for (i = 0; i < bank.len; i++) { int id = bank.bank_left[i]; mel[meloffset + id] += MULT16_32_P15(bank.filter_left[i], ps[i]); id = bank.bank_right[i]; mel[meloffset + id] += MULT16_32_P15(bank.filter_right[i], ps[i]); } /* Think I can safely disable normalisation that for fixed-point (and probably float as well) */ /*for (i=0;i<bank.nb_banks;i++) * mel[i] = MULT16_32_P15(Q15(bank.scaling[i]),mel[i]); */ }
private void speex_preprocess_state_init(int frameSize, int sampling_rate) { int i; int N, N3, N4, M; st.frame_size = frameSize; /* Round ps_size down to the nearest power of two */ st.ps_size = st.frame_size; N = st.ps_size; N3 = 2 * N - st.frame_size; N4 = st.frame_size - N3; st.sampling_rate = sampling_rate; // st.denoise_enabled = true; st.vad_enabled = false; st.dereverb_enabled = false; st.reverb_decay = 0; st.noise_suppress = NOISE_SUPPRESS_DEFAULT; st.echo_suppress = ECHO_SUPPRESS_DEFAULT; st.echo_suppress_active = ECHO_SUPPRESS_ACTIVE_DEFAULT; st.speech_prob_start = SPEECH_PROB_START_DEFAULT; st.speech_prob_continue = SPEECH_PROB_CONTINUE_DEFAULT; st.echo_state = null; st.nbands = NB_BANDS; M = st.nbands; st.bank = SpeexFilterBank.filterbank_new(M, sampling_rate, N, 1); st.frame = new float[2 * N]; // (float[] )speex_alloc(2*N*sizeof(float)); st.window = new float[2 * N]; // (float[] )speex_alloc(2*N*sizeof(float)); st.ft = new float[2 * N]; // (float[] )speex_alloc(2*N*sizeof(float)); st.ps = new float[N + M]; // (float[] )speex_alloc((N+M)*sizeof(float)); st.noise = new float[N + M]; // (float[] )speex_alloc((N+M)*sizeof(float)); st.echo_noise = new float[N + M]; // (float[] )speex_alloc((N+M)*sizeof(float)); st.residual_echo = new float[N + M]; // (float[] )speex_alloc((N+M)*sizeof(float)); st.reverb_estimate = new float[N + M]; // (float[] )speex_alloc((N+M)*sizeof(float)); st.old_ps = new float[N + M]; // (float[] )speex_alloc((N+M)*sizeof(float)); st.prior = new float[N + M]; // (float[] )speex_alloc((N+M)*sizeof(float)); st.post = new float[N + M]; // (float[] )speex_alloc((N+M)*sizeof(float)); st.gain = new float[N + M]; // (float[] )speex_alloc((N+M)*sizeof(float)); st.gain2 = new float[N + M]; // (float[] )speex_alloc((N+M)*sizeof(float)); st.gain_floor = new float[N + M]; // (float[] )speex_alloc((N+M)*sizeof(float)); st.zeta = new float[N + M]; // (float[] )speex_alloc((N+M)*sizeof(float)); st.S = new float[N]; // (float[] )speex_alloc(N*sizeof(float)); st.Smin = new float[N]; //(float[] )speex_alloc(N*sizeof(float)); st.Stmp = new float[N]; //(float[] )speex_alloc(N*sizeof(float)); st.update_prob = new int[N]; // (int*)speex_alloc(N*sizeof(int)); st.inbuf = new float[N3]; //(float[] )speex_alloc(N3*sizeof(float)); st.outbuf = new float[N3]; //(float[] )speex_alloc(N3*sizeof(float)); conj_window(st.window, 2 * N3); for (i = 2 * N3; i < 2 * st.ps_size; i++) { st.window[i] = Q15_ONE; } if (N4 > 0) { for (i = N3 - 1; i >= 0; i--) { st.window[i + N3 + N4] = st.window[i + N3]; st.window[i + N3] = 1; } } for (i = 0; i < N + M; i++) { st.noise[i] = QCONST32(1.0f, NOISE_SHIFT); st.reverb_estimate[i] = 0; st.old_ps[i] = 1; st.gain[i] = Q15_ONE; st.post[i] = SHL16(1.0f, SNR_SHIFT); st.prior[i] = SHL16(1.0f, SNR_SHIFT); } for (i = 0; i < N; i++) { st.update_prob[i] = 1; } for (i = 0; i < N3; i++) { st.inbuf[i] = 0; st.outbuf[i] = 0; } // st.agc_enabled = false; st.agc_level = 8000; st.loudness_weight = new float[N]; // (float*)speex_alloc(N * sizeof(float)); for (i = 0; i < N; i++) { float ff = (i) * .5f * sampling_rate / (N); /*st.loudness_weight[i] = .5f*(1.0f/(1.0f+ff/8000.0f))+1.0f*Math.Exp(-.5f*(ff-3800.0f)*(ff-3800.0f)/9e5f);*/ st.loudness_weight[i] = (float)(.35f - .35f * ff / 16000.0f + .73f * Math.Exp(-.5f * (ff - 3800) * (ff - 3800) / 9e5f)); if (st.loudness_weight[i] < .01f) { st.loudness_weight[i] = .01f; } st.loudness_weight[i] *= st.loudness_weight[i]; } /*st.loudness = pow(AMP_SCALE*st.agc_level,LOUDNESS_EXP);*/ st.loudness = 1e-15f; st.agc_gain = 1; st.max_gain = 30; st.max_increase_step = (float)Math.Exp(0.11513f * 12.0f * st.frame_size / st.sampling_rate); st.max_decrease_step = (float)Math.Exp(-0.11513f * 40.0f * st.frame_size / st.sampling_rate); st.prev_loudness = 1; st.init_max = 1; st.was_speech = false; st.fft = new SpeexFft(2 * N); st.nb_adapt = 0; st.min_count = 0; }
public int speex_preprocess_run(short[] x) { int N = st.ps_size; int N3 = 2 * N - st.frame_size; int N4 = st.frame_size - N3; st.nb_adapt++; if (st.nb_adapt > 20000) { st.nb_adapt = 20000; } st.min_count++; float beta = Math.Max(QCONST16(.03f, 15), DIV32_16(Q15_ONE, st.nb_adapt)); float beta_1 = Q15_ONE - beta; int M = st.nbands; /* Deal with residual echo if provided */ if (st.echo_state != null && config.EnableAec) { // SpeexEchoCanceller.speex_echo_get_residual(st.echo_state, st.residual_echo, N); st.echo_state.GetResidual(st.residual_echo); /* If there are NaNs or ridiculous values, it'll show up in the DC and we just reset everything to zero */ if (!(st.residual_echo[0] >= 0 && st.residual_echo[0] < N * 1e9f)) { for (int i = 0; i < N; i++) { st.residual_echo[i] = 0; } } for (int i = 0; i < N; i++) { st.echo_noise[i] = Math.Max(MULT16_32_Q15(QCONST16(.6f, 15), st.echo_noise[i]), st.residual_echo[i]); } SpeexFilterBank.filterbank_compute_bank32(st.bank, st.echo_noise, st.echo_noise, N); } else { for (int i = 0; i < N + M; i++) { st.echo_noise[i] = 0; } } preprocess_analysis(x); update_noise_prob(); /* Update the noise estimate for the frequencies where it can be */ for (int i = 0; i < N; i++) { if (st.update_prob[i] == 0 || st.ps[i] < PSHR32(st.noise[i], NOISE_SHIFT)) { st.noise[i] = Math.Max(EXTEND32(0), MULT16_32_Q15(beta_1, st.noise[i]) + MULT16_32_Q15(beta, SHL32(st.ps[i], NOISE_SHIFT))); } } SpeexFilterBank.filterbank_compute_bank32(st.bank, st.noise, st.noise, N); /* Special case for first frame */ if (st.nb_adapt == 1) { for (int i = 0; i < N + M; i++) { st.old_ps[i] = ps[i]; } } /* Compute a posteriori SNR */ for (int i = 0; i < N + M; i++) { /* Total noise estimate including residual echo and reverberation */ float tot_noise = ADD32(ADD32(ADD32(EXTEND32(1), PSHR32(st.noise[i], NOISE_SHIFT)), st.echo_noise[i]), st.reverb_estimate[i]); /* A posteriori SNR = ps/noise - 1*/ st.post[i] = SUB16(DIV32_16_Q8(ps[i], tot_noise), QCONST16(1.0f, SNR_SHIFT)); st.post[i] = Math.Min(st.post[i], QCONST16(100.0f, SNR_SHIFT)); /* Computing update gamma = .1 + .9*(old/(old+noise))^2 */ float gamma = QCONST16(.1f, 15) + MULT16_16_Q15(QCONST16(.89f, 15), SQR16_Q15(DIV32_16_Q15(st.old_ps[i], ADD32(st.old_ps[i], tot_noise)))); /* A priori SNR update = gamma*max(0,post) + (1-gamma)*old/noise */ st.prior[i] = EXTRACT16(PSHR32(ADD32(MULT16_16(gamma, Math.Max(0, st.post[i])), MULT16_16(Q15_ONE - gamma, DIV32_16_Q8(st.old_ps[i], tot_noise))), 15)); st.prior[i] = Math.Min(st.prior[i], QCONST16(100.0f, SNR_SHIFT)); } /*print_vec(st.post, N+M, "");*/ /* Recursive average of the a priori SNR. A bit smoothed for the psd components */ st.zeta[0] = PSHR32(ADD32(MULT16_16(QCONST16(.7f, 15), st.zeta[0]), MULT16_16(QCONST16(.3f, 15), st.prior[0])), 15); for (int i = 1; i < N - 1; i++) { st.zeta[i] = PSHR32(ADD32(ADD32(ADD32(MULT16_16(QCONST16(.7f, 15), st.zeta[i]), MULT16_16(QCONST16(.15f, 15), st.prior[i])), MULT16_16(QCONST16(.075f, 15), st.prior[i - 1])), MULT16_16(QCONST16(.075f, 15), st.prior[i + 1])), 15); } for (int i = N - 1; i < N + M; i++) { st.zeta[i] = PSHR32(ADD32(MULT16_16(QCONST16(.7f, 15), st.zeta[i]), MULT16_16(QCONST16(.3f, 15), st.prior[i])), 15); } /* Speech probability of presence for the entire frame is based on the average filterbank a priori SNR */ float Zframe = 0; for (int i = N; i < N + M; i++) { Zframe = ADD32(Zframe, EXTEND32(st.zeta[i])); } float Pframe = QCONST16(.1f, 15) + MULT16_16_Q15(QCONST16(.899f, 15), qcurve(DIV32_16(Zframe, st.nbands))); float effectiveEchoSuppress = EXTRACT16(PSHR32(ADD32(MULT16_16(SUB16(Q15_ONE, Pframe), st.echo_suppress), MULT16_16(Pframe, st.echo_suppress_active)), 15)); compute_gain_floor(st.noise_suppress, (int)effectiveEchoSuppress, st.noise, N, st.echo_noise, N, st.gain_floor, N, M); /* Compute Ephraim & Malah gain speech probability of presence for each critical band (Bark scale) * Technically this is actually wrong because the EM gain assumes a slightly different probability * distribution */ for (int i = N; i < N + M; i++) { /* Weiner filter gain */ float priorRatio = PDIV32_16(SHL32(EXTEND32(st.prior[i]), 15), ADD16(st.prior[i], SHL32(1, SNR_SHIFT))); /* See EM and Cohen papers*/ float theta = MULT16_32_P15(priorRatio, QCONST32(1.0f, EXPIN_SHIFT) + SHL32(EXTEND32(st.post[i]), EXPIN_SHIFT - SNR_SHIFT)); /* Gain from hypergeometric function */ float MM = hypergeom_gain(theta); /* Gain with bound */ st.gain[i] = EXTRACT16(Math.Min(Q15_ONE, MULT16_32_Q15(priorRatio, MM))); /* Save old Bark power spectrum */ st.old_ps[i] = MULT16_32_P15(QCONST16(.2f, 15), st.old_ps[i]) + MULT16_32_P15(MULT16_16_P15(QCONST16(.8f, 15), SQR16_Q15(st.gain[i])), ps[i]); /* a priority probability of speech presence based on Bark sub-band alone */ float P1 = QCONST16(.199f, 15) + MULT16_16_Q15(QCONST16(.8f, 15), qcurve(st.zeta[i])); /* Speech absence a priori probability (considering sub-band and frame) */ float q = Q15_ONE - MULT16_16_Q15(Pframe, P1); st.gain2[i] = 1 / (1.0f + (q / (1.0f - q)) * (1 + st.prior[i]) * (float)Math.Exp(-theta)); } /* Convert the EM gains and speech prob to linear frequency */ SpeexFilterBank.filterbank_compute_psd16(st.bank, st.gain2, N, st.gain2); SpeexFilterBank.filterbank_compute_psd16(st.bank, st.gain, N, st.gain); /* Linear gain resolution (best) */ SpeexFilterBank.filterbank_compute_psd16(st.bank, st.gain_floor, N, st.gain_floor); /* Compute gain according to the Ephraim-Malah algorithm -- linear frequency */ for (int i = 0; i < N; i++) { /* Wiener filter gain */ float priorRatio = PDIV32_16(SHL32(EXTEND32(st.prior[i]), 15), ADD16(st.prior[i], SHL32(1, SNR_SHIFT))); float theta = MULT16_32_P15(priorRatio, QCONST32(1.0f, EXPIN_SHIFT) + SHL32(EXTEND32(st.post[i]), EXPIN_SHIFT - SNR_SHIFT)); /* Optimal estimator for loudness domain */ float MM = hypergeom_gain(theta); /* EM gain with bound */ float g = EXTRACT16(Math.Min(Q15_ONE, MULT16_32_Q15(priorRatio, MM))); /* Interpolated speech probability of presence */ float p = st.gain2[i]; /* Constrain the gain to be close to the Bark scale gain */ if (MULT16_16_Q15(QCONST16(.333f, 15), g) > st.gain[i]) { g = MULT16_16(3, st.gain[i]); } st.gain[i] = g; /* Save old power spectrum */ st.old_ps[i] = MULT16_32_P15(QCONST16(.2f, 15), st.old_ps[i]) + MULT16_32_P15(MULT16_16_P15(QCONST16(.8f, 15), SQR16_Q15(st.gain[i])), ps[i]); /* Apply gain floor */ if (st.gain[i] < st.gain_floor[i]) { st.gain[i] = st.gain_floor[i]; } /* Exponential decay model for reverberation (unused) */ /*st.reverb_estimate[i] = st.reverb_decay*st.reverb_estimate[i] + st.reverb_decay*st.reverb_level*st.gain[i]*st.gain[i]*st.ps[i];*/ /* Take into account speech probability of presence (loudness domain MMSE estimator) */ /* gain2 = [p*sqrt(gain)+(1-p)*sqrt(gain _floor) ]^2 */ float tmp = MULT16_16_P15(p, (float)Math.Sqrt(SHL32(EXTEND32(st.gain[i]), 15))) + MULT16_16_P15(SUB16(Q15_ONE, p), (float)Math.Sqrt(SHL32(EXTEND32(st.gain_floor[i]), 15))); st.gain2[i] = SQR16_Q15(tmp); /* Use this if you want a log-domain MMSE estimator instead */ /*st.gain2[i] = pow(st.gain[i], p) * pow(st.gain_floor[i],1.0f-p);*/ } /* If noise suppression is off, don't apply the gain (but then why call this in the first place!) */ if (!st.config.EnableDenoise) { for (int i = 0; i < N + M; i++) { st.gain2[i] = Q15_ONE; } } /* Apply computed gain */ for (int i = 1; i < N; i++) { st.ft[2 * i - 1] = MULT16_16_P15(st.gain2[i], st.ft[2 * i - 1]); st.ft[2 * i] = MULT16_16_P15(st.gain2[i], st.ft[2 * i]); } st.ft[0] = MULT16_16_P15(st.gain2[0], st.ft[0]); st.ft[2 * N - 1] = MULT16_16_P15(st.gain2[N - 1], st.ft[2 * N - 1]); if (st.config.EnableAgc) { speex_compute_agc(Pframe, st.ft); } /* Inverse FFT with 1/N scaling */ st.fft.DoIfft(st.ft, 0, st.frame, 0); /* Scale back to original (lower) amplitude */ // for (i = 0; i < 2 * N; i++) // st.frame[i] = PSHR16(st.frame[i], st.frame_shift); /*FIXME: This *will* not work for fixed-point */ if (st.config.EnableAgc) { float maxSample = 0; for (int i = 0; i < 2 * N; i++) { if (Math.Abs(st.frame[i]) > maxSample) { maxSample = Math.Abs(st.frame[i]); } } if (maxSample > 28000.0f) { float damp = 28000.0f / maxSample; for (int i = 0; i < 2 * N; i++) { st.frame[i] *= damp; } } } /* Synthesis window (for WOLA) */ for (int i = 0; i < 2 * N; i++) { st.frame[i] = MULT16_16_Q15(st.frame[i], st.window[i]); } /* Perform overlap and add */ for (int i = 0; i < N3; i++) { x[i] = (short)(st.outbuf[i] + st.frame[i]); } for (int i = 0; i < N4; i++) { x[N3 + i] = (short)st.frame[N3 + i]; } /* Update outbuf */ for (int i = 0; i < N3; i++) { st.outbuf[i] = st.frame[st.frame_size + i]; } /* FIXME: This VAD is a kludge */ st.speech_prob = Pframe; if (st.vad_enabled) { if (st.speech_prob > st.speech_prob_start || (st.was_speech && st.speech_prob > st.speech_prob_continue)) { st.was_speech = true; return(1); } else { st.was_speech = false; return(0); } } else { return(1); } }
public static void filterbank_destroy(SpeexFilterBank bank) { // No-op due to GC. }
public static SpeexFilterBank filterbank_new(int banks, float sampling, int len, int type) { int i; float df = DIV32(SHL32(sampling, 15), MULT16_16(2, len)); float maxMel = toBARK(EXTRACT16(sampling / 2)); float melInterval = PDIV32(maxMel, banks - 1); var bank = new SpeexFilterBank(); bank.nb_banks = banks; bank.len = len; bank.bank_left = new int[len]; // (int*)speex_alloc(len*sizeof(int)); bank.bank_right = new int[len]; // (int*)speex_alloc(len*sizeof(int)); bank.filter_left = new float[len]; // (float*)speex_alloc(len*sizeof(float)); bank.filter_right = new float[len]; // (float*)speex_alloc(len*sizeof(float)); /* Think I can safely disable normalisation that for fixed-point (and probably float as well) */ bank.scaling = new float[banks]; // (float*)speex_alloc(banks*sizeof(float)); for (i = 0; i < len; i++) { float val; float currFreq = EXTRACT16(MULT16_32_P15(i, df)); float mel = toBARK(currFreq); if (mel > maxMel) { break; } int id1 = (int)(Math.Floor(mel / melInterval)); if (id1 > banks - 2) { id1 = banks - 2; val = Q15_ONE; } else { val = DIV32_16(mel - id1 * melInterval, EXTRACT16(PSHR32(melInterval, 15))); } int id2 = id1 + 1; bank.bank_left[i] = id1; bank.filter_left[i] = SUB16(Q15_ONE, val); bank.bank_right[i] = id2; bank.filter_right[i] = val; } /* Think I can safely disable normalisation for fixed-point (and probably float as well) */ for (i = 0; i < bank.nb_banks; i++) { bank.scaling[i] = 0; } for (i = 0; i < bank.len; i++) { int id = bank.bank_left[i]; bank.scaling[id] += bank.filter_left[i]; id = bank.bank_right[i]; bank.scaling[id] += bank.filter_right[i]; } for (i = 0; i < bank.nb_banks; i++) { bank.scaling[i] = Q15_ONE / (bank.scaling[i]); } return(bank); }