/// <summary> /// Long Term Prediction Unquantification. /// </summary> /// <returns>pitch</returns> public abstract int Unquant( float[] exc, int es, int start, float pitch_coef, int nsf, float[] gain_val, Bits bits, int count_lost, int subframe_offset, float last_pitch_gain);
/// <summary> /// Long Term Prediction Quantification. /// </summary> /// <returns>pitch</returns> public abstract int Quant( float[] target, float[] sw, int sws, float[] ak, float[] awk1, float[] awk2, float[] exc, int es, int start, int end, float pitch_coef, int p, int nsf, Bits bits, float[] exc2, int e2s, float[] r, int complexity);
/// <summary> /// Encode the given input signal. /// </summary> /// <returns>1 if successful.</returns> public virtual int Encode(Bits bits, float[] ins0) { int i; float[] mem, innov, syn_resp; float[] low_pi_gain, low_exc, low_innov; int dtx; /* Compute the two sub-bands by filtering with h0 and h1 */ NSpeex.Filters.Qmf_decomp(ins0, NSpeex.Codebook_Constants.h0, x0d, x1d, fullFrameSize, NSpeex.SbCodec.QMF_ORDER, h0_mem); /* Encode the narrowband part */ lowenc.Encode(bits, x0d); /* High-band buffering / sync with low band */ for (i = 0; i < windowSize - frameSize; i++) { high[i] = high[frameSize + i]; } for (i = 0; i < frameSize; i++) { high[windowSize - frameSize + i] = x1d[i]; } System.Array.Copy(excBuf, frameSize, excBuf, 0, bufSize - frameSize); low_pi_gain = lowenc.PiGain; low_exc = lowenc.Exc; low_innov = lowenc.Innov; int low_mode = lowenc.Mode; if (low_mode == 0) { dtx = 1; } else { dtx = 0; } /* Start encoding the high-band */ for (i = 0; i < windowSize; i++) { buf[i] = high[i] * window[i]; } /* Compute auto-correlation */ NSpeex.Lpc.Autocorr(buf, autocorr, lpcSize + 1, windowSize); autocorr[0] += 1; /* prevents NANs */ autocorr[0] *= lpc_floor; /* Noise floor in auto-correlation domain */ /* Lag windowing: equivalent to filtering in the power-spectrum domain */ for (i = 0; i < lpcSize + 1; i++) { autocorr[i] *= lagWindow[i]; } /* Levinson-Durbin */ NSpeex.Lpc.Wld(lpc, autocorr, rc, lpcSize); // tmperr System.Array.Copy(lpc, 0, lpc, 1, lpcSize); lpc[0] = 1; /* LPC to LSPs (x-domain) transform */ int roots = NSpeex.Lsp.Lpc2lsp(lpc, lpcSize, lsp, 15, 0.2f); if (roots != lpcSize) { roots = NSpeex.Lsp.Lpc2lsp(lpc, lpcSize, lsp, 11, 0.02f); if (roots != lpcSize) { /* * If we can't find all LSP's, do some damage control and use a * flat filter */ for (i = 0; i < lpcSize; i++) { lsp[i] = (float)System.Math.Cos(System.Math.PI * ((float)(i + 1)) / (lpcSize + 1)); } } } /* x-domain to angle domain */ for (i = 0; i < lpcSize; i++) { lsp[i] = (float)System.Math.Acos(lsp[i]); } float lsp_dist = 0; for (i = 0; i < lpcSize; i++) { lsp_dist += (old_lsp[i] - lsp[i]) * (old_lsp[i] - lsp[i]); } /* VBR stuff */ if ((vbr_enabled != 0 || vad_enabled != 0) && dtx == 0) { float e_low = 0, e_high = 0; float ratio; if (abr_enabled != 0) { float qual_change = 0; if (abr_drift2 * abr_drift > 0) { /* * Only adapt if long-term and short-term drift are the same * sign */ qual_change = -.00001f * abr_drift / (1 + abr_count); if (qual_change > .1f) { qual_change = .1f; } if (qual_change < -.1f) { qual_change = -.1f; } } vbr_quality += qual_change; if (vbr_quality > 10) { vbr_quality = 10; } if (vbr_quality < 0) { vbr_quality = 0; } } for (i = 0; i < frameSize; i++) { e_low += x0d[i] * x0d[i]; e_high += high[i] * high[i]; } ratio = (float)Math.Log((1 + e_high) / (1 + e_low)); relative_quality = lowenc.RelativeQuality; if (ratio < -4) { ratio = -4; } if (ratio > 2) { ratio = 2; } /* if (ratio>-2) */ if (vbr_enabled != 0) { int modeid; modeid = nb_modes - 1; relative_quality += 1.0f * (ratio + 2); if (relative_quality < -1) { relative_quality = -1; } while (modeid != 0) { int v1; float thresh; v1 = (int)Math.Floor(vbr_quality); if (v1 == 10) { thresh = NSpeex.Vbr.hb_thresh[modeid][v1]; } else { thresh = (vbr_quality - v1) * NSpeex.Vbr.hb_thresh[modeid][v1 + 1] + (1 + v1 - vbr_quality) * NSpeex.Vbr.hb_thresh[modeid][v1]; } if (relative_quality >= thresh) { break; } modeid--; } Mode = modeid; if (abr_enabled != 0) { int bitrate; bitrate = BitRate; abr_drift += (bitrate - abr_enabled); abr_drift2 = .95f * abr_drift2 + .05f * (bitrate - abr_enabled); abr_count += 1.0f; } } else { /* VAD only */ int modeid_0; if (relative_quality < 2.0d) { modeid_0 = 1; } else { modeid_0 = submodeSelect; } /* speex_encoder_ctl(state, SPEEX_SET_MODE, &mode); */ submodeID = modeid_0; } /* fprintf (stderr, "%f %f\n", ratio, low_qual); */ } bits.Pack(1, 1); if (dtx != 0) { bits.Pack(0, NSpeex.SbCodec.SB_SUBMODE_BITS); } else { bits.Pack(submodeID, NSpeex.SbCodec.SB_SUBMODE_BITS); } /* If null mode (no transmission), just set a couple things to zero */ if (dtx != 0 || submodes[submodeID] == null) { for (i = 0; i < frameSize; i++) { excBuf[excIdx + i] = swBuf[i] = NSpeex.NbCodec.VERY_SMALL; } for (i = 0; i < lpcSize; i++) { mem_sw[i] = 0; } first = 1; /* Final signal synthesis from excitation */ NSpeex.Filters.Iir_mem2(excBuf, excIdx, interp_qlpc, high, 0, subframeSize, lpcSize, mem_sp); /* Reconstruct the original */ filters.Fir_mem_up(x0d, NSpeex.Codebook_Constants.h0, y0, fullFrameSize, NSpeex.SbCodec.QMF_ORDER, g0_mem); filters.Fir_mem_up(high, NSpeex.Codebook_Constants.h1, y1, fullFrameSize, NSpeex.SbCodec.QMF_ORDER, g1_mem); for (i = 0; i < fullFrameSize; i++) { ins0[i] = 2 * (y0[i] - y1[i]); } if (dtx != 0) { return(0); } else { return(1); } } /* LSP quantization */ submodes[submodeID].LsqQuant.Quant(lsp, qlsp, lpcSize, bits); if (first != 0) { for (i = 0; i < lpcSize; i++) { old_lsp[i] = lsp[i]; } for (i = 0; i < lpcSize; i++) { old_qlsp[i] = qlsp[i]; } } mem = new float[lpcSize]; syn_resp = new float[subframeSize]; innov = new float[subframeSize]; for (int sub = 0; sub < nbSubframes; sub++) { float tmp, filter_ratio; int exc, sp, sw, resp; int offset; float rl, rh, eh = 0, el = 0; int fold; offset = subframeSize * sub; sp = offset; exc = excIdx + offset; resp = offset; sw = offset; /* LSP interpolation (quantized and unquantized) */ tmp = (1.0f + sub) / nbSubframes; for (i = 0; i < lpcSize; i++) { interp_lsp[i] = (1 - tmp) * old_lsp[i] + tmp * lsp[i]; } for (i = 0; i < lpcSize; i++) { interp_qlsp[i] = (1 - tmp) * old_qlsp[i] + tmp * qlsp[i]; } NSpeex.Lsp.Enforce_margin(interp_lsp, lpcSize, .05f); NSpeex.Lsp.Enforce_margin(interp_qlsp, lpcSize, .05f); /* Compute interpolated LPCs (quantized and unquantized) */ for (i = 0; i < lpcSize; i++) { interp_lsp[i] = (float)System.Math.Cos(interp_lsp[i]); } for (i = 0; i < lpcSize; i++) { interp_qlsp[i] = (float)System.Math.Cos(interp_qlsp[i]); } m_lsp.Lsp2lpc(interp_lsp, interp_lpc, lpcSize); m_lsp.Lsp2lpc(interp_qlsp, interp_qlpc, lpcSize); NSpeex.Filters.Bw_lpc(gamma1, interp_lpc, bw_lpc1, lpcSize); NSpeex.Filters.Bw_lpc(gamma2, interp_lpc, bw_lpc2, lpcSize); /* * Compute mid-band (4000 Hz for wideband) response of low-band and * high-band filters */ rl = rh = 0; tmp = 1; pi_gain[sub] = 0; for (i = 0; i <= lpcSize; i++) { rh += tmp * interp_qlpc[i]; tmp = -tmp; pi_gain[sub] += interp_qlpc[i]; } rl = low_pi_gain[sub]; rl = 1 / (Math.Abs(rl) + .01f); rh = 1 / (Math.Abs(rh) + .01f); /* Compute ratio, will help predict the gain */ filter_ratio = Math.Abs(.01f + rh) / (.01f + Math.Abs(rl)); fold = (filter_ratio < 5) ? 1 : 0; /* printf ("filter_ratio %f\n", filter_ratio); */ fold = 0; /* Compute "real excitation" */ NSpeex.Filters.Fir_mem2(high, sp, interp_qlpc, excBuf, exc, subframeSize, lpcSize, mem_sp2); /* Compute energy of low-band and high-band excitation */ for (i = 0; i < subframeSize; i++) { eh += excBuf[exc + i] * excBuf[exc + i]; } if (submodes[submodeID].Innovation == null) { /* * 1 for spectral * folding * excitation, 0 for * stochastic */ float g; /* speex_bits_pack(bits, 1, 1); */ for (i = 0; i < subframeSize; i++) { el += low_innov[offset + i] * low_innov[offset + i]; } /* * Gain to use if we want to use the low-band excitation for * high-band */ g = eh / (.01f + el); g = (float)Math.Sqrt(g); g *= filter_ratio; /* print_vec(&g, 1, "gain factor"); */ /* Gain quantization */ { int quant = (int)Math.Floor(.5d + 10 + 8.0d * Math.Log((g + .0001d))); /* speex_warning_int("tata", quant); */ if (quant < 0) { quant = 0; } if (quant > 31) { quant = 31; } bits.Pack(quant, 5); g = (float)(.1d * Math.Exp(quant / 9.4d)); } /* printf ("folding gain: %f\n", g); */ g /= filter_ratio; } else { float gc, scale, scale_1; for (i = 0; i < subframeSize; i++) { el += low_exc[offset + i] * low_exc[offset + i]; } /* speex_bits_pack(bits, 0, 1); */ gc = (float)(Math.Sqrt(1 + eh) * filter_ratio / Math.Sqrt((1 + el) * subframeSize)); { int qgc = (int)Math.Floor(.5d + 3.7d * (Math.Log(gc) + 2)); if (qgc < 0) { qgc = 0; } if (qgc > 15) { qgc = 15; } bits.Pack(qgc, 4); gc = (float)Math.Exp((1 / 3.7d) * qgc - 2); } scale = gc * (float)Math.Sqrt(1 + el) / filter_ratio; scale_1 = 1 / scale; for (i = 0; i < subframeSize; i++) { excBuf[exc + i] = 0; } excBuf[exc] = 1; NSpeex.Filters.Syn_percep_zero(excBuf, exc, interp_qlpc, bw_lpc1, bw_lpc2, syn_resp, subframeSize, lpcSize); /* Reset excitation */ for (i = 0; i < subframeSize; i++) { excBuf[exc + i] = 0; } /* * Compute zero response (ringing) of A(z/g1) / ( A(z/g2) * * Aq(z) ) */ for (i = 0; i < lpcSize; i++) { mem[i] = mem_sp[i]; } NSpeex.Filters.Iir_mem2(excBuf, exc, interp_qlpc, excBuf, exc, subframeSize, lpcSize, mem); for (i = 0; i < lpcSize; i++) { mem[i] = mem_sw[i]; } NSpeex.Filters.Filter_mem2(excBuf, exc, bw_lpc1, bw_lpc2, res, resp, subframeSize, lpcSize, mem, 0); /* Compute weighted signal */ for (i = 0; i < lpcSize; i++) { mem[i] = mem_sw[i]; } NSpeex.Filters.Filter_mem2(high, sp, bw_lpc1, bw_lpc2, swBuf, sw, subframeSize, lpcSize, mem, 0); /* Compute target signal */ for (i = 0; i < subframeSize; i++) { target[i] = swBuf[sw + i] - res[resp + i]; } for (i = 0; i < subframeSize; i++) { excBuf[exc + i] = 0; } for (i = 0; i < subframeSize; i++) { target[i] *= scale_1; } /* Reset excitation */ for (i = 0; i < subframeSize; i++) { innov[i] = 0; } /* print_vec(target, st->subframeSize, "\ntarget"); */ submodes[submodeID].Innovation.Quantify(target, interp_qlpc, bw_lpc1, bw_lpc2, lpcSize, subframeSize, innov, 0, syn_resp, bits, (complexity + 1) >> 1); /* print_vec(target, st->subframeSize, "after"); */ for (i = 0; i < subframeSize; i++) { excBuf[exc + i] += innov[i] * scale; } if (submodes[submodeID].DoubleCodebook != 0) { float[] innov2 = new float[subframeSize]; for (i = 0; i < subframeSize; i++) { innov2[i] = 0; } for (i = 0; i < subframeSize; i++) { target[i] *= 2.5f; } submodes[submodeID].Innovation.Quantify(target, interp_qlpc, bw_lpc1, bw_lpc2, lpcSize, subframeSize, innov2, 0, syn_resp, bits, (complexity + 1) >> 1); for (i = 0; i < subframeSize; i++) { innov2[i] *= (float)(scale * (1 / 2.5d)); } for (i = 0; i < subframeSize; i++) { excBuf[exc + i] += innov2[i]; } } } /* Keep the previous memory */ for (i = 0; i < lpcSize; i++) { mem[i] = mem_sp[i]; } /* Final signal synthesis from excitation */ NSpeex.Filters.Iir_mem2(excBuf, exc, interp_qlpc, high, sp, subframeSize, lpcSize, mem_sp); /* * Compute weighted signal again, from synthesized speech (not sure * it's the right thing) */ NSpeex.Filters.Filter_mem2(high, sp, bw_lpc1, bw_lpc2, swBuf, sw, subframeSize, lpcSize, mem_sw, 0); } // #ifndef RELEASE /* Reconstruct the original */ filters.Fir_mem_up(x0d, NSpeex.Codebook_Constants.h0, y0, fullFrameSize, NSpeex.SbCodec.QMF_ORDER, g0_mem); filters.Fir_mem_up(high, NSpeex.Codebook_Constants.h1, y1, fullFrameSize, NSpeex.SbCodec.QMF_ORDER, g1_mem); for (i = 0; i < fullFrameSize; i++) { ins0[i] = 2 * (y0[i] - y1[i]); } // #endif for (i = 0; i < lpcSize; i++) { old_lsp[i] = lsp[i]; } for (i = 0; i < lpcSize; i++) { old_qlsp[i] = qlsp[i]; } first = 0; return(1); }
/// <summary> /// Long Term Prediction Quantification (3Tap). /// </summary> /// <returns>pitch</returns> public sealed override int Quant( float[] target, float[] sw, int sws, float[] ak, float[] awk1, float[] awk2, float[] exc, int es, int start, int end, float pitch_coef, int p, int nsf, Bits bits, float[] exc2, int e2s, float[] r, int complexity) { int i, j; int[] cdbk_index = new int[1]; int pitch = 0, best_gain_index = 0; float[] best_exc; int best_pitch = 0; float err, best_err = -1; int N; int[] nbest; float[] gains; N = complexity; if (N > 10) { N = 10; } nbest = new int[N]; gains = new float[N]; if (N == 0 || end < start) { bits.Pack(0, pitch_bits); bits.Pack(0, gain_bits); for (i = 0; i < nsf; i++) { exc[es + i] = 0; } return(start); } best_exc = new float[nsf]; if (N > end - start + 1) { N = end - start + 1; } NSpeex.Ltp.Open_loop_nbest_pitch(sw, sws, start, end, nsf, nbest, gains, N); for (i = 0; i < N; i++) { pitch = nbest[i]; for (j = 0; j < nsf; j++) { exc[es + j] = 0; } err = Pitch_gain_search_3tap(target, ak, awk1, awk2, exc, es, pitch, p, nsf, bits, exc2, e2s, r, cdbk_index); if (err < best_err || best_err < 0) { for (j = 0; j < nsf; j++) { best_exc[j] = exc[es + j]; } best_err = err; best_pitch = pitch; best_gain_index = cdbk_index[0]; } } bits.Pack(best_pitch - start, pitch_bits); bits.Pack(best_gain_index, gain_bits); for (i = 0; i < nsf; i++) { exc[es + i] = best_exc[i]; } return(pitch); }
/// <summary> /// Finds the best quantized 3-tap pitch predictor by analysis by synthesis. /// </summary> /// <param name="target">Target vector</param> /// <param name="ak">LPCs for this subframe</param> /// <param name="awk1">Weighted LPCs #1 for this subframe</param> /// <param name="awk2">Weighted LPCs #2 for this subframe</param> /// <param name="exc">Excitation</param> /// <param name="es"></param> /// <param name="pitch">Pitch value</param> /// <param name="p">Number of LPC coeffs</param> /// <param name="nsf">Number of samples in subframe</param> /// <param name="bits"></param> /// <param name="exc2"></param> /// <param name="e2s"></param> /// <param name="r"></param> /// <param name="cdbk_index"></param> /// <returns>the best quantized 3-tap pitch predictor by analysis by</returns> private float Pitch_gain_search_3tap(float[] target, float[] ak, float[] awk1, float[] awk2, float[] exc, int es, int pitch, int p, int nsf, Bits bits, float[] exc2, int e2s, float[] r, int[] cdbk_index) { int i, j; float[][] x; // float[][] e; float[] corr = new float[3]; float[][] A = CreateJaggedArray <float>(3, 3); int gain_cdbk_size; float err1, err2; gain_cdbk_size = 1 << gain_bits; x = CreateJaggedArray <float>(3, nsf); e = CreateJaggedArray <float>(3, nsf); for (i = 2; i >= 0; i--) { int pp = pitch + 1 - i; for (j = 0; j < nsf; j++) { if (j - pp < 0) { e[i][j] = exc2[e2s + j - pp]; } else if (j - pp - pitch < 0) { e[i][j] = exc2[e2s + j - pp - pitch]; } else { e[i][j] = 0; } } if (i == 2) { NSpeex.Filters.Syn_percep_zero(e[i], 0, ak, awk1, awk2, x[i], nsf, p); } else { for (j = 0; j < nsf - 1; j++) { x[i][j + 1] = x[i + 1][j]; } x[i][0] = 0; for (j = 0; j < nsf; j++) { x[i][j] += e[i][0] * r[j]; } } } for (i = 0; i < 3; i++) { corr[i] = NSpeex.Ltp.Inner_prod(x[i], 0, target, 0, nsf); } for (i = 0; i < 3; i++) { for (j = 0; j <= i; j++) { A[i][j] = A[j][i] = NSpeex.Ltp.Inner_prod(x[i], 0, x[j], 0, nsf); } } { float[] C = new float[9]; int ptr = 0; int best_cdbk = 0; float best_sum = 0; C[0] = corr[2]; C[1] = corr[1]; C[2] = corr[0]; C[3] = A[1][2]; C[4] = A[0][1]; C[5] = A[0][2]; C[6] = A[2][2]; C[7] = A[1][1]; C[8] = A[0][0]; for (i = 0; i < gain_cdbk_size; i++) { float sum = 0; float g0, g1, g2; ptr = 3 * i; g0 = 0.015625f * gain_cdbk[ptr] + .5f; g1 = 0.015625f * gain_cdbk[ptr + 1] + .5f; g2 = 0.015625f * gain_cdbk[ptr + 2] + .5f; sum += C[0] * g0; sum += C[1] * g1; sum += C[2] * g2; sum -= C[3] * g0 * g1; sum -= C[4] * g2 * g1; sum -= C[5] * g2 * g0; sum -= .5f * C[6] * g0 * g0; sum -= .5f * C[7] * g1 * g1; sum -= .5f * C[8] * g2 * g2; /* * If true, force "safe" pitch values to handle packet loss * better */ if (false) { float tot = Math.Abs(gain_cdbk[ptr + 1]); if (gain_cdbk[ptr] > 0) { tot += gain_cdbk[ptr]; } if (gain_cdbk[ptr + 2] > 0) { tot += gain_cdbk[ptr + 2]; } if (tot > 1) { continue; } } if (sum > best_sum || i == 0) { best_sum = sum; best_cdbk = i; } } gain[0] = 0.015625f * gain_cdbk[best_cdbk * 3] + .5f; gain[1] = 0.015625f * gain_cdbk[best_cdbk * 3 + 1] + .5f; gain[2] = 0.015625f * gain_cdbk[best_cdbk * 3 + 2] + .5f; cdbk_index[0] = best_cdbk; } for (i = 0; i < nsf; i++) { exc[es + i] = gain[0] * e[2][i] + gain[1] * e[1][i] + gain[2] * e[0][i]; } err1 = 0; err2 = 0; for (i = 0; i < nsf; i++) { err1 += target[i] * target[i]; } for (i = 0; i < nsf; i++) { err2 += (target[i] - gain[2] * x[0][i] - gain[1] * x[1][i] - gain[0] * x[2][i]) * (target[i] - gain[2] * x[0][i] - gain[1] * x[1][i] - gain[0] * x[2][i]); } return(err2); }
/// <summary> /// Long Term Prediction Unquantification (3Tap). /// </summary> /// <returns>pitch</returns> public sealed override int Unquant( float[] exc, int es, int start, float pitch_coef, int nsf, float[] gain_val, Bits bits, int count_lost, int subframe_offset, float last_pitch_gain) { int i, pitch, gain_index; pitch = bits.Unpack(pitch_bits); pitch += start; gain_index = bits.Unpack(gain_bits); gain[0] = 0.015625f * (float)gain_cdbk[gain_index * 3] + .5f; gain[1] = 0.015625f * (float)gain_cdbk[gain_index * 3 + 1] + .5f; gain[2] = 0.015625f * (float)gain_cdbk[gain_index * 3 + 2] + .5f; if (count_lost != 0 && pitch > subframe_offset) { float gain_sum = Math.Abs(gain[1]); float tmp = (count_lost < 4) ? last_pitch_gain : 0.4f * last_pitch_gain; if (tmp > .95f) { tmp = .95f; } if (gain[0] > 0) { gain_sum += gain[0]; } else { gain_sum -= .5f * gain[0]; } if (gain[2] > 0) { gain_sum += gain[2]; } else { gain_sum -= .5f * gain[0]; } if (gain_sum > tmp) { float fact = tmp / gain_sum; for (i = 0; i < 3; i++) { gain[i] *= fact; } } } gain_val[0] = gain[0]; gain_val[1] = gain[1]; gain_val[2] = gain[2]; for (i = 0; i < 3; i++) { int j, tmp1, tmp2, pp = pitch + 1 - i; tmp1 = nsf; if (tmp1 > pp) { tmp1 = pp; } tmp2 = nsf; if (tmp2 > pp + pitch) { tmp2 = pp + pitch; } for (j = 0; j < tmp1; j++) { e[i][j] = exc[es + j - pp]; } for (j = tmp1; j < tmp2; j++) { e[i][j] = exc[es + j - pp - pitch]; } for (j = tmp2; j < nsf; j++) { e[i][j] = 0; } } for (i = 0; i < nsf; i++) { exc[es + i] = gain[0] * e[2][i] + gain[1] * e[1][i] + gain[2] * e[0][i]; } return(pitch); }