/// <summary> /// Initialization of the Silk VAD /// </summary> /// <param name="psSilk_VAD">O Pointer to Silk VAD state. Cannot be nullptr</param> /// <returns>0 if success</returns> internal static int silk_VAD_Init(SilkVADState psSilk_VAD) { int b, ret = 0; /* reset state memory */ psSilk_VAD.Reset(); /* init noise levels */ /* Initialize array with approx pink noise levels (psd proportional to inverse of frequency) */ for (b = 0; b < SilkConstants.VAD_N_BANDS; b++) { psSilk_VAD.NoiseLevelBias[b] = Inlines.silk_max_32(Inlines.silk_DIV32_16(SilkConstants.VAD_NOISE_LEVELS_BIAS, (short)(b + 1)), 1); } /* Initialize state */ for (b = 0; b < SilkConstants.VAD_N_BANDS; b++) { psSilk_VAD.NL[b] = Inlines.silk_MUL(100, psSilk_VAD.NoiseLevelBias[b]); psSilk_VAD.inv_NL[b] = Inlines.silk_DIV32(int.MaxValue, psSilk_VAD.NL[b]); } psSilk_VAD.counter = 15; /* init smoothed energy-to-noise ratio*/ for (b = 0; b < SilkConstants.VAD_N_BANDS; b++) { psSilk_VAD.NrgRatioSmth_Q8[b] = 100 * 256; /* 100 * 256 -. 20 dB SNR */ } return(ret); }
/// <summary> /// Get the speech activity level in Q8 /// </summary> /// <param name="psEncC">I/O Encoder state</param> /// <param name="pIn">I PCM input</param> /// <param name="pIn_ptr"></param> /// <returns>0 if success</returns> internal static int silk_VAD_GetSA_Q8( SilkChannelEncoder psEncC, short[] pIn, int pIn_ptr) { int SA_Q15, pSNR_dB_Q7, input_tilt; int decimated_framelength1, decimated_framelength2; int decimated_framelength; int dec_subframe_length, dec_subframe_offset, SNR_Q7, i, b, s; int sumSquared = 0, smooth_coef_Q16; short HPstateTmp; short[] X; int[] Xnrg = new int[SilkConstants.VAD_N_BANDS]; int[] NrgToNoiseRatio_Q8 = new int[SilkConstants.VAD_N_BANDS]; int speech_nrg, x_tmp; int[] X_offset = new int[SilkConstants.VAD_N_BANDS]; int ret = 0; SilkVADState psSilk_VAD = psEncC.sVAD; /* Safety checks */ Inlines.OpusAssert(SilkConstants.VAD_N_BANDS == 4); Inlines.OpusAssert(SilkConstants.MAX_FRAME_LENGTH >= psEncC.frame_length); Inlines.OpusAssert(psEncC.frame_length <= 512); Inlines.OpusAssert(psEncC.frame_length == 8 * Inlines.silk_RSHIFT(psEncC.frame_length, 3)); /***********************/ /* Filter and Decimate */ /***********************/ decimated_framelength1 = Inlines.silk_RSHIFT(psEncC.frame_length, 1); decimated_framelength2 = Inlines.silk_RSHIFT(psEncC.frame_length, 2); decimated_framelength = Inlines.silk_RSHIFT(psEncC.frame_length, 3); /* Decimate into 4 bands: * 0 L 3L L 3L 5L * - -- - -- -- * 8 8 2 4 4 * * [0-1 kHz| temp. |1-2 kHz| 2-4 kHz | 4-8 kHz | * * They're arranged to allow the minimal ( frame_length / 4 ) extra * scratch space during the downsampling process */ X_offset[0] = 0; X_offset[1] = decimated_framelength + decimated_framelength2; X_offset[2] = X_offset[1] + decimated_framelength; X_offset[3] = X_offset[2] + decimated_framelength2; X = new short[X_offset[3] + decimated_framelength1]; /* 0-8 kHz to 0-4 kHz and 4-8 kHz */ Filters.silk_ana_filt_bank_1(pIn, pIn_ptr, psSilk_VAD.AnaState, X, X, X_offset[3], psEncC.frame_length); /* 0-4 kHz to 0-2 kHz and 2-4 kHz */ Filters.silk_ana_filt_bank_1(X, 0, psSilk_VAD.AnaState1, X, X, X_offset[2], decimated_framelength1); /* 0-2 kHz to 0-1 kHz and 1-2 kHz */ Filters.silk_ana_filt_bank_1(X, 0, psSilk_VAD.AnaState2, X, X, X_offset[1], decimated_framelength2); /*********************************************/ /* HP filter on lowest band (differentiator) */ /*********************************************/ X[decimated_framelength - 1] = (short)(Inlines.silk_RSHIFT(X[decimated_framelength - 1], 1)); HPstateTmp = X[decimated_framelength - 1]; for (i = decimated_framelength - 1; i > 0; i--) { X[i - 1] = (short)(Inlines.silk_RSHIFT(X[i - 1], 1)); X[i] -= X[i - 1]; } X[0] -= psSilk_VAD.HPstate; psSilk_VAD.HPstate = HPstateTmp; /*************************************/ /* Calculate the energy in each band */ /*************************************/ for (b = 0; b < SilkConstants.VAD_N_BANDS; b++) { /* Find the decimated framelength in the non-uniformly divided bands */ decimated_framelength = Inlines.silk_RSHIFT(psEncC.frame_length, Inlines.silk_min_int(SilkConstants.VAD_N_BANDS - b, SilkConstants.VAD_N_BANDS - 1)); /* Split length into subframe lengths */ dec_subframe_length = Inlines.silk_RSHIFT(decimated_framelength, SilkConstants.VAD_INTERNAL_SUBFRAMES_LOG2); dec_subframe_offset = 0; /* Compute energy per sub-frame */ /* initialize with summed energy of last subframe */ Xnrg[b] = psSilk_VAD.XnrgSubfr[b]; for (s = 0; s < SilkConstants.VAD_INTERNAL_SUBFRAMES; s++) { sumSquared = 0; for (i = 0; i < dec_subframe_length; i++) { /* The energy will be less than dec_subframe_length * ( silk_int16_MIN / 8 ) ^ 2. */ /* Therefore we can accumulate with no risk of overflow (unless dec_subframe_length > 128) */ x_tmp = Inlines.silk_RSHIFT( X[X_offset[b] + i + dec_subframe_offset], 3); sumSquared = Inlines.silk_SMLABB(sumSquared, x_tmp, x_tmp); /* Safety check */ Inlines.OpusAssert(sumSquared >= 0); } /* Add/saturate summed energy of current subframe */ if (s < SilkConstants.VAD_INTERNAL_SUBFRAMES - 1) { Xnrg[b] = Inlines.silk_ADD_POS_SAT32(Xnrg[b], sumSquared); } else { /* Look-ahead subframe */ Xnrg[b] = Inlines.silk_ADD_POS_SAT32(Xnrg[b], Inlines.silk_RSHIFT(sumSquared, 1)); } dec_subframe_offset += dec_subframe_length; } psSilk_VAD.XnrgSubfr[b] = sumSquared; } /********************/ /* Noise estimation */ /********************/ silk_VAD_GetNoiseLevels(Xnrg, psSilk_VAD); /***********************************************/ /* Signal-plus-noise to noise ratio estimation */ /***********************************************/ sumSquared = 0; input_tilt = 0; for (b = 0; b < SilkConstants.VAD_N_BANDS; b++) { speech_nrg = Xnrg[b] - psSilk_VAD.NL[b]; if (speech_nrg > 0) { /* Divide, with sufficient resolution */ if ((Xnrg[b] & 0xFF800000) == 0) { NrgToNoiseRatio_Q8[b] = Inlines.silk_DIV32(Inlines.silk_LSHIFT(Xnrg[b], 8), psSilk_VAD.NL[b] + 1); } else { NrgToNoiseRatio_Q8[b] = Inlines.silk_DIV32(Xnrg[b], Inlines.silk_RSHIFT(psSilk_VAD.NL[b], 8) + 1); } /* Convert to log domain */ SNR_Q7 = Inlines.silk_lin2log(NrgToNoiseRatio_Q8[b]) - 8 * 128; /* Sum-of-squares */ sumSquared = Inlines.silk_SMLABB(sumSquared, SNR_Q7, SNR_Q7); /* Q14 */ /* Tilt measure */ if (speech_nrg < ((int)1 << 20)) { /* Scale down SNR value for small subband speech energies */ SNR_Q7 = Inlines.silk_SMULWB(Inlines.silk_LSHIFT(Inlines.silk_SQRT_APPROX(speech_nrg), 6), SNR_Q7); } input_tilt = Inlines.silk_SMLAWB(input_tilt, tiltWeights[b], SNR_Q7); } else { NrgToNoiseRatio_Q8[b] = 256; } } /* Mean-of-squares */ sumSquared = Inlines.silk_DIV32_16(sumSquared, SilkConstants.VAD_N_BANDS); /* Q14 */ /* Root-mean-square approximation, scale to dBs, and write to output pointer */ pSNR_dB_Q7 = (short)(3 * Inlines.silk_SQRT_APPROX(sumSquared)); /* Q7 */ /*********************************/ /* Speech Probability Estimation */ /*********************************/ SA_Q15 = Sigmoid.silk_sigm_Q15(Inlines.silk_SMULWB(SilkConstants.VAD_SNR_FACTOR_Q16, pSNR_dB_Q7) - SilkConstants.VAD_NEGATIVE_OFFSET_Q5); /**************************/ /* Frequency Tilt Measure */ /**************************/ psEncC.input_tilt_Q15 = Inlines.silk_LSHIFT(Sigmoid.silk_sigm_Q15(input_tilt) - 16384, 1); /**************************************************/ /* Scale the sigmoid output based on power levels */ /**************************************************/ speech_nrg = 0; for (b = 0; b < SilkConstants.VAD_N_BANDS; b++) { /* Accumulate signal-without-noise energies, higher frequency bands have more weight */ speech_nrg += (b + 1) * Inlines.silk_RSHIFT(Xnrg[b] - psSilk_VAD.NL[b], 4); } /* Power scaling */ if (speech_nrg <= 0) { SA_Q15 = Inlines.silk_RSHIFT(SA_Q15, 1); } else if (speech_nrg < 32768) { if (psEncC.frame_length == 10 * psEncC.fs_kHz) { speech_nrg = Inlines.silk_LSHIFT_SAT32(speech_nrg, 16); } else { speech_nrg = Inlines.silk_LSHIFT_SAT32(speech_nrg, 15); } /* square-root */ speech_nrg = Inlines.silk_SQRT_APPROX(speech_nrg); SA_Q15 = Inlines.silk_SMULWB(32768 + speech_nrg, SA_Q15); } /* Copy the resulting speech activity in Q8 */ psEncC.speech_activity_Q8 = Inlines.silk_min_int(Inlines.silk_RSHIFT(SA_Q15, 7), byte.MaxValue); /***********************************/ /* Energy Level and SNR estimation */ /***********************************/ /* Smoothing coefficient */ smooth_coef_Q16 = Inlines.silk_SMULWB(SilkConstants.VAD_SNR_SMOOTH_COEF_Q18, Inlines.silk_SMULWB((int)SA_Q15, SA_Q15)); if (psEncC.frame_length == 10 * psEncC.fs_kHz) { smooth_coef_Q16 >>= 1; } for (b = 0; b < SilkConstants.VAD_N_BANDS; b++) { /* compute smoothed energy-to-noise ratio per band */ psSilk_VAD.NrgRatioSmth_Q8[b] = Inlines.silk_SMLAWB(psSilk_VAD.NrgRatioSmth_Q8[b], NrgToNoiseRatio_Q8[b] - psSilk_VAD.NrgRatioSmth_Q8[b], smooth_coef_Q16); /* signal to noise ratio in dB per band */ SNR_Q7 = 3 * (Inlines.silk_lin2log(psSilk_VAD.NrgRatioSmth_Q8[b]) - 8 * 128); /* quality = sigmoid( 0.25 * ( SNR_dB - 16 ) ); */ psEncC.input_quality_bands_Q15[b] = Sigmoid.silk_sigm_Q15(Inlines.silk_RSHIFT(SNR_Q7 - 16 * 128, 4)); } return(ret); }
/// <summary> /// Noise level estimation /// </summary> /// <param name="pX">I subband energies [VAD_N_BANDS]</param> /// <param name="psSilk_VAD">I/O Pointer to Silk VAD state</param> internal static void silk_VAD_GetNoiseLevels( int[] pX, SilkVADState psSilk_VAD) { int k; int nl, nrg, inv_nrg; int coef, min_coef; /* Initially faster smoothing */ if (psSilk_VAD.counter < 1000) { /* 1000 = 20 sec */ min_coef = Inlines.silk_DIV32_16(short.MaxValue, (short)(Inlines.silk_RSHIFT(psSilk_VAD.counter, 4) + 1)); } else { min_coef = 0; } for (k = 0; k < SilkConstants.VAD_N_BANDS; k++) { /* Get old noise level estimate for current band */ nl = psSilk_VAD.NL[k]; Inlines.OpusAssert(nl >= 0); /* Add bias */ nrg = Inlines.silk_ADD_POS_SAT32(pX[k], psSilk_VAD.NoiseLevelBias[k]); Inlines.OpusAssert(nrg > 0); /* Invert energies */ inv_nrg = Inlines.silk_DIV32(int.MaxValue, nrg); Inlines.OpusAssert(inv_nrg >= 0); /* Less update when subband energy is high */ if (nrg > Inlines.silk_LSHIFT(nl, 3)) { coef = SilkConstants.VAD_NOISE_LEVEL_SMOOTH_COEF_Q16 >> 3; } else if (nrg < nl) { coef = SilkConstants.VAD_NOISE_LEVEL_SMOOTH_COEF_Q16; } else { coef = Inlines.silk_SMULWB(Inlines.silk_SMULWW(inv_nrg, nl), SilkConstants.VAD_NOISE_LEVEL_SMOOTH_COEF_Q16 << 1); } /* Initially faster smoothing */ coef = Inlines.silk_max_int(coef, min_coef); /* Smooth inverse energies */ psSilk_VAD.inv_NL[k] = Inlines.silk_SMLAWB(psSilk_VAD.inv_NL[k], inv_nrg - psSilk_VAD.inv_NL[k], coef); Inlines.OpusAssert(psSilk_VAD.inv_NL[k] >= 0); /* Compute noise level by inverting again */ nl = Inlines.silk_DIV32(int.MaxValue, psSilk_VAD.inv_NL[k]); Inlines.OpusAssert(nl >= 0); /* Limit noise levels (guarantee 7 bits of head room) */ nl = Inlines.silk_min(nl, 0x00FFFFFF); /* Store as part of state */ psSilk_VAD.NL[k] = nl; } /* Increment frame counter */ psSilk_VAD.counter++; }