internal static int compute_frame_size <T>(T[] analysis_pcm, int analysis_pcm_ptr, int frame_size, OpusFramesize variable_duration, int C, int Fs, int bitrate_bps, int delay_compensation, Downmix.downmix_func <T> downmix, float[] subframe_mem, bool analysis_enabled ) { if (analysis_enabled && variable_duration == OpusFramesize.OPUS_FRAMESIZE_VARIABLE && frame_size >= Fs / 200) { int LM = 3; LM = optimize_framesize(analysis_pcm, analysis_pcm_ptr, frame_size, C, Fs, bitrate_bps, 0, subframe_mem, delay_compensation, downmix); while ((Fs / 400 << LM) > frame_size) { LM--; } frame_size = (Fs / 400 << LM); } else { frame_size = frame_size_select(frame_size, variable_duration, Fs); } if (frame_size < 0) { return(-1); } return(frame_size); }
internal static void run_analysis <T>(TonalityAnalysisState analysis, CeltMode celt_mode, T[] analysis_pcm, int analysis_pcm_ptr, int analysis_frame_size, int frame_size, int c1, int c2, int C, int Fs, int lsb_depth, Downmix.downmix_func <T> downmix, AnalysisInfo analysis_info) { int offset; int pcm_len; if (analysis_pcm != null) { /* Avoid overflow/wrap-around of the analysis buffer */ analysis_frame_size = Inlines.IMIN((OpusConstants.DETECT_SIZE - 5) * Fs / 100, analysis_frame_size); pcm_len = analysis_frame_size - analysis.analysis_offset; offset = analysis.analysis_offset; do { tonality_analysis(analysis, celt_mode, analysis_pcm, analysis_pcm_ptr, Inlines.IMIN(480, pcm_len), offset, c1, c2, C, lsb_depth, downmix); offset += 480; pcm_len -= 480; } while (pcm_len > 0); analysis.analysis_offset = analysis_frame_size; analysis.analysis_offset -= frame_size; } analysis_info.valid = 0; tonality_get_info(analysis, analysis_info, frame_size); }
internal static int optimize_framesize <T>(T[] x, int x_ptr, int len, int C, int Fs, int bitrate, int tonality, float[] mem, int buffering, Downmix.downmix_func <T> downmix) { int N; int i; float[] e = new float[MAX_DYNAMIC_FRAMESIZE + 4]; float[] e_1 = new float[MAX_DYNAMIC_FRAMESIZE + 3]; int memx; int bestLM = 0; int subframe; int pos; int offset; int[] sub; subframe = Fs / 400; sub = new int[subframe]; e[0] = mem[0]; e_1[0] = 1.0f / (CeltConstants.EPSILON + mem[0]); if (buffering != 0) { /* Consider the CELT delay when not in restricted-lowdelay */ /* We assume the buffering is between 2.5 and 5 ms */ offset = 2 * subframe - buffering; Inlines.OpusAssert(offset >= 0 && offset <= subframe); len -= offset; e[1] = mem[1]; e_1[1] = 1.0f / (CeltConstants.EPSILON + mem[1]); e[2] = mem[2]; e_1[2] = 1.0f / (CeltConstants.EPSILON + mem[2]); pos = 3; } else { pos = 1; offset = 0; } N = Inlines.IMIN(len / subframe, MAX_DYNAMIC_FRAMESIZE); /* Just silencing a warning, it's really initialized later */ memx = 0; for (i = 0; i < N; i++) { float tmp; int tmpx; int j; tmp = CeltConstants.EPSILON; downmix(x, x_ptr, sub, 0, subframe, i * subframe + offset, 0, -2, C); if (i == 0) { memx = sub[0]; } for (j = 0; j < subframe; j++) { tmpx = sub[j]; tmp += (tmpx - memx) * (float)(tmpx - memx); memx = tmpx; } e[i + pos] = tmp; e_1[i + pos] = 1.0f / tmp; } /* Hack to get 20 ms working with APPLICATION_AUDIO * The real problem is that the corresponding memory needs to use 1.5 ms * from this frame and 1 ms from the next frame */ e[i + pos] = e[i + pos - 1]; if (buffering != 0) { N = Inlines.IMIN(MAX_DYNAMIC_FRAMESIZE, N + 2); } bestLM = transient_viterbi(e, e_1, N, (int)((1.0f + .5f * tonality) * (60 * C + 40)), bitrate / 400); mem[0] = e[1 << bestLM]; if (buffering != 0) { mem[1] = e[(1 << bestLM) + 1]; mem[2] = e[(1 << bestLM) + 2]; } return(bestLM); }
internal int opus_multistream_encode_native <T> ( opus_copy_channel_in_func <T> copy_channel_in, T[] pcm, int pcm_ptr, int analysis_frame_size, byte[] data, int data_ptr, int max_data_bytes, int lsb_depth, Downmix.downmix_func <T> downmix, int float_api ) { int Fs; int s; int encoder_ptr; int tot_size; short[] buf; int[] bandSMR; byte[] tmp_data = new byte[MS_FRAME_TMP]; OpusRepacketizer rp = new OpusRepacketizer(); int vbr; CeltMode celt_mode; int[] bitrates = new int[256]; int[] bandLogE = new int[42]; int[] mem = null; int[] preemph_mem = null; int frame_size; int rate_sum; int smallest_packet; if (this.surround != 0) { preemph_mem = this.preemph_mem; mem = this.window_mem; } encoder_ptr = 0; Fs = this.encoders[encoder_ptr].SampleRate; vbr = this.encoders[encoder_ptr].UseVBR ? 1 : 0; celt_mode = this.encoders[encoder_ptr].GetCeltMode(); { int delay_compensation; int channels; channels = this.layout.nb_streams + this.layout.nb_coupled_streams; delay_compensation = this.encoders[encoder_ptr].Lookahead; delay_compensation -= Fs / 400; frame_size = CodecHelpers.compute_frame_size(pcm, pcm_ptr, analysis_frame_size, this.variable_duration, channels, Fs, this.bitrate_bps, delay_compensation, downmix, this.subframe_mem, this.encoders[encoder_ptr].analysis.enabled); } if (400 * frame_size < Fs) { return(OpusError.OPUS_BAD_ARG); } /* Validate frame_size before using it to allocate stack space. * This mirrors the checks in opus_encode[_float](). */ if (400 * frame_size != Fs && 200 * frame_size != Fs && 100 * frame_size != Fs && 50 * frame_size != Fs && 25 * frame_size != Fs && 50 * frame_size != 3 * Fs) { return(OpusError.OPUS_BAD_ARG); } /* Smallest packet the encoder can produce. */ smallest_packet = this.layout.nb_streams * 2 - 1; if (max_data_bytes < smallest_packet) { return(OpusError.OPUS_BUFFER_TOO_SMALL); } buf = new short[2 * frame_size]; bandSMR = new int[21 * this.layout.nb_channels]; if (this.surround != 0) { surround_analysis(celt_mode, pcm, pcm_ptr, bandSMR, mem, preemph_mem, frame_size, 120, this.layout.nb_channels, Fs, copy_channel_in); } /* Compute bitrate allocation between streams (this could be a lot better) */ rate_sum = surround_rate_allocation(bitrates, frame_size); if (vbr == 0) { if (this.bitrate_bps == OpusConstants.OPUS_AUTO) { max_data_bytes = Inlines.IMIN(max_data_bytes, 3 * rate_sum / (3 * 8 * Fs / frame_size)); } else if (this.bitrate_bps != OpusConstants.OPUS_BITRATE_MAX) { max_data_bytes = Inlines.IMIN(max_data_bytes, Inlines.IMAX(smallest_packet, 3 * this.bitrate_bps / (3 * 8 * Fs / frame_size))); } } for (s = 0; s < this.layout.nb_streams; s++) { OpusEncoder enc = this.encoders[encoder_ptr]; encoder_ptr += 1; enc.Bitrate = (bitrates[s]); if (this.surround != 0) { int equiv_rate; equiv_rate = this.bitrate_bps; if (frame_size * 50 < Fs) { equiv_rate -= 60 * (Fs / frame_size - 50) * this.layout.nb_channels; } if (equiv_rate > 10000 * this.layout.nb_channels) { enc.Bandwidth = (OpusBandwidth.OPUS_BANDWIDTH_FULLBAND); } else if (equiv_rate > 7000 * this.layout.nb_channels) { enc.Bandwidth = (OpusBandwidth.OPUS_BANDWIDTH_SUPERWIDEBAND); } else if (equiv_rate > 5000 * this.layout.nb_channels) { enc.Bandwidth = (OpusBandwidth.OPUS_BANDWIDTH_WIDEBAND); } else { enc.Bandwidth = (OpusBandwidth.OPUS_BANDWIDTH_NARROWBAND); } if (s < this.layout.nb_coupled_streams) { /* To preserve the spatial image, force stereo CELT on coupled streams */ enc.ForceMode = (OpusMode.MODE_CELT_ONLY); enc.ForceChannels = (2); } } } encoder_ptr = 0; /* Counting ToC */ tot_size = 0; for (s = 0; s < this.layout.nb_streams; s++) { OpusEncoder enc; int len; int curr_max; int c1, c2; rp.Reset(); enc = this.encoders[encoder_ptr]; if (s < this.layout.nb_coupled_streams) { int i; int left, right; left = OpusMultistream.get_left_channel(this.layout, s, -1); right = OpusMultistream.get_right_channel(this.layout, s, -1); copy_channel_in(buf, 0, 2, pcm, pcm_ptr, this.layout.nb_channels, left, frame_size); copy_channel_in(buf, 1, 2, pcm, pcm_ptr, this.layout.nb_channels, right, frame_size); encoder_ptr += 1; if (this.surround != 0) { for (i = 0; i < 21; i++) { bandLogE[i] = bandSMR[21 * left + i]; bandLogE[21 + i] = bandSMR[21 * right + i]; } } c1 = left; c2 = right; } else { int i; int chan = OpusMultistream.get_mono_channel(this.layout, s, -1); copy_channel_in(buf, 0, 1, pcm, pcm_ptr, this.layout.nb_channels, chan, frame_size); encoder_ptr += 1; if (this.surround != 0) { for (i = 0; i < 21; i++) { bandLogE[i] = bandSMR[21 * chan + i]; } } c1 = chan; c2 = -1; } if (this.surround != 0) { enc.SetEnergyMask(bandLogE); } /* number of bytes left (+Toc) */ curr_max = max_data_bytes - tot_size; /* Reserve one byte for the last stream and two for the others */ curr_max -= Inlines.IMAX(0, 2 * (this.layout.nb_streams - s - 1) - 1); curr_max = Inlines.IMIN(curr_max, MS_FRAME_TMP); /* Repacketizer will add one or two bytes for self-delimited frames */ if (s != this.layout.nb_streams - 1) { curr_max -= curr_max > 253 ? 2 : 1; } if (vbr == 0 && s == this.layout.nb_streams - 1) { enc.Bitrate = (curr_max * (8 * Fs / frame_size)); } len = enc.opus_encode_native(buf, 0, frame_size, tmp_data, 0, curr_max, lsb_depth, pcm, pcm_ptr, analysis_frame_size, c1, c2, this.layout.nb_channels, downmix, float_api); if (len < 0) { return(len); } /* We need to use the repacketizer to add the self-delimiting lengths * while taking into account the fact that the encoder can now return * more than one frame at a time (e.g. 60 ms CELT-only) */ rp.AddPacket(tmp_data, 0, len); len = rp.opus_repacketizer_out_range_impl(0, rp.GetNumFrames(), data, data_ptr, max_data_bytes - tot_size, (s != this.layout.nb_streams - 1) ? 1 : 0, (vbr == 0 && s == this.layout.nb_streams - 1) ? 1 : 0); data_ptr += len; tot_size += len; } return(tot_size); }
/// <summary> /// /// </summary> /// <typeparam name="T">The type of signal being handled (either short or float) - changes based on which API is used</typeparam> /// <param name="tonal"></param> /// <param name="celt_mode"></param> /// <param name="x"></param> /// <param name="len"></param> /// <param name="offset"></param> /// <param name="c1"></param> /// <param name="c2"></param> /// <param name="C"></param> /// <param name="lsb_depth"></param> /// <param name="downmix"></param> internal static void tonality_analysis <T>(TonalityAnalysisState tonal, CeltMode celt_mode, T[] x, int x_ptr, int len, int offset, int c1, int c2, int C, int lsb_depth, Downmix.downmix_func <T> downmix) { int i, b; FFTState kfft; int[] input; int[] output; int N = 480, N2 = 240; float[] A = tonal.angle; float[] dA = tonal.d_angle; float[] d2A = tonal.d2_angle; float[] tonality; float[] noisiness; float[] band_tonality = new float[OpusConstants.NB_TBANDS]; float[] logE = new float[OpusConstants.NB_TBANDS]; float[] BFCC = new float[8]; float[] features = new float[25]; float frame_tonality; float max_frame_tonality; /*float tw_sum=0;*/ float frame_noisiness; float pi4 = (float)(M_PI * M_PI * M_PI * M_PI); float slope = 0; float frame_stationarity; float relativeE; float[] frame_probs = new float[2]; float alpha, alphaE, alphaE2; float frame_loudness; float bandwidth_mask; int bandwidth = 0; float maxE = 0; float noise_floor; int remaining; AnalysisInfo info; //[porting note] pointer tonal.last_transition++; alpha = 1.0f / Inlines.IMIN(20, 1 + tonal.count); alphaE = 1.0f / Inlines.IMIN(50, 1 + tonal.count); alphaE2 = 1.0f / Inlines.IMIN(1000, 1 + tonal.count); if (tonal.count < 4) { tonal.music_prob = 0.5f; } kfft = celt_mode.mdct.kfft[0]; if (tonal.count == 0) { tonal.mem_fill = 240; } downmix(x, x_ptr, tonal.inmem, tonal.mem_fill, Inlines.IMIN(len, OpusConstants.ANALYSIS_BUF_SIZE - tonal.mem_fill), offset, c1, c2, C); if (tonal.mem_fill + len < OpusConstants.ANALYSIS_BUF_SIZE) { tonal.mem_fill += len; /* Don't have enough to update the analysis */ return; } info = tonal.info[tonal.write_pos++]; if (tonal.write_pos >= OpusConstants.DETECT_SIZE) { tonal.write_pos -= OpusConstants.DETECT_SIZE; } input = new int[960]; output = new int[960]; tonality = new float[240]; noisiness = new float[240]; for (i = 0; i < N2; i++) { float w = Tables.analysis_window[i]; input[2 * i] = (int)(w * tonal.inmem[i]); input[2 * i + 1] = (int)(w * tonal.inmem[N2 + i]); input[(2 * (N - i - 1))] = (int)(w * tonal.inmem[N - i - 1]); input[(2 * (N - i - 1)) + 1] = (int)(w * tonal.inmem[N + N2 - i - 1]); } Arrays.MemMoveInt(tonal.inmem, OpusConstants.ANALYSIS_BUF_SIZE - 240, 0, 240); remaining = len - (OpusConstants.ANALYSIS_BUF_SIZE - tonal.mem_fill); downmix(x, x_ptr, tonal.inmem, 240, remaining, offset + OpusConstants.ANALYSIS_BUF_SIZE - tonal.mem_fill, c1, c2, C); tonal.mem_fill = 240 + remaining; KissFFT.opus_fft(kfft, input, output); for (i = 1; i < N2; i++) { float X1r, X2r, X1i, X2i; float angle, d_angle, d2_angle; float angle2, d_angle2, d2_angle2; float mod1, mod2, avg_mod; X1r = (float)output[2 * i] + output[2 * (N - i)]; X1i = (float)output[(2 * i) + 1] - output[2 * (N - i) + 1]; X2r = (float)output[(2 * i) + 1] + output[2 * (N - i) + 1]; X2i = (float)output[2 * (N - i)] - output[2 * i]; angle = (float)(.5f / M_PI) * fast_atan2f(X1i, X1r); d_angle = angle - A[i]; d2_angle = d_angle - dA[i]; angle2 = (float)(.5f / M_PI) * fast_atan2f(X2i, X2r); d_angle2 = angle2 - angle; d2_angle2 = d_angle2 - d_angle; mod1 = d2_angle - (float)Math.Floor(0.5f + d2_angle); noisiness[i] = Inlines.ABS16(mod1); mod1 *= mod1; mod1 *= mod1; mod2 = d2_angle2 - (float)Math.Floor(0.5f + d2_angle2); noisiness[i] += Inlines.ABS16(mod2); mod2 *= mod2; mod2 *= mod2; avg_mod = .25f * (d2A[i] + 2.0f * mod1 + mod2); tonality[i] = 1.0f / (1.0f + 40.0f * 16.0f * pi4 * avg_mod) - .015f; A[i] = angle2; dA[i] = d_angle2; d2A[i] = mod2; } frame_tonality = 0; max_frame_tonality = 0; /*tw_sum = 0;*/ info.activity = 0; frame_noisiness = 0; frame_stationarity = 0; if (tonal.count == 0) { for (b = 0; b < OpusConstants.NB_TBANDS; b++) { tonal.lowE[b] = 1e10f; tonal.highE[b] = -1e10f; } } relativeE = 0; frame_loudness = 0; for (b = 0; b < OpusConstants.NB_TBANDS; b++) { float E = 0, tE = 0, nE = 0; float L1, L2; float stationarity; for (i = Tables.tbands[b]; i < Tables.tbands[b + 1]; i++) { float binE = output[2 * i] * (float)output[2 * i] + output[2 * (N - i)] * (float)output[2 * (N - i)] + output[2 * i + 1] * (float)output[2 * i + 1] + output[2 * (N - i) + 1] * (float)output[2 * (N - i) + 1]; /* FIXME: It's probably best to change the BFCC filter initial state instead */ binE *= 5.55e-17f; E += binE; tE += binE * tonality[i]; nE += binE * 2.0f * (.5f - noisiness[i]); } tonal.E[tonal.E_count][b] = E; frame_noisiness += nE / (1e-15f + E); frame_loudness += (float)Math.Sqrt(E + 1e-10f); logE[b] = (float)Math.Log(E + 1e-10f); tonal.lowE[b] = Inlines.MIN32(logE[b], tonal.lowE[b] + 0.01f); tonal.highE[b] = Inlines.MAX32(logE[b], tonal.highE[b] - 0.1f); if (tonal.highE[b] < tonal.lowE[b] + 1.0f) { tonal.highE[b] += 0.5f; tonal.lowE[b] -= 0.5f; } relativeE += (logE[b] - tonal.lowE[b]) / (1e-15f + tonal.highE[b] - tonal.lowE[b]); L1 = L2 = 0; for (i = 0; i < OpusConstants.NB_FRAMES; i++) { L1 += (float)Math.Sqrt(tonal.E[i][b]); L2 += tonal.E[i][b]; } stationarity = Inlines.MIN16(0.99f, L1 / (float)Math.Sqrt(1e-15 + OpusConstants.NB_FRAMES * L2)); stationarity *= stationarity; stationarity *= stationarity; frame_stationarity += stationarity; /*band_tonality[b] = tE/(1e-15+E)*/ band_tonality[b] = Inlines.MAX16(tE / (1e-15f + E), stationarity * tonal.prev_band_tonality[b]); frame_tonality += band_tonality[b]; if (b >= OpusConstants.NB_TBANDS - OpusConstants.NB_TONAL_SKIP_BANDS) { frame_tonality -= band_tonality[b - OpusConstants.NB_TBANDS + OpusConstants.NB_TONAL_SKIP_BANDS]; } max_frame_tonality = Inlines.MAX16(max_frame_tonality, (1.0f + .03f * (b - OpusConstants.NB_TBANDS)) * frame_tonality); slope += band_tonality[b] * (b - 8); tonal.prev_band_tonality[b] = band_tonality[b]; } bandwidth_mask = 0; bandwidth = 0; maxE = 0; noise_floor = 5.7e-4f / (1 << (Inlines.IMAX(0, lsb_depth - 8))); noise_floor *= 1 << (15 + CeltConstants.SIG_SHIFT); noise_floor *= noise_floor; for (b = 0; b < OpusConstants.NB_TOT_BANDS; b++) { float E = 0; int band_start, band_end; /* Keep a margin of 300 Hz for aliasing */ band_start = Tables.extra_bands[b]; band_end = Tables.extra_bands[b + 1]; for (i = band_start; i < band_end; i++) { float binE = output[2 * i] * (float)output[2 * i] + output[2 * (N - i)] * (float)output[2 * (N - i)] + output[2 * i + 1] * (float)output[2 * i + 1] + output[2 * (N - i) + 1] * (float)output[2 * (N - i) + 1]; E += binE; } maxE = Inlines.MAX32(maxE, E); tonal.meanE[b] = Inlines.MAX32((1 - alphaE2) * tonal.meanE[b], E); E = Inlines.MAX32(E, tonal.meanE[b]); /* Use a simple follower with 13 dB/Bark slope for spreading function */ bandwidth_mask = Inlines.MAX32(.05f * bandwidth_mask, E); /* Consider the band "active" only if all these conditions are met: * 1) less than 10 dB below the simple follower * 2) less than 90 dB below the peak band (maximal masking possible considering * both the ATH and the loudness-dependent slope of the spreading function) * 3) above the PCM quantization noise floor */ if (E > .1 * bandwidth_mask && E * 1e9f > maxE && E > noise_floor * (band_end - band_start)) { bandwidth = b; } } if (tonal.count <= 2) { bandwidth = 20; } frame_loudness = 20 * (float)Math.Log10(frame_loudness); tonal.Etracker = Inlines.MAX32(tonal.Etracker - .03f, frame_loudness); tonal.lowECount *= (1 - alphaE); if (frame_loudness < tonal.Etracker - 30) { tonal.lowECount += alphaE; } for (i = 0; i < 8; i++) { float sum = 0; for (b = 0; b < 16; b++) { sum += Tables.dct_table[i * 16 + b] * logE[b]; } BFCC[i] = sum; } frame_stationarity /= OpusConstants.NB_TBANDS; relativeE /= OpusConstants.NB_TBANDS; if (tonal.count < 10) { relativeE = 0.5f; } frame_noisiness /= OpusConstants.NB_TBANDS; info.activity = frame_noisiness + (1 - frame_noisiness) * relativeE; frame_tonality = (max_frame_tonality / (OpusConstants.NB_TBANDS - OpusConstants.NB_TONAL_SKIP_BANDS)); frame_tonality = Inlines.MAX16(frame_tonality, tonal.prev_tonality * .8f); tonal.prev_tonality = frame_tonality; slope /= 8 * 8; info.tonality_slope = slope; tonal.E_count = (tonal.E_count + 1) % OpusConstants.NB_FRAMES; tonal.count++; info.tonality = frame_tonality; for (i = 0; i < 4; i++) { features[i] = -0.12299f * (BFCC[i] + tonal.mem[i + 24]) + 0.49195f * (tonal.mem[i] + tonal.mem[i + 16]) + 0.69693f * tonal.mem[i + 8] - 1.4349f * tonal.cmean[i]; } for (i = 0; i < 4; i++) { tonal.cmean[i] = (1 - alpha) * tonal.cmean[i] + alpha * BFCC[i]; } for (i = 0; i < 4; i++) { features[4 + i] = 0.63246f * (BFCC[i] - tonal.mem[i + 24]) + 0.31623f * (tonal.mem[i] - tonal.mem[i + 16]); } for (i = 0; i < 3; i++) { features[8 + i] = 0.53452f * (BFCC[i] + tonal.mem[i + 24]) - 0.26726f * (tonal.mem[i] + tonal.mem[i + 16]) - 0.53452f * tonal.mem[i + 8]; } if (tonal.count > 5) { for (i = 0; i < 9; i++) { tonal.std[i] = (1 - alpha) * tonal.std[i] + alpha * features[i] * features[i]; } } for (i = 0; i < 8; i++) { tonal.mem[i + 24] = tonal.mem[i + 16]; tonal.mem[i + 16] = tonal.mem[i + 8]; tonal.mem[i + 8] = tonal.mem[i]; tonal.mem[i] = BFCC[i]; } for (i = 0; i < 9; i++) { features[11 + i] = (float)Math.Sqrt(tonal.std[i]); } features[20] = info.tonality; features[21] = info.activity; features[22] = frame_stationarity; features[23] = info.tonality_slope; features[24] = tonal.lowECount; mlp.mlp_process(Tables.net, features, frame_probs); frame_probs[0] = .5f * (frame_probs[0] + 1); /* Curve fitting between the MLP probability and the actual probability */ frame_probs[0] = .01f + 1.21f * frame_probs[0] * frame_probs[0] - .23f * (float)Math.Pow(frame_probs[0], 10); /* Probability of active audio (as opposed to silence) */ frame_probs[1] = .5f * frame_probs[1] + .5f; /* Consider that silence has a 50-50 probability. */ frame_probs[0] = frame_probs[1] * frame_probs[0] + (1 - frame_probs[1]) * .5f; /*printf("%f %f ", frame_probs[0], frame_probs[1]);*/ { /* Probability of state transition */ float tau; /* Represents independence of the MLP probabilities, where * beta=1 means fully independent. */ float beta; /* Denormalized probability of speech (p0) and music (p1) after update */ float p0, p1; /* Probabilities for "all speech" and "all music" */ float s0, m0; /* Probability sum for renormalisation */ float psum; /* Instantaneous probability of speech and music, with beta pre-applied. */ float speech0; float music0; /* One transition every 3 minutes of active audio */ tau = .00005f * frame_probs[1]; beta = .05f; //if (1) { /* Adapt beta based on how "unexpected" the new prob is */ float p, q; p = Inlines.MAX16(.05f, Inlines.MIN16(.95f, frame_probs[0])); q = Inlines.MAX16(.05f, Inlines.MIN16(.95f, tonal.music_prob)); beta = .01f + .05f * Inlines.ABS16(p - q) / (p * (1 - q) + q * (1 - p)); } /* p0 and p1 are the probabilities of speech and music at this frame * using only information from previous frame and applying the * state transition model */ p0 = (1 - tonal.music_prob) * (1 - tau) + tonal.music_prob * tau; p1 = tonal.music_prob * (1 - tau) + (1 - tonal.music_prob) * tau; /* We apply the current probability with exponent beta to work around * the fact that the probability estimates aren't independent. */ p0 *= (float)Math.Pow(1 - frame_probs[0], beta); p1 *= (float)Math.Pow(frame_probs[0], beta); /* Normalise the probabilities to get the Marokv probability of music. */ tonal.music_prob = p1 / (p0 + p1); info.music_prob = tonal.music_prob; /* This chunk of code deals with delayed decision. */ psum = 1e-20f; /* Instantaneous probability of speech and music, with beta pre-applied. */ speech0 = (float)Math.Pow(1 - frame_probs[0], beta); music0 = (float)Math.Pow(frame_probs[0], beta); if (tonal.count == 1) { tonal.pspeech[0] = 0.5f; tonal.pmusic[0] = 0.5f; } /* Updated probability of having only speech (s0) or only music (m0), * before considering the new observation. */ s0 = tonal.pspeech[0] + tonal.pspeech[1]; m0 = tonal.pmusic[0] + tonal.pmusic[1]; /* Updates s0 and m0 with instantaneous probability. */ tonal.pspeech[0] = s0 * (1 - tau) * speech0; tonal.pmusic[0] = m0 * (1 - tau) * music0; /* Propagate the transition probabilities */ for (i = 1; i < OpusConstants.DETECT_SIZE - 1; i++) { tonal.pspeech[i] = tonal.pspeech[i + 1] * speech0; tonal.pmusic[i] = tonal.pmusic[i + 1] * music0; } /* Probability that the latest frame is speech, when all the previous ones were music. */ tonal.pspeech[OpusConstants.DETECT_SIZE - 1] = m0 * tau * speech0; /* Probability that the latest frame is music, when all the previous ones were speech. */ tonal.pmusic[OpusConstants.DETECT_SIZE - 1] = s0 * tau * music0; /* Renormalise probabilities to 1 */ for (i = 0; i < OpusConstants.DETECT_SIZE; i++) { psum += tonal.pspeech[i] + tonal.pmusic[i]; } psum = 1.0f / psum; for (i = 0; i < OpusConstants.DETECT_SIZE; i++) { tonal.pspeech[i] *= psum; tonal.pmusic[i] *= psum; } psum = tonal.pmusic[0]; for (i = 1; i < OpusConstants.DETECT_SIZE; i++) { psum += tonal.pspeech[i]; } /* Estimate our confidence in the speech/music decisions */ if (frame_probs[1] > .75) { if (tonal.music_prob > .9) { float adapt; adapt = 1.0f / (++tonal.music_confidence_count); tonal.music_confidence_count = Inlines.IMIN(tonal.music_confidence_count, 500); tonal.music_confidence += adapt * Inlines.MAX16(-.2f, frame_probs[0] - tonal.music_confidence); } if (tonal.music_prob < .1) { float adapt; adapt = 1.0f / (++tonal.speech_confidence_count); tonal.speech_confidence_count = Inlines.IMIN(tonal.speech_confidence_count, 500); tonal.speech_confidence += adapt * Inlines.MIN16(.2f, frame_probs[0] - tonal.speech_confidence); } } else { if (tonal.music_confidence_count == 0) { tonal.music_confidence = .9f; } if (tonal.speech_confidence_count == 0) { tonal.speech_confidence = .1f; } } } if (tonal.last_music != ((tonal.music_prob > .5f) ? 1 : 0)) { tonal.last_transition = 0; } tonal.last_music = (tonal.music_prob > .5f) ? 1 : 0; info.bandwidth = bandwidth; info.noisiness = frame_noisiness; info.valid = 1; }