internal static void run_analysis <T>(TonalityAnalysisState analysis, CeltMode celt_mode, T[] analysis_pcm, int analysis_pcm_ptr, int analysis_frame_size, int frame_size, int c1, int c2, int C, int Fs, int lsb_depth, Downmix.downmix_func <T> downmix, AnalysisInfo analysis_info) { int offset; int pcm_len; if (analysis_pcm != null) { /* Avoid overflow/wrap-around of the analysis buffer */ analysis_frame_size = Inlines.IMIN((OpusConstants.DETECT_SIZE - 5) * Fs / 100, analysis_frame_size); pcm_len = analysis_frame_size - analysis.analysis_offset; offset = analysis.analysis_offset; do { tonality_analysis(analysis, celt_mode, analysis_pcm, analysis_pcm_ptr, Inlines.IMIN(480, pcm_len), offset, c1, c2, C, lsb_depth, downmix); offset += 480; pcm_len -= 480; } while (pcm_len > 0); analysis.analysis_offset = analysis_frame_size; analysis.analysis_offset -= frame_size; } analysis_info.valid = 0; tonality_get_info(analysis, analysis_info, frame_size); }
internal static void tonality_analysis_init(TonalityAnalysisState tonal) { tonal.Reset(); }
/// <summary> /// /// </summary> /// <typeparam name="T">The type of signal being handled (either short or float) - changes based on which API is used</typeparam> /// <param name="tonal"></param> /// <param name="celt_mode"></param> /// <param name="x"></param> /// <param name="len"></param> /// <param name="offset"></param> /// <param name="c1"></param> /// <param name="c2"></param> /// <param name="C"></param> /// <param name="lsb_depth"></param> /// <param name="downmix"></param> internal static void tonality_analysis <T>(TonalityAnalysisState tonal, CeltMode celt_mode, T[] x, int x_ptr, int len, int offset, int c1, int c2, int C, int lsb_depth, Downmix.downmix_func <T> downmix) { int i, b; FFTState kfft; int[] input; int[] output; int N = 480, N2 = 240; float[] A = tonal.angle; float[] dA = tonal.d_angle; float[] d2A = tonal.d2_angle; float[] tonality; float[] noisiness; float[] band_tonality = new float[OpusConstants.NB_TBANDS]; float[] logE = new float[OpusConstants.NB_TBANDS]; float[] BFCC = new float[8]; float[] features = new float[25]; float frame_tonality; float max_frame_tonality; /*float tw_sum=0;*/ float frame_noisiness; float pi4 = (float)(M_PI * M_PI * M_PI * M_PI); float slope = 0; float frame_stationarity; float relativeE; float[] frame_probs = new float[2]; float alpha, alphaE, alphaE2; float frame_loudness; float bandwidth_mask; int bandwidth = 0; float maxE = 0; float noise_floor; int remaining; AnalysisInfo info; //[porting note] pointer tonal.last_transition++; alpha = 1.0f / Inlines.IMIN(20, 1 + tonal.count); alphaE = 1.0f / Inlines.IMIN(50, 1 + tonal.count); alphaE2 = 1.0f / Inlines.IMIN(1000, 1 + tonal.count); if (tonal.count < 4) { tonal.music_prob = 0.5f; } kfft = celt_mode.mdct.kfft[0]; if (tonal.count == 0) { tonal.mem_fill = 240; } downmix(x, x_ptr, tonal.inmem, tonal.mem_fill, Inlines.IMIN(len, OpusConstants.ANALYSIS_BUF_SIZE - tonal.mem_fill), offset, c1, c2, C); if (tonal.mem_fill + len < OpusConstants.ANALYSIS_BUF_SIZE) { tonal.mem_fill += len; /* Don't have enough to update the analysis */ return; } info = tonal.info[tonal.write_pos++]; if (tonal.write_pos >= OpusConstants.DETECT_SIZE) { tonal.write_pos -= OpusConstants.DETECT_SIZE; } input = new int[960]; output = new int[960]; tonality = new float[240]; noisiness = new float[240]; for (i = 0; i < N2; i++) { float w = Tables.analysis_window[i]; input[2 * i] = (int)(w * tonal.inmem[i]); input[2 * i + 1] = (int)(w * tonal.inmem[N2 + i]); input[(2 * (N - i - 1))] = (int)(w * tonal.inmem[N - i - 1]); input[(2 * (N - i - 1)) + 1] = (int)(w * tonal.inmem[N + N2 - i - 1]); } Arrays.MemMoveInt(tonal.inmem, OpusConstants.ANALYSIS_BUF_SIZE - 240, 0, 240); remaining = len - (OpusConstants.ANALYSIS_BUF_SIZE - tonal.mem_fill); downmix(x, x_ptr, tonal.inmem, 240, remaining, offset + OpusConstants.ANALYSIS_BUF_SIZE - tonal.mem_fill, c1, c2, C); tonal.mem_fill = 240 + remaining; KissFFT.opus_fft(kfft, input, output); for (i = 1; i < N2; i++) { float X1r, X2r, X1i, X2i; float angle, d_angle, d2_angle; float angle2, d_angle2, d2_angle2; float mod1, mod2, avg_mod; X1r = (float)output[2 * i] + output[2 * (N - i)]; X1i = (float)output[(2 * i) + 1] - output[2 * (N - i) + 1]; X2r = (float)output[(2 * i) + 1] + output[2 * (N - i) + 1]; X2i = (float)output[2 * (N - i)] - output[2 * i]; angle = (float)(.5f / M_PI) * fast_atan2f(X1i, X1r); d_angle = angle - A[i]; d2_angle = d_angle - dA[i]; angle2 = (float)(.5f / M_PI) * fast_atan2f(X2i, X2r); d_angle2 = angle2 - angle; d2_angle2 = d_angle2 - d_angle; mod1 = d2_angle - (float)Math.Floor(0.5f + d2_angle); noisiness[i] = Inlines.ABS16(mod1); mod1 *= mod1; mod1 *= mod1; mod2 = d2_angle2 - (float)Math.Floor(0.5f + d2_angle2); noisiness[i] += Inlines.ABS16(mod2); mod2 *= mod2; mod2 *= mod2; avg_mod = .25f * (d2A[i] + 2.0f * mod1 + mod2); tonality[i] = 1.0f / (1.0f + 40.0f * 16.0f * pi4 * avg_mod) - .015f; A[i] = angle2; dA[i] = d_angle2; d2A[i] = mod2; } frame_tonality = 0; max_frame_tonality = 0; /*tw_sum = 0;*/ info.activity = 0; frame_noisiness = 0; frame_stationarity = 0; if (tonal.count == 0) { for (b = 0; b < OpusConstants.NB_TBANDS; b++) { tonal.lowE[b] = 1e10f; tonal.highE[b] = -1e10f; } } relativeE = 0; frame_loudness = 0; for (b = 0; b < OpusConstants.NB_TBANDS; b++) { float E = 0, tE = 0, nE = 0; float L1, L2; float stationarity; for (i = Tables.tbands[b]; i < Tables.tbands[b + 1]; i++) { float binE = output[2 * i] * (float)output[2 * i] + output[2 * (N - i)] * (float)output[2 * (N - i)] + output[2 * i + 1] * (float)output[2 * i + 1] + output[2 * (N - i) + 1] * (float)output[2 * (N - i) + 1]; /* FIXME: It's probably best to change the BFCC filter initial state instead */ binE *= 5.55e-17f; E += binE; tE += binE * tonality[i]; nE += binE * 2.0f * (.5f - noisiness[i]); } tonal.E[tonal.E_count][b] = E; frame_noisiness += nE / (1e-15f + E); frame_loudness += (float)Math.Sqrt(E + 1e-10f); logE[b] = (float)Math.Log(E + 1e-10f); tonal.lowE[b] = Inlines.MIN32(logE[b], tonal.lowE[b] + 0.01f); tonal.highE[b] = Inlines.MAX32(logE[b], tonal.highE[b] - 0.1f); if (tonal.highE[b] < tonal.lowE[b] + 1.0f) { tonal.highE[b] += 0.5f; tonal.lowE[b] -= 0.5f; } relativeE += (logE[b] - tonal.lowE[b]) / (1e-15f + tonal.highE[b] - tonal.lowE[b]); L1 = L2 = 0; for (i = 0; i < OpusConstants.NB_FRAMES; i++) { L1 += (float)Math.Sqrt(tonal.E[i][b]); L2 += tonal.E[i][b]; } stationarity = Inlines.MIN16(0.99f, L1 / (float)Math.Sqrt(1e-15 + OpusConstants.NB_FRAMES * L2)); stationarity *= stationarity; stationarity *= stationarity; frame_stationarity += stationarity; /*band_tonality[b] = tE/(1e-15+E)*/ band_tonality[b] = Inlines.MAX16(tE / (1e-15f + E), stationarity * tonal.prev_band_tonality[b]); frame_tonality += band_tonality[b]; if (b >= OpusConstants.NB_TBANDS - OpusConstants.NB_TONAL_SKIP_BANDS) { frame_tonality -= band_tonality[b - OpusConstants.NB_TBANDS + OpusConstants.NB_TONAL_SKIP_BANDS]; } max_frame_tonality = Inlines.MAX16(max_frame_tonality, (1.0f + .03f * (b - OpusConstants.NB_TBANDS)) * frame_tonality); slope += band_tonality[b] * (b - 8); tonal.prev_band_tonality[b] = band_tonality[b]; } bandwidth_mask = 0; bandwidth = 0; maxE = 0; noise_floor = 5.7e-4f / (1 << (Inlines.IMAX(0, lsb_depth - 8))); noise_floor *= 1 << (15 + CeltConstants.SIG_SHIFT); noise_floor *= noise_floor; for (b = 0; b < OpusConstants.NB_TOT_BANDS; b++) { float E = 0; int band_start, band_end; /* Keep a margin of 300 Hz for aliasing */ band_start = Tables.extra_bands[b]; band_end = Tables.extra_bands[b + 1]; for (i = band_start; i < band_end; i++) { float binE = output[2 * i] * (float)output[2 * i] + output[2 * (N - i)] * (float)output[2 * (N - i)] + output[2 * i + 1] * (float)output[2 * i + 1] + output[2 * (N - i) + 1] * (float)output[2 * (N - i) + 1]; E += binE; } maxE = Inlines.MAX32(maxE, E); tonal.meanE[b] = Inlines.MAX32((1 - alphaE2) * tonal.meanE[b], E); E = Inlines.MAX32(E, tonal.meanE[b]); /* Use a simple follower with 13 dB/Bark slope for spreading function */ bandwidth_mask = Inlines.MAX32(.05f * bandwidth_mask, E); /* Consider the band "active" only if all these conditions are met: * 1) less than 10 dB below the simple follower * 2) less than 90 dB below the peak band (maximal masking possible considering * both the ATH and the loudness-dependent slope of the spreading function) * 3) above the PCM quantization noise floor */ if (E > .1 * bandwidth_mask && E * 1e9f > maxE && E > noise_floor * (band_end - band_start)) { bandwidth = b; } } if (tonal.count <= 2) { bandwidth = 20; } frame_loudness = 20 * (float)Math.Log10(frame_loudness); tonal.Etracker = Inlines.MAX32(tonal.Etracker - .03f, frame_loudness); tonal.lowECount *= (1 - alphaE); if (frame_loudness < tonal.Etracker - 30) { tonal.lowECount += alphaE; } for (i = 0; i < 8; i++) { float sum = 0; for (b = 0; b < 16; b++) { sum += Tables.dct_table[i * 16 + b] * logE[b]; } BFCC[i] = sum; } frame_stationarity /= OpusConstants.NB_TBANDS; relativeE /= OpusConstants.NB_TBANDS; if (tonal.count < 10) { relativeE = 0.5f; } frame_noisiness /= OpusConstants.NB_TBANDS; info.activity = frame_noisiness + (1 - frame_noisiness) * relativeE; frame_tonality = (max_frame_tonality / (OpusConstants.NB_TBANDS - OpusConstants.NB_TONAL_SKIP_BANDS)); frame_tonality = Inlines.MAX16(frame_tonality, tonal.prev_tonality * .8f); tonal.prev_tonality = frame_tonality; slope /= 8 * 8; info.tonality_slope = slope; tonal.E_count = (tonal.E_count + 1) % OpusConstants.NB_FRAMES; tonal.count++; info.tonality = frame_tonality; for (i = 0; i < 4; i++) { features[i] = -0.12299f * (BFCC[i] + tonal.mem[i + 24]) + 0.49195f * (tonal.mem[i] + tonal.mem[i + 16]) + 0.69693f * tonal.mem[i + 8] - 1.4349f * tonal.cmean[i]; } for (i = 0; i < 4; i++) { tonal.cmean[i] = (1 - alpha) * tonal.cmean[i] + alpha * BFCC[i]; } for (i = 0; i < 4; i++) { features[4 + i] = 0.63246f * (BFCC[i] - tonal.mem[i + 24]) + 0.31623f * (tonal.mem[i] - tonal.mem[i + 16]); } for (i = 0; i < 3; i++) { features[8 + i] = 0.53452f * (BFCC[i] + tonal.mem[i + 24]) - 0.26726f * (tonal.mem[i] + tonal.mem[i + 16]) - 0.53452f * tonal.mem[i + 8]; } if (tonal.count > 5) { for (i = 0; i < 9; i++) { tonal.std[i] = (1 - alpha) * tonal.std[i] + alpha * features[i] * features[i]; } } for (i = 0; i < 8; i++) { tonal.mem[i + 24] = tonal.mem[i + 16]; tonal.mem[i + 16] = tonal.mem[i + 8]; tonal.mem[i + 8] = tonal.mem[i]; tonal.mem[i] = BFCC[i]; } for (i = 0; i < 9; i++) { features[11 + i] = (float)Math.Sqrt(tonal.std[i]); } features[20] = info.tonality; features[21] = info.activity; features[22] = frame_stationarity; features[23] = info.tonality_slope; features[24] = tonal.lowECount; mlp.mlp_process(Tables.net, features, frame_probs); frame_probs[0] = .5f * (frame_probs[0] + 1); /* Curve fitting between the MLP probability and the actual probability */ frame_probs[0] = .01f + 1.21f * frame_probs[0] * frame_probs[0] - .23f * (float)Math.Pow(frame_probs[0], 10); /* Probability of active audio (as opposed to silence) */ frame_probs[1] = .5f * frame_probs[1] + .5f; /* Consider that silence has a 50-50 probability. */ frame_probs[0] = frame_probs[1] * frame_probs[0] + (1 - frame_probs[1]) * .5f; /*printf("%f %f ", frame_probs[0], frame_probs[1]);*/ { /* Probability of state transition */ float tau; /* Represents independence of the MLP probabilities, where * beta=1 means fully independent. */ float beta; /* Denormalized probability of speech (p0) and music (p1) after update */ float p0, p1; /* Probabilities for "all speech" and "all music" */ float s0, m0; /* Probability sum for renormalisation */ float psum; /* Instantaneous probability of speech and music, with beta pre-applied. */ float speech0; float music0; /* One transition every 3 minutes of active audio */ tau = .00005f * frame_probs[1]; beta = .05f; //if (1) { /* Adapt beta based on how "unexpected" the new prob is */ float p, q; p = Inlines.MAX16(.05f, Inlines.MIN16(.95f, frame_probs[0])); q = Inlines.MAX16(.05f, Inlines.MIN16(.95f, tonal.music_prob)); beta = .01f + .05f * Inlines.ABS16(p - q) / (p * (1 - q) + q * (1 - p)); } /* p0 and p1 are the probabilities of speech and music at this frame * using only information from previous frame and applying the * state transition model */ p0 = (1 - tonal.music_prob) * (1 - tau) + tonal.music_prob * tau; p1 = tonal.music_prob * (1 - tau) + (1 - tonal.music_prob) * tau; /* We apply the current probability with exponent beta to work around * the fact that the probability estimates aren't independent. */ p0 *= (float)Math.Pow(1 - frame_probs[0], beta); p1 *= (float)Math.Pow(frame_probs[0], beta); /* Normalise the probabilities to get the Marokv probability of music. */ tonal.music_prob = p1 / (p0 + p1); info.music_prob = tonal.music_prob; /* This chunk of code deals with delayed decision. */ psum = 1e-20f; /* Instantaneous probability of speech and music, with beta pre-applied. */ speech0 = (float)Math.Pow(1 - frame_probs[0], beta); music0 = (float)Math.Pow(frame_probs[0], beta); if (tonal.count == 1) { tonal.pspeech[0] = 0.5f; tonal.pmusic[0] = 0.5f; } /* Updated probability of having only speech (s0) or only music (m0), * before considering the new observation. */ s0 = tonal.pspeech[0] + tonal.pspeech[1]; m0 = tonal.pmusic[0] + tonal.pmusic[1]; /* Updates s0 and m0 with instantaneous probability. */ tonal.pspeech[0] = s0 * (1 - tau) * speech0; tonal.pmusic[0] = m0 * (1 - tau) * music0; /* Propagate the transition probabilities */ for (i = 1; i < OpusConstants.DETECT_SIZE - 1; i++) { tonal.pspeech[i] = tonal.pspeech[i + 1] * speech0; tonal.pmusic[i] = tonal.pmusic[i + 1] * music0; } /* Probability that the latest frame is speech, when all the previous ones were music. */ tonal.pspeech[OpusConstants.DETECT_SIZE - 1] = m0 * tau * speech0; /* Probability that the latest frame is music, when all the previous ones were speech. */ tonal.pmusic[OpusConstants.DETECT_SIZE - 1] = s0 * tau * music0; /* Renormalise probabilities to 1 */ for (i = 0; i < OpusConstants.DETECT_SIZE; i++) { psum += tonal.pspeech[i] + tonal.pmusic[i]; } psum = 1.0f / psum; for (i = 0; i < OpusConstants.DETECT_SIZE; i++) { tonal.pspeech[i] *= psum; tonal.pmusic[i] *= psum; } psum = tonal.pmusic[0]; for (i = 1; i < OpusConstants.DETECT_SIZE; i++) { psum += tonal.pspeech[i]; } /* Estimate our confidence in the speech/music decisions */ if (frame_probs[1] > .75) { if (tonal.music_prob > .9) { float adapt; adapt = 1.0f / (++tonal.music_confidence_count); tonal.music_confidence_count = Inlines.IMIN(tonal.music_confidence_count, 500); tonal.music_confidence += adapt * Inlines.MAX16(-.2f, frame_probs[0] - tonal.music_confidence); } if (tonal.music_prob < .1) { float adapt; adapt = 1.0f / (++tonal.speech_confidence_count); tonal.speech_confidence_count = Inlines.IMIN(tonal.speech_confidence_count, 500); tonal.speech_confidence += adapt * Inlines.MIN16(.2f, frame_probs[0] - tonal.speech_confidence); } } else { if (tonal.music_confidence_count == 0) { tonal.music_confidence = .9f; } if (tonal.speech_confidence_count == 0) { tonal.speech_confidence = .1f; } } } if (tonal.last_music != ((tonal.music_prob > .5f) ? 1 : 0)) { tonal.last_transition = 0; } tonal.last_music = (tonal.music_prob > .5f) ? 1 : 0; info.bandwidth = bandwidth; info.noisiness = frame_noisiness; info.valid = 1; }
internal static void tonality_get_info(TonalityAnalysisState tonal, AnalysisInfo info_out, int len) { int pos; int curr_lookahead; float psum; int i; pos = tonal.read_pos; curr_lookahead = tonal.write_pos - tonal.read_pos; if (curr_lookahead < 0) { curr_lookahead += OpusConstants.DETECT_SIZE; } if (len > 480 && pos != tonal.write_pos) { pos++; if (pos == OpusConstants.DETECT_SIZE) { pos = 0; } } if (pos == tonal.write_pos) { pos--; } if (pos < 0) { pos = OpusConstants.DETECT_SIZE - 1; } info_out.Assign(tonal.info[pos]); tonal.read_subframe += len / 120; while (tonal.read_subframe >= 4) { tonal.read_subframe -= 4; tonal.read_pos++; } if (tonal.read_pos >= OpusConstants.DETECT_SIZE) { tonal.read_pos -= OpusConstants.DETECT_SIZE; } /* Compensate for the delay in the features themselves. * FIXME: Need a better estimate the 10 I just made up */ curr_lookahead = Inlines.IMAX(curr_lookahead - 10, 0); psum = 0; /* Summing the probability of transition patterns that involve music at * time (DETECT_SIZE-curr_lookahead-1) */ for (i = 0; i < OpusConstants.DETECT_SIZE - curr_lookahead; i++) { psum += tonal.pmusic[i]; } for (; i < OpusConstants.DETECT_SIZE; i++) { psum += tonal.pspeech[i]; } psum = psum * tonal.music_confidence + (1 - psum) * tonal.speech_confidence; /*printf("%f %f %f\n", psum, info_out.music_prob, info_out.tonality);*/ info_out.music_prob = psum; }