/// <summary> /// Encode frame with Silk /// Note: if prefillFlag is set, the input must contain 10 ms of audio, irrespective of what /// encControl.payloadSize_ms is set to /// </summary> /// <param name="psEnc">I/O State</param> /// <param name="encControl">I Control status</param> /// <param name="samplesIn">I Speech sample input vector</param> /// <param name="nSamplesIn">I Number of samples in input vector</param> /// <param name="psRangeEnc">I/O Compressor data structure</param> /// <param name="nBytesOut">I/O Number of bytes in payload (input: Max bytes)</param> /// <param name="prefillFlag">I Flag to indicate prefilling buffers no coding</param> /// <returns>error code</returns> internal static int silk_Encode( SilkEncoder psEnc, EncControlState encControl, short[] samplesIn, int nSamplesIn, EntropyCoder psRangeEnc, BoxedValueInt nBytesOut, int prefillFlag) { int ret = SilkError.SILK_NO_ERROR; int n, i, nBits, flags, tmp_payloadSize_ms = 0, tmp_complexity = 0; int nSamplesToBuffer, nSamplesToBufferMax, nBlocksOf10ms; int nSamplesFromInput = 0, nSamplesFromInputMax; int speech_act_thr_for_switch_Q8; int TargetRate_bps, channelRate_bps, LBRR_symbol, sum; int[] MStargetRates_bps = new int[2]; short[] buf; int transition, curr_block, tot_blocks; nBytesOut.Val = 0; if (encControl.reducedDependency != 0) { psEnc.state_Fxx[0].first_frame_after_reset = 1; psEnc.state_Fxx[1].first_frame_after_reset = 1; } psEnc.state_Fxx[0].nFramesEncoded = psEnc.state_Fxx[1].nFramesEncoded = 0; /* Check values in encoder control structure */ ret += encControl.check_control_input(); if (ret != SilkError.SILK_NO_ERROR) { Inlines.OpusAssert(false); return(ret); } encControl.switchReady = 0; if (encControl.nChannelsInternal > psEnc.nChannelsInternal) { /* Mono . Stereo transition: init state of second channel and stereo state */ ret += SilkEncoder.silk_init_encoder(psEnc.state_Fxx[1]); Arrays.MemSetShort(psEnc.sStereo.pred_prev_Q13, 0, 2); Arrays.MemSetShort(psEnc.sStereo.sSide, 0, 2); psEnc.sStereo.mid_side_amp_Q0[0] = 0; psEnc.sStereo.mid_side_amp_Q0[1] = 1; psEnc.sStereo.mid_side_amp_Q0[2] = 0; psEnc.sStereo.mid_side_amp_Q0[3] = 1; psEnc.sStereo.width_prev_Q14 = 0; psEnc.sStereo.smth_width_Q14 = (short)(((int)((1.0f) * ((long)1 << (14)) + 0.5)) /*Inlines.SILK_CONST(1.0f, 14)*/); if (psEnc.nChannelsAPI == 2) { psEnc.state_Fxx[1].resampler_state.Assign(psEnc.state_Fxx[0].resampler_state); Array.Copy(psEnc.state_Fxx[0].In_HP_State, psEnc.state_Fxx[1].In_HP_State, 2); } } transition = ((encControl.payloadSize_ms != psEnc.state_Fxx[0].PacketSize_ms) || (psEnc.nChannelsInternal != encControl.nChannelsInternal)) ? 1 : 0; psEnc.nChannelsAPI = encControl.nChannelsAPI; psEnc.nChannelsInternal = encControl.nChannelsInternal; nBlocksOf10ms = Inlines.silk_DIV32(100 * nSamplesIn, encControl.API_sampleRate); tot_blocks = (nBlocksOf10ms > 1) ? nBlocksOf10ms >> 1 : 1; curr_block = 0; if (prefillFlag != 0) { /* Only accept input length of 10 ms */ if (nBlocksOf10ms != 1) { Inlines.OpusAssert(false); return(SilkError.SILK_ENC_INPUT_INVALID_NO_OF_SAMPLES); } /* Reset Encoder */ for (n = 0; n < encControl.nChannelsInternal; n++) { ret += SilkEncoder.silk_init_encoder(psEnc.state_Fxx[n]); Inlines.OpusAssert(ret == SilkError.SILK_NO_ERROR); } tmp_payloadSize_ms = encControl.payloadSize_ms; encControl.payloadSize_ms = 10; tmp_complexity = encControl.complexity; encControl.complexity = 0; for (n = 0; n < encControl.nChannelsInternal; n++) { psEnc.state_Fxx[n].controlled_since_last_payload = 0; psEnc.state_Fxx[n].prefillFlag = 1; } } else { /* Only accept input lengths that are a multiple of 10 ms */ if (nBlocksOf10ms * encControl.API_sampleRate != 100 * nSamplesIn || nSamplesIn < 0) { Inlines.OpusAssert(false); return(SilkError.SILK_ENC_INPUT_INVALID_NO_OF_SAMPLES); } /* Make sure no more than one packet can be produced */ if (1000 * (int)nSamplesIn > encControl.payloadSize_ms * encControl.API_sampleRate) { Inlines.OpusAssert(false); return(SilkError.SILK_ENC_INPUT_INVALID_NO_OF_SAMPLES); } } TargetRate_bps = Inlines.silk_RSHIFT32(encControl.bitRate, encControl.nChannelsInternal - 1); for (n = 0; n < encControl.nChannelsInternal; n++) { /* Force the side channel to the same rate as the mid */ int force_fs_kHz = (n == 1) ? psEnc.state_Fxx[0].fs_kHz : 0; ret += psEnc.state_Fxx[n].silk_control_encoder(encControl, TargetRate_bps, psEnc.allowBandwidthSwitch, n, force_fs_kHz); if (ret != SilkError.SILK_NO_ERROR) { Inlines.OpusAssert(false); return(ret); } if (psEnc.state_Fxx[n].first_frame_after_reset != 0 || transition != 0) { for (i = 0; i < psEnc.state_Fxx[0].nFramesPerPacket; i++) { psEnc.state_Fxx[n].LBRR_flags[i] = 0; } } psEnc.state_Fxx[n].inDTX = psEnc.state_Fxx[n].useDTX; } Inlines.OpusAssert(encControl.nChannelsInternal == 1 || psEnc.state_Fxx[0].fs_kHz == psEnc.state_Fxx[1].fs_kHz); /* Input buffering/resampling and encoding */ nSamplesToBufferMax = 10 * nBlocksOf10ms * psEnc.state_Fxx[0].fs_kHz; nSamplesFromInputMax = Inlines.silk_DIV32_16(nSamplesToBufferMax * psEnc.state_Fxx[0].API_fs_Hz, (short)(psEnc.state_Fxx[0].fs_kHz * 1000)); buf = new short[nSamplesFromInputMax]; int samplesIn_ptr = 0; while (true) { nSamplesToBuffer = psEnc.state_Fxx[0].frame_length - psEnc.state_Fxx[0].inputBufIx; nSamplesToBuffer = Inlines.silk_min(nSamplesToBuffer, nSamplesToBufferMax); nSamplesFromInput = Inlines.silk_DIV32_16(nSamplesToBuffer * psEnc.state_Fxx[0].API_fs_Hz, psEnc.state_Fxx[0].fs_kHz * 1000); /* Resample and write to buffer */ if (encControl.nChannelsAPI == 2 && encControl.nChannelsInternal == 2) { int id = psEnc.state_Fxx[0].nFramesEncoded; for (n = 0; n < nSamplesFromInput; n++) { buf[n] = samplesIn[samplesIn_ptr + (2 * n)]; } /* Making sure to start both resamplers from the same state when switching from mono to stereo */ if (psEnc.nPrevChannelsInternal == 1 && id == 0) { //silk_memcpy(&psEnc.state_Fxx[1].resampler_state, &psEnc.state_Fxx[0].resampler_state, sizeof(psEnc.state_Fxx[1].resampler_state)); psEnc.state_Fxx[1].resampler_state.Assign(psEnc.state_Fxx[0].resampler_state); } ret += Resampler.silk_resampler( psEnc.state_Fxx[0].resampler_state, psEnc.state_Fxx[0].inputBuf, psEnc.state_Fxx[0].inputBufIx + 2, buf, 0, nSamplesFromInput); psEnc.state_Fxx[0].inputBufIx += nSamplesToBuffer; nSamplesToBuffer = psEnc.state_Fxx[1].frame_length - psEnc.state_Fxx[1].inputBufIx; nSamplesToBuffer = Inlines.silk_min(nSamplesToBuffer, 10 * nBlocksOf10ms * psEnc.state_Fxx[1].fs_kHz); for (n = 0; n < nSamplesFromInput; n++) { buf[n] = samplesIn[samplesIn_ptr + (2 * n) + 1]; } ret += Resampler.silk_resampler( psEnc.state_Fxx[1].resampler_state, psEnc.state_Fxx[1].inputBuf, psEnc.state_Fxx[1].inputBufIx + 2, buf, 0, nSamplesFromInput); psEnc.state_Fxx[1].inputBufIx += nSamplesToBuffer; } else if (encControl.nChannelsAPI == 2 && encControl.nChannelsInternal == 1) { /* Combine left and right channels before resampling */ for (n = 0; n < nSamplesFromInput; n++) { sum = samplesIn[samplesIn_ptr + (2 * n)] + samplesIn[samplesIn_ptr + (2 * n) + 1]; buf[n] = (short)Inlines.silk_RSHIFT_ROUND(sum, 1); } ret += Resampler.silk_resampler( psEnc.state_Fxx[0].resampler_state, psEnc.state_Fxx[0].inputBuf, psEnc.state_Fxx[0].inputBufIx + 2, buf, 0, nSamplesFromInput); /* On the first mono frame, average the results for the two resampler states */ if (psEnc.nPrevChannelsInternal == 2 && psEnc.state_Fxx[0].nFramesEncoded == 0) { ret += Resampler.silk_resampler( psEnc.state_Fxx[1].resampler_state, psEnc.state_Fxx[1].inputBuf, psEnc.state_Fxx[1].inputBufIx + 2, buf, 0, nSamplesFromInput); for (n = 0; n < psEnc.state_Fxx[0].frame_length; n++) { psEnc.state_Fxx[0].inputBuf[psEnc.state_Fxx[0].inputBufIx + n + 2] = (short)(Inlines.silk_RSHIFT(psEnc.state_Fxx[0].inputBuf[psEnc.state_Fxx[0].inputBufIx + n + 2] + psEnc.state_Fxx[1].inputBuf[psEnc.state_Fxx[1].inputBufIx + n + 2], 1)); } } psEnc.state_Fxx[0].inputBufIx += nSamplesToBuffer; } else { Inlines.OpusAssert(encControl.nChannelsAPI == 1 && encControl.nChannelsInternal == 1); Array.Copy(samplesIn, samplesIn_ptr, buf, 0, nSamplesFromInput); ret += Resampler.silk_resampler( psEnc.state_Fxx[0].resampler_state, psEnc.state_Fxx[0].inputBuf, psEnc.state_Fxx[0].inputBufIx + 2, buf, 0, nSamplesFromInput); psEnc.state_Fxx[0].inputBufIx += nSamplesToBuffer; } samplesIn_ptr += (nSamplesFromInput * encControl.nChannelsAPI); nSamplesIn -= nSamplesFromInput; /* Default */ psEnc.allowBandwidthSwitch = 0; /* Silk encoder */ if (psEnc.state_Fxx[0].inputBufIx >= psEnc.state_Fxx[0].frame_length) { /* Enough data in input buffer, so encode */ Inlines.OpusAssert(psEnc.state_Fxx[0].inputBufIx == psEnc.state_Fxx[0].frame_length); Inlines.OpusAssert(encControl.nChannelsInternal == 1 || psEnc.state_Fxx[1].inputBufIx == psEnc.state_Fxx[1].frame_length); /* Deal with LBRR data */ if (psEnc.state_Fxx[0].nFramesEncoded == 0 && prefillFlag == 0) { /* Create space at start of payload for VAD and FEC flags */ byte[] iCDF = { 0, 0 }; iCDF[0] = (byte)(256 - Inlines.silk_RSHIFT(256, (psEnc.state_Fxx[0].nFramesPerPacket + 1) * encControl.nChannelsInternal)); psRangeEnc.enc_icdf(0, iCDF, 8); /* Encode any LBRR data from previous packet */ /* Encode LBRR flags */ for (n = 0; n < encControl.nChannelsInternal; n++) { LBRR_symbol = 0; for (i = 0; i < psEnc.state_Fxx[n].nFramesPerPacket; i++) { LBRR_symbol |= Inlines.silk_LSHIFT(psEnc.state_Fxx[n].LBRR_flags[i], i); } psEnc.state_Fxx[n].LBRR_flag = (sbyte)(LBRR_symbol > 0 ? 1 : 0); if (LBRR_symbol != 0 && psEnc.state_Fxx[n].nFramesPerPacket > 1) { psRangeEnc.enc_icdf(LBRR_symbol - 1, Tables.silk_LBRR_flags_iCDF_ptr[psEnc.state_Fxx[n].nFramesPerPacket - 2], 8); } } /* Code LBRR indices and excitation signals */ for (i = 0; i < psEnc.state_Fxx[0].nFramesPerPacket; i++) { for (n = 0; n < encControl.nChannelsInternal; n++) { if (psEnc.state_Fxx[n].LBRR_flags[i] != 0) { int condCoding; if (encControl.nChannelsInternal == 2 && n == 0) { Stereo.silk_stereo_encode_pred(psRangeEnc, psEnc.sStereo.predIx[i]); /* For LBRR data there's no need to code the mid-only flag if the side-channel LBRR flag is set */ if (psEnc.state_Fxx[1].LBRR_flags[i] == 0) { Stereo.silk_stereo_encode_mid_only(psRangeEnc, psEnc.sStereo.mid_only_flags[i]); } } /* Use conditional coding if previous frame available */ if (i > 0 && psEnc.state_Fxx[n].LBRR_flags[i - 1] != 0) { condCoding = SilkConstants.CODE_CONDITIONALLY; } else { condCoding = SilkConstants.CODE_INDEPENDENTLY; } EncodeIndices.silk_encode_indices(psEnc.state_Fxx[n], psRangeEnc, i, 1, condCoding); EncodePulses.silk_encode_pulses(psRangeEnc, psEnc.state_Fxx[n].indices_LBRR[i].signalType, psEnc.state_Fxx[n].indices_LBRR[i].quantOffsetType, psEnc.state_Fxx[n].pulses_LBRR[i], psEnc.state_Fxx[n].frame_length); } } } /* Reset LBRR flags */ for (n = 0; n < encControl.nChannelsInternal; n++) { Arrays.MemSetInt(psEnc.state_Fxx[n].LBRR_flags, 0, SilkConstants.MAX_FRAMES_PER_PACKET); } psEnc.nBitsUsedLBRR = psRangeEnc.tell(); } HPVariableCutoff.silk_HP_variable_cutoff(psEnc.state_Fxx); /* Total target bits for packet */ nBits = Inlines.silk_DIV32_16(Inlines.silk_MUL(encControl.bitRate, encControl.payloadSize_ms), 1000); /* Subtract bits used for LBRR */ if (prefillFlag == 0) { nBits -= psEnc.nBitsUsedLBRR; } /* Divide by number of uncoded frames left in packet */ nBits = Inlines.silk_DIV32_16(nBits, psEnc.state_Fxx[0].nFramesPerPacket); /* Convert to bits/second */ if (encControl.payloadSize_ms == 10) { TargetRate_bps = Inlines.silk_SMULBB(nBits, 100); } else { TargetRate_bps = Inlines.silk_SMULBB(nBits, 50); } /* Subtract fraction of bits in excess of target in previous frames and packets */ TargetRate_bps -= Inlines.silk_DIV32_16(Inlines.silk_MUL(psEnc.nBitsExceeded, 1000), TuningParameters.BITRESERVOIR_DECAY_TIME_MS); if (prefillFlag == 0 && psEnc.state_Fxx[0].nFramesEncoded > 0) { /* Compare actual vs target bits so far in this packet */ int bitsBalance = psRangeEnc.tell() - psEnc.nBitsUsedLBRR - nBits * psEnc.state_Fxx[0].nFramesEncoded; TargetRate_bps -= Inlines.silk_DIV32_16(Inlines.silk_MUL(bitsBalance, 1000), TuningParameters.BITRESERVOIR_DECAY_TIME_MS); } /* Never exceed input bitrate */ TargetRate_bps = Inlines.silk_LIMIT(TargetRate_bps, encControl.bitRate, 5000); /* Convert Left/Right to Mid/Side */ if (encControl.nChannelsInternal == 2) { BoxedValueSbyte midOnlyFlagBoxed = new BoxedValueSbyte(psEnc.sStereo.mid_only_flags[psEnc.state_Fxx[0].nFramesEncoded]); Stereo.silk_stereo_LR_to_MS(psEnc.sStereo, psEnc.state_Fxx[0].inputBuf, 2, psEnc.state_Fxx[1].inputBuf, 2, psEnc.sStereo.predIx[psEnc.state_Fxx[0].nFramesEncoded], midOnlyFlagBoxed, MStargetRates_bps, TargetRate_bps, psEnc.state_Fxx[0].speech_activity_Q8, encControl.toMono, psEnc.state_Fxx[0].fs_kHz, psEnc.state_Fxx[0].frame_length); psEnc.sStereo.mid_only_flags[psEnc.state_Fxx[0].nFramesEncoded] = midOnlyFlagBoxed.Val; if (midOnlyFlagBoxed.Val == 0) { /* Reset side channel encoder memory for first frame with side coding */ if (psEnc.prev_decode_only_middle == 1) { psEnc.state_Fxx[1].sShape.Reset(); psEnc.state_Fxx[1].sPrefilt.Reset(); psEnc.state_Fxx[1].sNSQ.Reset(); Arrays.MemSetShort(psEnc.state_Fxx[1].prev_NLSFq_Q15, 0, SilkConstants.MAX_LPC_ORDER); Arrays.MemSetInt(psEnc.state_Fxx[1].sLP.In_LP_State, 0, 2); psEnc.state_Fxx[1].prevLag = 100; psEnc.state_Fxx[1].sNSQ.lagPrev = 100; psEnc.state_Fxx[1].sShape.LastGainIndex = 10; psEnc.state_Fxx[1].prevSignalType = SilkConstants.TYPE_NO_VOICE_ACTIVITY; psEnc.state_Fxx[1].sNSQ.prev_gain_Q16 = 65536; psEnc.state_Fxx[1].first_frame_after_reset = 1; } psEnc.state_Fxx[1].silk_encode_do_VAD(); } else { psEnc.state_Fxx[1].VAD_flags[psEnc.state_Fxx[0].nFramesEncoded] = 0; } if (prefillFlag == 0) { Stereo.silk_stereo_encode_pred(psRangeEnc, psEnc.sStereo.predIx[psEnc.state_Fxx[0].nFramesEncoded]); if (psEnc.state_Fxx[1].VAD_flags[psEnc.state_Fxx[0].nFramesEncoded] == 0) { Stereo.silk_stereo_encode_mid_only(psRangeEnc, psEnc.sStereo.mid_only_flags[psEnc.state_Fxx[0].nFramesEncoded]); } } } else { /* Buffering */ Array.Copy(psEnc.sStereo.sMid, psEnc.state_Fxx[0].inputBuf, 2); Array.Copy(psEnc.state_Fxx[0].inputBuf, psEnc.state_Fxx[0].frame_length, psEnc.sStereo.sMid, 0, 2); } psEnc.state_Fxx[0].silk_encode_do_VAD(); /* Encode */ for (n = 0; n < encControl.nChannelsInternal; n++) { int maxBits, useCBR; /* Handling rate constraints */ maxBits = encControl.maxBits; if (tot_blocks == 2 && curr_block == 0) { maxBits = maxBits * 3 / 5; } else if (tot_blocks == 3) { if (curr_block == 0) { maxBits = maxBits * 2 / 5; } else if (curr_block == 1) { maxBits = maxBits * 3 / 4; } } useCBR = (encControl.useCBR != 0 && curr_block == tot_blocks - 1) ? 1 : 0; if (encControl.nChannelsInternal == 1) { channelRate_bps = TargetRate_bps; } else { channelRate_bps = MStargetRates_bps[n]; if (n == 0 && MStargetRates_bps[1] > 0) { useCBR = 0; /* Give mid up to 1/2 of the max bits for that frame */ maxBits -= encControl.maxBits / (tot_blocks * 2); } } if (channelRate_bps > 0) { int condCoding; psEnc.state_Fxx[n].silk_control_SNR(channelRate_bps); /* Use independent coding if no previous frame available */ if (psEnc.state_Fxx[0].nFramesEncoded - n <= 0) { condCoding = SilkConstants.CODE_INDEPENDENTLY; } else if (n > 0 && psEnc.prev_decode_only_middle != 0) { /* If we skipped a side frame in this packet, we don't * need LTP scaling; the LTP state is well-defined. */ condCoding = SilkConstants.CODE_INDEPENDENTLY_NO_LTP_SCALING; } else { condCoding = SilkConstants.CODE_CONDITIONALLY; } ret += psEnc.state_Fxx[n].silk_encode_frame(nBytesOut, psRangeEnc, condCoding, maxBits, useCBR); Inlines.OpusAssert(ret == SilkError.SILK_NO_ERROR); } psEnc.state_Fxx[n].controlled_since_last_payload = 0; psEnc.state_Fxx[n].inputBufIx = 0; psEnc.state_Fxx[n].nFramesEncoded++; } psEnc.prev_decode_only_middle = psEnc.sStereo.mid_only_flags[psEnc.state_Fxx[0].nFramesEncoded - 1]; /* Insert VAD and FEC flags at beginning of bitstream */ if (nBytesOut.Val > 0 && psEnc.state_Fxx[0].nFramesEncoded == psEnc.state_Fxx[0].nFramesPerPacket) { flags = 0; for (n = 0; n < encControl.nChannelsInternal; n++) { for (i = 0; i < psEnc.state_Fxx[n].nFramesPerPacket; i++) { flags = Inlines.silk_LSHIFT(flags, 1); flags |= (int)psEnc.state_Fxx[n].VAD_flags[i]; } flags = Inlines.silk_LSHIFT(flags, 1); flags |= (int)psEnc.state_Fxx[n].LBRR_flag; } if (prefillFlag == 0) { psRangeEnc.enc_patch_initial_bits((uint)flags, (uint)((psEnc.state_Fxx[0].nFramesPerPacket + 1) * encControl.nChannelsInternal)); } /* Return zero bytes if all channels DTXed */ if (psEnc.state_Fxx[0].inDTX != 0 && (encControl.nChannelsInternal == 1 || psEnc.state_Fxx[1].inDTX != 0)) { nBytesOut.Val = 0; } psEnc.nBitsExceeded += nBytesOut.Val * 8; psEnc.nBitsExceeded -= Inlines.silk_DIV32_16(Inlines.silk_MUL(encControl.bitRate, encControl.payloadSize_ms), 1000); psEnc.nBitsExceeded = Inlines.silk_LIMIT(psEnc.nBitsExceeded, 0, 10000); /* Update flag indicating if bandwidth switching is allowed */ speech_act_thr_for_switch_Q8 = Inlines.silk_SMLAWB(((int)((TuningParameters.SPEECH_ACTIVITY_DTX_THRES) * ((long)1 << (8)) + 0.5)) /*Inlines.SILK_CONST(TuningParameters.SPEECH_ACTIVITY_DTX_THRES, 8)*/, ((int)(((1 - TuningParameters.SPEECH_ACTIVITY_DTX_THRES) / TuningParameters.MAX_BANDWIDTH_SWITCH_DELAY_MS) * ((long)1 << (16 + 8)) + 0.5)) /*Inlines.SILK_CONST((1 - TuningParameters.SPEECH_ACTIVITY_DTX_THRES) / TuningParameters.MAX_BANDWIDTH_SWITCH_DELAY_MS, 16 + 8)*/, psEnc.timeSinceSwitchAllowed_ms); if (psEnc.state_Fxx[0].speech_activity_Q8 < speech_act_thr_for_switch_Q8) { psEnc.allowBandwidthSwitch = 1; psEnc.timeSinceSwitchAllowed_ms = 0; } else { psEnc.allowBandwidthSwitch = 0; psEnc.timeSinceSwitchAllowed_ms += encControl.payloadSize_ms; } } if (nSamplesIn == 0) { break; } } else { break; } curr_block++; } psEnc.nPrevChannelsInternal = encControl.nChannelsInternal; encControl.allowBandwidthSwitch = psEnc.allowBandwidthSwitch; encControl.inWBmodeWithoutVariableLP = (psEnc.state_Fxx[0].fs_kHz == 16 && psEnc.state_Fxx[0].sLP.mode == 0) ? 1 : 0; encControl.internalSampleRate = Inlines.silk_SMULBB(psEnc.state_Fxx[0].fs_kHz, 1000); encControl.stereoWidth_Q14 = encControl.toMono != 0 ? 0 : psEnc.sStereo.smth_width_Q14; if (prefillFlag != 0) { encControl.payloadSize_ms = tmp_payloadSize_ms; encControl.complexity = tmp_complexity; for (n = 0; n < encControl.nChannelsInternal; n++) { psEnc.state_Fxx[n].controlled_since_last_payload = 0; psEnc.state_Fxx[n].prefillFlag = 0; } } return(ret); }
/* Decode a frame */ internal static int silk_Decode( /* O Returns error code */ SilkDecoder psDec, /* I/O State */ DecControlState decControl, /* I/O Control Structure */ int lostFlag, /* I 0: no loss, 1 loss, 2 decode fec */ int newPacketFlag, /* I Indicates first decoder call for this packet */ EntropyCoder psRangeDec, /* I/O Compressor data structure */ short[] samplesOut, /* O Decoded output speech vector */ int samplesOut_ptr, out int nSamplesOut /* O Number of samples decoded */ ) { int i, n, decode_only_middle = 0, ret = SilkError.SILK_NO_ERROR; int LBRR_symbol; BoxedValueInt nSamplesOutDec = new BoxedValueInt(); short[] samplesOut_tmp; int[] samplesOut_tmp_ptrs = new int[2]; short[] samplesOut1_tmp_storage1; short[] samplesOut1_tmp_storage2; short[] samplesOut2_tmp; int[] MS_pred_Q13 = new int[] { 0, 0 }; short[] resample_out; int resample_out_ptr; SilkChannelDecoder[] channel_state = psDec.channel_state; int has_side; int stereo_to_mono; int delay_stack_alloc; nSamplesOut = 0; Inlines.OpusAssert(decControl.nChannelsInternal == 1 || decControl.nChannelsInternal == 2); /**********************************/ /* Test if first frame in payload */ /**********************************/ if (newPacketFlag != 0) { for (n = 0; n < decControl.nChannelsInternal; n++) { channel_state[n].nFramesDecoded = 0; /* Used to count frames in packet */ } } /* If Mono . Stereo transition in bitstream: init state of second channel */ if (decControl.nChannelsInternal > psDec.nChannelsInternal) { ret += channel_state[1].silk_init_decoder(); } stereo_to_mono = (decControl.nChannelsInternal == 1 && psDec.nChannelsInternal == 2 && (decControl.internalSampleRate == 1000 * channel_state[0].fs_kHz)) ? 1 : 0; if (channel_state[0].nFramesDecoded == 0) { for (n = 0; n < decControl.nChannelsInternal; n++) { int fs_kHz_dec; if (decControl.payloadSize_ms == 0) { /* Assuming packet loss, use 10 ms */ channel_state[n].nFramesPerPacket = 1; channel_state[n].nb_subfr = 2; } else if (decControl.payloadSize_ms == 10) { channel_state[n].nFramesPerPacket = 1; channel_state[n].nb_subfr = 2; } else if (decControl.payloadSize_ms == 20) { channel_state[n].nFramesPerPacket = 1; channel_state[n].nb_subfr = 4; } else if (decControl.payloadSize_ms == 40) { channel_state[n].nFramesPerPacket = 2; channel_state[n].nb_subfr = 4; } else if (decControl.payloadSize_ms == 60) { channel_state[n].nFramesPerPacket = 3; channel_state[n].nb_subfr = 4; } else { Inlines.OpusAssert(false); return(SilkError.SILK_DEC_INVALID_FRAME_SIZE); } fs_kHz_dec = (decControl.internalSampleRate >> 10) + 1; if (fs_kHz_dec != 8 && fs_kHz_dec != 12 && fs_kHz_dec != 16) { Inlines.OpusAssert(false); return(SilkError.SILK_DEC_INVALID_SAMPLING_FREQUENCY); } ret += channel_state[n].silk_decoder_set_fs(fs_kHz_dec, decControl.API_sampleRate); } } if (decControl.nChannelsAPI == 2 && decControl.nChannelsInternal == 2 && (psDec.nChannelsAPI == 1 || psDec.nChannelsInternal == 1)) { Arrays.MemSetShort(psDec.sStereo.pred_prev_Q13, 0, 2); Arrays.MemSetShort(psDec.sStereo.sSide, 0, 2); channel_state[1].resampler_state.Assign(channel_state[0].resampler_state); } psDec.nChannelsAPI = decControl.nChannelsAPI; psDec.nChannelsInternal = decControl.nChannelsInternal; if (decControl.API_sampleRate > (int)SilkConstants.MAX_API_FS_KHZ * 1000 || decControl.API_sampleRate < 8000) { ret = SilkError.SILK_DEC_INVALID_SAMPLING_FREQUENCY; return(ret); } if (lostFlag != DecoderAPIFlag.FLAG_PACKET_LOST && channel_state[0].nFramesDecoded == 0) { /* First decoder call for this payload */ /* Decode VAD flags and LBRR flag */ for (n = 0; n < decControl.nChannelsInternal; n++) { for (i = 0; i < channel_state[n].nFramesPerPacket; i++) { channel_state[n].VAD_flags[i] = psRangeDec.dec_bit_logp(1); } channel_state[n].LBRR_flag = psRangeDec.dec_bit_logp(1); } /* Decode LBRR flags */ for (n = 0; n < decControl.nChannelsInternal; n++) { Arrays.MemSetInt(channel_state[n].LBRR_flags, 0, SilkConstants.MAX_FRAMES_PER_PACKET); if (channel_state[n].LBRR_flag != 0) { if (channel_state[n].nFramesPerPacket == 1) { channel_state[n].LBRR_flags[0] = 1; } else { LBRR_symbol = psRangeDec.dec_icdf(Tables.silk_LBRR_flags_iCDF_ptr[channel_state[n].nFramesPerPacket - 2], 8) + 1; for (i = 0; i < channel_state[n].nFramesPerPacket; i++) { channel_state[n].LBRR_flags[i] = Inlines.silk_RSHIFT(LBRR_symbol, i) & 1; } } } } if (lostFlag == DecoderAPIFlag.FLAG_DECODE_NORMAL) { /* Regular decoding: skip all LBRR data */ for (i = 0; i < channel_state[0].nFramesPerPacket; i++) { for (n = 0; n < decControl.nChannelsInternal; n++) { if (channel_state[n].LBRR_flags[i] != 0) { short[] pulses = new short[SilkConstants.MAX_FRAME_LENGTH]; int condCoding; if (decControl.nChannelsInternal == 2 && n == 0) { Stereo.silk_stereo_decode_pred(psRangeDec, MS_pred_Q13); if (channel_state[1].LBRR_flags[i] == 0) { BoxedValueInt decodeOnlyMiddleBoxed = new BoxedValueInt(decode_only_middle); Stereo.silk_stereo_decode_mid_only(psRangeDec, decodeOnlyMiddleBoxed); decode_only_middle = decodeOnlyMiddleBoxed.Val; } } /* Use conditional coding if previous frame available */ if (i > 0 && (channel_state[n].LBRR_flags[i - 1] != 0)) { condCoding = SilkConstants.CODE_CONDITIONALLY; } else { condCoding = SilkConstants.CODE_INDEPENDENTLY; } DecodeIndices.silk_decode_indices(channel_state[n], psRangeDec, i, 1, condCoding); DecodePulses.silk_decode_pulses(psRangeDec, pulses, channel_state[n].indices.signalType, channel_state[n].indices.quantOffsetType, channel_state[n].frame_length); } } } } } /* Get MS predictor index */ if (decControl.nChannelsInternal == 2) { if (lostFlag == DecoderAPIFlag.FLAG_DECODE_NORMAL || (lostFlag == DecoderAPIFlag.FLAG_DECODE_LBRR && channel_state[0].LBRR_flags[channel_state[0].nFramesDecoded] == 1)) { Stereo.silk_stereo_decode_pred(psRangeDec, MS_pred_Q13); /* For LBRR data, decode mid-only flag only if side-channel's LBRR flag is false */ if ((lostFlag == DecoderAPIFlag.FLAG_DECODE_NORMAL && channel_state[1].VAD_flags[channel_state[0].nFramesDecoded] == 0) || (lostFlag == DecoderAPIFlag.FLAG_DECODE_LBRR && channel_state[1].LBRR_flags[channel_state[0].nFramesDecoded] == 0)) { BoxedValueInt decodeOnlyMiddleBoxed = new BoxedValueInt(decode_only_middle); Stereo.silk_stereo_decode_mid_only(psRangeDec, decodeOnlyMiddleBoxed); decode_only_middle = decodeOnlyMiddleBoxed.Val; } else { decode_only_middle = 0; } } else { for (n = 0; n < 2; n++) { MS_pred_Q13[n] = psDec.sStereo.pred_prev_Q13[n]; } } } /* Reset side channel decoder prediction memory for first frame with side coding */ if (decControl.nChannelsInternal == 2 && decode_only_middle == 0 && psDec.prev_decode_only_middle == 1) { Arrays.MemSetShort(psDec.channel_state[1].outBuf, 0, SilkConstants.MAX_FRAME_LENGTH + 2 * SilkConstants.MAX_SUB_FRAME_LENGTH); Arrays.MemSetInt(psDec.channel_state[1].sLPC_Q14_buf, 0, SilkConstants.MAX_LPC_ORDER); psDec.channel_state[1].lagPrev = 100; psDec.channel_state[1].LastGainIndex = 10; psDec.channel_state[1].prevSignalType = SilkConstants.TYPE_NO_VOICE_ACTIVITY; psDec.channel_state[1].first_frame_after_reset = 1; } /* Check if the temp buffer fits into the output PCM buffer. If it fits, * we can delay allocating the temp buffer until after the SILK peak stack * usage. We need to use a < and not a <= because of the two extra samples. */ delay_stack_alloc = (decControl.internalSampleRate * decControl.nChannelsInternal < decControl.API_sampleRate * decControl.nChannelsAPI) ? 1 : 0; if (delay_stack_alloc != 0) { samplesOut_tmp = samplesOut; samplesOut_tmp_ptrs[0] = samplesOut_ptr; samplesOut_tmp_ptrs[1] = samplesOut_ptr + channel_state[0].frame_length + 2; } else { samplesOut1_tmp_storage1 = new short[decControl.nChannelsInternal * (channel_state[0].frame_length + 2)]; samplesOut_tmp = samplesOut1_tmp_storage1; samplesOut_tmp_ptrs[0] = 0; samplesOut_tmp_ptrs[1] = channel_state[0].frame_length + 2; } if (lostFlag == DecoderAPIFlag.FLAG_DECODE_NORMAL) { has_side = (decode_only_middle == 0) ? 1 : 0; } else { has_side = (psDec.prev_decode_only_middle == 0 || (decControl.nChannelsInternal == 2 && lostFlag == DecoderAPIFlag.FLAG_DECODE_LBRR && channel_state[1].LBRR_flags[channel_state[1].nFramesDecoded] == 1)) ? 1 : 0; } /* Call decoder for one frame */ for (n = 0; n < decControl.nChannelsInternal; n++) { if (n == 0 || (has_side != 0)) { int FrameIndex; int condCoding; FrameIndex = channel_state[0].nFramesDecoded - n; /* Use independent coding if no previous frame available */ if (FrameIndex <= 0) { condCoding = SilkConstants.CODE_INDEPENDENTLY; } else if (lostFlag == DecoderAPIFlag.FLAG_DECODE_LBRR) { condCoding = (channel_state[n].LBRR_flags[FrameIndex - 1] != 0) ? SilkConstants.CODE_CONDITIONALLY : SilkConstants.CODE_INDEPENDENTLY; } else if (n > 0 && (psDec.prev_decode_only_middle != 0)) { /* If we skipped a side frame in this packet, we don't * need LTP scaling; the LTP state is well-defined. */ condCoding = SilkConstants.CODE_INDEPENDENTLY_NO_LTP_SCALING; } else { condCoding = SilkConstants.CODE_CONDITIONALLY; } ret += channel_state[n].silk_decode_frame(psRangeDec, samplesOut_tmp, samplesOut_tmp_ptrs[n] + 2, nSamplesOutDec, lostFlag, condCoding); } else { Arrays.MemSetWithOffset <short>(samplesOut_tmp, 0, samplesOut_tmp_ptrs[n] + 2, nSamplesOutDec.Val); } channel_state[n].nFramesDecoded++; } if (decControl.nChannelsAPI == 2 && decControl.nChannelsInternal == 2) { /* Convert Mid/Side to Left/Right */ Stereo.silk_stereo_MS_to_LR(psDec.sStereo, samplesOut_tmp, samplesOut_tmp_ptrs[0], samplesOut_tmp, samplesOut_tmp_ptrs[1], MS_pred_Q13, channel_state[0].fs_kHz, nSamplesOutDec.Val); } else { /* Buffering */ Array.Copy(psDec.sStereo.sMid, 0, samplesOut_tmp, samplesOut_tmp_ptrs[0], 2); Array.Copy(samplesOut_tmp, samplesOut_tmp_ptrs[0] + nSamplesOutDec.Val, psDec.sStereo.sMid, 0, 2); } /* Number of output samples */ nSamplesOut = Inlines.silk_DIV32(nSamplesOutDec.Val * decControl.API_sampleRate, Inlines.silk_SMULBB(channel_state[0].fs_kHz, 1000)); /* Set up pointers to temp buffers */ if (decControl.nChannelsAPI == 2) { samplesOut2_tmp = new short[nSamplesOut]; resample_out = samplesOut2_tmp; resample_out_ptr = 0; } else { resample_out = samplesOut; resample_out_ptr = samplesOut_ptr; } if (delay_stack_alloc != 0) { samplesOut1_tmp_storage2 = new short[decControl.nChannelsInternal * (channel_state[0].frame_length + 2)]; Array.Copy(samplesOut, samplesOut_ptr, samplesOut1_tmp_storage2, 0, decControl.nChannelsInternal * (channel_state[0].frame_length + 2)); samplesOut_tmp = samplesOut1_tmp_storage2; samplesOut_tmp_ptrs[0] = 0; samplesOut_tmp_ptrs[1] = channel_state[0].frame_length + 2; } for (n = 0; n < Inlines.silk_min(decControl.nChannelsAPI, decControl.nChannelsInternal); n++) { /* Resample decoded signal to API_sampleRate */ ret += Resampler.silk_resampler(channel_state[n].resampler_state, resample_out, resample_out_ptr, samplesOut_tmp, samplesOut_tmp_ptrs[n] + 1, nSamplesOutDec.Val); /* Interleave if stereo output and stereo stream */ if (decControl.nChannelsAPI == 2) { int nptr = samplesOut_ptr + n; for (i = 0; i < nSamplesOut; i++) { samplesOut[nptr + 2 * i] = resample_out[resample_out_ptr + i]; } } } /* Create two channel output from mono stream */ if (decControl.nChannelsAPI == 2 && decControl.nChannelsInternal == 1) { if (stereo_to_mono != 0) { /* Resample right channel for newly collapsed stereo just in case * we weren't doing collapsing when switching to mono */ ret += Resampler.silk_resampler(channel_state[1].resampler_state, resample_out, resample_out_ptr, samplesOut_tmp, samplesOut_tmp_ptrs[0] + 1, nSamplesOutDec.Val); for (i = 0; i < nSamplesOut; i++) { samplesOut[samplesOut_ptr + 1 + 2 * i] = resample_out[resample_out_ptr + i]; } } else { for (i = 0; i < nSamplesOut; i++) { samplesOut[samplesOut_ptr + 1 + 2 * i] = samplesOut[samplesOut_ptr + 2 * i]; } } } /* Export pitch lag, measured at 48 kHz sampling rate */ if (channel_state[0].prevSignalType == SilkConstants.TYPE_VOICED) { int[] mult_tab = { 6, 4, 3 }; decControl.prevPitchLag = channel_state[0].lagPrev * mult_tab[(channel_state[0].fs_kHz - 8) >> 2]; } else { decControl.prevPitchLag = 0; } if (lostFlag == DecoderAPIFlag.FLAG_PACKET_LOST) { /* On packet loss, remove the gain clamping to prevent having the energy "bounce back" * if we lose packets when the energy is going down */ for (i = 0; i < psDec.nChannelsInternal; i++) { psDec.channel_state[i].LastGainIndex = 10; } } else { psDec.prev_decode_only_middle = decode_only_middle; } return(ret); }