unsafe void initializeSubframeTasks(int blocksize, int channelsCount, int nFrames, FlaCudaTask task) { task.nResidualTasks = 0; task.nTasksPerWindow = Math.Min(32, eparams.orders_per_window); task.nResidualTasksPerChannel = _windowcount * task.nTasksPerWindow + 1 + (eparams.do_constant ? 1 : 0) + eparams.max_fixed_order - eparams.min_fixed_order; if (task.nResidualTasksPerChannel >= 4) task.nResidualTasksPerChannel = (task.nResidualTasksPerChannel + 7) & ~7; task.nAutocorTasksPerChannel = _windowcount; for (int iFrame = 0; iFrame < nFrames; iFrame++) { for (int ch = 0; ch < channelsCount; ch++) { for (int iWindow = 0; iWindow < _windowcount; iWindow++) { // LPC tasks for (int order = 0; order < task.nTasksPerWindow; order++) { task.ResidualTasks[task.nResidualTasks].type = (int)SubframeType.LPC; task.ResidualTasks[task.nResidualTasks].channel = ch; task.ResidualTasks[task.nResidualTasks].obits = (int)bits_per_sample + (channels == 2 && ch == 3 ? 1 : 0); task.ResidualTasks[task.nResidualTasks].abits = task.ResidualTasks[task.nResidualTasks].obits; task.ResidualTasks[task.nResidualTasks].blocksize = blocksize; task.ResidualTasks[task.nResidualTasks].residualOrder = order + 1; task.ResidualTasks[task.nResidualTasks].samplesOffs = ch * FlaCudaWriter.MAX_BLOCKSIZE + iFrame * blocksize; task.ResidualTasks[task.nResidualTasks].residualOffs = task.ResidualTasks[task.nResidualTasks].samplesOffs; task.nResidualTasks++; } } // Constant frames if (eparams.do_constant) { task.ResidualTasks[task.nResidualTasks].type = (int)SubframeType.Constant; task.ResidualTasks[task.nResidualTasks].channel = ch; task.ResidualTasks[task.nResidualTasks].obits = (int)bits_per_sample + (channels == 2 && ch == 3 ? 1 : 0); task.ResidualTasks[task.nResidualTasks].abits = task.ResidualTasks[task.nResidualTasks].obits; task.ResidualTasks[task.nResidualTasks].blocksize = blocksize; task.ResidualTasks[task.nResidualTasks].samplesOffs = ch * FlaCudaWriter.MAX_BLOCKSIZE + iFrame * blocksize; task.ResidualTasks[task.nResidualTasks].residualOffs = task.ResidualTasks[task.nResidualTasks].samplesOffs; task.ResidualTasks[task.nResidualTasks].residualOrder = 1; task.ResidualTasks[task.nResidualTasks].shift = 0; task.ResidualTasks[task.nResidualTasks].coefs[0] = 1; task.nResidualTasks++; } // Fixed prediction for (int order = eparams.min_fixed_order; order <= eparams.max_fixed_order; order++) { task.ResidualTasks[task.nResidualTasks].type = (int)SubframeType.Fixed; task.ResidualTasks[task.nResidualTasks].channel = ch; task.ResidualTasks[task.nResidualTasks].obits = (int)bits_per_sample + (channels == 2 && ch == 3 ? 1 : 0); task.ResidualTasks[task.nResidualTasks].abits = task.ResidualTasks[task.nResidualTasks].obits; task.ResidualTasks[task.nResidualTasks].blocksize = blocksize; task.ResidualTasks[task.nResidualTasks].residualOrder = order; task.ResidualTasks[task.nResidualTasks].samplesOffs = ch * FlaCudaWriter.MAX_BLOCKSIZE + iFrame * blocksize; task.ResidualTasks[task.nResidualTasks].residualOffs = task.ResidualTasks[task.nResidualTasks].samplesOffs; task.ResidualTasks[task.nResidualTasks].shift = 0; switch (order) { case 0: break; case 1: task.ResidualTasks[task.nResidualTasks].coefs[0] = 1; break; case 2: task.ResidualTasks[task.nResidualTasks].coefs[1] = 2; task.ResidualTasks[task.nResidualTasks].coefs[0] = -1; break; case 3: task.ResidualTasks[task.nResidualTasks].coefs[2] = 3; task.ResidualTasks[task.nResidualTasks].coefs[1] = -3; task.ResidualTasks[task.nResidualTasks].coefs[0] = 1; break; case 4: task.ResidualTasks[task.nResidualTasks].coefs[3] = 4; task.ResidualTasks[task.nResidualTasks].coefs[2] = -6; task.ResidualTasks[task.nResidualTasks].coefs[1] = 4; task.ResidualTasks[task.nResidualTasks].coefs[0] = -1; break; } task.nResidualTasks++; } // Filler while ((task.nResidualTasks % task.nResidualTasksPerChannel) != 0) { task.ResidualTasks[task.nResidualTasks].type = (int)SubframeType.Verbatim; task.ResidualTasks[task.nResidualTasks].channel = ch; task.ResidualTasks[task.nResidualTasks].obits = (int)bits_per_sample + (channels == 2 && ch == 3 ? 1 : 0); task.ResidualTasks[task.nResidualTasks].abits = task.ResidualTasks[task.nResidualTasks].obits; task.ResidualTasks[task.nResidualTasks].blocksize = blocksize; task.ResidualTasks[task.nResidualTasks].residualOrder = 0; task.ResidualTasks[task.nResidualTasks].samplesOffs = ch * FlaCudaWriter.MAX_BLOCKSIZE + iFrame * blocksize; task.ResidualTasks[task.nResidualTasks].residualOffs = task.ResidualTasks[task.nResidualTasks].samplesOffs; task.ResidualTasks[task.nResidualTasks].shift = 0; task.nResidualTasks++; } } } if (sizeof(FlaCudaSubframeTask) * task.nResidualTasks > task.residualTasksLen) throw new Exception("oops"); cuda.CopyHostToDeviceAsync(task.cudaResidualTasks, task.residualTasksPtr, (uint)(sizeof(FlaCudaSubframeTask) * task.nResidualTasks), task.stream); task.frameSize = blocksize; }
public unsafe void InitTasks() { bool doMidside = channels == 2 && eparams.do_midside; int channelCount = doMidside ? 2 * channels : channels; if (!inited) { cuda = new CUDA(true, InitializationFlags.None); cuda.CreateContext(0, CUCtxFlags.SchedAuto); using (Stream cubin = GetType().Assembly.GetManifestResourceStream(GetType(), "flacuda.cubin")) using (StreamReader sr = new StreamReader(cubin)) cuda.LoadModule(new ASCIIEncoding().GetBytes(sr.ReadToEnd())); //cuda.LoadModule(System.IO.Path.Combine(Environment.CurrentDirectory, "flacuda.cubin")); if (_IO == null) _IO = new FileStream(_path, FileMode.Create, FileAccess.Write, FileShare.Read); int header_size = flake_encode_init(); _IO.Write(header, 0, header_size); if (_IO.CanSeek) first_frame_offset = _IO.Position; task1 = new FlaCudaTask(cuda, channelCount, channels, bits_per_sample, max_frame_size, _settings.DoVerify); task2 = new FlaCudaTask(cuda, channelCount, channels, bits_per_sample, max_frame_size, _settings.DoVerify); if (_settings.CPUThreads > 0) { cpu_tasks = new FlaCudaTask[_settings.CPUThreads]; for (int i = 0; i < cpu_tasks.Length; i++) cpu_tasks[i] = new FlaCudaTask(cuda, channelCount, channels, bits_per_sample, max_frame_size, _settings.DoVerify); } cudaWindow = cuda.Allocate((uint)sizeof(float) * FlaCudaWriter.MAX_BLOCKSIZE * 2 * lpc.MAX_LPC_WINDOWS); inited = true; } }
public unsafe void do_output_frames(int nFrames) { send_to_GPU(task1, nFrames, eparams.block_size); if (task2.frameCount > 0) cuda.SynchronizeStream(task2.stream); run_GPU_task(task1); if (task2.frameCount > 0) { if (cpu_tasks != null) { wait_for_cpu_task(); FlaCudaTask ttmp = cpu_tasks[oldest_cpu_task]; cpu_tasks[oldest_cpu_task] = task2; task2 = ttmp; start_cpu_task(); oldest_cpu_task = (oldest_cpu_task + 1) % cpu_tasks.Length; if (task2.frameCount > 0) write_result(task2); } else { process_result(task2); write_result(task2); } } int bs = eparams.block_size * nFrames; samplesInBuffer -= bs; if (samplesInBuffer > 0) AudioSamples.MemCpy(((byte*)task2.samplesBytesPtr), ((byte*)task1.samplesBytesPtr) + bs * _pcm.BlockAlign, samplesInBuffer * _pcm.BlockAlign); FlaCudaTask tmp = task1; task1 = task2; task2 = tmp; task1.frameCount = 0; }
unsafe void process_result(FlaCudaTask task) { bool doMidside = channels == 2 && eparams.do_midside; int channelCount = doMidside ? 2 * channels : channels; long iSample = 0; long iByte = 0; task.frame.writer.Reset(); task.frame.writer_offset = 0; for (int iFrame = 0; iFrame < task.frameCount; iFrame++) { //if (0 != eparams.variable_block_size && 0 == (task.blocksize & 7) && task.blocksize >= 128) // fs = encode_frame_vbs(); //else int fn = task.frameNumber + (eparams.variable_block_size > 0 ? (int)iSample : iFrame); int fs = encode_frame(doMidside, channelCount, iFrame, task, fn); if (task.verify != null) { int decoded = task.verify.DecodeFrame(task.frame.writer.Buffer, task.frame.writer_offset, fs); if (decoded != fs || task.verify.Remaining != task.frameSize) throw new Exception("validation failed! frame size mismatch"); fixed (int* r = task.verify.Samples) { for (int ch = 0; ch < channels; ch++) { short* res = ((short*)task.samplesBytesPtr) + iFrame * channels * task.frameSize + ch; int* smp = r + ch * Flake.MAX_BLOCKSIZE; for (int i = task.frameSize; i > 0; i--) { //if (AudioSamples.MemCmp(s + iFrame * task.frameSize + ch * FlaCudaWriter.MAX_BLOCKSIZE, r + ch * Flake.MAX_BLOCKSIZE, task.frameSize)) if (*res != *(smp++)) throw new Exception(string.Format("validation failed! iFrame={0}, ch={1}", iFrame, ch)); res += channels; } } } } if (seek_table != null && _IO.CanSeek) { for (int sp = 0; sp < seek_table.Length; sp++) { if (seek_table[sp].framesize != 0) continue; if (seek_table[sp].number >= task.framePos + iSample + task.frameSize) break; if (seek_table[sp].number >= task.framePos + iSample) { seek_table[sp].number = task.framePos + iSample; seek_table[sp].offset = iByte; seek_table[sp].framesize = task.frameSize; } } } //Array.Copy(task.frame.buffer, 0, task.outputBuffer, iByte, fs); iSample += task.frameSize; iByte += fs; } task.outputSize = (int)iByte; if (iByte != task.frame.writer.Length) throw new Exception("invalid length"); }
unsafe void write_result(FlaCudaTask task) { int iSample = task.frameSize * task.frameCount; if (seek_table != null && _IO.CanSeek) for (int sp = 0; sp < seek_table.Length; sp++) { if (seek_table[sp].number >= task.framePos + iSample) break; if (seek_table[sp].number >= task.framePos) seek_table[sp].offset += _IO.Position - first_frame_offset; } _IO.Write(task.outputBuffer, 0, task.outputSize); _position += iSample; _totalSize += task.outputSize; }
unsafe void send_to_GPU(FlaCudaTask task, int nFrames, int blocksize) { bool doMidside = channels == 2 && eparams.do_midside; int channelsCount = doMidside ? 2 * channels : channels; if (blocksize != task.frameSize) task.nResidualTasks = 0; task.frameCount = nFrames; task.frameSize = blocksize; task.frameNumber = eparams.variable_block_size > 0 ? frame_pos : frame_count; task.framePos = frame_pos; frame_count += nFrames; frame_pos += nFrames * blocksize; cuda.CopyHostToDeviceAsync(task.cudaSamplesBytes, task.samplesBytesPtr, (uint)(sizeof(short) * channels * blocksize * nFrames), task.stream); }
unsafe void run_GPU_task(FlaCudaTask task) { bool doMidside = channels == 2 && eparams.do_midside; int channelsCount = doMidside ? 2 * channels : channels; if (task.frameSize != _windowsize && task.frameSize > 4) fixed (float* window = windowBuffer) { _windowsize = task.frameSize; _windowcount = 0; calculate_window(window, lpc.window_welch, WindowFunction.Welch); calculate_window(window, lpc.window_flattop, WindowFunction.Flattop); calculate_window(window, lpc.window_tukey, WindowFunction.Tukey); calculate_window(window, lpc.window_hann, WindowFunction.Hann); calculate_window(window, lpc.window_bartlett, WindowFunction.Bartlett); if (_windowcount == 0) throw new Exception("invalid windowfunction"); cuda.CopyHostToDevice<float>(cudaWindow, windowBuffer); } if (task.nResidualTasks == 0) initializeSubframeTasks(task.frameSize, channelsCount, max_frames, task); estimate_residual(task, channelsCount); }
unsafe int encode_frame(bool doMidside, int channelCount, int iFrame, FlaCudaTask task, int current_frame_number) { task.frame.InitSize(task.frameSize, eparams.variable_block_size != 0); task.frame.frame_number = iFrame; task.frame.ch_mode = ChannelMode.NotStereo; fixed (int* smp = task.samplesBuffer) { for (int ch = 0; ch < channelCount; ch++) task.frame.subframes[ch].Init( smp + ch * FlaCudaWriter.MAX_BLOCKSIZE + iFrame * task.frameSize, ((int*)task.residualBufferPtr) + ch * FlaCudaWriter.MAX_BLOCKSIZE + iFrame * task.frameSize, _pcm.BitsPerSample + (doMidside && ch == 3 ? 1 : 0), 0); select_best_methods(task.frame, channelCount, iFrame, task); //unpack_samples(task); encode_residual(task); //task.frame.writer.Reset(); task.frame.frame_number = current_frame_number; task.frame.writer_offset = task.frame.writer.Length; output_frame_header(task.frame); output_subframes(task.frame); output_frame_footer(task.frame); if (task.frame.writer.Length - task.frame.writer_offset >= max_frame_size) throw new Exception("buffer overflow"); return task.frame.writer.Length - task.frame.writer_offset; } }
/// <summary> /// Copy channel-interleaved input samples into separate subframes /// </summary> /// <param name="task"></param> /// <param name="doMidside"></param> unsafe void unpack_samples(FlaCudaTask task, int count) { int iFrame = task.frame.frame_number; short* src = ((short*)task.samplesBytesPtr) + iFrame * channels * task.frameSize; switch (task.frame.ch_mode) { case ChannelMode.NotStereo: for (int ch = 0; ch < channels; ch++) { int* s = task.frame.subframes[ch].samples; int wbits = (int)task.frame.subframes[ch].wbits; for (int i = 0; i < count; i++) s[i] = src[i * channels + ch] >>= wbits; } break; case ChannelMode.LeftRight: { int* left = task.frame.subframes[0].samples; int* right = task.frame.subframes[1].samples; int lwbits = (int)task.frame.subframes[0].wbits; int rwbits = (int)task.frame.subframes[1].wbits; for (int i = 0; i < count; i++) { int l = *(src++); int r = *(src++); left[i] = l >> lwbits; right[i] = r >> rwbits; } break; } case ChannelMode.LeftSide: { int* left = task.frame.subframes[0].samples; int* right = task.frame.subframes[1].samples; int lwbits = (int)task.frame.subframes[0].wbits; int rwbits = (int)task.frame.subframes[1].wbits; for (int i = 0; i < count; i++) { int l = *(src++); int r = *(src++); left[i] = l >> lwbits; right[i] = (l - r) >> rwbits; } break; } case ChannelMode.RightSide: { int* left = task.frame.subframes[0].samples; int* right = task.frame.subframes[1].samples; int lwbits = (int)task.frame.subframes[0].wbits; int rwbits = (int)task.frame.subframes[1].wbits; for (int i = 0; i < count; i++) { int l = *(src++); int r = *(src++); left[i] = (l - r) >> lwbits; right[i] = r >> rwbits; } break; } case ChannelMode.MidSide: { int* left = task.frame.subframes[0].samples; int* right = task.frame.subframes[1].samples; int lwbits = (int)task.frame.subframes[0].wbits; int rwbits = (int)task.frame.subframes[1].wbits; for (int i = 0; i < count; i++) { int l = *(src++); int r = *(src++); left[i] = (l + r) >> (1 + lwbits); right[i] = (l - r) >> rwbits; } break; } } }
unsafe void estimate_residual(FlaCudaTask task, int channelsCount) { if (task.frameSize <= 4) return; //int autocorPartSize = (2 * 256 - eparams.max_prediction_order) & ~15; int autocorPartSize = 32 * 15; int autocorPartCount = (task.frameSize + autocorPartSize - 1) / autocorPartSize; if (autocorPartCount > maxAutocorParts) throw new Exception("internal error"); int threads_y; if (task.nResidualTasksPerChannel < 4) threads_y = 8; else if (task.nResidualTasksPerChannel >= 4 && task.nResidualTasksPerChannel <= 8) threads_y = task.nResidualTasksPerChannel; else if ((task.nResidualTasksPerChannel % 8) == 0) threads_y = 8; else if ((task.nResidualTasksPerChannel % 7) == 0) threads_y = 7; else if ((task.nResidualTasksPerChannel % 6) == 0) threads_y = 6; else if ((task.nResidualTasksPerChannel % 5) == 0) threads_y = 5; else if ((task.nResidualTasksPerChannel % 4) == 0) threads_y = 4; else throw new Exception("invalid LPC order"); int residualPartSize = 32 * threads_y; int residualPartCount = (task.frameSize + residualPartSize - 1) / residualPartSize; if (residualPartCount > maxResidualParts) throw new Exception("invalid combination of block size and LPC order"); int max_porder = get_max_p_order(eparams.max_partition_order, task.frameSize, eparams.max_prediction_order); int calcPartitionPartSize = task.frameSize >> max_porder; while (calcPartitionPartSize < 16 && max_porder > 0) { calcPartitionPartSize <<= 1; max_porder--; } int calcPartitionPartCount = (calcPartitionPartSize >= 128) ? 1 : (256 / calcPartitionPartSize); CUfunction cudaChannelDecorr = channels == 2 ? (channelsCount == 4 ? task.cudaStereoDecorr : task.cudaChannelDecorr2) : task.cudaChannelDecorr; CUfunction cudaCalcPartition = calcPartitionPartSize >= 128 ? task.cudaCalcLargePartition : calcPartitionPartSize == 16 && task.frameSize >= 256 ? task.cudaCalcPartition16 : task.cudaCalcPartition; CUfunction cudaEstimateResidual = task.nResidualTasksPerChannel < 4 ? task.cudaEstimateResidual1 : eparams.max_prediction_order <= 8 ? task.cudaEstimateResidual8 : eparams.max_prediction_order <= 12 ? task.cudaEstimateResidual12 : task.cudaEstimateResidual; cuda.SetParameter(cudaChannelDecorr, 0 * sizeof(uint), (uint)task.cudaSamples.Pointer); cuda.SetParameter(cudaChannelDecorr, 1 * sizeof(uint), (uint)task.cudaSamplesBytes.Pointer); cuda.SetParameter(cudaChannelDecorr, 2 * sizeof(uint), (uint)MAX_BLOCKSIZE); cuda.SetParameterSize(cudaChannelDecorr, sizeof(uint) * 3U); cuda.SetFunctionBlockShape(cudaChannelDecorr, 256, 1, 1); cuda.SetParameter(task.cudaFindWastedBits, 0 * sizeof(uint), (uint)task.cudaResidualTasks.Pointer); cuda.SetParameter(task.cudaFindWastedBits, 1 * sizeof(uint), (uint)task.cudaSamples.Pointer); cuda.SetParameter(task.cudaFindWastedBits, 2 * sizeof(uint), (uint)task.nResidualTasksPerChannel); cuda.SetParameterSize(task.cudaFindWastedBits, sizeof(uint) * 3U); cuda.SetFunctionBlockShape(task.cudaFindWastedBits, 256, 1, 1); cuda.SetParameter(task.cudaComputeAutocor, 0, (uint)task.cudaAutocorOutput.Pointer); cuda.SetParameter(task.cudaComputeAutocor, 1 * sizeof(uint), (uint)task.cudaSamples.Pointer); cuda.SetParameter(task.cudaComputeAutocor, 2 * sizeof(uint), (uint)cudaWindow.Pointer); cuda.SetParameter(task.cudaComputeAutocor, 3 * sizeof(uint), (uint)task.cudaResidualTasks.Pointer); cuda.SetParameter(task.cudaComputeAutocor, 4 * sizeof(uint), (uint)eparams.max_prediction_order); cuda.SetParameter(task.cudaComputeAutocor, 5 * sizeof(uint), (uint)task.nAutocorTasksPerChannel - 1); cuda.SetParameter(task.cudaComputeAutocor, 6 * sizeof(uint), (uint)task.nResidualTasksPerChannel); cuda.SetParameterSize(task.cudaComputeAutocor, 7U * sizeof(uint)); cuda.SetFunctionBlockShape(task.cudaComputeAutocor, 32, 8, 1); cuda.SetParameter(task.cudaComputeLPC, 0, (uint)task.cudaResidualTasks.Pointer); cuda.SetParameter(task.cudaComputeLPC, 1 * sizeof(uint), (uint)task.nResidualTasksPerChannel); cuda.SetParameter(task.cudaComputeLPC, 2 * sizeof(uint), (uint)task.cudaAutocorOutput.Pointer); cuda.SetParameter(task.cudaComputeLPC, 3 * sizeof(uint), (uint)eparams.max_prediction_order); cuda.SetParameter(task.cudaComputeLPC, 4 * sizeof(uint), (uint)task.cudaLPCData.Pointer); cuda.SetParameter(task.cudaComputeLPC, 5 * sizeof(uint), (uint)_windowcount); cuda.SetParameter(task.cudaComputeLPC, 6 * sizeof(uint), (uint)autocorPartCount); cuda.SetParameterSize(task.cudaComputeLPC, 7U * sizeof(uint)); cuda.SetFunctionBlockShape(task.cudaComputeLPC, 32, 1, 1); cuda.SetParameter(task.cudaComputeLPCLattice, 0, (uint)task.cudaResidualTasks.Pointer); cuda.SetParameter(task.cudaComputeLPCLattice, 1 * sizeof(uint), (uint)task.nResidualTasksPerChannel); cuda.SetParameter(task.cudaComputeLPCLattice, 2 * sizeof(uint), (uint)task.cudaSamples.Pointer); cuda.SetParameter(task.cudaComputeLPCLattice, 3 * sizeof(uint), (uint)_windowcount); cuda.SetParameter(task.cudaComputeLPCLattice, 4 * sizeof(uint), (uint)eparams.max_prediction_order); cuda.SetParameter(task.cudaComputeLPCLattice, 5 * sizeof(uint), (uint)task.cudaLPCData.Pointer); cuda.SetParameterSize(task.cudaComputeLPCLattice, 6U * sizeof(uint)); cuda.SetFunctionBlockShape(task.cudaComputeLPCLattice, 256, 1, 1); cuda.SetParameter(task.cudaQuantizeLPC, 0, (uint)task.cudaResidualTasks.Pointer); cuda.SetParameter(task.cudaQuantizeLPC, 1 * sizeof(uint), (uint)task.nResidualTasksPerChannel); cuda.SetParameter(task.cudaQuantizeLPC, 2 * sizeof(uint), (uint)task.nTasksPerWindow); cuda.SetParameter(task.cudaQuantizeLPC, 3 * sizeof(uint), (uint)task.cudaLPCData.Pointer); cuda.SetParameter(task.cudaQuantizeLPC, 4 * sizeof(uint), (uint)eparams.max_prediction_order); cuda.SetParameter(task.cudaQuantizeLPC, 5 * sizeof(uint), (uint)eparams.lpc_min_precision_search); cuda.SetParameter(task.cudaQuantizeLPC, 6 * sizeof(uint), (uint)(eparams.lpc_max_precision_search - eparams.lpc_min_precision_search)); cuda.SetParameterSize(task.cudaQuantizeLPC, 7U * sizeof(uint)); cuda.SetFunctionBlockShape(task.cudaQuantizeLPC, 32, 4, 1); cuda.SetParameter(cudaEstimateResidual, sizeof(uint) * 0, (uint)task.cudaResidualOutput.Pointer); cuda.SetParameter(cudaEstimateResidual, sizeof(uint) * 1, (uint)task.cudaSamples.Pointer); cuda.SetParameter(cudaEstimateResidual, sizeof(uint) * 2, (uint)task.cudaResidualTasks.Pointer); cuda.SetParameter(cudaEstimateResidual, sizeof(uint) * 3, (uint)eparams.max_prediction_order); cuda.SetParameter(cudaEstimateResidual, sizeof(uint) * 4, (uint)residualPartSize); cuda.SetParameterSize(cudaEstimateResidual, 5U * sizeof(uint)); cuda.SetFunctionBlockShape(cudaEstimateResidual, 32, threads_y, 1); cuda.SetParameter(task.cudaChooseBestMethod, 0 * sizeof(uint), (uint)task.cudaResidualTasks.Pointer); cuda.SetParameter(task.cudaChooseBestMethod, 1 * sizeof(uint), (uint)task.cudaResidualOutput.Pointer); cuda.SetParameter(task.cudaChooseBestMethod, 2 * sizeof(uint), (uint)residualPartSize); cuda.SetParameter(task.cudaChooseBestMethod, 3 * sizeof(uint), (uint)residualPartCount); cuda.SetParameter(task.cudaChooseBestMethod, 4 * sizeof(uint), (uint)task.nResidualTasksPerChannel); cuda.SetParameterSize(task.cudaChooseBestMethod, 5U * sizeof(uint)); cuda.SetFunctionBlockShape(task.cudaChooseBestMethod, 32, 8, 1); cuda.SetParameter(task.cudaCopyBestMethod, 0, (uint)task.cudaBestResidualTasks.Pointer); cuda.SetParameter(task.cudaCopyBestMethod, 1 * sizeof(uint), (uint)task.cudaResidualTasks.Pointer); cuda.SetParameter(task.cudaCopyBestMethod, 2 * sizeof(uint), (uint)task.nResidualTasksPerChannel); cuda.SetParameterSize(task.cudaCopyBestMethod, sizeof(uint) * 3U); cuda.SetFunctionBlockShape(task.cudaCopyBestMethod, 64, 1, 1); cuda.SetParameter(task.cudaCopyBestMethodStereo, 0, (uint)task.cudaBestResidualTasks.Pointer); cuda.SetParameter(task.cudaCopyBestMethodStereo, 1 * sizeof(uint), (uint)task.cudaResidualTasks.Pointer); cuda.SetParameter(task.cudaCopyBestMethodStereo, 2 * sizeof(uint), (uint)task.nResidualTasksPerChannel); cuda.SetParameterSize(task.cudaCopyBestMethodStereo, sizeof(uint) * 3U); cuda.SetFunctionBlockShape(task.cudaCopyBestMethodStereo, 64, 1, 1); cuda.SetParameter(task.cudaEncodeResidual, 0, (uint)task.cudaResidual.Pointer); cuda.SetParameter(task.cudaEncodeResidual, 1 * sizeof(uint), (uint)task.cudaSamples.Pointer); cuda.SetParameter(task.cudaEncodeResidual, 2 * sizeof(uint), (uint)task.cudaBestResidualTasks.Pointer); cuda.SetParameterSize(task.cudaEncodeResidual, sizeof(uint) * 3U); cuda.SetFunctionBlockShape(task.cudaEncodeResidual, residualPartSize, 1, 1); cuda.SetParameter(cudaCalcPartition, 0, (uint)task.cudaPartitions.Pointer); cuda.SetParameter(cudaCalcPartition, 1 * sizeof(uint), (uint)task.cudaResidual.Pointer); cuda.SetParameter(cudaCalcPartition, 2 * sizeof(uint), (uint)task.cudaSamples.Pointer); cuda.SetParameter(cudaCalcPartition, 3 * sizeof(uint), (uint)task.cudaBestResidualTasks.Pointer); cuda.SetParameter(cudaCalcPartition, 4 * sizeof(uint), (uint)max_porder); cuda.SetParameter(cudaCalcPartition, 5 * sizeof(uint), (uint)calcPartitionPartSize); cuda.SetParameter(cudaCalcPartition, 6 * sizeof(uint), (uint)calcPartitionPartCount); cuda.SetParameterSize(cudaCalcPartition, 7U * sizeof(uint)); cuda.SetFunctionBlockShape(cudaCalcPartition, 16, 16, 1); cuda.SetParameter(task.cudaSumPartition, 0, (uint)task.cudaPartitions.Pointer); cuda.SetParameter(task.cudaSumPartition, 1 * sizeof(uint), (uint)max_porder); cuda.SetParameterSize(task.cudaSumPartition, 2U * sizeof(uint)); cuda.SetFunctionBlockShape(task.cudaSumPartition, Math.Max(32, 1 << (max_porder - 1)), 1, 1); cuda.SetParameter(task.cudaFindRiceParameter, 0, (uint)task.cudaRiceParams.Pointer); cuda.SetParameter(task.cudaFindRiceParameter, 1 * sizeof(uint), (uint)task.cudaPartitions.Pointer); cuda.SetParameter(task.cudaFindRiceParameter, 2 * sizeof(uint), (uint)max_porder); cuda.SetParameterSize(task.cudaFindRiceParameter, 3U * sizeof(uint)); cuda.SetFunctionBlockShape(task.cudaFindRiceParameter, 32, 8, 1); cuda.SetParameter(task.cudaFindPartitionOrder, 0, (uint)task.cudaBestRiceParams.Pointer); cuda.SetParameter(task.cudaFindPartitionOrder, 1 * sizeof(uint), (uint)task.cudaBestResidualTasks.Pointer); cuda.SetParameter(task.cudaFindPartitionOrder, 2 * sizeof(uint), (uint)task.cudaRiceParams.Pointer); cuda.SetParameter(task.cudaFindPartitionOrder, 3 * sizeof(uint), (uint)max_porder); cuda.SetParameterSize(task.cudaFindPartitionOrder, 4U * sizeof(uint)); cuda.SetFunctionBlockShape(task.cudaFindPartitionOrder, 256, 1, 1); // issue work to the GPU cuda.LaunchAsync(cudaChannelDecorr, (task.frameCount * task.frameSize + 255) / 256, channels == 2 ? 1 : channels, task.stream); if (eparams.do_wasted) cuda.LaunchAsync(task.cudaFindWastedBits, channelsCount * task.frameCount, 1, task.stream); bool lattice = do_lattice && task.frameSize <= 512 && eparams.max_prediction_order <= 12; if (!lattice || _windowcount > 1) { cuda.LaunchAsync(task.cudaComputeAutocor, autocorPartCount, task.nAutocorTasksPerChannel * channelsCount * task.frameCount, task.stream); cuda.LaunchAsync(task.cudaComputeLPC, task.nAutocorTasksPerChannel, channelsCount * task.frameCount, task.stream); } if (lattice) cuda.LaunchAsync(task.cudaComputeLPCLattice, 1, channelsCount * task.frameCount, task.stream); cuda.LaunchAsync(task.cudaQuantizeLPC, task.nAutocorTasksPerChannel, channelsCount * task.frameCount, task.stream); cuda.LaunchAsync(cudaEstimateResidual, residualPartCount, task.nResidualTasksPerChannel * channelsCount * task.frameCount / (task.nResidualTasksPerChannel < 4 ? 1 : threads_y), task.stream); cuda.LaunchAsync(task.cudaChooseBestMethod, 1, channelsCount * task.frameCount, task.stream); if (channels == 2 && channelsCount == 4) cuda.LaunchAsync(task.cudaCopyBestMethodStereo, 1, task.frameCount, task.stream); else cuda.LaunchAsync(task.cudaCopyBestMethod, 1, channels * task.frameCount, task.stream); if (_settings.GPUOnly) { int bsz = calcPartitionPartCount * calcPartitionPartSize; if (cudaCalcPartition.Pointer == task.cudaCalcLargePartition.Pointer) cuda.LaunchAsync(task.cudaEncodeResidual, residualPartCount, channels * task.frameCount, task.stream); cuda.LaunchAsync(cudaCalcPartition, (task.frameSize + bsz - 1) / bsz, channels * task.frameCount, task.stream); if (max_porder > 0) cuda.LaunchAsync(task.cudaSumPartition, Flake.MAX_RICE_PARAM + 1, channels * task.frameCount, task.stream); cuda.LaunchAsync(task.cudaFindRiceParameter, ((2 << max_porder) + 31) / 32, channels * task.frameCount, task.stream); //if (max_porder > 0) // need to run even if max_porder==0 just to calculate the final frame size cuda.LaunchAsync(task.cudaFindPartitionOrder, 1, channels * task.frameCount, task.stream); cuda.CopyDeviceToHostAsync(task.cudaResidual, task.residualBufferPtr, (uint)(sizeof(int) * MAX_BLOCKSIZE * channels), task.stream); cuda.CopyDeviceToHostAsync(task.cudaBestRiceParams, task.bestRiceParamsPtr, (uint)(sizeof(int) * (1 << max_porder) * channels * task.frameCount), task.stream); task.max_porder = max_porder; } cuda.CopyDeviceToHostAsync(task.cudaBestResidualTasks, task.bestResidualTasksPtr, (uint)(sizeof(FlaCudaSubframeTask) * channels * task.frameCount), task.stream); }
unsafe void select_best_methods(FlacFrame frame, int channelsCount, int iFrame, FlaCudaTask task) { if (channelsCount == 4 && channels == 2) { if (task.BestResidualTasks[iFrame * 2].channel == 0 && task.BestResidualTasks[iFrame * 2 + 1].channel == 1) frame.ch_mode = ChannelMode.LeftRight; else if (task.BestResidualTasks[iFrame * 2].channel == 0 && task.BestResidualTasks[iFrame * 2 + 1].channel == 3) frame.ch_mode = ChannelMode.LeftSide; else if (task.BestResidualTasks[iFrame * 2].channel == 3 && task.BestResidualTasks[iFrame * 2 + 1].channel == 1) frame.ch_mode = ChannelMode.RightSide; else if (task.BestResidualTasks[iFrame * 2].channel == 2 && task.BestResidualTasks[iFrame * 2 + 1].channel == 3) frame.ch_mode = ChannelMode.MidSide; else throw new Exception("internal error: invalid stereo mode"); frame.SwapSubframes(0, task.BestResidualTasks[iFrame * 2].channel); frame.SwapSubframes(1, task.BestResidualTasks[iFrame * 2 + 1].channel); } else frame.ch_mode = channels != 2 ? ChannelMode.NotStereo : ChannelMode.LeftRight; for (int ch = 0; ch < channels; ch++) { int index = ch + iFrame * channels; frame.subframes[ch].best.residual = ((int*)task.residualBufferPtr) + task.BestResidualTasks[index].residualOffs; frame.subframes[ch].best.type = SubframeType.Verbatim; frame.subframes[ch].best.size = (uint)(frame.subframes[ch].obits * frame.blocksize); frame.subframes[ch].wbits = 0; if (task.BestResidualTasks[index].size < 0) throw new Exception("internal error"); if (frame.blocksize > Math.Max(4, eparams.max_prediction_order) && frame.subframes[ch].best.size > task.BestResidualTasks[index].size) { frame.subframes[ch].best.type = (SubframeType)task.BestResidualTasks[index].type; frame.subframes[ch].best.size = (uint)task.BestResidualTasks[index].size; frame.subframes[ch].best.order = task.BestResidualTasks[index].residualOrder; frame.subframes[ch].best.cbits = task.BestResidualTasks[index].cbits; frame.subframes[ch].best.shift = task.BestResidualTasks[index].shift; frame.subframes[ch].obits -= task.BestResidualTasks[index].wbits; frame.subframes[ch].wbits = task.BestResidualTasks[index].wbits; frame.subframes[ch].best.rc.porder = task.BestResidualTasks[index].porder; for (int i = 0; i < task.BestResidualTasks[index].residualOrder; i++) frame.subframes[ch].best.coefs[i] = task.BestResidualTasks[index].coefs[task.BestResidualTasks[index].residualOrder - 1 - i]; if (_settings.GPUOnly && (frame.subframes[ch].best.type == SubframeType.Fixed || frame.subframes[ch].best.type == SubframeType.LPC)) { int* riceParams = ((int*)task.bestRiceParamsPtr) + (index << task.max_porder); fixed (int* dstParams = frame.subframes[ch].best.rc.rparams) AudioSamples.MemCpy(dstParams, riceParams, (1 << frame.subframes[ch].best.rc.porder)); //for (int i = 0; i < (1 << frame.subframes[ch].best.rc.porder); i++) // frame.subframes[ch].best.rc.rparams[i] = riceParams[i]; } } } }
unsafe void encode_residual(FlaCudaTask task) { bool unpacked = false; unpack_samples(task, Math.Min(32, task.frameSize)); for (int ch = 0; ch < channels; ch++) { switch (task.frame.subframes[ch].best.type) { case SubframeType.Constant: break; case SubframeType.Verbatim: if (!unpacked) unpack_samples(task, task.frameSize); unpacked = true; break; case SubframeType.Fixed: if (!_settings.GPUOnly) { if (!unpacked) unpack_samples(task, task.frameSize); unpacked = true; encode_residual_fixed(task.frame.subframes[ch].best.residual, task.frame.subframes[ch].samples, task.frame.blocksize, task.frame.subframes[ch].best.order); int pmin = get_max_p_order(eparams.min_partition_order, task.frame.blocksize, task.frame.subframes[ch].best.order); int pmax = get_max_p_order(eparams.max_partition_order, task.frame.blocksize, task.frame.subframes[ch].best.order); uint bits = (uint)(task.frame.subframes[ch].best.order * task.frame.subframes[ch].obits) + 6; task.frame.subframes[ch].best.size = bits + calc_rice_params(task.frame.subframes[ch].best.rc, pmin, pmax, task.frame.subframes[ch].best.residual, (uint)task.frame.blocksize, (uint)task.frame.subframes[ch].best.order); } break; case SubframeType.LPC: fixed (int* coefs = task.frame.subframes[ch].best.coefs) { ulong csum = 0; for (int i = task.frame.subframes[ch].best.order; i > 0; i--) csum += (ulong)Math.Abs(coefs[i - 1]); if ((csum << task.frame.subframes[ch].obits) >= 1UL << 32 || !_settings.GPUOnly) { if (!unpacked) unpack_samples(task, task.frameSize); unpacked = true; if ((csum << task.frame.subframes[ch].obits) >= 1UL << 32) lpc.encode_residual_long(task.frame.subframes[ch].best.residual, task.frame.subframes[ch].samples, task.frame.blocksize, task.frame.subframes[ch].best.order, coefs, task.frame.subframes[ch].best.shift); else lpc.encode_residual(task.frame.subframes[ch].best.residual, task.frame.subframes[ch].samples, task.frame.blocksize, task.frame.subframes[ch].best.order, coefs, task.frame.subframes[ch].best.shift); int pmin = get_max_p_order(eparams.min_partition_order, task.frame.blocksize, task.frame.subframes[ch].best.order); int pmax = get_max_p_order(eparams.max_partition_order, task.frame.blocksize, task.frame.subframes[ch].best.order); uint bits = (uint)(task.frame.subframes[ch].best.order * task.frame.subframes[ch].obits) + 4 + 5 + (uint)task.frame.subframes[ch].best.order * (uint)task.frame.subframes[ch].best.cbits + 6; //uint oldsize = task.frame.subframes[ch].best.size; task.frame.subframes[ch].best.size = bits + calc_rice_params(task.frame.subframes[ch].best.rc, pmin, pmax, task.frame.subframes[ch].best.residual, (uint)task.frame.blocksize, (uint)task.frame.subframes[ch].best.order); //if (task.frame.subframes[ch].best.size > task.frame.subframes[ch].obits * (uint)task.frame.blocksize && // oldsize <= task.frame.subframes[ch].obits * (uint)task.frame.blocksize) // throw new Exception("oops"); } } break; } if (task.frame.subframes[ch].best.size > task.frame.subframes[ch].obits * task.frame.blocksize) { #if DEBUG throw new Exception("larger than verbatim"); #endif task.frame.subframes[ch].best.type = SubframeType.Verbatim; task.frame.subframes[ch].best.size = (uint)(task.frame.subframes[ch].obits * task.frame.blocksize); if (!unpacked) unpack_samples(task, task.frameSize); unpacked = true; } } }