output_subframe_fixed(FLACCLTask task, FlacSubframeInfo sub, int index) { FlacFrame frame = task.frame; // warm-up samples for (int i = 0; i < sub.best.order; i++) frame.writer.writebits_signed(sub.obits, sub.samples[i]); // residual output_residual(task, sub, sub.obits * sub.best.order, index); }
public unsafe void do_output_frames(int nFrames) { send_to_GPU(task1, nFrames, eparams.block_size); run_GPU_task(task1); if (task2.frameCount > 0) task2.openCLCQ.Finish(); if (task2.frameCount > 0) { if (cpu_tasks != null) { wait_for_cpu_task(); FLACCLTask ttmp = cpu_tasks[oldest_cpu_task]; cpu_tasks[oldest_cpu_task] = task2; task2 = ttmp; start_cpu_task(); oldest_cpu_task = (oldest_cpu_task + 1) % cpu_tasks.Length; if (task2.frameCount > 0) write_result(task2); } else { process_result(task2); write_result(task2); } } int bs = eparams.block_size * nFrames; samplesInBuffer -= bs; if (samplesInBuffer > 0) AudioSamples.MemCpy( ((byte*)task2.clSamplesBytesPtr), ((byte*)task1.clSamplesBytesPtr) + bs * _pcm.BlockAlign, samplesInBuffer * _pcm.BlockAlign); FLACCLTask tmp = task1; task1 = task2; task2 = tmp; task1.frameCount = 0; }
unsafe void output_residual(FLACCLTask task, FlacSubframeInfo sub, int offs0, int index) { FlacFrame frame = task.frame; // rice-encoded block frame.writer.writebits(2, sub.best.rc.coding_method); // partition order int porder = sub.best.rc.porder; //assert(porder >= 0); frame.writer.writebits(4, porder); if (task.UseGPURice) { int len = task.BestResidualTasks[index].size - task.BestResidualTasks[index].headerLen; int pos = task.BestResidualTasks[index].encodingOffset; if (task.BestResidualTasks[index].size != (int)sub.best.size) throw new Exception("Encoding offset mismatch"); if (task.BestResidualTasks[index].headerLen != offs0 + 6) throw new Exception("Encoding offset mismatch"); if (pos % 8 != frame.writer.BitLength % 8) throw new Exception("Encoding offset mismatch"); //Console.WriteLine("{0:x} => {1:x}", _totalSize + frame.writer.BitLength / 8, _totalSize + (frame.writer.BitLength + len) / 8); // task.BestResidualTasks[index].headerLen frame.writer.writeints(len, pos, (byte*)task.clRiceOutputPtr); } else { int psize = frame.blocksize >> porder; int res_cnt = psize - sub.best.order; // residual int j = sub.best.order; fixed (byte* fixbuf = frame.writer.Buffer) for (int p = 0; p < (1 << porder); p++) { int k = sub.best.rc.rparams[p]; frame.writer.writebits(4 + sub.best.rc.coding_method, k); if (p == 1) res_cnt = psize; int cnt = Math.Min(res_cnt, frame.blocksize - j); frame.writer.write_rice_block_signed(fixbuf, k, sub.best.residual + j, cnt); j += cnt; } } }
unsafe void write_result(FLACCLTask task) { int iSample = task.frameSize * task.frameCount; if (seek_table != null && _IO.CanSeek) for (int sp = 0; sp < seek_table.Length; sp++) { if (seek_table[sp].number >= task.framePos + iSample) break; if (seek_table[sp].number >= task.framePos) seek_table[sp].offset += _IO.Position - first_frame_offset; } _IO.Write(task.outputBuffer, 0, task.outputSize); _position += iSample; _totalSize += task.outputSize; }
public unsafe void InitTasks() { bool doMidside = channels == 2 && eparams.do_midside; int channelCount = doMidside ? 2 * channels : channels; if (!inited) { if (OpenCL.NumberOfPlatforms < 1) throw new Exception("no opencl platforms found"); int groupSize = _settings.DeviceType == OpenCLDeviceType.CPU ? 1 : _settings.GroupSize; OCLMan = new OpenCLManager(); // Attempt to save binaries after compilation, as well as load precompiled binaries // to avoid compilation. Usually you'll want this to be true. OCLMan.AttemptUseBinaries = true; // true; // Attempt to compile sources. This should probably be true for almost all projects. // Setting it to false means that when you attempt to compile "mysource.cl", it will // only scan the precompiled binary directory for a binary corresponding to a source // with that name. There's a further restriction that the compiled binary also has to // use the same Defines and BuildOptions OCLMan.AttemptUseSource = true; // Binary and source paths // This is where we store our sources and where compiled binaries are placed //OCLMan.BinaryPath = @"OpenCL\bin"; //OCLMan.SourcePath = @"OpenCL\src"; // If true, RequireImageSupport will filter out any devices without image support // In this project we don't need image support though, so we set it to false OCLMan.RequireImageSupport = false; // The BuildOptions string is passed directly to clBuild and can be used to do debug builds etc OCLMan.BuildOptions = ""; OCLMan.SourcePath = System.IO.Path.GetDirectoryName(GetType().Assembly.Location); OCLMan.BinaryPath = System.IO.Path.Combine(System.IO.Path.Combine(Environment.GetFolderPath(Environment.SpecialFolder.LocalApplicationData), "CUE Tools"), "OpenCL"); int platformId = 0; if (_settings.Platform != null) { platformId = -1; string platforms = ""; for (int i = 0; i < OpenCL.NumberOfPlatforms; i++) { var platform = OpenCL.GetPlatform(i); platforms += " \"" + platform.Name + "\""; if (platform.Name.Equals(_settings.Platform, StringComparison.InvariantCultureIgnoreCase)) { platformId = i; break; } } if (platformId < 0) throw new Exception("unknown platform \"" + _settings.Platform + "\". Platforms available:" + platforms); } OCLMan.CreateDefaultContext(platformId, (DeviceType)_settings.DeviceType); this.framesPerTask = (int)OCLMan.Context.Devices[0].MaxComputeUnits * Math.Max(1, _settings.TaskSize / channels); bool UseGPUOnly = _settings.GPUOnly && OCLMan.Context.Devices[0].Extensions.Contains("cl_khr_local_int32_extended_atomics"); bool UseGPURice = UseGPUOnly && _settings.DoRice; if (_blocksize == 0) { if (eparams.block_size == 0) eparams.block_size = select_blocksize(sample_rate, eparams.block_time_ms); _blocksize = eparams.block_size; } else eparams.block_size = _blocksize; int maxBS = 1 << (BitReader.log2i(eparams.block_size - 1) + 1); // The Defines string gets prepended to any and all sources that are compiled // and serve as a convenient way to pass configuration information to the compilation process OCLMan.Defines = "#define MAX_ORDER " + eparams.max_prediction_order.ToString() + "\n" + "#define GROUP_SIZE " + groupSize.ToString() + "\n" + "#define FLACCL_VERSION \"" + vendor_string + "\"\n" + (UseGPUOnly ? "#define DO_PARTITIONS\n" : "") + (UseGPURice ? "#define DO_RICE\n" : "") + "#define BITS_PER_SAMPLE " + PCM.BitsPerSample + "\n" + "#define MAX_BLOCKSIZE " + maxBS + "\n" + "#define MAX_CHANNELS " + PCM.ChannelCount + "\n" + #if DEBUG "#define DEBUG\n" + #endif (_settings.DeviceType == OpenCLDeviceType.CPU ? "#define FLACCL_CPU\n" : "") + _settings.Defines + "\n"; var exts = new string[] { "cl_khr_local_int32_base_atomics", "cl_khr_local_int32_extended_atomics", "cl_khr_fp64", "cl_amd_fp64" }; foreach (string extension in exts) if (OCLMan.Context.Devices[0].Extensions.Contains(extension)) { OCLMan.Defines += "#pragma OPENCL EXTENSION " + extension + ": enable\n"; OCLMan.Defines += "#define HAVE_" + extension + "\n"; } try { openCLProgram = OCLMan.CompileFile("flac.cl"); } catch (OpenCLBuildException ex) { string buildLog = ex.BuildLogs[0]; throw ex; } //using (Stream kernel = GetType().Assembly.GetManifestResourceStream(GetType(), "flac.cl")) //using (StreamReader sr = new StreamReader(kernel)) //{ // try // { // openCLProgram = OCLMan.CompileSource(sr.ReadToEnd()); ; // } // catch (OpenCLBuildException ex) // { // string buildLog = ex.BuildLogs[0]; // throw ex; // } //} #if TTTTKJHSKJH var openCLPlatform = OpenCL.GetPlatform(0); openCLContext = openCLPlatform.CreateDefaultContext(); using (Stream kernel = GetType().Assembly.GetManifestResourceStream(GetType(), "flac.cl")) using (StreamReader sr = new StreamReader(kernel)) openCLProgram = openCLContext.CreateProgramWithSource(sr.ReadToEnd()); try { openCLProgram.Build(); } catch (OpenCLException) { string buildLog = openCLProgram.GetBuildLog(openCLProgram.Devices[0]); throw; } #endif if (_IO == null) _IO = new FileStream(_path, FileMode.Create, FileAccess.Write, FileShare.Read); int header_size = flake_encode_init(); _IO.Write(header, 0, header_size); _totalSize += header_size; if (_IO.CanSeek) first_frame_offset = _IO.Position; task1 = new FLACCLTask(openCLProgram, channelCount, channels, bits_per_sample, max_frame_size, this, groupSize, UseGPUOnly, UseGPURice); task2 = new FLACCLTask(openCLProgram, channelCount, channels, bits_per_sample, max_frame_size, this, groupSize, UseGPUOnly, UseGPURice); if (_settings.CPUThreads > 0) { cpu_tasks = new FLACCLTask[_settings.CPUThreads]; for (int i = 0; i < cpu_tasks.Length; i++) cpu_tasks[i] = new FLACCLTask(openCLProgram, channelCount, channels, bits_per_sample, max_frame_size, this, groupSize, UseGPUOnly, UseGPURice); } inited = true; } }
unsafe void initializeSubframeTasks(int blocksize, int channelsCount, int nFrames, FLACCLTask task) { task.channelSize = ((blocksize + 3) & ~3) * nFrames; task.frameSize = blocksize; task.nWindowFunctions = 0; if (task.frameSize > 4) { calculate_window(task, lpc.window_welch, WindowFunction.Welch); calculate_window(task, lpc.window_flattop, WindowFunction.Flattop); calculate_window(task, lpc.window_tukey, WindowFunction.Tukey); calculate_window(task, lpc.window_hann, WindowFunction.Hann); calculate_window(task, lpc.window_bartlett, WindowFunction.Bartlett); if (task.nWindowFunctions == 0) throw new Exception("invalid windowfunction"); if (!task.UseMappedMemory) task.openCLCQ.EnqueueWriteBuffer(task.clWindowFunctions, false, 0, sizeof(float) * task.nWindowFunctions * task.frameSize, task.clWindowFunctionsPtr); } task.nResidualTasks = 0; task.nTasksPerWindow = Math.Min(32, eparams.orders_per_window); task.nResidualTasksPerChannel = task.nWindowFunctions * task.nTasksPerWindow + (eparams.do_constant ? 1 : 0) + Math.Max(0, 1 + eparams.max_fixed_order - eparams.min_fixed_order); if (task.nResidualTasksPerChannel > 32) throw new Exception("too many tasks"); if (channels == 2 && channelsCount == 4) task.nEstimateTasksPerChannel = Math.Min(eparams.orders_per_channel, task.nResidualTasksPerChannel); else task.nEstimateTasksPerChannel = task.nResidualTasksPerChannel; //if (task.nResidualTasksPerChannel >= 4) // task.nResidualTasksPerChannel = (task.nResidualTasksPerChannel + 7) & ~7; for (int iFrame = 0; iFrame < nFrames; iFrame++) { for (int ch = 0; ch < channelsCount; ch++) { int *selectedTasks = (int*)task.clSelectedTasksPtr; for (int j = 0; j < task.nEstimateTasksPerChannel; j++) { int k = j; if (j < task.nWindowFunctions * task.nTasksPerWindow && task.nWindowFunctions > 1) { k = (j % task.nWindowFunctions) * task.nTasksPerWindow + (j / task.nWindowFunctions); } selectedTasks[(iFrame * channelsCount + ch) * task.nEstimateTasksPerChannel + j] = (iFrame * channelsCount + ch) * task.nResidualTasksPerChannel + k; } for (int iWindow = 0; iWindow < task.nWindowFunctions; iWindow++) { // LPC tasks for (int order = 0; order < task.nTasksPerWindow; order++) { task.ResidualTasks[task.nResidualTasks].type = (int)SubframeType.LPC; task.ResidualTasks[task.nResidualTasks].channel = ch; task.ResidualTasks[task.nResidualTasks].obits = (int)bits_per_sample + (channels == 2 && ch == 3 ? 1 : 0); task.ResidualTasks[task.nResidualTasks].abits = task.ResidualTasks[task.nResidualTasks].obits; task.ResidualTasks[task.nResidualTasks].blocksize = blocksize; task.ResidualTasks[task.nResidualTasks].residualOrder = order + 1; task.ResidualTasks[task.nResidualTasks].samplesOffs = ch * task.channelSize + iFrame * blocksize; task.ResidualTasks[task.nResidualTasks].residualOffs = task.ResidualTasks[task.nResidualTasks].samplesOffs; task.ResidualTasks[task.nResidualTasks].wbits = 0; task.ResidualTasks[task.nResidualTasks].coding_method = PCM.BitsPerSample > 16 ? 1 : 0; task.ResidualTasks[task.nResidualTasks].size = task.ResidualTasks[task.nResidualTasks].obits * blocksize; task.nResidualTasks++; } } // Constant frames if (eparams.do_constant) { task.ResidualTasks[task.nResidualTasks].type = (int)SubframeType.Constant; task.ResidualTasks[task.nResidualTasks].channel = ch; task.ResidualTasks[task.nResidualTasks].obits = (int)bits_per_sample + (channels == 2 && ch == 3 ? 1 : 0); task.ResidualTasks[task.nResidualTasks].abits = task.ResidualTasks[task.nResidualTasks].obits; task.ResidualTasks[task.nResidualTasks].blocksize = blocksize; task.ResidualTasks[task.nResidualTasks].samplesOffs = ch * task.channelSize + iFrame * blocksize; task.ResidualTasks[task.nResidualTasks].residualOffs = task.ResidualTasks[task.nResidualTasks].samplesOffs; task.ResidualTasks[task.nResidualTasks].wbits = 0; task.ResidualTasks[task.nResidualTasks].coding_method = PCM.BitsPerSample > 16 ? 1 : 0; task.ResidualTasks[task.nResidualTasks].size = task.ResidualTasks[task.nResidualTasks].obits * blocksize; task.ResidualTasks[task.nResidualTasks].residualOrder = 1; task.ResidualTasks[task.nResidualTasks].shift = 0; task.ResidualTasks[task.nResidualTasks].coefs[0] = 1; task.nResidualTasks++; } // Fixed prediction for (int order = eparams.min_fixed_order; order <= eparams.max_fixed_order; order++) { task.ResidualTasks[task.nResidualTasks].type = (int)SubframeType.Fixed; task.ResidualTasks[task.nResidualTasks].channel = ch; task.ResidualTasks[task.nResidualTasks].obits = (int)bits_per_sample + (channels == 2 && ch == 3 ? 1 : 0); task.ResidualTasks[task.nResidualTasks].abits = task.ResidualTasks[task.nResidualTasks].obits; task.ResidualTasks[task.nResidualTasks].blocksize = blocksize; task.ResidualTasks[task.nResidualTasks].residualOrder = order; task.ResidualTasks[task.nResidualTasks].samplesOffs = ch * task.channelSize + iFrame * blocksize; task.ResidualTasks[task.nResidualTasks].residualOffs = task.ResidualTasks[task.nResidualTasks].samplesOffs; task.ResidualTasks[task.nResidualTasks].wbits = 0; task.ResidualTasks[task.nResidualTasks].coding_method = PCM.BitsPerSample > 16 ? 1 : 0; task.ResidualTasks[task.nResidualTasks].size = task.ResidualTasks[task.nResidualTasks].obits * blocksize; task.ResidualTasks[task.nResidualTasks].shift = 0; switch (order) { case 0: break; case 1: task.ResidualTasks[task.nResidualTasks].coefs[0] = 1; break; case 2: task.ResidualTasks[task.nResidualTasks].coefs[1] = 2; task.ResidualTasks[task.nResidualTasks].coefs[0] = -1; break; case 3: task.ResidualTasks[task.nResidualTasks].coefs[2] = 3; task.ResidualTasks[task.nResidualTasks].coefs[1] = -3; task.ResidualTasks[task.nResidualTasks].coefs[0] = 1; break; case 4: task.ResidualTasks[task.nResidualTasks].coefs[3] = 4; task.ResidualTasks[task.nResidualTasks].coefs[2] = -6; task.ResidualTasks[task.nResidualTasks].coefs[1] = 4; task.ResidualTasks[task.nResidualTasks].coefs[0] = -1; break; } task.nResidualTasks++; } //// Filler //while ((task.nResidualTasks % task.nResidualTasksPerChannel) != 0) //{ // task.ResidualTasks[task.nResidualTasks].type = (int)SubframeType.Verbatim; // task.ResidualTasks[task.nResidualTasks].channel = ch; // task.ResidualTasks[task.nResidualTasks].obits = (int)bits_per_sample + (channels == 2 && ch == 3 ? 1 : 0); // task.ResidualTasks[task.nResidualTasks].abits = task.ResidualTasks[task.nResidualTasks].obits; // task.ResidualTasks[task.nResidualTasks].blocksize = blocksize; // task.ResidualTasks[task.nResidualTasks].residualOrder = 0; // task.ResidualTasks[task.nResidualTasks].samplesOffs = ch * task.channelSize + iFrame * blocksize; // task.ResidualTasks[task.nResidualTasks].residualOffs = task.ResidualTasks[task.nResidualTasks].samplesOffs; // task.ResidualTasks[task.nResidualTasks].shift = 0; // task.nResidualTasks++; //} } } if (sizeof(FLACCLSubframeTask) * task.nResidualTasks > task.residualTasksLen) throw new Exception("oops"); if (!task.UseMappedMemory) { task.openCLCQ.EnqueueWriteBuffer(task.clResidualTasks, false, 0, sizeof(FLACCLSubframeTask) * task.nResidualTasks, task.clResidualTasksPtr); task.openCLCQ.EnqueueWriteBuffer(task.clSelectedTasks, false, 0, sizeof(int) * (nFrames * channelsCount * task.nEstimateTasksPerChannel), task.clSelectedTasksPtr); } }
unsafe void process_result(FLACCLTask task) { bool doMidside = channels == 2 && eparams.do_midside; int channelCount = doMidside ? 2 * channels : channels; long iSample = 0; long iByte = 0; task.frame.writer.Reset(); task.frame.writer_offset = 0; for (int iFrame = 0; iFrame < task.frameCount; iFrame++) { //if (0 != eparams.variable_block_size && 0 == (task.blocksize & 7) && task.blocksize >= 128) // fs = encode_frame_vbs(); //else int fn = task.frameNumber + (eparams.variable_block_size > 0 ? (int)iSample : iFrame); int fs = encode_frame(doMidside, channelCount, iFrame, task, fn); if (task.verify != null) { int decoded = task.verify.DecodeFrame(task.frame.writer.Buffer, task.frame.writer_offset, fs); if (decoded != fs || task.verify.Remaining != task.frameSize) throw new Exception(string.Format("validation failed! frame size mismatch, iFrame={0}, decoded=={1}, fs=={2}", fn, decoded, fs)); fixed (int* r = task.verify.Samples) { for (int ch = 0; ch < channels; ch++) { byte* res = ((byte*)task.clSamplesBytesPtr) + PCM.BlockAlign * iFrame * task.frameSize + ch * (PCM.BlockAlign / channels); int* smp = r + ch * Flake.MAX_BLOCKSIZE; int ba = PCM.BlockAlign; if (PCM.BitsPerSample == 16) { for (int i = task.frameSize; i > 0; i--) { //if (AudioSamples.MemCmp(s + iFrame * task.frameSize + ch * FLACCLWriter.MAX_BLOCKSIZE, r + ch * Flake.MAX_BLOCKSIZE, task.frameSize)) int ress = *(short*)res; if (ress != *(smp++)) throw new Exception(string.Format("validation failed! iFrame={0}, ch={1}", fn, ch)); res += ba; } } else if (PCM.BitsPerSample == 24) { for (int i = task.frameSize; i > 0; i--) { //if (AudioSamples.MemCmp(s + iFrame * task.frameSize + ch * FLACCLWriter.MAX_BLOCKSIZE, r + ch * Flake.MAX_BLOCKSIZE, task.frameSize)) int ress = (((int)res[0] << 8) + ((int)res[1] << 16) + ((int)res[2] << 24)) >> (8); if (ress != *(smp++)) throw new Exception(string.Format("validation failed! iFrame={0}, ch={1}", iFrame, ch)); res += ba; } } else throw new Exception("Invalid BPS"); } } } if (seek_table != null && _IO.CanSeek) { for (int sp = 0; sp < seek_table.Length; sp++) { if (seek_table[sp].framesize != 0) continue; if (seek_table[sp].number >= task.framePos + iSample + task.frameSize) break; if (seek_table[sp].number >= task.framePos + iSample) { seek_table[sp].number = task.framePos + iSample; seek_table[sp].offset = iByte; seek_table[sp].framesize = task.frameSize; } } } //Array.Copy(task.frame.buffer, 0, task.outputBuffer, iByte, fs); iSample += task.frameSize; iByte += fs; } task.outputSize = (int)iByte; if (iByte != task.frame.writer.Length) throw new Exception("invalid length"); }
unsafe int encode_frame(bool doMidside, int channelCount, int iFrame, FLACCLTask task, int current_frame_number) { task.frame.InitSize(task.frameSize, eparams.variable_block_size != 0); task.frame.frame_number = iFrame; task.frame.ch_mode = ChannelMode.NotStereo; fixed (int* smp = task.samplesBuffer) { for (int ch = 0; ch < channelCount; ch++) task.frame.subframes[ch].Init( smp + ch * task.channelSize + iFrame * task.frameSize, ((int*)task.clResidualPtr) + ch * task.channelSize + iFrame * task.frameSize, _pcm.BitsPerSample + (doMidside && ch == 3 ? 1 : 0), 0); encode_residual(task, channelCount, iFrame); //task.frame.writer.Reset(); task.frame.frame_number = current_frame_number; task.frame.writer_offset = task.frame.writer.Length; output_frame_header(task.frame); output_subframes(task, iFrame); output_frame_footer(task.frame); if (task.frame.writer.Length - task.frame.writer_offset >= max_frame_size) throw new Exception("buffer overflow"); return task.frame.writer.Length - task.frame.writer_offset; } }
unsafe void send_to_GPU(FLACCLTask task, int nFrames, int blocksize) { bool doMidside = channels == 2 && eparams.do_midside; int channelsCount = doMidside ? 2 * channels : channels; if (blocksize != task.frameSize) task.nResidualTasks = 0; task.frameCount = nFrames; task.frameSize = blocksize; task.frameNumber = eparams.variable_block_size > 0 ? frame_pos : frame_count; task.framePos = frame_pos; frame_count += nFrames; frame_pos += nFrames * blocksize; if (!task.UseMappedMemory) task.openCLCQ.EnqueueWriteBuffer(task.clSamplesBytes, false, 0, PCM.BlockAlign * blocksize * nFrames, task.clSamplesBytesPtr); //task.openCLCQ.EnqueueUnmapMemObject(task.clSamplesBytes, task.clSamplesBytes.HostPtr); //task.openCLCQ.EnqueueMapBuffer(task.clSamplesBytes, true, MapFlags.WRITE, 0, task.samplesBufferLen / 2); }
/// <summary> /// Copy channel-interleaved input samples into separate subframes /// </summary> /// <param name="task"></param> /// <param name="doMidside"></param> unsafe void unpack_samples_24(FLACCLTask task, byte* srcptr, int count) { switch (task.frame.ch_mode) { case ChannelMode.NotStereo: for (int ch = 0; ch < channels; ch++) { int* s = task.frame.subframes[ch].samples; int wbits = (int)task.frame.subframes[ch].wbits; byte* src = &srcptr[ch * 3]; for (int i = 0; i < count; i++) { s[i] = (((int)src[0] << 8) + ((int)src[1] << 16) + ((int)src[2] << 24)) >> (8 + wbits); src += PCM.BlockAlign; } } break; case ChannelMode.LeftRight: { int* left = task.frame.subframes[0].samples; int* right = task.frame.subframes[1].samples; int lwbits = (int)task.frame.subframes[0].wbits; int rwbits = (int)task.frame.subframes[1].wbits; for (int i = 0; i < count; i++) { int l = (((int)*(srcptr++) << 8) + ((int)*(srcptr++) << 16) + ((int)*(srcptr++) << 24)) >> 8; int r = (((int)*(srcptr++) << 8) + ((int)*(srcptr++) << 16) + ((int)*(srcptr++) << 24)) >> 8; left[i] = l >> lwbits; right[i] = r >> rwbits; } break; } case ChannelMode.LeftSide: { int* left = task.frame.subframes[0].samples; int* right = task.frame.subframes[1].samples; int lwbits = (int)task.frame.subframes[0].wbits; int rwbits = (int)task.frame.subframes[1].wbits; for (int i = 0; i < count; i++) { int l = (((int)*(srcptr++) << 8) + ((int)*(srcptr++) << 16) + ((int)*(srcptr++) << 24)) >> 8; int r = (((int)*(srcptr++) << 8) + ((int)*(srcptr++) << 16) + ((int)*(srcptr++) << 24)) >> 8; left[i] = l >> lwbits; right[i] = (l - r) >> rwbits; } break; } case ChannelMode.RightSide: { int* left = task.frame.subframes[0].samples; int* right = task.frame.subframes[1].samples; int lwbits = (int)task.frame.subframes[0].wbits; int rwbits = (int)task.frame.subframes[1].wbits; for (int i = 0; i < count; i++) { int l = (((int)*(srcptr++) << 8) + ((int)*(srcptr++) << 16) + ((int)*(srcptr++) << 24)) >> 8; int r = (((int)*(srcptr++) << 8) + ((int)*(srcptr++) << 16) + ((int)*(srcptr++) << 24)) >> 8; left[i] = (l - r) >> lwbits; right[i] = r >> rwbits; } break; } case ChannelMode.MidSide: { int* left = task.frame.subframes[0].samples; int* right = task.frame.subframes[1].samples; int lwbits = (int)task.frame.subframes[0].wbits; int rwbits = (int)task.frame.subframes[1].wbits; for (int i = 0; i < count; i++) { int l = (((int)*(srcptr++) << 8) + ((int)*(srcptr++) << 16) + ((int)*(srcptr++) << 24)) >> 8; int r = (((int)*(srcptr++) << 8) + ((int)*(srcptr++) << 16) + ((int)*(srcptr++) << 24)) >> 8; left[i] = (l + r) >> (1 + lwbits); right[i] = (l - r) >> rwbits; } break; } } }
/// <summary> /// Copy channel-interleaved input samples into separate subframes /// </summary> /// <param name="task"></param> /// <param name="doMidside"></param> unsafe void unpack_samples(FLACCLTask task, int count) { int iFrame = task.frame.frame_number; byte* srcptr = ((byte*)task.clSamplesBytesPtr) + iFrame * task.frameSize * PCM.BlockAlign; if (PCM.BitsPerSample == 16) unpack_samples_16(task, srcptr, count); else if (PCM.BitsPerSample == 24) unpack_samples_24(task, srcptr, count); else throw new Exception("Invalid BPS"); }
/// <summary> /// Copy channel-interleaved input samples into separate subframes /// </summary> /// <param name="task"></param> /// <param name="doMidside"></param> unsafe void unpack_samples_16(FLACCLTask task, byte * srcptr, int count) { short* src = (short*)srcptr; switch (task.frame.ch_mode) { case ChannelMode.NotStereo: for (int ch = 0; ch < channels; ch++) { int* s = task.frame.subframes[ch].samples; int wbits = (int)task.frame.subframes[ch].wbits; for (int i = 0; i < count; i++) s[i] = src[i * channels + ch] >> wbits; } break; case ChannelMode.LeftRight: { int* left = task.frame.subframes[0].samples; int* right = task.frame.subframes[1].samples; int lwbits = (int)task.frame.subframes[0].wbits; int rwbits = (int)task.frame.subframes[1].wbits; for (int i = 0; i < count; i++) { int l = *(src++); int r = *(src++); left[i] = l >> lwbits; right[i] = r >> rwbits; } break; } case ChannelMode.LeftSide: { int* left = task.frame.subframes[0].samples; int* right = task.frame.subframes[1].samples; int lwbits = (int)task.frame.subframes[0].wbits; int rwbits = (int)task.frame.subframes[1].wbits; for (int i = 0; i < count; i++) { int l = *(src++); int r = *(src++); left[i] = l >> lwbits; right[i] = (l - r) >> rwbits; } break; } case ChannelMode.RightSide: { int* left = task.frame.subframes[0].samples; int* right = task.frame.subframes[1].samples; int lwbits = (int)task.frame.subframes[0].wbits; int rwbits = (int)task.frame.subframes[1].wbits; for (int i = 0; i < count; i++) { int l = *(src++); int r = *(src++); left[i] = (l - r) >> lwbits; right[i] = r >> rwbits; } break; } case ChannelMode.MidSide: { int* left = task.frame.subframes[0].samples; int* right = task.frame.subframes[1].samples; int lwbits = (int)task.frame.subframes[0].wbits; int rwbits = (int)task.frame.subframes[1].wbits; for (int i = 0; i < count; i++) { int l = *(src++); int r = *(src++); left[i] = (l + r) >> (1 + lwbits); right[i] = (l - r) >> rwbits; } break; } } }
unsafe void estimate_residual(FLACCLTask task, int channelsCount) { if (task.frameSize > 4) task.EnqueueKernels(); }
unsafe void encode_residual(FLACCLTask task, int channelsCount, int iFrame) { FlacFrame frame = task.frame; if (channelsCount == 4 && channels == 2 && frame.blocksize > 4) { if (task.BestResidualTasks[iFrame * 2].channel == 0 && task.BestResidualTasks[iFrame * 2 + 1].channel == 1) frame.ch_mode = ChannelMode.LeftRight; else if (task.BestResidualTasks[iFrame * 2].channel == 0 && task.BestResidualTasks[iFrame * 2 + 1].channel == 3) frame.ch_mode = ChannelMode.LeftSide; else if (task.BestResidualTasks[iFrame * 2].channel == 3 && task.BestResidualTasks[iFrame * 2 + 1].channel == 1) frame.ch_mode = ChannelMode.RightSide; else if (task.BestResidualTasks[iFrame * 2].channel == 2 && task.BestResidualTasks[iFrame * 2 + 1].channel == 3) frame.ch_mode = ChannelMode.MidSide; else throw new Exception("internal error: invalid stereo mode"); frame.SwapSubframes(0, task.BestResidualTasks[iFrame * 2].channel); frame.SwapSubframes(1, task.BestResidualTasks[iFrame * 2 + 1].channel); } else frame.ch_mode = channels != 2 ? ChannelMode.NotStereo : ChannelMode.LeftRight; int toUnpack = Math.Min(task.frameSize, eparams.max_prediction_order); // calculate wbits before unpacking samples. for (int ch = 0; ch < channels; ch++) { int index = ch + iFrame * channels; frame.subframes[ch].best.residual = ((int*)task.clResidualPtr) + task.BestResidualTasks[index].residualOffs; frame.subframes[ch].best.type = SubframeType.Verbatim; frame.subframes[ch].best.size = (uint)(frame.subframes[ch].obits * frame.blocksize); frame.subframes[ch].wbits = 0; if (frame.blocksize > Math.Max(4, eparams.max_prediction_order)) { if (task.BestResidualTasks[index].size < 0) throw new Exception("internal error"); if (frame.subframes[ch].best.size > task.BestResidualTasks[index].size && (SubframeType)task.BestResidualTasks[index].type != SubframeType.Verbatim) { frame.subframes[ch].best.type = (SubframeType)task.BestResidualTasks[index].type; frame.subframes[ch].best.size = (uint)task.BestResidualTasks[index].size; frame.subframes[ch].best.order = task.BestResidualTasks[index].residualOrder; frame.subframes[ch].best.cbits = task.BestResidualTasks[index].cbits; frame.subframes[ch].best.shift = task.BestResidualTasks[index].shift; frame.subframes[ch].obits -= task.BestResidualTasks[index].wbits; frame.subframes[ch].wbits = task.BestResidualTasks[index].wbits; for (int i = 0; i < task.BestResidualTasks[index].residualOrder; i++) frame.subframes[ch].best.coefs[i] = task.BestResidualTasks[index].coefs[task.BestResidualTasks[index].residualOrder - 1 - i]; frame.subframes[ch].best.rc.porder = task.BestResidualTasks[index].porder; frame.subframes[ch].best.rc.coding_method = task.BestResidualTasks[index].coding_method; if (task.UseGPUOnly && !task.UseGPURice) { if (frame.subframes[ch].best.type == SubframeType.Fixed || frame.subframes[ch].best.type == SubframeType.LPC) { int* riceParams = ((int*)task.clBestRiceParamsPtr) + (index << task.max_porder); fixed (int* dstParams = frame.subframes[ch].best.rc.rparams) AudioSamples.MemCpy(dstParams, riceParams, (1 << frame.subframes[ch].best.rc.porder)); } uint real_size = measure_subframe(frame, frame.subframes[ch]); if (real_size != task.frame.subframes[ch].best.size) throw new Exception("size reported incorrectly"); } } else { if (task.UseGPURice && frame.subframes[ch].best.size != task.BestResidualTasks[index].size) throw new Exception("size reported incorrectly"); } } if (task.frame.subframes[ch].best.type == SubframeType.Verbatim) toUnpack = task.frameSize; if (task.frame.subframes[ch].best.type == SubframeType.LPC && !task.UseGPUOnly) toUnpack = task.frameSize; if (task.frame.subframes[ch].best.type == SubframeType.Fixed && !task.UseGPUOnly) toUnpack = task.frameSize; } unpack_samples(task, toUnpack); for (int ch = 0; ch < channels; ch++) { int index = ch + iFrame * channels; switch (task.frame.subframes[ch].best.type) { case SubframeType.Constant: break; case SubframeType.Verbatim: break; case SubframeType.Fixed: if (!task.UseGPUOnly) { encode_residual_fixed(task.frame.subframes[ch].best.residual, task.frame.subframes[ch].samples, task.frame.blocksize, task.frame.subframes[ch].best.order); int pmin = get_max_p_order(eparams.min_partition_order, task.frame.blocksize, task.frame.subframes[ch].best.order); int pmax = get_max_p_order(eparams.max_partition_order, task.frame.blocksize, task.frame.subframes[ch].best.order); calc_rice_params(task.frame.subframes[ch].best.rc, pmin, pmax, task.frame.subframes[ch].best.residual, (uint)task.frame.blocksize, (uint)task.frame.subframes[ch].best.order, PCM.BitsPerSample > 16 ? 1 : 0); } break; case SubframeType.LPC: if (!task.UseGPUOnly) { fixed (int* coefs = task.frame.subframes[ch].best.coefs) { if (PCM.BitsPerSample > 16) lpc.encode_residual_long(task.frame.subframes[ch].best.residual, task.frame.subframes[ch].samples, task.frame.blocksize, task.frame.subframes[ch].best.order, coefs, task.frame.subframes[ch].best.shift); else lpc.encode_residual(task.frame.subframes[ch].best.residual, task.frame.subframes[ch].samples, task.frame.blocksize, task.frame.subframes[ch].best.order, coefs, task.frame.subframes[ch].best.shift); } int pmin = get_max_p_order(eparams.min_partition_order, task.frame.blocksize, task.frame.subframes[ch].best.order); int pmax = get_max_p_order(eparams.max_partition_order, task.frame.blocksize, task.frame.subframes[ch].best.order); calc_rice_params(task.frame.subframes[ch].best.rc, pmin, pmax, task.frame.subframes[ch].best.residual, (uint)task.frame.blocksize, (uint)task.frame.subframes[ch].best.order, PCM.BitsPerSample > 16 ? 1 : 0); } break; } if (!task.UseGPUOnly) { task.frame.subframes[ch].best.size = measure_subframe(task.frame, task.frame.subframes[ch]); if (task.frame.subframes[ch].best.size > task.frame.subframes[ch].obits * task.frame.blocksize) { task.frame.subframes[ch].best.type = SubframeType.Verbatim; task.frame.subframes[ch].best.size = (uint)(task.frame.subframes[ch].obits * task.frame.blocksize); } } } }
output_subframe_lpc(FLACCLTask task, FlacSubframeInfo sub, int index) { FlacFrame frame = task.frame; // warm-up samples for (int i = 0; i < sub.best.order; i++) frame.writer.writebits_signed(sub.obits, sub.samples[i]); // LPC coefficients frame.writer.writebits(4, sub.best.cbits - 1); frame.writer.writebits_signed(5, sub.best.shift); for (int i = 0; i < sub.best.order; i++) frame.writer.writebits_signed(sub.best.cbits, sub.best.coefs[i]); // residual output_residual(task, sub, (sub.obits + sub.best.cbits) * sub.best.order + 9, index); }
unsafe void run_GPU_task(FLACCLTask task) { bool doMidside = channels == 2 && eparams.do_midside; int channelsCount = doMidside ? 2 * channels : channels; if (task.nResidualTasks == 0) initializeSubframeTasks(task.frameSize, channelsCount, framesPerTask, task); estimate_residual(task, channelsCount); }
unsafe void output_subframes(FLACCLTask task, int iFrame) { FlacFrame frame = task.frame; for (int ch = 0; ch < channels; ch++) { FlacSubframeInfo sub = frame.subframes[ch]; // subframe header int type_code = (int) sub.best.type; if (sub.best.type == SubframeType.Fixed) type_code |= sub.best.order; if (sub.best.type == SubframeType.LPC) type_code |= sub.best.order - 1; frame.writer.writebits(1, 0); frame.writer.writebits(6, type_code); frame.writer.writebits(1, sub.wbits != 0 ? 1 : 0); if (sub.wbits > 0) frame.writer.writebits((int)sub.wbits, 1); //if (frame_writer.Length >= frame_buffer.Length) // throw new Exception("buffer overflow"); int index = ch + iFrame * channels; // subframe switch (sub.best.type) { case SubframeType.Constant: output_subframe_constant(frame, sub); break; case SubframeType.Verbatim: output_subframe_verbatim(frame, sub); break; case SubframeType.Fixed: output_subframe_fixed(task, sub, index); break; case SubframeType.LPC: output_subframe_lpc(task, sub, index); break; } //if (frame_writer.Length >= frame_buffer.Length) // throw new Exception("buffer overflow"); } }
unsafe void calculate_window(FLACCLTask task, window_function func, WindowFunction flag) { if ((eparams.window_function & flag) == 0 || task.nWindowFunctions == lpc.MAX_LPC_WINDOWS) return; func(((float*)task.clWindowFunctionsPtr) + task.nWindowFunctions * task.frameSize, task.frameSize); //int sz = _windowsize; //float* pos = window + _windowcount * FLACCLWriter.MAX_BLOCKSIZE * 2; //do //{ // func(pos, sz); // if ((sz & 1) != 0) // break; // pos += sz; // sz >>= 1; //} while (sz >= 32); task.nWindowFunctions++; }