/// <summary> /// Retrieves the compressed stream from the encoder state that was previously used in one of the encoder functions.<para/> /// Note: Synchronizes on stream. For async use IntPtr! /// </summary> /// <param name="stream">CUDA stream where all the required device operations will be placed.</param> /// <returns>Byte array containing the data.</returns> public byte[] RetrieveBitstream(CudaStream stream) { SizeT size = new SizeT(); res = NvJpegNativeMethods.nvjpegEncodeRetrieveBitstream(_nvJpeg.Handle, _state, IntPtr.Zero, ref size, stream.Stream); Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "nvjpegEncodeRetrieveBitstream", res)); if (res != nvjpegStatus.Success) { throw new NvJpegException(res); } byte[] data = new byte[size]; GCHandle handle = GCHandle.Alloc(data, GCHandleType.Pinned); try { IntPtr ptr = handle.AddrOfPinnedObject(); res = NvJpegNativeMethods.nvjpegEncodeRetrieveBitstream(_nvJpeg.Handle, _state, ptr, ref size, stream.Stream); Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "nvjpegEncodeRetrieveBitstream", res)); stream.Synchronize(); } finally { handle.Free(); } if (res != nvjpegStatus.Success) { throw new NvJpegException(res); } return(data); }
private void FinishBinding(CUdeviceptr output) { m_ifft.Exec(m_tempBlock.GetDevicePtr(m_owner, m_secondFFTOffset), output); float factor = 1.0f; if (NormalizeOutput) { //ZXC m_dotKernel.RunAsync(m_stream, m_tempBlock, 0, output, output, m_inputSize, /* distributed: */ 0); m_dotKernel.RunAsync(m_stream, m_tempBlock, output, output, m_inputSize); m_tempBlock.SafeCopyToHost(0, 1); if (m_tempBlock.Host[0] > 0.000001f) { factor /= (float)(Math.Sqrt(m_tempBlock.Host[0])); } } else { factor = 1.0f / Denominator; } if (factor != 1.0f) { m_normalKernel.Run(0, 0, factor, 0, output, output, m_inputSize); } else { m_stream.Synchronize(); } }
/// <summary> /// Decodes a single image, and writes the decoded image in the desired format to the output buffers. This function is asynchronous with respect to the host. All GPU tasks for this function will be submitted to the provided stream.<para/> /// Note: Synchronizes on stream. For async use IntPtr! /// </summary> /// <param name="data">the encoded data.</param> /// <param name="output_format">Format in which the decoded output will be saved.</param> /// <param name="destination">Pointer to the structure that describes the output destination. This structure should be on the host (CPU), but the pointers in this structure should be pointing to the device (i.e., GPU) memory.</param> /// <param name="stream">The CUDA stream where all of the GPU work will be submitted.</param> public void Decode(byte[] data, nvjpegOutputFormat output_format, ref nvjpegImage destination, CudaStream stream) { GCHandle handle = GCHandle.Alloc(data, GCHandleType.Pinned); try { IntPtr ptr = handle.AddrOfPinnedObject(); res = NvJpegNativeMethods.nvjpegDecode(_nvJpeg.Handle, _state, ptr, data.Length, output_format, ref destination, stream.Stream); Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "nvjpegDecode", res)); stream.Synchronize(); } finally { handle.Free(); } if (res != nvjpegStatus.Success) { throw new NvJpegException(res); } }
static void Main(string[] args) { try { if (args.Length > 0) { deviceID = int.Parse(args[0]); } } catch (Exception ex) { Logger.Log(LogLevel.Error, "Device ID parse error"); } try { if (args.Length > 1) { port = int.Parse(args[1]); Comms.ConnectToMaster(port); } else { TEST = true; Logger.CopyToConsole = true; CGraph.ShowCycles = true; } } catch (Exception ex) { Logger.Log(LogLevel.Error, "Master connection error"); } try { if (args.Length > 3) { gpuCount = int.Parse(args[3]); fastCuda = gpuCount <= (Environment.ProcessorCount / 2); if (fastCuda) { Logger.Log(LogLevel.Info, "Using single GPU blocking mode"); } } } catch { } if (TEST) { currentJob = nextJob = new Job() { jobID = 0, k0 = 0xf4956dc403730b01L, k1 = 0xe6d45de39c2a5a3eL, k2 = 0xcbf626a8afee35f6L, k3 = 0x4307b94b1a0c9980L, pre_pow = TestPrePow, timestamp = DateTime.Now }; } else { currentJob = nextJob = new Job() { jobID = 0, k0 = 0xf4956dc403730b01L, k1 = 0xe6d45de39c2a5a3eL, k2 = 0xcbf626a8afee35f6L, k3 = 0x4307b94b1a0c9980L, pre_pow = TestPrePow, timestamp = DateTime.Now }; if (!Comms.IsConnected()) { Console.WriteLine("Master connection failed, aborting"); Logger.Log(LogLevel.Error, "No master connection, exitting!"); return; } if (deviceID < 0) { int devCnt = CudaContext.GetDeviceCount(); GpuDevicesMessage gpum = new GpuDevicesMessage() { devices = new List <GpuDevice>(devCnt) }; for (int i = 0; i < devCnt; i++) { string name = CudaContext.GetDeviceName(i); var info = CudaContext.GetDeviceInfo(i); gpum.devices.Add(new GpuDevice() { deviceID = i, name = name, memory = info.TotalGlobalMemory }); } //Console.WriteLine(devCnt); Comms.gpuMsg = gpum; Comms.SetEvent(); //Console.WriteLine("event fired"); Task.Delay(1000).Wait(); //Console.WriteLine("closing"); Comms.Close(); return; } } try { var assembly = Assembly.GetEntryAssembly(); var resourceStream = assembly.GetManifestResourceStream("CudaSolver.kernel_x64.ptx"); ctx = new CudaContext(deviceID, !fastCuda ? (CUCtxFlags.BlockingSync | CUCtxFlags.MapHost) : CUCtxFlags.MapHost); meanSeedA = ctx.LoadKernelPTX(resourceStream, "FluffySeed2A"); meanSeedA.BlockDimensions = 128; meanSeedA.GridDimensions = 2048; meanSeedA.PreferredSharedMemoryCarveout = CUshared_carveout.MaxShared; meanSeedB = ctx.LoadKernelPTX(resourceStream, "FluffySeed2B"); meanSeedB.BlockDimensions = 128; meanSeedB.GridDimensions = 2048; meanSeedB.PreferredSharedMemoryCarveout = CUshared_carveout.MaxShared; meanSeedB_4 = ctx.LoadKernelPTX(resourceStream, "FluffySeed2B"); meanSeedB_4.BlockDimensions = 128; meanSeedB_4.GridDimensions = 1024; meanSeedB_4.PreferredSharedMemoryCarveout = CUshared_carveout.MaxShared; meanRound = ctx.LoadKernelPTX(resourceStream, "FluffyRound"); meanRound.BlockDimensions = 512; meanRound.GridDimensions = 4096; meanRound.PreferredSharedMemoryCarveout = CUshared_carveout.MaxShared; meanRound_2 = ctx.LoadKernelPTX(resourceStream, "FluffyRound"); meanRound_2.BlockDimensions = 512; meanRound_2.GridDimensions = 2048; meanRound_2.PreferredSharedMemoryCarveout = CUshared_carveout.MaxShared; meanRoundJoin = ctx.LoadKernelPTX(resourceStream, "FluffyRound_J"); meanRoundJoin.BlockDimensions = 512; meanRoundJoin.GridDimensions = 4096; meanRoundJoin.PreferredSharedMemoryCarveout = CUshared_carveout.MaxShared; meanTail = ctx.LoadKernelPTX(resourceStream, "FluffyTail"); meanTail.BlockDimensions = 1024; meanTail.GridDimensions = 4096; meanTail.PreferredSharedMemoryCarveout = CUshared_carveout.MaxL1; meanRecover = ctx.LoadKernelPTX(resourceStream, "FluffyRecovery"); meanRecover.BlockDimensions = 256; meanRecover.GridDimensions = 2048; meanRecover.PreferredSharedMemoryCarveout = CUshared_carveout.MaxL1; } catch (Exception ex) { Logger.Log(LogLevel.Error, "Unable to create kernels: " + ex.Message); Task.Delay(500).Wait(); Comms.Close(); return; } try { d_buffer = new CudaDeviceVariable <ulong>(BUFFER_SIZE_U32); d_bufferMid = new CudaDeviceVariable <ulong>(d_buffer.DevicePointer + (BUFFER_SIZE_B * 8)); d_bufferB = new CudaDeviceVariable <ulong>(d_buffer.DevicePointer + (BUFFER_SIZE_A * 8)); d_indexesA = new CudaDeviceVariable <uint>(INDEX_SIZE * 2); d_indexesB = new CudaDeviceVariable <uint>(INDEX_SIZE * 2); Array.Clear(h_indexesA, 0, h_indexesA.Length); Array.Clear(h_indexesB, 0, h_indexesA.Length); d_indexesA = h_indexesA; d_indexesB = h_indexesB; streamPrimary = new CudaStream(CUStreamFlags.NonBlocking); streamSecondary = new CudaStream(CUStreamFlags.NonBlocking); } catch (Exception ex) { Task.Delay(200).Wait(); Logger.Log(LogLevel.Error, $"Out of video memory! Only {ctx.GetFreeDeviceMemorySize()} free"); Task.Delay(500).Wait(); Comms.Close(); return; } try { AllocateHostMemory(true, ref h_a, ref hAligned_a, 1024 * 1024 * 32); } catch (Exception ex) { Logger.Log(LogLevel.Error, "Unable to create pinned memory."); Task.Delay(500).Wait(); Comms.Close(); return; } int loopCnt = 0; while (!Comms.IsTerminated) { try { if (!TEST && (Comms.nextJob.pre_pow == null || Comms.nextJob.pre_pow == "" || Comms.nextJob.pre_pow == TestPrePow)) { Logger.Log(LogLevel.Info, string.Format("Waiting for job....")); Task.Delay(1000).Wait(); continue; } if (!TEST && ((currentJob.pre_pow != Comms.nextJob.pre_pow) || (currentJob.origin != Comms.nextJob.origin))) { currentJob = Comms.nextJob; currentJob.timestamp = DateTime.Now; } if (!TEST && (currentJob.timestamp.AddMinutes(30) < DateTime.Now) && Comms.lastIncoming.AddMinutes(30) < DateTime.Now) { Logger.Log(LogLevel.Info, string.Format("Job too old...")); Task.Delay(1000).Wait(); continue; } // test runs only once if (TEST && loopCnt++ > 100) { Comms.IsTerminated = true; } Solution s; while (graphSolutions.TryDequeue(out s)) { meanRecover.SetConstantVariable <ulong>("recovery", s.GetUlongEdges()); d_indexesB.MemsetAsync(0, streamPrimary.Stream); meanRecover.RunAsync(streamPrimary.Stream, s.job.k0, s.job.k1, s.job.k2, s.job.k3, d_indexesB.DevicePointer); streamPrimary.Synchronize(); s.nonces = new uint[40]; d_indexesB.CopyToHost(s.nonces, 0, 0, 40 * 4); s.nonces = s.nonces.OrderBy(n => n).ToArray(); lock (Comms.graphSolutionsOut) { Comms.graphSolutionsOut.Enqueue(s); } Comms.SetEvent(); } uint[] count; do { if (!TEST && ((currentJob.pre_pow != Comms.nextJob.pre_pow) || (currentJob.origin != Comms.nextJob.origin))) { currentJob = Comms.nextJob; currentJob.timestamp = DateTime.Now; } currentJob = currentJob.Next(); Logger.Log(LogLevel.Debug, string.Format("GPU NV{4}:Trimming #{4}: {0} {1} {2} {3}", currentJob.k0, currentJob.k1, currentJob.k2, currentJob.k3, currentJob.jobID, deviceID)); timer.Restart(); d_indexesA.MemsetAsync(0, streamPrimary.Stream); d_indexesB.MemsetAsync(0, streamPrimary.Stream); meanSeedA.RunAsync(streamPrimary.Stream, currentJob.k0, currentJob.k1, currentJob.k2, currentJob.k3, d_bufferMid.DevicePointer, d_indexesB.DevicePointer); meanSeedB_4.RunAsync(streamPrimary.Stream, d_bufferMid.DevicePointer, d_buffer.DevicePointer, d_indexesB.DevicePointer, d_indexesA.DevicePointer, 0); meanSeedB_4.RunAsync(streamPrimary.Stream, d_bufferMid.DevicePointer, d_buffer.DevicePointer + ((BUFFER_SIZE_A * 8) / 4) * 1, d_indexesB.DevicePointer, d_indexesA.DevicePointer, 16); meanSeedB_4.RunAsync(streamPrimary.Stream, d_bufferMid.DevicePointer, d_buffer.DevicePointer + ((BUFFER_SIZE_A * 8) / 4) * 2, d_indexesB.DevicePointer, d_indexesA.DevicePointer, 32); meanSeedB_4.RunAsync(streamPrimary.Stream, d_bufferMid.DevicePointer, d_buffer.DevicePointer + ((BUFFER_SIZE_A * 8) / 4) * 3, d_indexesB.DevicePointer, d_indexesA.DevicePointer, 48); d_indexesB.MemsetAsync(0, streamPrimary.Stream); meanRound_2.RunAsync(streamPrimary.Stream, d_buffer.DevicePointer + ((BUFFER_SIZE_A * 8) / 4) * 2, d_bufferB.DevicePointer, d_indexesA.DevicePointer + (2048 * 4), d_indexesB.DevicePointer + (4096 * 4), DUCK_EDGES_A, DUCK_EDGES_B / 2); meanRound_2.RunAsync(streamPrimary.Stream, d_buffer.DevicePointer, d_bufferB.DevicePointer - (BUFFER_SIZE_B * 8), d_indexesA.DevicePointer, d_indexesB.DevicePointer, DUCK_EDGES_A, DUCK_EDGES_B / 2); d_indexesA.MemsetAsync(0, streamPrimary.Stream); meanRoundJoin.RunAsync(streamPrimary.Stream, d_bufferB.DevicePointer - (BUFFER_SIZE_B * 8), d_bufferB.DevicePointer, d_buffer.DevicePointer, d_indexesB.DevicePointer, d_indexesA.DevicePointer, DUCK_EDGES_B / 2, DUCK_EDGES_B / 2); //d_indexesA.MemsetAsync(0, streamPrimary.Stream); //meanRound.RunAsync(streamPrimary.Stream, d_bufferB.DevicePointer, d_buffer.DevicePointer, d_indexesB.DevicePointer, d_indexesA.DevicePointer, DUCK_EDGES_B, DUCK_EDGES_B / 2); d_indexesB.MemsetAsync(0, streamPrimary.Stream); meanRound.RunAsync(streamPrimary.Stream, d_buffer.DevicePointer, d_bufferB.DevicePointer, d_indexesA.DevicePointer, d_indexesB.DevicePointer, DUCK_EDGES_B / 2, DUCK_EDGES_B / 2); d_indexesA.MemsetAsync(0, streamPrimary.Stream); meanRound.RunAsync(streamPrimary.Stream, d_bufferB.DevicePointer, d_buffer.DevicePointer, d_indexesB.DevicePointer, d_indexesA.DevicePointer, DUCK_EDGES_B / 2, DUCK_EDGES_B / 2); d_indexesB.MemsetAsync(0, streamPrimary.Stream); meanRound.RunAsync(streamPrimary.Stream, d_buffer.DevicePointer, d_bufferB.DevicePointer, d_indexesA.DevicePointer, d_indexesB.DevicePointer, DUCK_EDGES_B / 2, DUCK_EDGES_B / 4); for (int i = 0; i < trimRounds; i++) { d_indexesA.MemsetAsync(0, streamPrimary.Stream); meanRound.RunAsync(streamPrimary.Stream, d_bufferB.DevicePointer, d_buffer.DevicePointer, d_indexesB.DevicePointer, d_indexesA.DevicePointer, DUCK_EDGES_B / 4, DUCK_EDGES_B / 4); d_indexesB.MemsetAsync(0, streamPrimary.Stream); meanRound.RunAsync(streamPrimary.Stream, d_buffer.DevicePointer, d_bufferB.DevicePointer, d_indexesA.DevicePointer, d_indexesB.DevicePointer, DUCK_EDGES_B / 4, DUCK_EDGES_B / 4); } d_indexesA.MemsetAsync(0, streamPrimary.Stream); meanTail.RunAsync(streamPrimary.Stream, d_bufferB.DevicePointer, d_buffer.DevicePointer, d_indexesB.DevicePointer, d_indexesA.DevicePointer); ctx.Synchronize(); streamPrimary.Synchronize(); count = new uint[2]; d_indexesA.CopyToHost(count, 0, 0, 8); if (count[0] > 4194304) { // trouble count[0] = 4194304; // log } hAligned_a.AsyncCopyFromDevice(d_buffer.DevicePointer, 0, 0, count[0] * 8, streamPrimary.Stream); streamPrimary.Synchronize(); System.Runtime.InteropServices.Marshal.Copy(hAligned_a.PinnedHostPointer, h_a, 0, ((int)count[0] * 8) / sizeof(int)); timer.Stop(); currentJob.solvedAt = DateTime.Now; currentJob.trimTime = timer.ElapsedMilliseconds; //Console.WriteLine("Trimmed in {0}ms to {1} edges", timer.ElapsedMilliseconds, count[0]); Logger.Log(LogLevel.Info, string.Format("GPU NV{2}: Trimmed in {0}ms to {1} edges, h {3}", timer.ElapsedMilliseconds, count[0], deviceID, currentJob.height)); }while((currentJob.height != Comms.nextJob.height) && (!Comms.IsTerminated) && (!TEST)); if (TEST) { //Console.WriteLine("Trimmed in {0}ms to {1} edges", timer.ElapsedMilliseconds, count[0]); CGraph cg = FinderBag.GetFinder(); if (cg == null) { continue; } cg.SetEdges(h_a, (int)count[0]); cg.SetHeader(currentJob); //currentJob = currentJob.Next(); Task.Factory.StartNew(() => { Stopwatch sw = new Stopwatch(); sw.Start(); if (count[0] < 200000) { try { if (findersInFlight++ < 3) { Stopwatch cycleTime = new Stopwatch(); cycleTime.Start(); cg.FindSolutions(graphSolutions); cycleTime.Stop(); AdjustTrims(cycleTime.ElapsedMilliseconds); if (graphSolutions.Count > 0) { solutions++; } } else { Logger.Log(LogLevel.Warning, "CPU overloaded!"); } } catch (Exception ex) { Logger.Log(LogLevel.Error, "Cycle finder error" + ex.Message); } finally { findersInFlight--; FinderBag.ReturnFinder(cg); } } sw.Stop(); if (++trims % 50 == 0) { Console.ForegroundColor = ConsoleColor.Green; Console.WriteLine("SOLS: {0}/{1} - RATE: {2:F1}", solutions, trims, (float)trims / solutions); Console.ResetColor(); } //Console.WriteLine("Finder completed in {0}ms on {1} edges with {2} solution(s)", sw.ElapsedMilliseconds, count[0], graphSolutions.Count); //Console.WriteLine("Duped edges: {0}", cg.dupes); Logger.Log(LogLevel.Info, string.Format("Finder completed in {0}ms on {1} edges with {2} solution(s) and {3} dupes", sw.ElapsedMilliseconds, count[0], graphSolutions.Count, cg.dupes)); }); //h_indexesA = d_indexesA; //h_indexesB = d_indexesB; //var sumA = h_indexesA.Sum(e => e); //var sumB = h_indexesB.Sum(e => e); ; } else { CGraph cg = FinderBag.GetFinder(); cg.SetEdges(h_a, (int)count[0]); cg.SetHeader(currentJob); Task.Factory.StartNew(() => { if (count[0] < 200000) { try { if (findersInFlight++ < 3) { Stopwatch cycleTime = new Stopwatch(); cycleTime.Start(); cg.FindSolutions(graphSolutions); cycleTime.Stop(); AdjustTrims(cycleTime.ElapsedMilliseconds); if (graphSolutions.Count > 0) { solutions++; } } else { Logger.Log(LogLevel.Warning, "CPU overloaded!"); } } catch (Exception ex) { Logger.Log(LogLevel.Error, "Cycle finder crashed: " + ex.Message); } finally { findersInFlight--; FinderBag.ReturnFinder(cg); } } }); } } catch (Exception ex) { Logger.Log(LogLevel.Error, "Critical error in main cuda loop " + ex.Message); Task.Delay(5000).Wait(); } } // clean up try { Task.Delay(500).Wait(); Comms.Close(); d_buffer.Dispose(); d_indexesA.Dispose(); d_indexesB.Dispose(); streamPrimary.Dispose(); streamSecondary.Dispose(); hAligned_a.Dispose(); if (ctx != null) { ctx.Dispose(); } } catch { } }
static void Main(string[] args) { try { if (args.Length == 1 && args[0].ToLower().Contains("fidelity")) { string[] fseg = args[0].Split(':'); deviceID = int.Parse(fseg[1]); nonce = Int64.Parse(fseg[2]) - 1; range = int.Parse(fseg[3]); QTEST = true; } else { if (args.Length > 0) { deviceID = int.Parse(args[0]); } } } catch (Exception ex) { Logger.Log(LogLevel.Error, "Device ID parse error: " + ex.Message); } try { if (args.Length > 0) { deviceID = int.Parse(args[0]); } } catch (Exception ex) { Logger.Log(LogLevel.Error, "Device ID parse error"); } try { if (args.Length > 1) { port = int.Parse(args[1]); Comms.ConnectToMaster(port); } else { TEST = true; Logger.CopyToConsole = true; CGraph.ShowCycles = true; } } catch (Exception ex) { Logger.Log(LogLevel.Error, "Master connection error"); } try { if (args.Length > 3) { gpuCount = int.Parse(args[3]); fastCuda = gpuCount <= (Environment.ProcessorCount / 2); if (fastCuda) { Logger.Log(LogLevel.Info, "Using single GPU blocking mode"); } } } catch { } if (TEST) { currentJob = nextJob = new Job() { jobID = 0, k0 = 0xf4956dc403730b01L, k1 = 0xe6d45de39c2a5a3eL, k2 = 0xcbf626a8afee35f6L, k3 = 0x4307b94b1a0c9980L, pre_pow = TestPrePow, timestamp = DateTime.Now }; } else { currentJob = nextJob = new Job() { jobID = 0, k0 = 0xf4956dc403730b01L, k1 = 0xe6d45de39c2a5a3eL, k2 = 0xcbf626a8afee35f6L, k3 = 0x4307b94b1a0c9980L, pre_pow = TestPrePow, timestamp = DateTime.Now }; if (!Comms.IsConnected()) { Console.WriteLine("Master connection failed, aborting"); Logger.Log(LogLevel.Error, "No master connection, exitting!"); return; } if (deviceID < 0) { int devCnt = CudaContext.GetDeviceCount(); GpuDevicesMessage gpum = new GpuDevicesMessage() { devices = new List <GpuDevice>(devCnt) }; for (int i = 0; i < devCnt; i++) { string name = CudaContext.GetDeviceName(i); var info = CudaContext.GetDeviceInfo(i); gpum.devices.Add(new GpuDevice() { deviceID = i, name = name, memory = info.TotalGlobalMemory }); } //Console.WriteLine(devCnt); Comms.gpuMsg = gpum; Comms.SetEvent(); //Console.WriteLine("event fired"); Task.Delay(1000).Wait(); //Console.WriteLine("closing"); Comms.Close(); return; } } try { var assembly = Assembly.GetEntryAssembly(); var resourceStream = assembly.GetManifestResourceStream("CudaSolver.kernel_x64.ptx"); ctx = new CudaContext(deviceID, /*!fastCuda ? (CUCtxFlags.BlockingSync | CUCtxFlags.MapHost) :*/ CUCtxFlags.MapHost); string pow = new StreamReader(resourceStream).ReadToEnd(); //pow = File.ReadAllText(@"kernel_x64.ptx"); Turing = ctx.GetDeviceInfo().MaxSharedMemoryPerMultiprocessor == 65536; using (var s = GenerateStreamFromString(pow)) { if (!Turing) { meanSeedA = ctx.LoadKernelPTX(s, "FluffySeed4K", new CUJITOption[] { CUJITOption.MaxRegisters }, new object[] { (uint)40 }); meanSeedA.BlockDimensions = 512; meanSeedA.GridDimensions = 1024; meanSeedA.PreferredSharedMemoryCarveout = CUshared_carveout.MaxShared; meanRound = ctx.LoadKernelPTX(s, "FluffyRound_A2", new CUJITOption[] { CUJITOption.MaxRegisters }, new object[] { (uint)40 }); meanRound.BlockDimensions = 512; meanRound.GridDimensions = 4096; meanRound.PreferredSharedMemoryCarveout = CUshared_carveout.MaxShared; meanRound_4 = ctx.LoadKernelPTX(s, "FluffyRound_A1", new CUJITOption[] { CUJITOption.MaxRegisters }, new object[] { (uint)32 }); meanRound_4.BlockDimensions = 1024; meanRound_4.GridDimensions = 1024; meanRound_4.PreferredSharedMemoryCarveout = CUshared_carveout.MaxShared; meanRoundJoin = ctx.LoadKernelPTX(s, "FluffyRound_A3", new CUJITOption[] { CUJITOption.MaxRegisters }, new object[] { (uint)32 }); meanRoundJoin.BlockDimensions = 1024; meanRoundJoin.GridDimensions = 4096; meanRoundJoin.PreferredSharedMemoryCarveout = CUshared_carveout.MaxShared; meanTail = ctx.LoadKernelPTX(s, "FluffyTail"); meanTail.BlockDimensions = 1024; meanTail.GridDimensions = 4096; meanTail.PreferredSharedMemoryCarveout = CUshared_carveout.MaxL1; meanRecover = ctx.LoadKernelPTX(s, "FluffyRecovery"); meanRecover.BlockDimensions = 256; meanRecover.GridDimensions = 2048; meanRecover.PreferredSharedMemoryCarveout = CUshared_carveout.MaxL1; } else { meanSeedA = ctx.LoadKernelPTX(s, "FluffySeed4K", new CUJITOption[] { CUJITOption.MaxRegisters }, new object[] { (uint)64 }); meanSeedA.BlockDimensions = 512; meanSeedA.GridDimensions = 1024; meanSeedA.PreferredSharedMemoryCarveout = CUshared_carveout.MaxShared; meanRound = ctx.LoadKernelPTX(s, "FluffyRound_C2", new CUJITOption[] { CUJITOption.MaxRegisters }, new object[] { (uint)32 }); meanRound.BlockDimensions = 1024; meanRound.GridDimensions = 4096; meanRound.PreferredSharedMemoryCarveout = CUshared_carveout.MaxShared; meanRound_4 = ctx.LoadKernelPTX(s, "FluffyRound_C1", new CUJITOption[] { CUJITOption.MaxRegisters }, new object[] { (uint)64 }); meanRound_4.BlockDimensions = 1024; meanRound_4.GridDimensions = 1024; meanRound_4.PreferredSharedMemoryCarveout = CUshared_carveout.MaxShared; meanRoundJoin = ctx.LoadKernelPTX(s, "FluffyRound_C3", new CUJITOption[] { CUJITOption.MaxRegisters }, new object[] { (uint)32 }); meanRoundJoin.BlockDimensions = 1024; meanRoundJoin.GridDimensions = 4096; meanRoundJoin.PreferredSharedMemoryCarveout = CUshared_carveout.MaxShared; meanTail = ctx.LoadKernelPTX(s, "FluffyTail"); meanTail.BlockDimensions = 1024; meanTail.GridDimensions = 4096; meanTail.PreferredSharedMemoryCarveout = CUshared_carveout.MaxL1; meanRecover = ctx.LoadKernelPTX(s, "FluffyRecovery"); meanRecover.BlockDimensions = 256; meanRecover.GridDimensions = 2048; meanRecover.PreferredSharedMemoryCarveout = CUshared_carveout.MaxL1; } } } catch (Exception ex) { Logger.Log(LogLevel.Error, "Unable to create kernels: " + ex.Message); Task.Delay(500).Wait(); Comms.Close(); return; } try { d_buffer = new CudaDeviceVariable <ulong>(BUFFER_SIZE_U32 * (temp ? 8 : 1)); d_bufferMid = new CudaDeviceVariable <ulong>(d_buffer.DevicePointer + (BUFFER_SIZE_B * 2)); d_bufferB = new CudaDeviceVariable <ulong>(d_buffer.DevicePointer + (BUFFER_SIZE_B * 8)); d_indexesA = new CudaDeviceVariable <uint>(INDEX_SIZE); d_indexesB = new CudaDeviceVariable <uint>(INDEX_SIZE); d_aux = new CudaDeviceVariable <uint>(512); Array.Clear(h_indexesA, 0, h_indexesA.Length); Array.Clear(h_indexesB, 0, h_indexesA.Length); d_indexesA = h_indexesA; d_indexesB = h_indexesB; streamPrimary = new CudaStream(CUStreamFlags.NonBlocking); } catch (Exception ex) { Task.Delay(200).Wait(); Logger.Log(LogLevel.Error, $"Mem alloc exception. Out of video memory? {ctx.GetFreeDeviceMemorySize()} free"); Task.Delay(500).Wait(); Comms.Close(); return; } try { AllocateHostMemory(true, ref h_a, ref hAligned_a, 1024 * 1024 * 32); } catch (Exception ex) { Logger.Log(LogLevel.Error, "Unable to create pinned memory."); Task.Delay(500).Wait(); Comms.Close(); return; } int loopCnt = 0; while (!Comms.IsTerminated) { try { if (!TEST && (Comms.nextJob.pre_pow == null || Comms.nextJob.pre_pow == "" || Comms.nextJob.pre_pow == TestPrePow)) { Logger.Log(LogLevel.Info, string.Format("Waiting for job....")); Task.Delay(1000).Wait(); continue; } if (!TEST && ((currentJob.pre_pow != Comms.nextJob.pre_pow) || (currentJob.origin != Comms.nextJob.origin))) { currentJob = Comms.nextJob; currentJob.timestamp = DateTime.Now; } if (!TEST && (currentJob.timestamp.AddMinutes(30) < DateTime.Now) && Comms.lastIncoming.AddMinutes(30) < DateTime.Now) { Logger.Log(LogLevel.Info, string.Format("Job too old...")); Task.Delay(1000).Wait(); continue; } // test runs only once if (TEST && ++loopCnt >= range) { Comms.IsTerminated = true; } Solution s; while (graphSolutions.TryDequeue(out s)) { meanRecover.SetConstantVariable <ulong>("recovery", s.GetUlongEdges()); d_indexesB.MemsetAsync(0, streamPrimary.Stream); meanRecover.RunAsync(streamPrimary.Stream, s.job.k0, s.job.k1, s.job.k2, s.job.k3, d_indexesB.DevicePointer); streamPrimary.Synchronize(); s.nonces = new uint[32]; d_indexesB.CopyToHost(s.nonces, 0, 0, 32 * 4); s.nonces = s.nonces.OrderBy(n => n).ToArray(); //fidelity = (32-cycles_found / graphs_searched) * 32 solutions++; s.fidelity = ((double)solutions / (double)trims) * 32.0; //Console.WriteLine(s.fidelity.ToString("0.000")); if (Comms.IsConnected()) { Comms.graphSolutionsOut.Enqueue(s); Comms.SetEvent(); } if (QTEST) { Console.ForegroundColor = ConsoleColor.Red; Console.WriteLine($"Solution for nonce {s.job.nonce}: {string.Join(' ', s.nonces)}"); Console.ResetColor(); } } if (QTEST) { currentJob = currentJob.NextSequential(ref nonce); Console.WriteLine($"Nonce: {nonce} K0: {currentJob.k0:X} K1: {currentJob.k1:X} K2: {currentJob.k2:X} K3: {currentJob.k3:X}"); } else { currentJob = currentJob.Next(); } Logger.Log(LogLevel.Debug, string.Format("GPU NV{4}:Trimming #{4}: {0} {1} {2} {3}", currentJob.k0, currentJob.k1, currentJob.k2, currentJob.k3, currentJob.jobID, deviceID)); timer.Restart(); d_indexesA.MemsetAsync(0, streamPrimary.Stream); d_indexesB.MemsetAsync(0, streamPrimary.Stream); d_aux.MemsetAsync(0, streamPrimary.Stream); meanSeedA.RunAsync(streamPrimary.Stream, currentJob.k0, currentJob.k1, currentJob.k2, currentJob.k3, d_bufferMid.DevicePointer, d_indexesB.DevicePointer, 0); meanSeedA.RunAsync(streamPrimary.Stream, currentJob.k0, currentJob.k1, currentJob.k2, currentJob.k3, d_bufferMid.DevicePointer + ((BUFFER_SIZE_A * 8) / 4 / 4) * 1, d_indexesB.DevicePointer + (4096 * 4), EDGE_SEG); meanSeedA.RunAsync(streamPrimary.Stream, currentJob.k0, currentJob.k1, currentJob.k2, currentJob.k3, d_bufferMid.DevicePointer + ((BUFFER_SIZE_A * 8) / 4 / 4) * 2, d_indexesB.DevicePointer + (4096 * 8), EDGE_SEG * 2); meanSeedA.RunAsync(streamPrimary.Stream, currentJob.k0, currentJob.k1, currentJob.k2, currentJob.k3, d_bufferMid.DevicePointer + ((BUFFER_SIZE_A * 8) / 4 / 4) * 3, d_indexesB.DevicePointer + (4096 * 12), EDGE_SEG * 3); meanRound_4.RunAsync(streamPrimary.Stream, d_bufferMid.DevicePointer, d_buffer.DevicePointer, d_indexesB.DevicePointer, d_indexesA.DevicePointer, DUCK_EDGES_A / 4, DUCK_EDGES_B / 4, 0); meanRound_4.RunAsync(streamPrimary.Stream, d_bufferMid.DevicePointer, d_buffer.DevicePointer + ((BUFFER_SIZE_B * 8) / 4) * 1, d_indexesB.DevicePointer, d_indexesA.DevicePointer, DUCK_EDGES_A / 4, DUCK_EDGES_B / 4, 1024); meanRound_4.RunAsync(streamPrimary.Stream, d_bufferMid.DevicePointer, d_buffer.DevicePointer + ((BUFFER_SIZE_B * 8) / 4) * 2, d_indexesB.DevicePointer, d_indexesA.DevicePointer, DUCK_EDGES_A / 4, DUCK_EDGES_B / 4, 2048); meanRound_4.RunAsync(streamPrimary.Stream, d_bufferMid.DevicePointer, d_buffer.DevicePointer + ((BUFFER_SIZE_B * 8) / 4) * 3, d_indexesB.DevicePointer, d_indexesA.DevicePointer, DUCK_EDGES_A / 4, DUCK_EDGES_B / 4, 3072); //streamPrimary.Synchronize(); //h_indexesA = d_indexesA; //h_indexesB = d_indexesB; //var sumA = h_indexesA.Sum(e => e); //var sumB = h_indexesB.Sum(e => e); //streamPrimary.Synchronize(); d_indexesB.MemsetAsync(0, streamPrimary.Stream); meanRoundJoin.RunAsync(streamPrimary.Stream, d_buffer.DevicePointer, d_buffer.DevicePointer + ((BUFFER_SIZE_B * 8) / 4) * 1, d_buffer.DevicePointer + ((BUFFER_SIZE_B * 8) / 4) * 2, d_buffer.DevicePointer + ((BUFFER_SIZE_B * 8) / 4) * 3, d_bufferB.DevicePointer, d_indexesA.DevicePointer, d_indexesB.DevicePointer, DUCK_EDGES_B / 4, DUCK_EDGES_B / 2); d_indexesA.MemsetAsync(0, streamPrimary.Stream); meanRound.RunAsync(streamPrimary.Stream, d_bufferB.DevicePointer, d_buffer.DevicePointer, d_indexesB.DevicePointer, d_indexesA.DevicePointer, DUCK_EDGES_B / 2, DUCK_EDGES_B / 2, 0, d_aux.DevicePointer); d_indexesB.MemsetAsync(0, streamPrimary.Stream); meanRound.RunAsync(streamPrimary.Stream, d_buffer.DevicePointer, d_bufferB.DevicePointer, d_indexesA.DevicePointer, d_indexesB.DevicePointer, DUCK_EDGES_B / 2, DUCK_EDGES_B / 2, 1, d_aux.DevicePointer); d_indexesA.MemsetAsync(0, streamPrimary.Stream); meanRound.RunAsync(streamPrimary.Stream, d_bufferB.DevicePointer, d_buffer.DevicePointer, d_indexesB.DevicePointer, d_indexesA.DevicePointer, DUCK_EDGES_B / 2, DUCK_EDGES_B / 2, 2, d_aux.DevicePointer); d_indexesB.MemsetAsync(0, streamPrimary.Stream); meanRound.RunAsync(streamPrimary.Stream, d_buffer.DevicePointer, d_bufferB.DevicePointer, d_indexesA.DevicePointer, d_indexesB.DevicePointer, DUCK_EDGES_B / 2, DUCK_EDGES_B / 4, 3, d_aux.DevicePointer); for (int i = 0; i < (TEST ? 80 : trimRounds); i++) //for (int i = 0; i < 85; i++) { d_indexesA.MemsetAsync(0, streamPrimary.Stream); meanRound.RunAsync(streamPrimary.Stream, d_bufferB.DevicePointer, d_buffer.DevicePointer, d_indexesB.DevicePointer, d_indexesA.DevicePointer, DUCK_EDGES_B / 4, DUCK_EDGES_B / 4, i * 2 + 4, d_aux.DevicePointer); d_indexesB.MemsetAsync(0, streamPrimary.Stream); meanRound.RunAsync(streamPrimary.Stream, d_buffer.DevicePointer, d_bufferB.DevicePointer, d_indexesA.DevicePointer, d_indexesB.DevicePointer, DUCK_EDGES_B / 4, DUCK_EDGES_B / 4, i * 2 + 5, d_aux.DevicePointer); } d_indexesA.MemsetAsync(0, streamPrimary.Stream); meanTail.RunAsync(streamPrimary.Stream, d_bufferB.DevicePointer, d_buffer.DevicePointer, d_indexesB.DevicePointer, d_indexesA.DevicePointer); Task.Delay((int)lastTrimMs).Wait(); streamPrimary.Synchronize(); uint[] count = new uint[2]; d_indexesA.CopyToHost(count, 0, 0, 8); if (count[0] > 131071) { // trouble count[0] = 131071; // log } hAligned_a.AsyncCopyFromDevice(d_buffer.DevicePointer, 0, 0, count[0] * 8, streamPrimary.Stream); streamPrimary.Synchronize(); System.Runtime.InteropServices.Marshal.Copy(hAligned_a.PinnedHostPointer, h_a, 0, ((int)count[0] * 8) / sizeof(int)); trims++; timer.Stop(); lastTrimMs = (long)Math.Min(Math.Max((float)timer.ElapsedMilliseconds * 0.9f, 50), 500); currentJob.solvedAt = DateTime.Now; currentJob.trimTime = timer.ElapsedMilliseconds; //Console.WriteLine("Trimmed in {0}ms to {1} edges", timer.ElapsedMilliseconds, count[0]); Logger.Log(LogLevel.Info, string.Format("GPU NV{2}: Trimmed in {0}ms to {1} edges", timer.ElapsedMilliseconds, count[0], deviceID)); FinderBag.RunFinder(TEST, ref trims, count[0], h_a, currentJob, graphSolutions, timer); if (trims % 50 == 0 && TEST) { Console.ForegroundColor = ConsoleColor.Green; Console.WriteLine("SOLS: {0}/{1} - RATE: {2:F1}", solutions, trims, (float)trims / solutions); Console.ResetColor(); } /* * if (TEST) * { * //Console.WriteLine("Trimmed in {0}ms to {1} edges", timer.ElapsedMilliseconds, count[0]); * * CGraph cg = FinderBag.GetFinder(); * cg.SetEdges(h_a, (int)count[0]); * cg.SetHeader(currentJob); * * //currentJob = currentJob.Next(); * * Task.Factory.StartNew(() => * { * Stopwatch sw = new Stopwatch(); * sw.Start(); * * if (count[0] < 131071) * { * try * { * if (findersInFlight++ < 3) * { * Stopwatch cycleTime = new Stopwatch(); * cycleTime.Start(); * cg.FindSolutions(graphSolutions); * cycleTime.Stop(); * AdjustTrims(cycleTime.ElapsedMilliseconds); * //if (graphSolutions.Count > 0) solutions++; * } * else * Logger.Log(LogLevel.Warning, "CPU overloaded!"); * } * catch (Exception ex) * { * Logger.Log(LogLevel.Error, "Cycle finder error" + ex.Message); * } * finally * { * FinderBag.ReturnFinder(cg); * findersInFlight--; * } * } * * sw.Stop(); * * if (trims % 50 == 0) * { * Console.ForegroundColor = ConsoleColor.Green; * Console.WriteLine("SOLS: {0}/{1} - RATE: {2:F1}", solutions, trims, (float)trims/solutions ); * Console.ResetColor(); * } * //Console.WriteLine("Finder completed in {0}ms on {1} edges with {2} solution(s)", sw.ElapsedMilliseconds, count[0], graphSolutions.Count); * //Console.WriteLine("Duped edges: {0}", cg.dupes); * if (!QTEST) * Logger.Log(LogLevel.Info, string.Format("Finder completed in {0}ms on {1} edges with {2} solution(s) and {3} dupes", sw.ElapsedMilliseconds, count[0], graphSolutions.Count, cg.dupes)); * }); * * //h_indexesA = d_indexesA; * //h_indexesB = d_indexesB; * * //var sumA = h_indexesA.Sum(e => e); * //var sumB = h_indexesB.Sum(e => e); * * ; * } * else * { * CGraph cg = FinderBag.GetFinder(); * cg.SetEdges(h_a, (int)count[0]); * cg.SetHeader(currentJob); * * Task.Factory.StartNew(() => * { * if (count[0] < 131071) * { * try * { * if (findersInFlight++ < 3) * { * Stopwatch cycleTime = new Stopwatch(); * cycleTime.Start(); * cg.FindSolutions(graphSolutions); * cycleTime.Stop(); * AdjustTrims(cycleTime.ElapsedMilliseconds); * } * else * Logger.Log(LogLevel.Warning, "CPU overloaded!"); * } * catch (Exception ex) * { * Logger.Log(LogLevel.Warning, "Cycle finder crashed: " + ex.Message); * } * finally * { * FinderBag.ReturnFinder(cg); * findersInFlight--; * } * } * }); * } * */ } catch (Exception ex) { Logger.Log(LogLevel.Error, "Critical error in main cuda loop " + ex.Message); Task.Delay(500).Wait(); break; } } // clean up try { Task.Delay(500).Wait(); Comms.Close(); d_buffer.Dispose(); d_indexesA.Dispose(); d_indexesB.Dispose(); d_aux.Dispose(); streamPrimary.Dispose(); streamSecondary.Dispose(); hAligned_a.Dispose(); if (ctx != null) { ctx.Dispose(); } } catch { } }