private void KernelRun(CudaStream stream, CUdeviceptr outputPtr, CUdeviceptr input1Ptr, CUdeviceptr input2Ptr, int size, bool async = false) { CUdeviceptr bufferPtr = m_buffer.GetDevicePtr(m_nGPU); outputPtr = outputPtr + outOffset * m_outTypeSize; if (this.size > 0) { size = this.size; } int segmentedFlag = Convert.ToInt32(segmented); int distributedFlag = Convert.ToInt32(distributed); if (async) { m_kernels[threadCount].RunAsync(stream, outputPtr, outOffset, input1Ptr, input2Ptr, bufferPtr, size, segmentedFlag, distributedFlag); } else { m_kernels[threadCount].Run(outputPtr, outOffset, input1Ptr, input2Ptr, bufferPtr, size, segmentedFlag, distributedFlag); } ResetParameters(); }
/// <summary> /// Draws a single element of a guess at the provided position /// </summary> /// <param name="s">[async] stream that the kernels will execute in</param> /// <param name="guess">element of a guess to draw</param> /// <param name="position">the positiion (0,1,2,...) of the element inside the guess vector</param> /// <param name="xGroupOffset">X pixel offset of the first element of the guess vector</param> /// <param name="yGroupOffset">Y pixel offset of the first element of the guess vector</param> private void DrawGuessAtPosition(CudaStream s, float guess, int position, int xGroupOffset, int yGroupOffset) { MyCudaTexture textureCircleRim = Owner.m_textureCircleRim; MyCudaTexture textureCircleMask = Owner.m_textureCircleMask; int xOffset = xGroupOffset + position * (GFX_GUESS_SPACER + textureCircleRim.SizeInPixels.x); int yOffset = yGroupOffset; Color guessColor = Owner.GetGuessColor((int)Math.Round(guess)); // color int yDiv = 10; Debug.Assert(textureCircleMask.SizeInPixels.y % yDiv == 0); m_MaskedColorKernel.SetupExecution(new dim3(textureCircleMask.SizeInPixels.x, yDiv, 1), new dim3(textureCircleMask.SizeInPixels.y / yDiv, TARGET_VALUES_PER_PIXEL)); m_MaskedColorKernel.RunAsync(s, Owner.VisualOutput, Owner.VisualWidth, Owner.VisualHeight, xOffset, yOffset, textureCircleMask.BitmapPtr, textureCircleMask.SizeInPixels.x, textureCircleMask.SizeInPixels.y, guessColor.R / 255.0f, guessColor.G / 255.0f, guessColor.B / 255.0f); // circle rim Debug.Assert(textureCircleRim.SizeInPixels.y % yDiv == 0); m_RgbaTextureKernel.SetupExecution(new dim3(textureCircleRim.SizeInPixels.x, yDiv, 1), new dim3(textureCircleRim.SizeInPixels.y / yDiv, TARGET_VALUES_PER_PIXEL)); m_RgbaTextureKernel.RunAsync(s, Owner.VisualOutput, Owner.VisualWidth, Owner.VisualHeight, xOffset, yOffset, textureCircleRim.BitmapPtr, textureCircleRim.SizeInPixels.x, textureCircleRim.SizeInPixels.y); }
public CudaError LaunchKernelWithStreamBinding( CudaStream stream, CudaKernel kernel, int gridDimX, int gridDimY, int gridDimZ, int blockDimX, int blockDimY, int blockDimZ, int sharedMemSizeInBytes, IntPtr args, IntPtr kernelArgs) { var binding = stream.BindScoped(); var result = LaunchKernel( kernel.FunctionPtr, gridDimX, gridDimY, gridDimZ, blockDimX, blockDimY, blockDimZ, sharedMemSizeInBytes, stream.StreamPtr, args, kernelArgs); binding.Recover(); return(result); }
public override void Init(int nGPU) { switch (Mode) { case HashMapperMode.Simple: break; case HashMapperMode.LocalitySensitive: MyMemoryManager.Instance.ClearGlobalVariable(Owner.GlobalVariableName, Owner.GPU); // Only values are the modulo and and integer divisor (placing into bins) Owner.Temp.SafeCopyToHost(0, 2); Owner.Temp.Host[0] = 2f; Owner.Temp.Host[1] = 2f / InternalBinCount; Owner.Temp.SafeCopyToDevice(0, 2); break; default: throw new ArgumentOutOfRangeException(); } _combineVectorsKernel = MyKernelFactory.Instance.Kernel(nGPU, @"common\CombineVectorsKernel", "CombineTwoVectorsKernelVarSize"); _combineVectorsKernel.SetupExecution(Owner.InputSize); _hashKernel = MyKernelFactory.Instance.Kernel(nGPU, @"VSA\Mappers", "GetIndices_ImplicitSeed"); _hashKernel.SetupExecution(Owner.Output.Count); _noHashKernel = MyKernelFactory.Instance.Kernel(nGPU, @"VSA\Mappers", "GetIndices_NoHashing"); _noHashKernel.SetupExecution(Owner.Output.Count); m_stream = new CudaStream(); }
private CudaStreamProvider() { for (var i = 0; i < StreamsPerThread; i++) { m_streamPool[i] = new CudaStream(); } }
private static void RunCudaTest(SparseMatrix A, SparseMatrix B) { // NOTE: for Hermitian matrices, the storage has to be transposed, which is // done by the solver (unless 3rd argument transpose = false). using (var stream = new CudaStream()) using (var solver = new CudaCholesky(stream, B)) { TestRandomSymmetric(solver, B, "CUDA Cholesky"); ReportGpuTime(solver.FactorizationTime); } using (var stream = new CudaStream()) using (var solver = new CudaQR(stream, A)) { TestRandom(solver, A, "CUDA QR"); ReportGpuTime(solver.FactorizationTime); } using (var stream = new CudaStream()) using (var solver = new CudaQR(stream, B)) { TestRandomSymmetric(solver, B, "CUDA QR"); ReportGpuTime(solver.FactorizationTime); } }
/// <summary> /// Transforms the <paramref name="output"/> vector into a vector of indices with properties specified by the parameters. /// </summary> /// <param name="input">The vector to transform.</param> /// <param name="output">The memory to contain the results.</param> /// <param name="misc">A vector containing the range to modulate to as the first value (typically 2f because dot product ranges from [-1,1]) /// and the bin size in this modulated space (typically <paramref name="misc"/>[0] / internalBinCount) as the second value.</param> /// <param name="offsets">The random offsets for each <paramref name="output"/> value (typically uniform random numbers in [0, <paramref name="misc"/>[0].</param> /// <param name="vectorSize">The length of the <paramref name="output"/> vector.</param> /// <param name="outputBinCount">The range into which the internal bins will be scattered.</param> /// <param name="seed">The seed used for the scattering the internal bins.</param> /// <param name="combineVectorsKernel">The kernel used for addition, modulo and integer division.</param> /// <param name="hashKernel">The kernel used for scattering the internal bins.</param> /// <param name="noHashKernel"></param> /// <param name="doHashMapping"></param> /// <param name="internalBinCount"></param> /// <param name="stream"></param> public static void GetIndices( CUdeviceptr input, CUdeviceptr output, CUdeviceptr misc, CUdeviceptr?offsets, int vectorSize, int outputBinCount, int seed, MyCudaKernel combineVectorsKernel, MyCudaKernel hashKernel, MyCudaKernel noHashKernel, bool doHashMapping, int internalBinCount, CudaStream stream) { Debug.Assert(vectorSize > 0, "Invalid vector size"); Debug.Assert(outputBinCount > 1, "Requires at least 2 output bins"); Debug.Assert(combineVectorsKernel != null && hashKernel != null, "Missing kernels"); // Values are in [-1, 1] if they were normalized if (offsets != null) { // Offset to [-1, 3] combineVectorsKernel.RunAsync(stream, input, offsets.Value, output, (int)MyJoin.MyJoinOperation.Addition, vectorSize, vectorSize); } // Modulate to [0, 2] combineVectorsKernel.RunAsync(stream, output, misc, output, (int)MyJoin.MyJoinOperation.Modulo, vectorSize, 1); // Transform to integers in [0, InternalBinCount - 1] combineVectorsKernel.RunAsync(stream, output, misc + sizeof(float), output, (int)MyJoin.MyJoinOperation.Division_int, vectorSize, 1); if (doHashMapping) { hashKernel.RunAsync(stream, output, output, vectorSize, vectorSize, outputBinCount, seed); } else { noHashKernel.RunAsync(stream, output, output, vectorSize, internalBinCount); } }
// TODO(Premek): Add more copy functions to be consistent with Async variants. (And cleanup the API in general.) #region Async copy device to device public virtual void CopyFromMemoryBlockAsync(MyMemoryBlock <T> source, int srcOffset, int destOffset, int count, CudaStream stream = null) { Device[Owner.GPU].AsyncCopyToDevice( source.GetDevice(Owner.GPU), srcOffset * ESize, destOffset * ESize, count * ESize, MyKernelFactory.GetCuStreamOrDefault(stream)); }
/// <summary> /// Retrieves the compressed stream from the encoder state that was previously used in one of the encoder functions.<para/> /// Note: Synchronizes on stream. For async use IntPtr! /// </summary> /// <param name="stream">CUDA stream where all the required device operations will be placed.</param> /// <returns>Byte array containing the data.</returns> public byte[] RetrieveBitstream(CudaStream stream) { SizeT size = new SizeT(); res = NvJpegNativeMethods.nvjpegEncodeRetrieveBitstream(_nvJpeg.Handle, _state, IntPtr.Zero, ref size, stream.Stream); Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "nvjpegEncodeRetrieveBitstream", res)); if (res != nvjpegStatus.Success) { throw new NvJpegException(res); } byte[] data = new byte[size]; GCHandle handle = GCHandle.Alloc(data, GCHandleType.Pinned); try { IntPtr ptr = handle.AddrOfPinnedObject(); res = NvJpegNativeMethods.nvjpegEncodeRetrieveBitstream(_nvJpeg.Handle, _state, ptr, ref size, stream.Stream); Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "nvjpegEncodeRetrieveBitstream", res)); stream.Synchronize(); } finally { handle.Free(); } if (res != nvjpegStatus.Success) { throw new NvJpegException(res); } return(data); }
/// <summary> /// Works just like DrawGuessAtPosition, only for evaluations. /// </summary> private void DrawEvaluationAtPosition(CudaStream s, MyMastermindWorld.EvaluationKind evaluation, int position, int xGroupOffset, int yGroupOffset, int evaluationsPerRow) { MyCudaTexture texture; switch (evaluation) { case MyMastermindWorld.EvaluationKind.Bull: texture = Owner.m_textureBull; break; case MyMastermindWorld.EvaluationKind.Cow: texture = Owner.m_textureCow; break; case MyMastermindWorld.EvaluationKind.Miss: default: texture = Owner.m_textureMiss; break; } int xOffset = xGroupOffset + (position % evaluationsPerRow) * (GFX_EVALUATION_SPACER + texture.SizeInPixels.x); int yOffset = yGroupOffset + (position / evaluationsPerRow) * (GFX_EVALUATION_SPACER + texture.SizeInPixels.y); int yDiv = 10; Debug.Assert(texture.SizeInPixels.y % yDiv == 0); m_RgbaTextureKernel.SetupExecution(new dim3(texture.SizeInPixels.x, yDiv, 1), new dim3(texture.SizeInPixels.y / yDiv, TARGET_VALUES_PER_PIXEL)); m_RgbaTextureKernel.RunAsync(s, Owner.VisualOutput, Owner.VisualWidth, Owner.VisualHeight, xOffset, yOffset, texture.BitmapPtr, texture.SizeInPixels.x, texture.SizeInPixels.y); }
internal CudaError LaunchKernelWithStreamBinding( CudaStream stream, CudaKernel kernel, RuntimeKernelConfig config, IntPtr args, IntPtr kernelArgs) { var binding = stream.BindScoped(); var result = LaunchKernel( kernel.FunctionPtr, config.GridDim.X, config.GridDim.Y, config.GridDim.Z, config.GroupDim.X, config.GroupDim.Y, config.GroupDim.Z, config.SharedMemoryConfig.DynamicArraySize, stream.StreamPtr, args, kernelArgs); binding.Recover(); return(result); }
/// <summary> Runs the kernel in asynchronous mode. </summary> /// <param name="stream">If the stream is null, the default per-thread stream is used.</param> /// <param name="args">MyMemoryBlock arguments are automatically converted to device pointers.</param> public void RunAsync(CudaStream stream, params object[] args) { CheckExecutionSetup(); ConvertMemoryBlocksToDevicePtrs(args); m_kernel.RunAsync(MyKernelFactory.GetCuStreamOrDefault(stream), args); }
public virtual void CopyToMemoryBlockAsync(MyMemoryBlock <T> destination, int srcOffset, int destOffset, int count, CudaStream stream = null) { destination.GetDevice(Owner.GPU).AsyncCopyToDevice( Device[Owner.GPU], srcOffset * ESize, destOffset * ESize, count * ESize, MyKernelFactory.GetCuStreamOrDefault(stream)); }
private static void RunCudaTest(SparseMatrix A, SparseMatrix B) { // NOTE: for real symmetric matrices, there's no need to transpose the storage, so // we call the solver constructor with 3rd argument transpose = false. using (var stream = new CudaStream()) using (var solver = new CudaCholesky(stream, B, false)) { TestRandomSymmetric(solver, B, "CUDA Cholesky"); ReportGpuTime(solver.FactorizationTime); } using (var stream = new CudaStream()) using (var solver = new CudaQR(stream, A)) { TestRandom(solver, A, "CUDA QR"); ReportGpuTime(solver.FactorizationTime); } using (var stream = new CudaStream()) using (var solver = new CudaQR(stream, B, false)) { TestRandomSymmetric(solver, B, "CUDA QR"); ReportGpuTime(solver.FactorizationTime); } }
public MyFourierBinder(MyWorkingNode owner, int inputSize, MyMemoryBlock <float> tempBlock) : base(owner, inputSize, tempBlock) { m_stream = new CudaStream(); m_fft = new CudaFFTPlan1D(inputSize, cufftType.R2C, 1); m_fft.SetStream(m_stream.Stream); m_ifft = new CudaFFTPlan1D(inputSize, cufftType.C2R, 1); m_ifft.SetStream(m_stream.Stream); m_mulkernel = MyKernelFactory.Instance.Kernel(owner.GPU, @"Common\CombineVectorsKernel", "MulComplexElementWise"); m_mulkernel.SetupExecution(inputSize + 1); m_involutionKernel = MyKernelFactory.Instance.Kernel(owner.GPU, @"Common\CombineVectorsKernel", "InvolveVector"); m_involutionKernel.SetupExecution(inputSize - 1); m_inversionKernel = MyKernelFactory.Instance.Kernel(owner.GPU, @"Transforms\InvertValuesKernel", "InvertLengthComplexKernel"); m_inversionKernel.SetupExecution(inputSize); m_dotKernel = MyKernelFactory.Instance.KernelProduct <float>(owner, owner.GPU, ProductMode.f_DotProduct_f); m_normalKernel = MyKernelFactory.Instance.Kernel(owner.GPU, @"Transforms\TransformKernels", "PolynomialFunctionKernel"); m_normalKernel.SetupExecution(inputSize); m_firstFFTOffset = 0; m_secondFFTOffset = (inputSize + 1) * 2; m_tempOffset = (inputSize + 1) * 4; Denominator = inputSize; }
public MyPermutationBinder(MyWorkingNode owner, int inputSize, MyMemoryBlock <float> tempBlock) : base(owner, inputSize, tempBlock) { m_stream = new CudaStream(); m_binaryPermKernel = MyKernelFactory.Instance.Kernel(owner.GPU, @"Common\CombineVectorsKernel", "CombineTwoVectorsKernel"); m_binaryPermKernel.SetupExecution(inputSize); }
private GpuContext() { CudaContext = new CudaContext(Global.CudaDeviceId); BlasContext = new CudaBlas(); KernelManager = new KernelManager(this); Methods = new TensorMethods(this); Stream = new CudaStream(); }
/// <summary> /// Register a cuda context for reference via <see cref="GetContextForDeviceId"/> (used for context restoration). /// </summary> /// <param name="context"></param> internal static void RegisterContext(CudaContext context, CudaStream stream) { if (!RegisteredContexts.Contains(context)) { RegisteredContexts.Add(context); RegisteredStreamsByContext.Add(context, stream); } }
// finishing async operations on the device public void DecodeJpegDevice(ref nvjpegImage destination, CudaStream cudaStream) { res = NvJpegNativeMethods.nvjpegDecodeJpegDevice(_nvJpeg.Handle, _decoder.Decoder, _state, ref destination, cudaStream.Stream); Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "nvjpegDecodeJpegDevice", res)); if (res != nvjpegStatus.Success) { throw new NvJpegException(res); } }
static TensorOpGpu() { _CudaContext = new CudaContext(0, true); _CudaBlasHandle = new CudaBlas(); _CudaStream = new CudaStream(); _CudnnContext = new CudaDNNContext(); _KernelLoader = new KernelLoader(); }
// copies huffman tables from parsed stream. should require same scans structure public void CopyHuffmanTables(EncoderParams encoderParams, JpegStream jpeg, CudaStream stream) { res = NvJpegNativeMethods.nvjpegEncoderParamsCopyHuffmanTables(_state, encoderParams.Params, jpeg.Stream, stream.Stream); Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "nvjpegEncoderParamsCopyHuffmanTables", res)); if (res != nvjpegStatus.Success) { throw new NvJpegException(res); } }
public void SetOptimizedHuffman(int optimized, CudaStream stream) { res = NvJpegNativeMethods.nvjpegEncoderParamsSetOptimizedHuffman(_params, optimized, stream.Stream); Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "nvjpegEncoderParamsSetOptimizedHuffman", res)); if (res != nvjpegStatus.Success) { throw new NvJpegException(res); } }
public void SetSamplingFactors(nvjpegChromaSubsampling chroma_subsampling, CudaStream stream) { res = NvJpegNativeMethods.nvjpegEncoderParamsSetSamplingFactors(_params, chroma_subsampling, stream.Stream); Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "nvjpegEncoderParamsSetSamplingFactors", res)); if (res != nvjpegStatus.Success) { throw new NvJpegException(res); } }
/// <summary> /// This function sets the stream to be used by the cudnn library to execute its routines. /// </summary> /// <param name="stream">the stream to be used by the library.</param> public void SetStream(CudaStream stream) { res = CudaDNNNativeMethods.cudnnSetStream(_handle, stream.Stream); Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cudnnSetStream", res)); if (res != cudnnStatus.Success) { throw new CudaDNNException(res); } }
public void SetEncoding(nvjpegJpegEncoding etype, CudaStream stream) { res = NvJpegNativeMethods.nvjpegEncoderParamsSetEncoding(_params, etype, stream.Stream); Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "nvjpegEncoderParamsSetEncoding", res)); if (res != nvjpegStatus.Success) { throw new NvJpegException(res); } }
/// <summary> /// </summary> internal EncoderState(NvJpeg nvJpeg, CudaStream stream) { _nvJpeg = nvJpeg; _state = new nvjpegEncoderState(); res = NvJpegNativeMethods.nvjpegEncoderStateCreate(nvJpeg.Handle, ref _state, stream.Stream); Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "nvjpegEncoderStateCreate", res)); if (res != nvjpegStatus.Success) { throw new NvJpegException(res); } }
/// <summary> /// Retrieves the compressed stream from the encoder state that was previously used in one of the encoder functions. /// </summary> /// <param name="ptr">Pointer to the buffer in the host memory where the compressed stream will be stored. Can be NULL</param> /// <param name="length">input buffer size.</param> /// <param name="stream">CUDA stream where all the required device operations will be placed.</param> /// <returns>On return the NVJPEG library will give the actual compressed stream size in this value.</returns> public SizeT RetrieveBitstream(IntPtr ptr, SizeT length, CudaStream stream) { res = NvJpegNativeMethods.nvjpegEncodeRetrieveBitstream(_nvJpeg.Handle, _state, ptr, ref length, stream.Stream); Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "nvjpegEncodeRetrieveBitstream", res)); if (res != nvjpegStatus.Success) { throw new NvJpegException(res); } return(length); }
public void InitializeData(int count) { this.count = count; IntPtr hostPointer = IntPtr.Zero; var res = DriverAPINativeMethods.MemoryManagement.cuMemAllocHost_v2(ref hostPointer, count * sizeof(ResultPoint)); if (res != CUResult.Success) { throw new CudaException(res); } hostBuffer = (ResultPoint *)hostPointer; deviceBuffer = new CudaDeviceVariable <ResultPoint>(count); for (int i = 0; i < count; i++) { hostBuffer[i].X = (uint)i; hostBuffer[i].Y = (uint)i; } defaultStream = new CudaStream(); res = DriverAPINativeMethods.AsynchronousMemcpy_v2.cuMemcpyHtoDAsync_v2( deviceBuffer.DevicePointer, hostPointer, deviceBuffer.SizeInBytes, defaultStream.Stream); if (res != CUResult.Success) { throw new CudaException(res); } IntPtr secondHostPointer = IntPtr.Zero; res = DriverAPINativeMethods.MemoryManagement.cuMemAllocHost_v2(ref secondHostPointer, count * sizeof(ResultPoint)); if (res != CUResult.Success) { throw new CudaException(res); } secondHostBuffer = (ResultPoint *)secondHostPointer; secondDeviceBuffer = new CudaDeviceVariable <ResultPoint>(count); for (int i = 0; i < count; i++) { secondHostBuffer[i].X = (uint)i; secondHostBuffer[i].Y = (uint)i; } defaultStream = new CudaStream(); res = DriverAPINativeMethods.AsynchronousMemcpy_v2.cuMemcpyHtoDAsync_v2( secondDeviceBuffer.DevicePointer, secondHostPointer, secondDeviceBuffer.SizeInBytes, defaultStream.Stream); if (res != CUResult.Success) { throw new CudaException(res); } }
private CudnnContext(CudnnHandle handle, CudaStream stream) { if (handle.Pointer == IntPtr.Zero) { throw new ArgumentException("handle"); } Contract.EndContractBlock(); this.handle = handle; this.stream = stream; }
public static CudnnContext Create(CudaStream stream = null) { CudnnHandle handle = default(CudnnHandle); Invoke(() => CudnnNativeMethods.cudnnCreate(out handle)); Contract.Assume(handle.Pointer != IntPtr.Zero); if (stream != null) { Invoke(() => CudnnNativeMethods.cudnnSetStream(handle, stream.Stream)); } return(new CudnnContext(handle, stream)); }
/// <summary> /// Create new dense solve instance using stream stream /// </summary> /// <param name="stream"></param> public CudaSolveDense(CudaStream stream) : this() { SetStream(stream); }
/// <summary> /// Create new sparse solve instance using stream stream /// </summary> /// <param name="stream"></param> public CudaSolveSparse(CudaStream stream) : this() { SetStream(stream); }
/// <summary> /// This function sets the stream to be used by the cudnn library to execute its routines. /// </summary> /// <param name="stream">the stream to be used by the library.</param> public void SetStream(CudaStream stream) { res = CudaDNNNativeMethods.cudnnSetStream(_handle, stream.Stream); Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cudnnSetStream", res)); if (res != cudnnStatus.Success) throw new CudaDNNException(res); }