public override float InferenceTraining(CudaDeviceVariable <float> input) { float error = 0; switch (_norm) { case Norm.L1: //derivative of cost-function. Hier L1-Norm: _res.CopyToDevice(input); _groundTrouthData.Sub(input, _dx); _dx.Threshold_GTVal(0, 1); _dx.Threshold_LTVal(0, -1); _dx.DivC(_batch * _inChannels * _inWidth * _inHeight); _groundTrouthData.Sub(input, _temp); _temp.Abs(); _temp.Sum(_summedError, _buffer); error = _summedError; error = error / _batch / _inChannels / _inWidth / _inHeight; break; case Norm.L2: //derivative of cost-function. Hier L2-Norm: _res.CopyToDevice(input); _groundTrouthData.Sub(input, _dx); _dx.DivC(_batch * _inChannels * _inWidth * _inHeight); _groundTrouthData.Sub(input, _temp); _temp.Sqr(); _temp.Sum(_summedError, _buffer); error = _summedError; error = error / _batch / _inChannels / _inWidth / _inHeight; break; case Norm.MSSSIM: _res.CopyToDevice(input); _kernelMSSSIML1.RunSafe(input, _groundTrouthData, _msssiml1, _dx, _inChannels, _batch, 1.0f); _msssiml1.Sum(_summedError, _buffer); error = _summedError; error = error / _batch / _inChannels; break; case Norm.Mix: _res.CopyToDevice(input); _kernelMSSSIML1.RunSafe(input, _groundTrouthData, _msssiml1, _dx, _inChannels, _batch, 0.84f); _msssiml1.Sum(_summedError, _buffer); error = _summedError; break; default: break; } return(error); }
public override void InitRandomWeight(Random rand) { // Xavier weight filling * _outChannels float wconv1 = (float)Math.Sqrt(3.0f / (_filterX * _filterY * _inChannels)); float[] w = new float[_weights.Size]; float[] b = new float[_bias.Size]; // Randomize network for (int i = 0; i < _weights.Size; i++) { w[i] = (float)((rand.NextDouble() * 2.0 - 1.0) * wconv1); } for (int i = 0; i < _bias.Size; i++) { b[i] = (float)((rand.NextDouble() * 2.0 - 1.0) * wconv1); } _weights.CopyToDevice(w); _bias.CopyToDevice(b); switch (_activation) { case Activation.PRelu: _aRelu.Set(0.25f); break; case Activation.LeakyRelu: _aRelu.Set(0.25f); break; default: break; } base.InitRandomWeight(rand); }
public void Run(DistanceOperation operation, CudaDeviceVariable <float> A, int sizeA, CudaDeviceVariable <float> B, int sizeB, CudaDeviceVariable <float> result, int sizeRes) { if (!ValidateAtRun(operation)) { return; } switch (operation) { case DistanceOperation.DotProd: //ZXC m_dotKernel.Run(result.DevicePointer, 0, A.DevicePointer, B.DevicePointer, sizeA, 0); m_dotKernel.Run(result.DevicePointer, A.DevicePointer, B.DevicePointer, sizeA); break; case DistanceOperation.CosDist: //ZXC m_cosKernel.Run(result.DevicePointer, 0, A.DevicePointer, B.DevicePointer, sizeA, 0); m_cosKernel.Run(result.DevicePointer, A.DevicePointer, B.DevicePointer, sizeA); break; case DistanceOperation.EuclidDist: float res = RunReturn(operation, A, sizeA, B, sizeB); result.CopyToDevice(res); break; case DistanceOperation.EuclidDistSquared: m_combineVecsKernel.SetupExecution(sizeA); m_combineVecsKernel.Run(A.DevicePointer, B.DevicePointer, m_temp, (int)MyJoin.MyJoinOperation.Subtraction, sizeA); //ZXC m_dotKernel.Run(result.DevicePointer, 0, m_temp, m_temp, m_temp.Count, 0); m_dotKernel.Run(result.DevicePointer, m_temp, m_temp); break; case DistanceOperation.HammingDist: m_combineVecsKernel.SetupExecution(sizeA); m_combineVecsKernel.Run(A.DevicePointer, B.DevicePointer, m_temp, (int)MyJoin.MyJoinOperation.Equal, sizeA); //ZXC m_reduceSumKernel.Run(result.DevicePointer, m_temp, m_temp.Count, 0, 0, 1, /*distributed = false*/0); // reduction to a single number m_reduceSumKernel.Run(result.DevicePointer, m_temp); float fDist = 0; // to transform number of matches to a number of differences result.CopyToHost(ref fDist); fDist = m_temp.Count - fDist; result.CopyToDevice(fDist); break; case DistanceOperation.HammingSim: m_combineVecsKernel.SetupExecution(sizeA); m_combineVecsKernel.Run(A.DevicePointer, B.DevicePointer, m_temp, (int)MyJoin.MyJoinOperation.Equal, sizeA); //ZXC m_reduceSumKernel.Run(result.DevicePointer, m_temp, m_temp.Count, 0, 0, 1, /*distributed = false*/0); // reduction to a single number m_reduceSumKernel.Run(result.DevicePointer, m_temp); // take the single number (number of different bits) and convert it to Hamming Similarity: // a number in range <0,1> that says how much the vectors are similar float fSim = 0; result.CopyToHost(ref fSim); fSim = fSim / m_temp.Count; result.CopyToDevice(fSim); break; } }
/* * ----------------------- * METHOD CYCLE * ----------------------- */ private void runMethodCycle(bool mustBeUpdated) { m_cycleKernel.SetConstantVariable("D_NB_CURVES", Count); m_cycleKernel.SetConstantVariable("D_TEXTURE_WIDTH", TextureWidth); m_cycleKernel.SetConstantVariable("D_PLOTAREA_WIDTH", m_plotAreaWidth); m_cycleKernel.SetConstantVariable("D_PLOTAREA_HEIGHT", m_plotAreaHeight); m_cycleKernel.SetConstantVariable("D_PLOTAREA_OFFSET_X", m_plotAreaOffsetX); m_cycleKernel.SetConstantVariable("D_MIN_VALUE", m_plotCurrentValueMin); m_cycleKernel.SetConstantVariable("D_MAX_VALUE", m_plotCurrentValueMax); int currentColumn = m_currentSamplingTimeStep % m_plotAreaWidth; if (Stride == 1) { m_valuesHistory.CopyToDevice(Target.GetDevicePtr(this), Offset * sizeof(float), currentColumn * Count * sizeof(float), Count * sizeof(float)); } else { //not really happy with this one for (int i = 0; i < Count; i++) { m_valuesHistory.CopyToDevice(Target.GetDevicePtr(this), (i * Stride + Offset) * sizeof(float), (currentColumn * Count + i) * sizeof(float), sizeof(float)); } } if (mustBeUpdated) { // Draw curves int nbColumnsToDraw = Math.Min(m_currentSamplingTimeStep + 1, m_plotAreaWidth); m_cycleKernel.SetupExecution(nbColumnsToDraw * m_plotAreaHeight); m_cycleKernel.Run( VBODevicePointer, 5, nbColumnsToDraw, m_valuesHistory.DevicePointer ); } else { // Draw only the needed columns m_cycleKernel.SetupExecution(m_plotAreaHeight); m_cycleKernel.Run( VBODevicePointer, currentColumn, 1, m_valuesHistory.DevicePointer ); } // Draw a vertical grey line m_verticalLineKernel.SetupExecution(m_plotAreaHeight); m_verticalLineKernel.Run( VBODevicePointer, m_plotAreaOffsetX + (currentColumn + 1) % m_plotAreaWidth, TextureWidth, m_plotAreaHeight ); }
internal MinMax FindMinAndMax(CudaDeviceVariable <float> a, int size) { if (size > 0) { var ptr = a; while (size > BLOCK_DIM2) { var bufferSize = (size / BLOCK_DIM2) + 1; using (var minBlock = new CudaDeviceVariable <float>(bufferSize)) using (var maxBlock = new CudaDeviceVariable <float>(bufferSize)) { minBlock.Memset(0); maxBlock.Memset(0); _Use(_findMinAndMax, size, k => k.Run(BLOCK_DIM2, ptr.DevicePointer, size, minBlock.DevicePointer, maxBlock.DevicePointer)); if (ptr != a) { ptr.Dispose(); } var minTest = new float[bufferSize]; var maxText = new float[bufferSize]; minBlock.CopyToHost(minTest); maxBlock.CopyToHost(maxText); size = bufferSize * 2; ptr = new CudaDeviceVariable <float>(size); ptr.CopyToDevice(minBlock, 0, 0, bufferSize * sizeof(float)); ptr.CopyToDevice(maxBlock, 0, bufferSize * sizeof(float), bufferSize * sizeof(float)); var test = new float[size]; ptr.CopyToHost(test); } } var data = new float[size]; ptr.CopyToHost(data); float min = float.MaxValue, max = float.MinValue; for (var i = 0; i < size; i++) { var val = data[i]; if (val > max) { max = val; } if (val < min) { min = val; } } return(new MinMax(min, max)); } return(MinMax.Empty); }
public static float2[] calculateCudaFFT(float[] h_dataIn) { CudaContext cntxt = new CudaContext(); //Caution: Array sizes matter! Based on CUFFFT-Documentation... int size_real = h_dataIn.Length; int size_complex = (int)Math.Floor(size_real / 2.0) + 1; //Crating FFT Plan CudaFFTPlanMany fftPlan = new CudaFFTPlanMany(1, new int[] { size_real }, 1, cufftType.R2C); //Size of d_data must be padded for inplace R2C transforms: size_complex * 2 and not size_real CudaDeviceVariable <float> d_data = new CudaDeviceVariable <float>(size_complex * 2); //device allocation and host have different sizes, why the amount of data must be given explicitly for copying: d_data.CopyToDevice(h_dataIn, 0, 0, size_real * sizeof(float)); //executa plan fftPlan.Exec(d_data.DevicePointer, TransformDirection.Forward); //Output to host, either as float2 or float, but array sizes must be right! float2[] h_dataOut = new float2[size_complex]; float[] h_dataOut2 = new float[size_complex * 2]; d_data.CopyToHost(h_dataOut); d_data.CopyToHost(h_dataOut2); fftPlan.Dispose(); return(h_dataOut); }
private T[] RunKernel <T>(Action <T[]> method, T[] parameters) where T : struct { var methodInfo = method.Method; string[] kernels; string llvmIr, ptxIr; var ptx = CudaSharp.CudaSharp.Translate(out kernels, out llvmIr, out ptxIr, "sm_20", methodInfo); Console.WriteLine(llvmIr); Console.WriteLine(ptxIr); var kernel = _context.LoadKernelPTX(ptx, kernels[0]); var maxThreads = kernel.MaxThreadsPerBlock; if (parameters.Length <= maxThreads) { kernel.BlockDimensions = parameters.Length; kernel.GridDimensions = 1; } else { kernel.BlockDimensions = maxThreads; kernel.GridDimensions = parameters.Length / maxThreads; if ((kernel.BlockDimensions * kernel.GridDimensions) != parameters.Length) { throw new Exception(string.Format("Invalid parameters size (must be <= {0} or a multiple of {0}", maxThreads)); } } var gpuMem = new CudaDeviceVariable <T>(parameters.Length); gpuMem.CopyToDevice(parameters); kernel.Run(gpuMem.DevicePointer); gpuMem.CopyToHost(parameters); gpuMem.Dispose(); return(parameters); }
protected override void Execute() { if (m_qMatrix != null && MatrixSizeOK()) { m_kernel.SetupExecution(TextureWidth * TextureHeight); m_kernel.Run( m_plotValues.DevicePointer, m_actionIndices.DevicePointer, m_actionLabels.DevicePointer, numOfActions, LABEL_PIXEL_WIDTH, LABEL_PIXEL_WIDTH, m_minUtilityValue, MaxUtilityValue, m_qMatrix.GetLength(0), m_qMatrix.GetLength(1), VBODevicePointer); if (ViewMode == ViewMethod.Orbit_3D) { m_vertexKernel.SetupExecution(m_qMatrix.Length); m_vertexKernel.Run( m_plotValues.DevicePointer, 0.1f, m_qMatrix.GetLength(0), m_qMatrix.GetLength(1), MaxUtilityValue, VertexVBODevicePointer); } } float[,] lastQMatrix = m_qMatrix; Target.ReadTwoDimensions(ref m_qMatrix, ref m_qMatrixActions, XAxisVariableIndex, YAxisVariableIndex, ApplyInnerScaling); if (lastQMatrix != m_qMatrix) { TriggerReset(); } else if (m_qMatrix != null && MatrixSizeOK()) { m_plotValues.CopyToDevice(m_qMatrix); m_actionIndices.CopyToDevice(m_qMatrixActions); } }
public override void RestoreValues(Stream stream) { BinaryReader br = new BinaryReader(stream); float[] w = new float[_weights.Size]; float[] b = new float[_bias.Size]; for (int i = 0; i < _weights.Size; i++) { w[i] = br.ReadSingle(); } for (int i = 0; i < _bias.Size; i++) { b[i] = br.ReadSingle(); } if (_activation == Activation.PRelu || _activation == Activation.LeakyRelu) { float[] a = new float[_aRelu.Size]; for (int i = 0; i < _aRelu.Size; i++) { a[i] = br.ReadSingle(); } _aRelu.CopyToDevice(a); } _weights.CopyToDevice(w); _bias.CopyToDevice(b); base.RestoreValues(stream); }
// perform Fast Fourier transform private float[] PerformFFT(float[] input) { int size_real = input.Length; int size_complex = (int)Math.Floor(size_real / 2.0) + 1; CudaFFTPlanMany fftPlan = new CudaFFTPlanMany(1, new int[] { size_real }, 1, cufftType.R2C); // size of d_data must be padded for inplace R2C transforms: size_complex * 2 and not size_real CudaDeviceVariable <float> fft = new CudaDeviceVariable <float>(size_complex * 2); // device allocation and host have different sizes, why the amount of data must be given explicitly for copying: fft.CopyToDevice(input, 0, 0, size_real * sizeof(float)); // executa plan fftPlan.Exec(fft.DevicePointer, TransformDirection.Forward); // output to host as float2 float2[] output = new float2[size_complex]; fft.CopyToHost(output); // cleanup fft.Dispose(); fftPlan.Dispose(); // squared magnitude of complex output as a result float[] result = new float[output.Length]; for (int i = 0; i < output.Length; i++) { result[i] = (float)Math.Sqrt((output[i].x * output[i].x) + (output[i].y * output[i].y)); } return(result); }
public SingularValueDecomposition Svd() { Debug.Assert(IsValid); var solver = _cuda.Solver; // find the size of the required buffer var bufferSize = solver.GesvdBufferSizeFloat(_rows, _columns); var mn = Math.Min(_rows, _columns); // allocate output buffers var s = new CudaDeviceVariable <float>(mn); var u = new CudaDeviceVariable <float>(_rows * _rows); var vt = new CudaDeviceVariable <float>(_columns * _columns); // call cusolver to find the SVD try { using (var buffer = new CudaDeviceVariable <float>(bufferSize)) using (var devInfo = new CudaDeviceVariable <int>(1)) using (var rwork = new CudaDeviceVariable <float>(mn)) using (var a = new CudaDeviceVariable <float>(_rows * _columns)) { a.CopyToDevice(_data); solver.Gesvd('A', 'A', _rows, _columns, a, _rows, s, u, _rows, vt, _columns, buffer, bufferSize, rwork, devInfo); return(new SingularValueDecomposition( new GpuMatrix(_cuda, _rows, _rows, u), new GpuVector(_cuda, mn, s), new GpuMatrix(_cuda, _columns, _columns, vt) )); } }catch { s.Dispose(); u.Dispose(); vt.Dispose(); throw; } }
public GpuMatrix(CudaProvider cuda, int rows, int columns, Func <int, int, float> init) { _cuda = cuda; _rows = rows; _columns = columns; cuda.Register(this); var count = rows * columns; var data = new float[count]; for (var j = 0; j < columns; j++) { for (var i = 0; i < rows; i++) { data[j * rows + i] = init(i, j); } } _data = new CudaDeviceVariable <float>(count); _data.CopyToDevice(data); #if DEBUG if (_id == _badAlloc) { Debugger.Break(); } #endif }
public static void blaa() { int num = 10; //NewContext creation CudaContext cntxt = new CudaContext(); //Module loading from precompiled .ptx in a project output folder CUmodule cumodule = cntxt.LoadModule("kernel.ptx"); //_Z9addKernelPf - function name, can be found in *.ptx file CudaKernel addWithCuda = new CudaKernel("_Z9addKernelPf", cumodule, cntxt); //Create device array for data CudaDeviceVariable <float> vec1_device = new CudaDeviceVariable <float>(num); //Create arrays with data float[] vec1 = new float[num]; //Copy data to device vec1_device.CopyToDevice(vec1); //Set grid and block dimensions addWithCuda.GridDimensions = new dim3(8, 1, 1); addWithCuda.BlockDimensions = new dim3(512, 1, 1); //Run the kernel addWithCuda.Run( vec1_device.DevicePointer); //Copy data from device vec1_device.CopyToHost(vec1); }
public void BackwardData(CudnnFilterDescriptor filter, double[] filterData, CudnnTensorDescriptor diffTensor, double[] diffData, CudnnConvolutionDescriptor convolution, CudnnTensorDescriptor gradient, double[] gradientData, CudnnAccumulateResult accumulate) { Contract.Requires(filter != null); Contract.Requires(filterData != null); Contract.Requires(diffTensor != null); Contract.Requires(diffData != null); Contract.Requires(convolution != null); Contract.Requires(gradient != null); Contract.Requires(gradientData != null); ThrowIfNotInitialized(); CheckIfCompatible(CudnnType.Double, filter, diffTensor, gradient); using (var filterDataGpu = new CudaDeviceVariable <double>(filterData.Length)) using (var diffDataGpu = new CudaDeviceVariable <double>(diffData.Length)) using (var gradientDataGpu = new CudaDeviceVariable <double>(gradientData.Length)) { filterDataGpu.CopyToDevice(filterData); diffDataGpu.CopyToDevice(diffData); Invoke(() => CudnnNativeMethods.cudnnConvolutionBackwardData(handle, filter.Handle, filterDataGpu.DevicePointer, diffTensor.Handle, diffDataGpu.DevicePointer, convolution.Handle, gradient.Handle, gradientDataGpu.DevicePointer, accumulate)); gradientDataGpu.CopyToHost(gradientData); } }
public void Backward(CudnnSoftmaxAlgorithm algorithm, CudnnSoftmaxMode mode, CudnnTensorDescriptor srcTensor, float[] srcData, CudnnTensorDescriptor srcDiffTensor, float[] srcDiffData, CudnnTensorDescriptor destDiffTensor, float[] destDiffData) { Contract.Requires(srcTensor != null); Contract.Requires(srcData != null); Contract.Requires(srcDiffTensor != null); Contract.Requires(srcDiffData != null); Contract.Requires(destDiffTensor != null); Contract.Requires(destDiffData != null); ThrowIfNotInitialized(); CheckIfCompatible(CudnnType.Float, srcTensor, srcDiffTensor, destDiffTensor); using (var srcDataGpu = new CudaDeviceVariable <float>(srcData.Length)) using (var srcDiffDataGpu = new CudaDeviceVariable <float>(srcDiffData.Length)) using (var destDiffDataGpu = new CudaDeviceVariable <float>(destDiffData.Length)) { srcDataGpu.CopyToDevice(srcData); srcDiffDataGpu.CopyToDevice(srcDiffData); Invoke(() => CudnnNativeMethods.cudnnSoftmaxBackward(handle, algorithm, mode, srcTensor.Handle, srcDataGpu.DevicePointer, srcDiffTensor.Handle, srcDiffDataGpu.DevicePointer, destDiffTensor.Handle, destDiffDataGpu.DevicePointer)); destDiffDataGpu.CopyToHost(destDiffData); } }
public void Backward(CudnnPoolingDescriptor pooling, CudnnTensorDescriptor srcTensor, float[] srcData, CudnnTensorDescriptor srcDiffTensor, float[] srcDiffData, CudnnTensorDescriptor destTensor, float[] destData, CudnnTensorDescriptor destDiffTensor, float[] destDiffData) { Contract.Requires(pooling != null); Contract.Requires(srcTensor != null); Contract.Requires(srcData != null); Contract.Requires(destTensor != null); Contract.Requires(destData != null); Contract.Requires(srcDiffTensor != null); Contract.Requires(srcDiffData != null); Contract.Requires(destDiffTensor != null); Contract.Requires(destDiffData != null); ThrowIfNotInitialized(); CheckIfCompatible(CudnnType.Float, srcTensor, srcDiffTensor, destTensor, destDiffTensor); using (var srcDataGpu = new CudaDeviceVariable<float>(srcData.Length)) using (var srcDiffDataGpu = new CudaDeviceVariable<float>(srcDiffData.Length)) using (var destDataGpu = new CudaDeviceVariable<float>(destData.Length)) using (var destDiffDataGpu = new CudaDeviceVariable<float>(destDiffData.Length)) { srcDataGpu.CopyToDevice(srcData); srcDiffDataGpu.CopyToDevice(srcDiffData); destDataGpu.CopyToDevice(destData); Invoke(() => CudnnNativeMethods.cudnnPoolingBackward(handle, pooling.Handle, srcTensor.Handle, srcDataGpu.DevicePointer, srcDiffTensor.Handle, srcDiffDataGpu.DevicePointer, destTensor.Handle, destDataGpu.DevicePointer, destDiffTensor.Handle, destDiffDataGpu.DevicePointer)); destDiffDataGpu.CopyToHost(destDiffData); } }
internal void TensorConvertToMatrix(IReadOnlyList <IDeviceMemoryPtr> matrixList, int tensorRows, int tensorColumns, int matrixRows, int matrixColumns, IDeviceMemoryPtr ret) { using (var devicePtr = new CudaDeviceVariable <CUdeviceptr>(matrixList.Count)) { devicePtr.CopyToDevice(matrixList.Select(m => m.DevicePointer).ToArray()); _Use(_tensorConvertToMatrix, matrixRows, matrixColumns, k => k.Run(0, devicePtr.DevicePointer, ret.DevicePointer, tensorRows, tensorColumns, matrixRows, matrixColumns)); } }
void InitCoordinatesAndVelocity() { m_d_PointsCoordinates = new CudaDeviceVariable <float>(3 * Target.MAX_CELLS); float[] m_h_pointsCoordinates = new float[3 * Target.MAX_CELLS]; for (int c = 0; c < m_h_pointsCoordinates.Length; c++) { m_h_pointsCoordinates[c] = COORDINATES_MIN + (COORDINATES_MAX - COORDINATES_MIN) * (float)randomNumber.NextDouble(); //m_h_pointsCoordinates[c] = 1.00f; } m_d_PointsCoordinates.CopyToDevice(m_h_pointsCoordinates); m_d_Velocity = new CudaDeviceVariable <float>(3 * Target.MAX_CELLS); float[] m_h_velocity = new float[3 * Target.MAX_CELLS]; for (int c = 0; c < m_h_velocity.Length; c++) { m_h_velocity[c] = 0.00f; } m_d_Velocity.CopyToDevice(m_h_velocity); m_d_CubeOperation = new CudaDeviceVariable <float>(6 * 4 * 3); int[] operationMask = new int[6 * 4 * 3] { -1, -1, +1, -1, -1, -1, +1, -1, -1, +1, -1, +1, -1, +1, +1, -1, -1, +1, +1, -1, +1, +1, +1, +1, -1, +1, -1, -1, -1, -1, -1, -1, +1, -1, +1, +1, -1, +1, -1, -1, -1, -1, +1, -1, -1, +1, +1, -1, +1, +1, -1, +1, -1, -1, +1, -1, +1, +1, +1, +1, -1, +1, +1, -1, +1, -1, +1, +1, -1, +1, +1, +1 }; float[] m_h_CubeOperation = new float[6 * 4 * 3]; for (int i = 0; i < operationMask.Length; i++) { m_h_CubeOperation[i] = (float)operationMask[i]; } m_d_CubeOperation.CopyToDevice(m_h_CubeOperation); m_d_CubeTexCoordinates = new CudaDeviceVariable <float>(6 * 4 * 2); int[] texCoordinates = new int[6 * 4 * 2] { 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 0, 1, 0, 1, 1, 0, 1, 0, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 1, 1 }; float[] m_h_CubeTexCoordinates = new float[6 * 4 * 2]; for (int i = 0; i < texCoordinates.Length; i++) { m_h_CubeTexCoordinates[i] = (float)texCoordinates[i]; } m_d_CubeTexCoordinates.CopyToDevice(m_h_CubeTexCoordinates); m_computeCubes2Kernel.SetConstantVariable("operationMaskConstant", m_h_CubeOperation); m_computeCubes2Kernel.SetConstantVariable("cubeTexCoordinatesConstant", m_h_CubeTexCoordinates); }
internal CudaDeviceVariable <CUdeviceptr> GetDeviceMemoryPtr() { var ret = new CudaDeviceVariable <CUdeviceptr>(Count); ret.CopyToDevice(_ptr); return(ret); }
public void Backward(CudnnPoolingDescriptor pooling, CudnnTensorDescriptor srcTensor, double[] srcData, CudnnTensorDescriptor srcDiffTensor, double[] srcDiffData, CudnnTensorDescriptor destTensor, double[] destData, CudnnTensorDescriptor destDiffTensor, double[] destDiffData) { Contract.Requires(pooling != null); Contract.Requires(srcTensor != null); Contract.Requires(srcData != null); Contract.Requires(destTensor != null); Contract.Requires(destData != null); Contract.Requires(srcDiffTensor != null); Contract.Requires(srcDiffData != null); Contract.Requires(destDiffTensor != null); Contract.Requires(destDiffData != null); ThrowIfNotInitialized(); CheckIfCompatible(CudnnType.Double, srcTensor, srcDiffTensor, destTensor, destDiffTensor); using (var srcDataGpu = new CudaDeviceVariable <double>(srcData.Length)) using (var srcDiffDataGpu = new CudaDeviceVariable <double>(srcDiffData.Length)) using (var destDataGpu = new CudaDeviceVariable <double>(destData.Length)) using (var destDiffDataGpu = new CudaDeviceVariable <double>(destDiffData.Length)) { srcDataGpu.CopyToDevice(srcData); srcDiffDataGpu.CopyToDevice(srcDiffData); destDataGpu.CopyToDevice(destData); Invoke(() => CudnnNativeMethods.cudnnPoolingBackward(handle, pooling.Handle, srcTensor.Handle, srcDataGpu.DevicePointer, srcDiffTensor.Handle, srcDiffDataGpu.DevicePointer, destTensor.Handle, destDataGpu.DevicePointer, destDiffTensor.Handle, destDiffDataGpu.DevicePointer)); destDiffDataGpu.CopyToHost(destDiffData); } }
protected override void Execute() { if (firstExec) { firstExec = false; m_kernel_test.SetupExecution(TextureHeight * TextureWidth); m_kernel_test.Run(TextureWidth, TextureHeight, VBODevicePointer); } if (Mode == BoxPlotObserverMinMaxMode.Dynamic) { MinValue = Target.Output.Host.Min(); MaxValue = Target.Output.Host.Max(); } float range = MaxValue - MinValue; int[] box = new int[Target.Output.Count]; for (int i = 0; i < Target.Output.Count; i++) { box[i] = (int)((Target.Output.Host[i] - MinValue) / range * (float)(boxPlotHeight - 1)); } m_box = new CudaDeviceVariable <int>(Target.Output.Count); m_box.CopyToDevice(box); for (int i = 0; i < Target.OutputRowsN; i++) { int xpos = i / rowHintV; int ypos = i % rowHintV; drawBoxPlotAtPostion(m_streams[i], i * 5, horizontalGapV + xpos * (horizontalGapV + boxPlotWidth), verticalGapV + ypos * (verticalGapV + boxPlotHeight)); } Target.Output.SafeCopyToDevice(); }
public IVector Column(int index) { Debug.Assert(IsValid); var ret = new CudaDeviceVariable <float>(_rows); ret.CopyToDevice(_data, index * _rows * sizeof(float), 0, _rows * sizeof(float)); return(new GpuVector(_cuda, _rows, ret)); }
public IVector Clone() { Debug.Assert(IsValid); var data = new CudaDeviceVariable <float>(_size); data.CopyToDevice(_data); return(new GpuVector(_cuda, _size, data)); }
private void assignRandoms() { var hostPhis1 = RandomGenerator.GetInstance().RandomVector(DimensionsCount * ParticlesCount, 0, 1); var hostPhis2 = RandomGenerator.GetInstance().RandomVector(DimensionsCount * ParticlesCount, 0, 1); _phis1.CopyToDevice(hostPhis1); _phis2.CopyToDevice(hostPhis2); }
public IMatrix ToRowMatrix(int numRows = 1) { Debug.Assert(IsValid); var ret = new CudaDeviceVariable <float>(_data.Size); ret.CopyToDevice(_data); return(new GpuMatrix(_cuda, 1, _size, ret)); }
public IMatrix Clone() { Debug.Assert(IsValid); var ret = new CudaDeviceVariable <float>(_rows * _columns); ret.CopyToDevice(_data); return(new GpuMatrix(_cuda, _rows, _columns, ret)); }
internal CudaDeviceVariable <float> PointwiseDivide(CudaDeviceVariable <float> a, CudaDeviceVariable <float> b, int size) { CudaDeviceVariable <float> ret = new CudaDeviceVariable <float>(size); ret.CopyToDevice(b); _Use(_pointwiseDivide, size, k => k.Run(BLOCK_DIM2 * sizeof(float), a.DevicePointer, ret.DevicePointer, size)); return(ret); }
public void CopyFrom(IVector vector) { Debug.Assert(vector.IsValid); var other = (GpuVector)vector; Debug.Assert(other._size == _size); _data.CopyToDevice(other._data); }
public IVector GetColumnSegment(int columnIndex, int rowIndex, int length) { Debug.Assert(IsValid); var ret = new CudaDeviceVariable <float>(length); ret.CopyToDevice(_data, ((columnIndex * _rows) + rowIndex) * sizeof(float), 0, length * sizeof(float)); return(new GpuVector(_cuda, length, ret)); }
public DeviceMemoryPtrList(IReadOnlyList <IReadOnlyList <IDeviceMemoryPtr> > data) { _ptrList = data.Select(d => { var ptr = new CudaDeviceVariable <CUdeviceptr>(d.Count); ptr.CopyToDevice(d.Select(m => m.DevicePointer).ToArray()); return(ptr); }).ToList(); _ptrToPtrList = new CudaDeviceVariable <CUdeviceptr>(data.Count); _ptrToPtrList.CopyToDevice(_ptrList.Select(m => m.DevicePointer).ToArray()); }
public override float Inference(CudaDeviceVariable <float> input) { _input = input; NPPImage_32fC1 tempConv = new NPPImage_32fC1(_tempConvolution.DevicePointer, InWidth, InHeight, InWidth * sizeof(float)); for (int outLayer = 0; outLayer < OutChannels; outLayer++) { SizeT offsetOut = outLayer * OutWidth * OutHeight * sizeof(float); CUdeviceptr ptrWithOffsetOut = _z.DevicePointer + offsetOut; NPPImage_32fC1 imgOut = new NPPImage_32fC1(ptrWithOffsetOut, OutWidth, OutHeight, OutWidth * sizeof(float)); imgOut.Set(0); for (int inLayer = 0; inLayer < InChannels; inLayer++) { SizeT offsetIn = inLayer * InWidth * InHeight * sizeof(float); CUdeviceptr ptrWithOffsetIn = _input.DevicePointer + offsetIn; NPPImage_32fC1 imgIn = new NPPImage_32fC1(ptrWithOffsetIn, InWidth, InHeight, InWidth * sizeof(float)); imgIn.SetRoi(_filterX / 2, _filterY / 2, InWidth - _filterX + 1, InHeight - _filterY + 1); SizeT offsetFilter = (outLayer * InChannels * _filterX * _filterY + inLayer * _filterX * _filterY) * sizeof(float); CudaDeviceVariable <float> filter = new CudaDeviceVariable <float>(_weights.DevicePointer + offsetFilter, false, _filterX * _filterY * sizeof(float)); imgIn.Filter(tempConv, filter, new NppiSize(_filterX, _filterY), new NppiPoint(_filterX / 2, _filterY / 2)); imgOut.Add(tempConv); } imgOut.Add(bHost[outLayer]); } switch (_activation) { case Activation.None: _y.CopyToDevice(_z); break; case Activation.Relu: //_aRelu is set to 0! _KernelPReluForward.RunSafe(_z, _aRelu, _y, _outWidth * _outHeight, _outChannels, _batch); break; case Activation.PRelu: _KernelPReluForward.RunSafe(_z, _aRelu, _y, _outWidth * _outHeight, _outChannels, _batch); break; case Activation.LeakyRelu: _KernelPReluForward.RunSafe(_z, _aRelu, _y, _outWidth * _outHeight, _outChannels, _batch); break; default: break; } return(_nextLayer.Inference(_y)); }
static void Test(byte[] ptxFile) { const int size = 16; var context = new CudaContext(); var kernel = context.LoadKernelPTX(ptxFile, "kernel"); var memory = context.AllocateMemory(4 * size); var gpuMemory = new CudaDeviceVariable<int>(memory); var cpuMemory = new int[size]; for (var i = 0; i < size; i++) cpuMemory[i] = i - 2; gpuMemory.CopyToDevice(cpuMemory); kernel.BlockDimensions = 4; kernel.GridDimensions = 4; kernel.Run(memory); gpuMemory.CopyToHost(cpuMemory); for (var i = 0; i < size; i++) Console.WriteLine("{0} = {1}", i, cpuMemory[i]); }
public void BackwardBias(CudnnTensorDescriptor srcTensor, double[] srcData, CudnnTensorDescriptor destTensor, double[] destData, CudnnAccumulateResult accumulate) { Contract.Requires(srcTensor != null); Contract.Requires(srcData != null); Contract.Requires(destTensor != null); Contract.Requires(destData != null); ThrowIfNotInitialized(); CheckIfCompatible(CudnnType.Double, srcTensor, destTensor); using (var srcDataGpu = new CudaDeviceVariable<double>(srcData.Length)) using (var destDataGpu = new CudaDeviceVariable<double>(destData.Length)) { srcDataGpu.CopyToDevice(srcData); Invoke(() => CudnnNativeMethods.cudnnConvolutionBackwardBias(handle, srcTensor.Handle, srcDataGpu.DevicePointer, destTensor.Handle, destDataGpu.DevicePointer, accumulate)); destDataGpu.CopyToHost(destData); } }
public void Run(DistanceOperation operation, CudaDeviceVariable<float> A, int sizeA, CudaDeviceVariable<float> B, int sizeB, CudaDeviceVariable<float> result, int sizeRes) { if (!ValidateAtRun(operation)) return; switch (operation) { case DistanceOperation.DotProd: //ZXC m_dotKernel.Run(result.DevicePointer, 0, A.DevicePointer, B.DevicePointer, sizeA, 0); m_dotKernel.Run(result.DevicePointer, A.DevicePointer, B.DevicePointer, sizeA); break; case DistanceOperation.CosDist: //ZXC m_cosKernel.Run(result.DevicePointer, 0, A.DevicePointer, B.DevicePointer, sizeA, 0); m_cosKernel.Run(result.DevicePointer, A.DevicePointer, B.DevicePointer, sizeA); break; case DistanceOperation.EuclidDist: float res = RunReturn(operation, A, sizeA, B, sizeB); result.CopyToDevice(res); break; case DistanceOperation.EuclidDistSquared: m_combineVecsKernel.SetupExecution(sizeA); m_combineVecsKernel.Run(A.DevicePointer, B.DevicePointer, m_temp, (int)MyJoin.MyJoinOperation.Subtraction, sizeA); //ZXC m_dotKernel.Run(result.DevicePointer, 0, m_temp, m_temp, m_temp.Count, 0); m_dotKernel.Run(result.DevicePointer, m_temp, m_temp); break; case DistanceOperation.HammingDist: m_combineVecsKernel.SetupExecution(sizeA); m_combineVecsKernel.Run(A.DevicePointer, B.DevicePointer, m_temp, (int)MyJoin.MyJoinOperation.Equal, sizeA); //ZXC m_reduceSumKernel.Run(result.DevicePointer, m_temp, m_temp.Count, 0, 0, 1, /*distributed = false*/0); // reduction to a single number m_reduceSumKernel.Run(result.DevicePointer, m_temp); float fDist = 0; // to transform number of matches to a number of differences result.CopyToHost(ref fDist); fDist = m_temp.Count - fDist; result.CopyToDevice(fDist); break; case DistanceOperation.HammingSim: m_combineVecsKernel.SetupExecution(sizeA); m_combineVecsKernel.Run(A.DevicePointer, B.DevicePointer, m_temp, (int)MyJoin.MyJoinOperation.Equal, sizeA); //ZXC m_reduceSumKernel.Run(result.DevicePointer, m_temp, m_temp.Count, 0, 0, 1, /*distributed = false*/0); // reduction to a single number m_reduceSumKernel.Run(result.DevicePointer, m_temp); // take the single number (number of different bits) and convert it to Hamming Similarity: // a number in range <0,1> that says how much the vectors are similar float fSim = 0; result.CopyToHost(ref fSim); fSim = fSim / m_temp.Count; result.CopyToDevice(fSim); break; } }
static void Main(string[] args) { int N = 275; float[] h_A; float[] h_B; float[] h_C; float[] h_C_ref; CudaDeviceVariable<float> d_A; CudaDeviceVariable<float> d_B; CudaDeviceVariable<float> d_C; float alpha = 1.0f; float beta = 0.0f; int n2 = N * N; int i; float error_norm; float ref_norm; float diff; CudaBlas handle; /* Initialize CUBLAS */ Console.WriteLine("simpleCUBLAS test running."); handle = new CudaBlas(); /* Allocate host memory for the matrices */ h_A = new float[n2]; h_B = new float[n2]; //h_C = new float[n2]; h_C_ref = new float[n2]; Random rand = new Random(0); /* Fill the matrices with test data */ for (i = 0; i < n2; i++) { h_A[i] = (float)rand.NextDouble(); h_B[i] = (float)rand.NextDouble(); //h_C[i] = (float)rand.NextDouble(); } /* Allocate device memory for the matrices */ d_A = new CudaDeviceVariable<float>(n2); d_B = new CudaDeviceVariable<float>(n2); d_C = new CudaDeviceVariable<float>(n2); /* Initialize the device matrices with the host matrices */ d_A.CopyToDevice(h_A); d_B.CopyToDevice(h_B); //d_C.CopyToDevice(h_C); /* Performs operation using plain C code */ simple_sgemm(N, alpha, h_A, h_B, beta, h_C_ref); /* Performs operation using cublas */ handle.Gemm(Operation.NonTranspose, Operation.NonTranspose, N, N, N, alpha, d_A, N, d_B, N, beta, d_C, N); /* Allocate host memory for reading back the result from device memory */ h_C = d_C; /* Check result against reference */ error_norm = 0; ref_norm = 0; for (i = 0; i < n2; ++i) { diff = h_C_ref[i] - h_C[i]; error_norm += diff * diff; ref_norm += h_C_ref[i] * h_C_ref[i]; } error_norm = (float)Math.Sqrt((double)error_norm); ref_norm = (float)Math.Sqrt((double)ref_norm); if (Math.Abs(ref_norm) < 1e-7) { Console.WriteLine("!!!! reference norm is 0"); return; } /* Memory clean up */ d_A.Dispose(); d_B.Dispose(); d_C.Dispose(); /* Shutdown */ handle.Dispose(); if (error_norm / ref_norm < 1e-6f) { Console.WriteLine("simpleCUBLAS test passed."); return; } else { Console.WriteLine("simpleCUBLAS test failed."); return; } }
public void BackwardFilter(CudnnTensorDescriptor srcTensor, double[] srcData, CudnnTensorDescriptor diffTensor, double[] diffData, CudnnConvolutionDescriptor convolution, CudnnFilterDescriptor gradient, double[] gradientData, CudnnAccumulateResult accumulate) { Contract.Requires(srcTensor != null); Contract.Requires(srcData != null); Contract.Requires(diffTensor != null); Contract.Requires(diffData != null); Contract.Requires(convolution != null); Contract.Requires(gradient != null); Contract.Requires(gradientData != null); ThrowIfNotInitialized(); CheckIfCompatible(CudnnType.Double, srcTensor, diffTensor, gradient); using (var srcDataGpu = new CudaDeviceVariable<double>(srcData.Length)) using (var diffDataGpu = new CudaDeviceVariable<double>(diffData.Length)) using (var gradientDataGpu = new CudaDeviceVariable<double>(gradientData.Length)) { srcDataGpu.CopyToDevice(srcData); diffDataGpu.CopyToDevice(diffData); Invoke(() => CudnnNativeMethods.cudnnConvolutionBackwardFilter(handle, srcTensor.Handle, srcDataGpu.DevicePointer, diffTensor.Handle, diffDataGpu.DevicePointer, convolution.Handle, gradient.Handle, gradientDataGpu.DevicePointer, accumulate)); gradientDataGpu.CopyToHost(gradientData); } }
public void Forward(CudnnTensorDescriptor srcTensor, float[] srcData, CudnnFilterDescriptor filter, float[] filterData, CudnnConvolutionDescriptor convolution, CudnnTensorDescriptor destTensor, float[] destData, CudnnAccumulateResult accumulate) { Contract.Requires(srcTensor != null); Contract.Requires(srcData != null); Contract.Requires(filter != null); Contract.Requires(filterData != null); Contract.Requires(convolution != null); Contract.Requires(destTensor != null); Contract.Requires(destData != null); ThrowIfNotInitialized(); CheckIfCompatible(CudnnType.Float, srcTensor, destTensor, filter); using (var srcDataGpu = new CudaDeviceVariable<float>(srcData.Length)) using (var filterDataGpu = new CudaDeviceVariable<float>(filterData.Length)) using (var destDataGpu = new CudaDeviceVariable<float>(destData.Length)) { srcDataGpu.CopyToDevice(srcData); filterDataGpu.CopyToDevice(filterData); Invoke(() => CudnnNativeMethods.cudnnConvolutionForward(handle, srcTensor.Handle, srcDataGpu.DevicePointer, filter.Handle, filterDataGpu.DevicePointer, convolution.Handle, destTensor.Handle, destDataGpu.DevicePointer, accumulate)); destDataGpu.CopyToHost(destData); } }
public void Backward(CudnnSoftmaxAlgorithm algorithm, CudnnSoftmaxMode mode, CudnnTensorDescriptor srcTensor, double[] srcData, CudnnTensorDescriptor srcDiffTensor, double[] srcDiffData, CudnnTensorDescriptor destDiffTensor, double[] destDiffData) { Contract.Requires(srcTensor != null); Contract.Requires(srcData != null); Contract.Requires(srcDiffTensor != null); Contract.Requires(srcDiffData != null); Contract.Requires(destDiffTensor != null); Contract.Requires(destDiffData != null); ThrowIfNotInitialized(); CheckIfCompatible(CudnnType.Double, srcTensor, srcDiffTensor, destDiffTensor); using (var srcDataGpu = new CudaDeviceVariable<double>(srcData.Length)) using (var srcDiffDataGpu = new CudaDeviceVariable<double>(srcDiffData.Length)) using (var destDiffDataGpu = new CudaDeviceVariable<double>(destDiffData.Length)) { srcDataGpu.CopyToDevice(srcData); srcDiffDataGpu.CopyToDevice(srcDiffData); Invoke(() => CudnnNativeMethods.cudnnSoftmaxBackward(handle, algorithm, mode, srcTensor.Handle, srcDataGpu.DevicePointer, srcDiffTensor.Handle, srcDiffDataGpu.DevicePointer, destDiffTensor.Handle, destDiffDataGpu.DevicePointer)); destDiffDataGpu.CopyToHost(destDiffData); } }
static void Main(string[] args) { int SIGNAL_SIZE = 50; int FILTER_KERNEL_SIZE = 11; Console.WriteLine("[simpleCUFFT] is starting..."); var assembly = Assembly.GetExecutingAssembly(); var resourceName = "simpleCUFFT.simpleCUFFTKernel.ptx"; CudaContext ctx = new CudaContext(0); CudaKernel ComplexPointwiseMulAndScale; string[] liste = assembly.GetManifestResourceNames(); using (Stream stream = assembly.GetManifestResourceStream(resourceName)) { ComplexPointwiseMulAndScale = ctx.LoadKernelPTX(stream, "ComplexPointwiseMulAndScale"); } // Allocate host memory for the signal cuFloatComplex[] h_signal = new cuFloatComplex[SIGNAL_SIZE]; //we use cuFloatComplex for complex multiplaction in reference host code... Random rand = new Random(0); // Initialize the memory for the signal for (int i = 0; i < SIGNAL_SIZE; ++i) { h_signal[i].real = (float)rand.NextDouble(); h_signal[i].imag = 0; } // Allocate host memory for the filter cuFloatComplex[] h_filter_kernel = new cuFloatComplex[FILTER_KERNEL_SIZE]; // Initialize the memory for the filter for (int i = 0; i < FILTER_KERNEL_SIZE; ++i) { h_filter_kernel[i].real = (float)rand.NextDouble(); h_filter_kernel[i].imag = 0; } // Pad signal and filter kernel cuFloatComplex[] h_padded_signal = null; cuFloatComplex[] h_padded_filter_kernel = null; int new_size = PadData(h_signal, ref h_padded_signal, SIGNAL_SIZE, h_filter_kernel, ref h_padded_filter_kernel, FILTER_KERNEL_SIZE); int mem_size = (int)cuFloatComplex.SizeOf * new_size; // Allocate device memory for signal CudaDeviceVariable<cuFloatComplex> d_signal = new CudaDeviceVariable<cuFloatComplex>(new_size); // Copy host memory to device d_signal.CopyToDevice(h_padded_signal); // Allocate device memory for filter kernel CudaDeviceVariable<cuFloatComplex> d_filter_kernel = new CudaDeviceVariable<cuFloatComplex>(new_size); // Copy host memory to device d_filter_kernel.CopyToDevice(h_padded_filter_kernel); // CUFFT plan simple API CudaFFTPlan1D plan = new CudaFFTPlan1D(new_size, cufftType.C2C, 1); // Transform signal and kernel Console.WriteLine("Transforming signal cufftExecC2C"); plan.Exec(d_signal.DevicePointer, TransformDirection.Forward); plan.Exec(d_filter_kernel.DevicePointer, TransformDirection.Forward); // Multiply the coefficients together and normalize the result Console.WriteLine("Launching ComplexPointwiseMulAndScale<<< >>>"); ComplexPointwiseMulAndScale.BlockDimensions = 256; ComplexPointwiseMulAndScale.GridDimensions = 32; ComplexPointwiseMulAndScale.Run(d_signal.DevicePointer, d_filter_kernel.DevicePointer, new_size, 1.0f / new_size); // Transform signal back Console.WriteLine("Transforming signal back cufftExecC2C"); plan.Exec(d_signal.DevicePointer, TransformDirection.Inverse); // Copy device memory to host cuFloatComplex[] h_convolved_signal = d_signal; // Allocate host memory for the convolution result cuFloatComplex[] h_convolved_signal_ref = new cuFloatComplex[SIGNAL_SIZE]; // Convolve on the host Convolve(h_signal, SIGNAL_SIZE, h_filter_kernel, FILTER_KERNEL_SIZE, h_convolved_signal_ref); // check result bool bTestResult = sdkCompareL2fe(h_convolved_signal_ref, h_convolved_signal, 1e-5f); //Destroy CUFFT context plan.Dispose(); // cleanup memory d_filter_kernel.Dispose(); d_signal.Dispose(); ctx.Dispose(); if (bTestResult) { Console.WriteLine("Test Passed"); } else { Console.WriteLine("Test Failed"); } }