public override void Conv2DInputGradient(Tensor gradient, Tensor rotKernels, int stride, int paddingX, int paddingY, Tensor inputGradients) { GpuShape[] shapes = new[] { new GpuShape(gradient.Shape), new GpuShape(rotKernels.Shape), new GpuShape(inputGradients.Shape), new GpuShape(rotKernels.Width, rotKernels.Height, 1, rotKernels.BatchSize) }; float[] devGradient = Gpu.CopyToDevice(gradient.Values); float[] devRotKernels = Gpu.CopyToDevice(rotKernels.Values); GpuShape[] devShapes = Gpu.CopyToDevice(shapes); int threadsRequiredPerResultElem = rotKernels.BatchSize * rotKernels.Height * rotKernels.Width; float[,] resultPartials = new float[inputGradients.Length, GetBlocksNum(threadsRequiredPerResultElem)]; float[,] devResultPartials = Gpu.Allocate(resultPartials); // simulate //GpuConv2DInputGradient(GetSimulatedThread(blockSize, new dim3(bx, by, bz), new dim3(tx, ty, tz)), gradient.Values, rotKernels.Values, resultPartials, shapes, paddingX, paddingY, stride); Gpu.Launch(new dim3(inputGradients.Length, GetBlocksNum(threadsRequiredPerResultElem)), THREADS_PER_BLOCK).GpuConv2DInputGradient(devGradient, devRotKernels, devResultPartials, devShapes, paddingX, paddingY, stride); Gpu.Synchronize(); Gpu.CopyFromDevice(devResultPartials, resultPartials); Gpu.FreeAll(); for (int k = 0; k < resultPartials.GetLength(0); ++k) { for (int partialId = 0; partialId < resultPartials.GetLength(1); ++partialId) { inputGradients.Values[k] += resultPartials[k, partialId]; } } }
private void AllocateGpuResources(int sampleCount) { if (m_SampleCount == sampleCount) { return; } DisposeGpuResources(); m_DevOverdBs = Gpu.Allocate <float>(sampleCount); m_SampleCount = sampleCount; }
public Tuple <int, int> CompareAbsoluteOpt(double[] source, double[] target, double tolerance, double ThreshholdTol) { System.Diagnostics.Debug.WriteLine("starting an absolute comparison on GPU"); if (source.Length != target.Length) { throw new ArgumentException("The source and target lengths need to match"); } double epsilon = ThreshholdTol; double MaxSource = source.Max(); double MaxTarget = target.Max(); double MinDoseEvaluated = (MaxSource * epsilon); double zero = 0.0; double lowMultiplier = (1 - tolerance); double highMultiplier = (1 + tolerance); int failed = 0; int isCounted = 0; Gpu gpu = Gpu.Default; // filter doses below threshold // TODO: should failure be -1? int dimension = source.Length; double[] sourceOnGPU = gpu.Allocate(source); double[] targetOnGPU = gpu.Allocate(target); double[] isCountedArray = gpu.Allocate <double>(dimension); double[] sourceOnGPULow = gpu.Allocate <double>(dimension); double[] sourceOnGPUHigh = gpu.Allocate <double>(dimension); double[] isGTtol = gpu.Allocate <double>(dimension); gpu.For(0, dimension, i => sourceOnGPU[i] = (sourceOnGPU[i] > epsilon) ? sourceOnGPU[i] : zero); gpu.For(0, dimension, i => targetOnGPU[i] = (targetOnGPU[i] > epsilon) ? targetOnGPU[i] : zero); gpu.For(0, dimension, i => sourceOnGPU[i] = (targetOnGPU[i] > epsilon) ? sourceOnGPU[i] : zero); gpu.For(0, dimension, i => targetOnGPU[i] = (sourceOnGPU[i] > epsilon) ? targetOnGPU[i] : zero); gpu.For(0, dimension, i => isCountedArray[i] = (sourceOnGPU[i] > zero) ? 1.0 : zero); gpu.For(0, dimension, i => sourceOnGPULow[i] = lowMultiplier * sourceOnGPU[i]); gpu.For(0, dimension, i => sourceOnGPUHigh[i] = highMultiplier * sourceOnGPU[i]); //determine if relative difference is greater than minDoseEvaluated // stores 1 as GT minDoseEvaluated is true gpu.For(0, isGTtol.Length, i => isGTtol[i] = (targetOnGPU[i] < sourceOnGPULow[i] || targetOnGPU[i] > sourceOnGPUHigh[i]) ? 1 : 0); isCounted = (int)gpu.Sum(isCountedArray); failed = (int)gpu.Sum(isGTtol); Gpu.Free(sourceOnGPU); Gpu.Free(targetOnGPU); Gpu.Free(sourceOnGPULow); Gpu.Free(sourceOnGPUHigh); Gpu.Free(isCountedArray); Gpu.Free(isGTtol); System.Diagnostics.Debug.WriteLine("finished an absolute comparison on GPU"); //gpu.Dispose(); return(new Tuple <int, int>(failed, isCounted)); }
private static double[,] CosineSimilarityGpu(Gpu gpu, double[][] dataset) { int size = dataset.Length * dataset.Length; var gpuDataset = gpu.Allocate(dataset); // Allocate directly on gpu. var gpuDistances = gpu.Allocate <double>(dataset.Length, dataset.Length); gpu.For(0, size, index => { int i = index / dataset.Length; int j = index % dataset.Length; double dotProduct = 0; double magnitudeOne = 0; double magnitudeTwo = 0; for (int k = 0; k < dataset[i].Length; k++) { dotProduct += (dataset[i][k] * dataset[j][k]); magnitudeOne += (dataset[i][k] * dataset[i][k]); magnitudeTwo += (dataset[j][k] * dataset[j][k]); } double distance = Math.Max(0, 1 - (dotProduct / Math.Sqrt(magnitudeOne * magnitudeTwo))); gpuDistances[i, j] = distance; }); // Gpu -> Cpu. var result = new double[dataset.Length, dataset.Length]; Gpu.Copy(gpuDistances, result); // Release gpu memory. Gpu.Free(gpuDataset); Gpu.Free(gpuDistances); return(result); }
//public override void Add(Tensor t1, Tensor t2, Tensor result) //{ // int threadsRequired = result.Length; // float[] devT1 = Gpu.CopyToDevice(t1.Values); // float[] devT2 = Gpu.CopyToDevice(t2.Values); // float[] devResult = Gpu.Allocate(result.Values); // Gpu.Launch(GetBlocksNum(threadsRequired), THREADS_PER_BLOCK).GpuAdd(devT1, devT2, devResult); // Gpu.Synchronize(); // Gpu.CopyFromDevice(devResult, result.Values); // Gpu.FreeAll(); //} //public override void Sub(Tensor t1, Tensor t2, Tensor result) //{ // int threadsRequired = result.Length; // float[] devT1 = Gpu.CopyToDevice(t1.Values); // float[] devT2 = Gpu.CopyToDevice(t2.Values); // float[] devResult = Gpu.Allocate(result.Values); // Gpu.Launch(GetBlocksNum(threadsRequired), THREADS_PER_BLOCK).GpuSub(devT1, devT2, devResult); // Gpu.Synchronize(); // Gpu.CopyFromDevice(devResult, result.Values); // Gpu.FreeAll(); //} //public override void Mul(Tensor t1, Tensor t2, Tensor result) //{ // int threadsRequired = result.BatchSize * t1.Depth * t1.Height * t2.Width; // GpuShape[] shapes = new [] { new GpuShape(t1.Shape), new GpuShape(t2.Shape), new GpuShape(result.Shape) }; // float[] devT1 = Gpu.CopyToDevice(t1.Values); // float[] devT2 = Gpu.CopyToDevice(t2.Values); // float[] devResult = Gpu.Allocate(result.Values); // GpuShape[] devShapes = Gpu.CopyToDevice(shapes); // Gpu.Launch(GetBlocksNum(threadsRequired), THREADS_PER_BLOCK).GpuMul(devT1, devT2, devResult, devShapes); // Gpu.Synchronize(); // Gpu.CopyFromDevice(devResult, result.Values); // Gpu.FreeAll(); //} public override void Conv2D(Tensor t, Tensor kernels, int stride, int paddingX, int paddingY, Tensor result) { int threadsRequired = t.BatchSize * kernels.BatchSize * result.Width * result.Height; GpuShape[] shapes = new[] { new GpuShape(t.Shape), new GpuShape(kernels.Shape), new GpuShape(result.Shape) }; float[] devT = Gpu.CopyToDevice(t.Values); float[] devKernels = Gpu.CopyToDevice(kernels.Values); float[] devResult = Gpu.Allocate(result.Values); GpuShape[] devShapes = Gpu.CopyToDevice(shapes); Gpu.Launch(GetBlocksNum(threadsRequired), THREADS_PER_BLOCK).GpuConv2D(devT, devKernels, devResult, devShapes, paddingX, paddingY, stride); Gpu.Synchronize(); Gpu.CopyFromDevice(devResult, result.Values); Gpu.FreeAll(); }
private float[,] GetDevNormSamples(int channels, int sampleCount) { return(m_DevNormSamples ?? (m_DevNormSamples = Gpu.Allocate <float>(channels, sampleCount))); }
private T[] GetDevOutputSamples <T>(int length) where T : struct { return((T[])(m_DevOutputSamples ?? (m_DevOutputSamples = Gpu.Allocate <T>(length)))); }