public override void Conv2DInputGradient(Tensor gradient, Tensor rotKernels, int stride, int paddingX, int paddingY, Tensor inputGradients) { GpuShape[] shapes = new[] { new GpuShape(gradient.Shape), new GpuShape(rotKernels.Shape), new GpuShape(inputGradients.Shape), new GpuShape(rotKernels.Width, rotKernels.Height, 1, rotKernels.BatchSize) }; float[] devGradient = Gpu.CopyToDevice(gradient.Values); float[] devRotKernels = Gpu.CopyToDevice(rotKernels.Values); GpuShape[] devShapes = Gpu.CopyToDevice(shapes); int threadsRequiredPerResultElem = rotKernels.BatchSize * rotKernels.Height * rotKernels.Width; float[,] resultPartials = new float[inputGradients.Length, GetBlocksNum(threadsRequiredPerResultElem)]; float[,] devResultPartials = Gpu.Allocate(resultPartials); // simulate //GpuConv2DInputGradient(GetSimulatedThread(blockSize, new dim3(bx, by, bz), new dim3(tx, ty, tz)), gradient.Values, rotKernels.Values, resultPartials, shapes, paddingX, paddingY, stride); Gpu.Launch(new dim3(inputGradients.Length, GetBlocksNum(threadsRequiredPerResultElem)), THREADS_PER_BLOCK).GpuConv2DInputGradient(devGradient, devRotKernels, devResultPartials, devShapes, paddingX, paddingY, stride); Gpu.Synchronize(); Gpu.CopyFromDevice(devResultPartials, resultPartials); Gpu.FreeAll(); for (int k = 0; k < resultPartials.GetLength(0); ++k) { for (int partialId = 0; partialId < resultPartials.GetLength(1); ++partialId) { inputGradients.Values[k] += resultPartials[k, partialId]; } } }
//public override void Add(Tensor t1, Tensor t2, Tensor result) //{ // int threadsRequired = result.Length; // float[] devT1 = Gpu.CopyToDevice(t1.Values); // float[] devT2 = Gpu.CopyToDevice(t2.Values); // float[] devResult = Gpu.Allocate(result.Values); // Gpu.Launch(GetBlocksNum(threadsRequired), THREADS_PER_BLOCK).GpuAdd(devT1, devT2, devResult); // Gpu.Synchronize(); // Gpu.CopyFromDevice(devResult, result.Values); // Gpu.FreeAll(); //} //public override void Sub(Tensor t1, Tensor t2, Tensor result) //{ // int threadsRequired = result.Length; // float[] devT1 = Gpu.CopyToDevice(t1.Values); // float[] devT2 = Gpu.CopyToDevice(t2.Values); // float[] devResult = Gpu.Allocate(result.Values); // Gpu.Launch(GetBlocksNum(threadsRequired), THREADS_PER_BLOCK).GpuSub(devT1, devT2, devResult); // Gpu.Synchronize(); // Gpu.CopyFromDevice(devResult, result.Values); // Gpu.FreeAll(); //} //public override void Mul(Tensor t1, Tensor t2, Tensor result) //{ // int threadsRequired = result.BatchSize * t1.Depth * t1.Height * t2.Width; // GpuShape[] shapes = new [] { new GpuShape(t1.Shape), new GpuShape(t2.Shape), new GpuShape(result.Shape) }; // float[] devT1 = Gpu.CopyToDevice(t1.Values); // float[] devT2 = Gpu.CopyToDevice(t2.Values); // float[] devResult = Gpu.Allocate(result.Values); // GpuShape[] devShapes = Gpu.CopyToDevice(shapes); // Gpu.Launch(GetBlocksNum(threadsRequired), THREADS_PER_BLOCK).GpuMul(devT1, devT2, devResult, devShapes); // Gpu.Synchronize(); // Gpu.CopyFromDevice(devResult, result.Values); // Gpu.FreeAll(); //} public override void Conv2D(Tensor t, Tensor kernels, int stride, int paddingX, int paddingY, Tensor result) { int threadsRequired = t.BatchSize * kernels.BatchSize * result.Width * result.Height; GpuShape[] shapes = new[] { new GpuShape(t.Shape), new GpuShape(kernels.Shape), new GpuShape(result.Shape) }; float[] devT = Gpu.CopyToDevice(t.Values); float[] devKernels = Gpu.CopyToDevice(kernels.Values); float[] devResult = Gpu.Allocate(result.Values); GpuShape[] devShapes = Gpu.CopyToDevice(shapes); Gpu.Launch(GetBlocksNum(threadsRequired), THREADS_PER_BLOCK).GpuConv2D(devT, devKernels, devResult, devShapes, paddingX, paddingY, stride); Gpu.Synchronize(); Gpu.CopyFromDevice(devResult, result.Values); Gpu.FreeAll(); }