protected override NdArray NeedPreviousForwardGpu(NdArray x) { Real[] y = NoBias ? new Real[OutputCount * x.BatchCount] : GetBiasedValue(x.BatchCount); using (ComputeBuffer <Real> gpuX = new ComputeBuffer <Real>(Weaver.Context, ComputeMemoryFlags.None, x.Data)) using (ComputeBuffer <Real> gpuY = new ComputeBuffer <Real>(Weaver.Context, ComputeMemoryFlags.None, y)) { ForwardKernel.SetMemoryArgument(0, gpuX); ForwardKernel.SetMemoryArgument(1, gpuW); ForwardKernel.SetMemoryArgument(2, gpuY); ForwardKernel.SetValueArgument(3, OutputCount); ForwardKernel.SetValueArgument(4, InputCount); Weaver.CommandQueue.Execute ( ForwardKernel, null, new long[] { OutputCount, x.BatchCount }, null, null ); Weaver.CommandQueue.Flush(); //for less cpu use. this is 65% of computation time (10.4ms on 1080ti). ASleep(6.5); Weaver.CommandQueue.Finish(); Weaver.CommandQueue.ReadFromBuffer(gpuY, ref y, true, null); } return(NdArray.Convert(y, new[] { OutputCount }, x.BatchCount, this)); }
protected override NdArray NeedPreviousForwardGpu(NdArray x) { Real[] y = this.NoBias ? new Real[OutputCount * x.BatchCount] : GetBiasedValue(x.BatchCount); using (ComputeBuffer <Real> gpuX = new ComputeBuffer <Real>(Weaver.Context, ComputeMemoryFlags.ReadOnly | ComputeMemoryFlags.CopyHostPointer, x.Data)) using (ComputeBuffer <Real> gpuW = new ComputeBuffer <Real>(Weaver.Context, ComputeMemoryFlags.ReadOnly | ComputeMemoryFlags.CopyHostPointer, this.Weight.Data)) using (ComputeBuffer <Real> gpuY = new ComputeBuffer <Real>(Weaver.Context, ComputeMemoryFlags.ReadWrite | ComputeMemoryFlags.CopyHostPointer, y)) { ForwardKernel.SetMemoryArgument(0, gpuX); ForwardKernel.SetMemoryArgument(1, gpuW); ForwardKernel.SetMemoryArgument(2, gpuY); ForwardKernel.SetValueArgument(3, this.OutputCount); ForwardKernel.SetValueArgument(4, this.InputCount); Weaver.CommandQueue.Execute ( ForwardKernel, null, new long[] { OutputCount, x.BatchCount }, null, null ); Weaver.CommandQueue.Finish(); Weaver.CommandQueue.ReadFromBuffer(gpuY, ref y, true, null); } return(NdArray.Convert(y, new[] { OutputCount }, x.BatchCount, this)); }
public override NdArray SingleInputForward(NdArray x) { //フラグチェック if (!IsParallel) { return(base.SingleInputForward(x)); } Real[] y = this.NoBias ? new Real[OutputCount * x.BatchCount] : GetBiasedValue(x.BatchCount); using (ComputeBuffer <Real> gpuX = new ComputeBuffer <Real>(OpenCL.Context, ComputeMemoryFlags.ReadOnly | ComputeMemoryFlags.UseHostPointer, x.Data)) using (ComputeBuffer <Real> gpuW = new ComputeBuffer <Real>(OpenCL.Context, ComputeMemoryFlags.ReadOnly | ComputeMemoryFlags.UseHostPointer, this.Weight.Data)) using (ComputeBuffer <Real> gpuY = new ComputeBuffer <Real>(OpenCL.Context, ComputeMemoryFlags.ReadWrite | ComputeMemoryFlags.UseHostPointer, y)) { ForwardKernel.SetMemoryArgument(0, gpuX); ForwardKernel.SetMemoryArgument(1, gpuW); ForwardKernel.SetMemoryArgument(2, gpuY); ForwardKernel.SetValueArgument(3, this.OutputCount); ForwardKernel.SetValueArgument(4, this.InputCount); OpenCL.CommandQueue.Execute ( ForwardKernel, null, new long[] { OutputCount, x.BatchCount }, null, null ); OpenCL.CommandQueue.Finish(); OpenCL.CommandQueue.ReadFromBuffer(gpuY, ref y, true, null); } return(NdArray.Convert(y, new[] { OutputCount }, x.BatchCount, this)); }
protected override NdArray NeedPreviousForwardGpu(NdArray x) { var ytemp = NoBias ? new Real[OutputCount * x.BatchCount] : GetBiasedValue(x.BatchCount); var y = GetArray("y", ytemp.Length, ComputeMemoryFlags.ReadWrite | ComputeMemoryFlags.CopyHostPointer); y.Write(ytemp); x.Data.Switch(Common.ComputeDeviceTypes.Gpu); Weight.Data.Switch(Common.ComputeDeviceTypes.Gpu); y.Switch(Common.ComputeDeviceTypes.Gpu); var gpuX = x.Data.GetBuffer(); var gpuW = Weight.Data.GetBuffer(); var gpuY = y.GetBuffer(); ForwardKernel.SetMemoryArgument(0, gpuX); ForwardKernel.SetMemoryArgument(1, gpuW); ForwardKernel.SetMemoryArgument(2, gpuY); ForwardKernel.SetValueArgument(3, OutputCount); ForwardKernel.SetValueArgument(4, InputCount); Weaver.CommandQueue.Execute ( ForwardKernel, null, new long[] { OutputCount, x.BatchCount }, null, null ); Weaver.CommandQueue.Flush(); Weaver.CommandQueue.Finish(); return(new NdArray(y, GetArray("y.Grad", y.Length, ComputeMemoryFlags.ReadWrite | ComputeMemoryFlags.CopyHostPointer), new[] { OutputCount }, x.BatchCount, this)); }
public override NdArray SingleInputForward(NdArray input) { //フラグチェック if (!IsParallel) { return(base.SingleInputForward(input)); } int outputHeight = (input.Shape[1] - 1) * this.StrideY + this.KernelHeight - this.PadY * 2; int outputWidth = (input.Shape[2] - 1) * this.StrideX + this.KernelWidth - this.PadX * 2; Real[] result = new Real[input.BatchCount * this.OutputCount * outputWidth * outputHeight]; using (ComputeBuffer <Real> gpuX = new ComputeBuffer <Real>(OpenCL.Context, ComputeMemoryFlags.ReadOnly | ComputeMemoryFlags.UseHostPointer, input.Data)) using (ComputeBuffer <Real> gpuW = new ComputeBuffer <Real>(OpenCL.Context, ComputeMemoryFlags.ReadOnly | ComputeMemoryFlags.UseHostPointer, this.Weight.Data)) using (ComputeBuffer <Real> gpub = new ComputeBuffer <Real>(OpenCL.Context, ComputeMemoryFlags.ReadOnly | ComputeMemoryFlags.UseHostPointer, this.NoBias ? new Real[OutputCount] : this.Bias.Data)) using (ComputeBuffer <Real> gpuY = new ComputeBuffer <Real>(OpenCL.Context, ComputeMemoryFlags.WriteOnly | ComputeMemoryFlags.AllocateHostPointer, result.Length)) { ForwardKernel.SetMemoryArgument(0, gpuX); ForwardKernel.SetMemoryArgument(1, gpuW); ForwardKernel.SetMemoryArgument(2, gpub); ForwardKernel.SetMemoryArgument(3, gpuY); ForwardKernel.SetValueArgument(4, input.Shape[1]); ForwardKernel.SetValueArgument(5, input.Shape[2]); ForwardKernel.SetValueArgument(6, input.Length); ForwardKernel.SetValueArgument(7, outputWidth); ForwardKernel.SetValueArgument(8, outputHeight); ForwardKernel.SetValueArgument(9, this.StrideX); ForwardKernel.SetValueArgument(10, this.StrideY); ForwardKernel.SetValueArgument(11, this.PadX); ForwardKernel.SetValueArgument(12, this.PadY); ForwardKernel.SetValueArgument(13, this.KernelHeight); ForwardKernel.SetValueArgument(14, this.KernelWidth); ForwardKernel.SetValueArgument(15, this.OutputCount); ForwardKernel.SetValueArgument(16, this.InputCount); OpenCL.CommandQueue.Execute ( ForwardKernel, null, new long[] { input.BatchCount *OutputCount, outputHeight, outputWidth }, null, null ); OpenCL.CommandQueue.Finish(); OpenCL.CommandQueue.ReadFromBuffer(gpuY, ref result, true, null); } return(NdArray.Convert(result, new[] { this.OutputCount, outputHeight, outputWidth }, input.BatchCount, this)); }
protected override NdArray NeedPreviousForwardGpu(NdArray input) { int outputHeight = (int)Math.Floor((input.Shape[1] - this._kHeight + this._padY * 2.0) / this._strideY) + 1; int outputWidth = (int)Math.Floor((input.Shape[2] - this._kWidth + this._padX * 2.0) / this._strideX) + 1; Real[] result = new Real[this.OutputCount * outputHeight * outputWidth * input.BatchCount]; using (ComputeBuffer <Real> gpuX = new ComputeBuffer <Real>(Weaver.Context, ComputeMemoryFlags.ReadOnly | ComputeMemoryFlags.CopyHostPointer, input.Data)) using (ComputeBuffer <Real> gpuW = new ComputeBuffer <Real>(Weaver.Context, ComputeMemoryFlags.ReadOnly | ComputeMemoryFlags.CopyHostPointer, this.Weight.Data)) //TODO using (ComputeBuffer <Real> gpub = new ComputeBuffer <Real>(Weaver.Context, ComputeMemoryFlags.ReadOnly | ComputeMemoryFlags.CopyHostPointer, this.NoBias ? new Real[OutputCount] : (Real[])this.Bias.Data)) using (ComputeBuffer <Real> gpuY = new ComputeBuffer <Real>(Weaver.Context, ComputeMemoryFlags.WriteOnly | ComputeMemoryFlags.AllocateHostPointer, result.Length)) { ForwardKernel.SetMemoryArgument(0, gpuX); ForwardKernel.SetMemoryArgument(1, gpuW); ForwardKernel.SetMemoryArgument(2, gpub); ForwardKernel.SetMemoryArgument(3, gpuY); ForwardKernel.SetValueArgument(4, input.Shape[1]); ForwardKernel.SetValueArgument(5, input.Shape[2]); ForwardKernel.SetValueArgument(6, input.Length); ForwardKernel.SetValueArgument(7, outputWidth); ForwardKernel.SetValueArgument(8, outputHeight); ForwardKernel.SetValueArgument(9, this._strideX); ForwardKernel.SetValueArgument(10, this._strideY); ForwardKernel.SetValueArgument(11, this._padX); ForwardKernel.SetValueArgument(12, this._padY); ForwardKernel.SetValueArgument(13, this._kHeight); ForwardKernel.SetValueArgument(14, this._kWidth); ForwardKernel.SetValueArgument(15, this.OutputCount); ForwardKernel.SetValueArgument(16, this.InputCount); Weaver.CommandQueue.Execute ( ForwardKernel, null, new long[] { input.BatchCount *OutputCount, outputHeight, outputWidth }, null, null ); Weaver.CommandQueue.Finish(); Weaver.CommandQueue.ReadFromBuffer(gpuY, ref result, true, null); } return(NdArray.Convert(result, new[] { this.OutputCount, outputHeight, outputWidth }, input.BatchCount, this)); }
protected override NdArray NeedPreviousForwardGpu([NotNull] NdArray input) { int outputHeight = (input.Shape[1] - 1) * _subSampleY + _kHeight - _trimY * 2; int outputWidth = (input.Shape[2] - 1) * _subSampleX + _kWidth - _trimX * 2; Real[] result = new Real[input.BatchCount * OutputCount * outputWidth * outputHeight]; using (ComputeBuffer <Real> gpuX = new ComputeBuffer <Real>(Weaver.Context, ComputeMemoryFlags.ReadOnly | ComputeMemoryFlags.CopyHostPointer, input.Data)) { using (ComputeBuffer <Real> gpuW = new ComputeBuffer <Real>(Weaver.Context, ComputeMemoryFlags.ReadOnly | ComputeMemoryFlags.CopyHostPointer, Weight.Data)) { using (ComputeBuffer <Real> gpub = new ComputeBuffer <Real>(Weaver.Context, ComputeMemoryFlags.ReadOnly | ComputeMemoryFlags.CopyHostPointer, NoBias ? new Real[OutputCount] : Bias.Data)) { using (ComputeBuffer <Real> gpuY = new ComputeBuffer <Real>(Weaver.Context, ComputeMemoryFlags.WriteOnly | ComputeMemoryFlags.AllocateHostPointer, result.Length)) { ForwardKernel.SetMemoryArgument(0, gpuX); ForwardKernel.SetMemoryArgument(1, gpuW); ForwardKernel.SetMemoryArgument(2, gpub); ForwardKernel.SetMemoryArgument(3, gpuY); ForwardKernel.SetValueArgument(4, input.Shape[1]); ForwardKernel.SetValueArgument(5, input.Shape[2]); ForwardKernel.SetValueArgument(6, input.Length); ForwardKernel.SetValueArgument(7, outputWidth); ForwardKernel.SetValueArgument(8, outputHeight); ForwardKernel.SetValueArgument(9, _subSampleX); ForwardKernel.SetValueArgument(10, _subSampleY); ForwardKernel.SetValueArgument(11, _trimX); ForwardKernel.SetValueArgument(12, _trimY); ForwardKernel.SetValueArgument(13, _kHeight); ForwardKernel.SetValueArgument(14, _kWidth); ForwardKernel.SetValueArgument(15, OutputCount); ForwardKernel.SetValueArgument(16, InputCount); Weaver.CommandQueue.Execute(ForwardKernel, null, new long[] { input.BatchCount *OutputCount, outputHeight, outputWidth }, null, null); Weaver.CommandQueue.Finish(); Weaver.CommandQueue.ReadFromBuffer(gpuY, ref result, true, null); } } } } return(NdArray.Convert(result, new[] { OutputCount, outputHeight, outputWidth }, input.BatchCount, this)); }