public MyFourierBinder(MyWorkingNode owner, int inputSize, MyMemoryBlock <float> tempBlock) : base(owner, inputSize, tempBlock) { m_fft = new CudaFFTPlan1D(inputSize, cufftType.R2C, 1); m_ifft = new CudaFFTPlan1D(inputSize, cufftType.C2R, 1); m_mulkernel = MyKernelFactory.Instance.Kernel(owner.GPU, @"Common\CombineVectorsKernel", "MulComplexElementWise"); m_mulkernel.SetupExecution(inputSize + 1); m_involutionKernel = MyKernelFactory.Instance.Kernel(owner.GPU, @"Common\CombineVectorsKernel", "InvolveVector"); m_involutionKernel.SetupExecution(inputSize - 1); m_inversionKernel = MyKernelFactory.Instance.Kernel(owner.GPU, @"Transforms\InvertValuesKernel", "InvertLengthComplexKernel"); m_inversionKernel.SetupExecution(inputSize); m_dotKernel = MyReductionFactory.Kernel(owner.GPU, MyReductionFactory.Mode.f_DotProduct_f); m_normalKernel = MyKernelFactory.Instance.Kernel(owner.GPU, @"Transforms\TransformKernels", "PolynomialFunctionKernel"); m_normalKernel.SetupExecution(inputSize); m_firstFFTOffset = 0; m_secondFFTOffset = (inputSize + 1) * 2; m_tempOffset = (inputSize + 1) * 4; Denominator = inputSize; }
public List <float> hypotesis(List <double> x, List <double> h, int N) { //int N = 2000000; string path = Path.GetDirectoryName(mv.plugins[0].filename); CudaContext ctx = new CudaContext(); CudaKernel kernel = ctx.LoadKernel(path + "\\kernel.ptx", "ComplexMultCUDA"); kernel.GridDimensions = (int)Math.Ceiling((double)(N + h.Count - 1) / 1024); kernel.BlockDimensions = 1024; double[] temp_y = new double[N + h.Count - 1]; double[] temp_h = new double[N + h.Count - 1]; double[] temp_x = new double[N + h.Count - 1]; double2[] temp_x2 = new double2[N + h.Count - 1]; h.ToArray().CopyTo(temp_h, 0); x.ToArray().CopyTo(temp_x, 0); CudaDeviceVariable <double> d_x = null; CudaDeviceVariable <double2> d_X = new CudaDeviceVariable <double2>(N + h.Count - 1); CudaDeviceVariable <double> d_h = new CudaDeviceVariable <double>(N + h.Count - 1); CudaDeviceVariable <double2> d_H = new CudaDeviceVariable <double2>(N + h.Count - 1); CudaDeviceVariable <double> d_y = new CudaDeviceVariable <double>(N + h.Count - 1); CudaFFTPlan1D planForward = new CudaFFTPlan1D(N + h.Count - 1, cufftType.D2Z, 1); CudaFFTPlan1D planInverse = new CudaFFTPlan1D(N + h.Count - 1, cufftType.Z2D, 1); try { d_h = temp_h; planForward.Exec(d_h.DevicePointer, d_H.DevicePointer, TransformDirection.Forward); } catch (Exception exp) { mainView.log(exp, "CUDA error: Impulse response FFT", this); return(null); } try { d_x = temp_x; planForward.Exec(d_x.DevicePointer, d_X.DevicePointer); kernel.Run(d_H.DevicePointer, d_X.DevicePointer, N + h.Count - 1); planInverse.Exec(d_X.DevicePointer, d_y.DevicePointer); } catch (Exception exp) { mainView.log(exp, "Cuda error: kernel run cuda error", this); } temp_y = d_y; return(Array.ConvertAll <double, float>(temp_y, d => (float)d).ToList().GetRange(500, x.Count)); }
public List <float> CUDA_FIR(List <float> x, List <double> h) { CudaContext ctx = new CudaContext(); //alloc data to cuda format double2[] temp_x = new double2[x.Count + h.Count - 1]; double2[] temp_h = new double2[x.Count + h.Count - 1]; double2[] temp_y = new double2[x.Count + h.Count - 1]; //data copy for (int i = 0; i < x.Count; i++) { temp_x[i].x = x[i]; } for (int i = 0; i < h.Count; i++) { temp_h[i].x = h[i]; } CudaDeviceVariable <double2> d_x = null; CudaDeviceVariable <double2> d_h = null; CudaFFTPlan1D plan1D = new CudaFFTPlan1D(x.Count + h.Count - 1, cufftType.Z2Z, 1); CudaKernel kernel = ctx.LoadKernel("kernel.ptx", "ComplexMultCUDA"); kernel.GridDimensions = (int)Math.Ceiling((double)(x.Count + h.Count - 1) / 1024); kernel.BlockDimensions = 1024; try { d_x = temp_x; d_h = temp_h; } catch (Exception e) { //("{0} Exception caught.", e); return(null); } plan1D.Exec(d_x.DevicePointer, TransformDirection.Forward); plan1D.Exec(d_h.DevicePointer, TransformDirection.Forward); kernel.Run(d_h.DevicePointer, d_x.DevicePointer, x.Count + h.Count - 1); plan1D.Exec(d_x.DevicePointer, TransformDirection.Inverse); temp_y = d_x; return(temp_y.Select(data => (float)data.x).ToList().GetRange(h.Count / 2, x.Count)); }
public override void Init(int nGPU) { fft = new CudaFFTPlan1D(Owner.InputSize, cufftType.R2C, 1); ifft = new CudaFFTPlan1D(Owner.InputSize, cufftType.C2R, 1); m_kernel = MyKernelFactory.Instance.Kernel(nGPU, @"Common\CombineVectorsKernel", "MulComplexElementWise"); m_kernel.SetupExecution(Owner.InputSize + 1); m_involutionKernel = MyKernelFactory.Instance.Kernel(nGPU, @"Common\CombineVectorsKernel", "InvolveVector"); m_involutionKernel.SetupExecution(Owner.InputSize - 1); m_linearCombKernel = MyKernelFactory.Instance.Kernel(nGPU, @"Transforms\TransformKernels", "LinearCombinationKernel"); m_linearCombKernel.SetupExecution(Owner.InputSize); m_normalKernel = MyKernelFactory.Instance.Kernel(nGPU, @"Transforms\TransformKernels", "PolynomialFunctionKernel"); m_normalKernel.SetupExecution(Owner.InputSize); }
public void cuFFTreconstruct() { CudaContext ctx = new CudaContext(0); ManagedCuda.BasicTypes.CUmodule cumodule = ctx.LoadModule("kernel.ptx"); CudaKernel cuKernel = new CudaKernel("cu_ArrayInversion", cumodule, ctx); float2[] fData = new float2[Resolution * Resolution]; float2[] result = new float2[Resolution * Resolution]; FFTData2D = new float[Resolution, Resolution, 2]; CudaDeviceVariable <float2> devData = new CudaDeviceVariable <float2>(Resolution * Resolution); CudaDeviceVariable <float2> copy_devData = new CudaDeviceVariable <float2>(Resolution * Resolution); int i, j; Random rnd = new Random(); double avrg = 0.0; for (i = 0; i < Resolution; i++) { for (j = 0; j < Resolution; j++) { fData[i * Resolution + j].x = i + j * 2; avrg += fData[i * Resolution + j].x; fData[i * Resolution + j].y = 0.0f; } } avrg = avrg / (double)(Resolution * Resolution); for (i = 0; i < Resolution; i++) { for (j = 0; j < Resolution; j++) { fData[(i * Resolution + j)].x = fData[(i * Resolution + j)].x - (float)avrg; } } devData.CopyToDevice(fData); CudaFFTPlan1D plan1D = new CudaFFTPlan1D(Resolution, cufftType.C2C, Resolution); plan1D.Exec(devData.DevicePointer, TransformDirection.Forward); cuKernel.GridDimensions = new ManagedCuda.VectorTypes.dim3(Resolution / cuda_blockNum, Resolution, 1); cuKernel.BlockDimensions = new ManagedCuda.VectorTypes.dim3(cuda_blockNum, 1, 1); cuKernel.Run(devData.DevicePointer, copy_devData.DevicePointer, Resolution); copy_devData.CopyToHost(result); for (i = 0; i < Resolution; i++) { for (j = 0; j < Resolution; j++) { FFTData2D[i, j, 0] = result[i * Resolution + j].x; FFTData2D[i, j, 1] = result[i * Resolution + j].y; } } //Clean up devData.Dispose(); copy_devData.Dispose(); plan1D.Dispose(); CudaContext.ProfilerStop(); ctx.Dispose(); }
static void Main(string[] args) { int SIGNAL_SIZE = 50; int FILTER_KERNEL_SIZE = 11; Console.WriteLine("[simpleCUFFT] is starting..."); var assembly = Assembly.GetExecutingAssembly(); var resourceName = "simpleCUFFT.simpleCUFFTKernel.ptx"; CudaContext ctx = new CudaContext(0); CudaKernel ComplexPointwiseMulAndScale; string[] liste = assembly.GetManifestResourceNames(); using (Stream stream = assembly.GetManifestResourceStream(resourceName)) { ComplexPointwiseMulAndScale = ctx.LoadKernelPTX(stream, "ComplexPointwiseMulAndScale"); } // Allocate host memory for the signal cuFloatComplex[] h_signal = new cuFloatComplex[SIGNAL_SIZE]; //we use cuFloatComplex for complex multiplaction in reference host code... Random rand = new Random(0); // Initialize the memory for the signal for (int i = 0; i < SIGNAL_SIZE; ++i) { h_signal[i].real = (float)rand.NextDouble(); h_signal[i].imag = 0; } // Allocate host memory for the filter cuFloatComplex[] h_filter_kernel = new cuFloatComplex[FILTER_KERNEL_SIZE]; // Initialize the memory for the filter for (int i = 0; i < FILTER_KERNEL_SIZE; ++i) { h_filter_kernel[i].real = (float)rand.NextDouble(); h_filter_kernel[i].imag = 0; } // Pad signal and filter kernel cuFloatComplex[] h_padded_signal = null; cuFloatComplex[] h_padded_filter_kernel = null; int new_size = PadData(h_signal, ref h_padded_signal, SIGNAL_SIZE, h_filter_kernel, ref h_padded_filter_kernel, FILTER_KERNEL_SIZE); int mem_size = (int)cuFloatComplex.SizeOf * new_size; // Allocate device memory for signal CudaDeviceVariable <cuFloatComplex> d_signal = new CudaDeviceVariable <cuFloatComplex>(new_size); // Copy host memory to device d_signal.CopyToDevice(h_padded_signal); // Allocate device memory for filter kernel CudaDeviceVariable <cuFloatComplex> d_filter_kernel = new CudaDeviceVariable <cuFloatComplex>(new_size); // Copy host memory to device d_filter_kernel.CopyToDevice(h_padded_filter_kernel); // CUFFT plan simple API CudaFFTPlan1D plan = new CudaFFTPlan1D(new_size, cufftType.C2C, 1); // Transform signal and kernel Console.WriteLine("Transforming signal cufftExecC2C"); plan.Exec(d_signal.DevicePointer, TransformDirection.Forward); plan.Exec(d_filter_kernel.DevicePointer, TransformDirection.Forward); // Multiply the coefficients together and normalize the result Console.WriteLine("Launching ComplexPointwiseMulAndScale<<< >>>"); ComplexPointwiseMulAndScale.BlockDimensions = 256; ComplexPointwiseMulAndScale.GridDimensions = 32; ComplexPointwiseMulAndScale.Run(d_signal.DevicePointer, d_filter_kernel.DevicePointer, new_size, 1.0f / new_size); // Transform signal back Console.WriteLine("Transforming signal back cufftExecC2C"); plan.Exec(d_signal.DevicePointer, TransformDirection.Inverse); // Copy device memory to host cuFloatComplex[] h_convolved_signal = d_signal; // Allocate host memory for the convolution result cuFloatComplex[] h_convolved_signal_ref = new cuFloatComplex[SIGNAL_SIZE]; // Convolve on the host Convolve(h_signal, SIGNAL_SIZE, h_filter_kernel, FILTER_KERNEL_SIZE, h_convolved_signal_ref); // check result bool bTestResult = sdkCompareL2fe(h_convolved_signal_ref, h_convolved_signal, 1e-5f); //Destroy CUFFT context plan.Dispose(); // cleanup memory d_filter_kernel.Dispose(); d_signal.Dispose(); ctx.Dispose(); if (bTestResult) { Console.WriteLine("Test Passed"); } else { Console.WriteLine("Test Failed"); } }
public List <float> hypotesis_long_save(List <double> xx, List <double> h, int N) { int n = (int)Math.Ceiling((double)(xx.Count() + 0.000000000001) / N); double[] temp_data = new double[n * (N + h.Count - 1) - (n - 1) * (h.Count - 1)]; xx.CopyTo(temp_data, h.Count - 1); List <double> x = temp_data.ToList(); //int N = 2000000; string path = Path.GetDirectoryName(mv.plugins[0].filename); CudaContext ctx = new CudaContext(); CudaKernel kernel = ctx.LoadKernel(path + "\\kernel.ptx", "ComplexMultCUDA"); kernel.GridDimensions = (int)Math.Ceiling((double)(N + h.Count - 1) / 1024); kernel.BlockDimensions = 1024; int blocks = (int)Math.Ceiling((double)(x.Count + h.Count - 1) / (N + h.Count - 1)); double[][] temp_y = new double[n][]; double[] temp_h = new double[N + h.Count - 1]; double[] temp_x = new double[N + h.Count - 1]; h.ToArray().CopyTo(temp_h, 0); CudaDeviceVariable <double> d_x = null; CudaDeviceVariable <double> d_h = new CudaDeviceVariable <double>(N + h.Count - 1); CudaDeviceVariable <double2> d_H = new CudaDeviceVariable <double2>(N + h.Count - 1); //CudaDeviceVariable<double> d_y = new CudaDeviceVariable<double>(N + h.Count - 1); CudaFFTPlan1D planForward = new CudaFFTPlan1D(N + h.Count - 1, cufftType.D2Z, 1); CudaFFTPlan1D planInverse = new CudaFFTPlan1D(N + h.Count - 1, cufftType.Z2D, 1); try { d_h = temp_h; planForward.Exec(d_h.DevicePointer, d_H.DevicePointer, TransformDirection.Forward); } catch (Exception exp) { mainView.log(exp, "CUDA error: Impulse response FFT", this); return(null); } for (int g = 0; g < n; g++) { CudaDeviceVariable <double2> d_X = new CudaDeviceVariable <double2>(N + h.Count - 1); int P = N + h.Count - 1; //if (x.Count - P * g < P) P = x.Count - P * g; int L = h.Count - 1; if (g == 0) { L = 0; } x.CopyTo(P * g - L * g, temp_x, 0, P); try { d_x = temp_x; planForward.Exec(d_x.DevicePointer, d_X.DevicePointer); kernel.Run(d_H.DevicePointer, d_X.DevicePointer, N + h.Count - 1); planInverse.Exec(d_X.DevicePointer, d_x.DevicePointer); } catch (Exception exp) { mainView.log(exp, "Cuda error: kernel run cuda error", this); } temp_y[g] = d_x; d_x.Dispose(); d_X.Dispose(); } planForward.Dispose(); planInverse.Dispose(); d_x.Dispose(); d_h.Dispose(); d_H.Dispose(); ctx.Dispose(); return(OverlapSave(temp_y, h.Count, N + h.Count - 1).GetRange(h.Count / 2, xx.Count)); }
public List <float> hypotesis_long(List <double> x, List <double> h, int N) { //int N = 2000000; string path = Path.GetDirectoryName(mv.plugins[0].filename); CudaContext ctx = new CudaContext(); CudaKernel kernel = ctx.LoadKernel(path + "\\kernel.ptx", "ComplexMultCUDA"); kernel.GridDimensions = (int)Math.Ceiling((double)(N + h.Count - 1) / 1024); kernel.BlockDimensions = 1024; int blocks = (int)Math.Ceiling((double)(x.Count + h.Count - 1) / (N + h.Count - 1)); double[][] temp_y = new double[blocks][]; double[] temp_h = new double[N + h.Count - 1]; double[] temp_x = new double[N + h.Count - 1]; h.ToArray().CopyTo(temp_h, 0); CudaDeviceVariable <double> d_x = null; CudaDeviceVariable <double2> d_X = new CudaDeviceVariable <double2>(N + h.Count - 1); CudaDeviceVariable <double> d_h = new CudaDeviceVariable <double>(N + h.Count - 1); CudaDeviceVariable <double2> d_H = new CudaDeviceVariable <double2>(N + h.Count - 1); //CudaDeviceVariable<double> d_y = new CudaDeviceVariable<double>(N + h.Count - 1); CudaFFTPlan1D planForward = new CudaFFTPlan1D(N + h.Count - 1, cufftType.D2Z, 1); CudaFFTPlan1D planInverse = new CudaFFTPlan1D(N + h.Count - 1, cufftType.Z2D, 1); try { d_h = temp_h; planForward.Exec(d_h.DevicePointer, d_H.DevicePointer, TransformDirection.Forward); } catch (Exception exp) { mainView.log(exp, "CUDA error: Impulse response FFT", this); return(null); } for (int g = 0; g < blocks; g++) { int P = N; if (x.Count - N * g < N) { P = x.Count - N * g; } x.GetRange(N * g, P).ToArray().CopyTo(temp_x, 0); try { d_x = temp_x; planForward.Exec(d_x.DevicePointer, d_X.DevicePointer); kernel.Run(d_H.DevicePointer, d_X.DevicePointer, N + h.Count - 1); planInverse.Exec(d_X.DevicePointer, d_x.DevicePointer); } catch (Exception exp) { mainView.log(exp, "Cuda error: kernel run cuda error", this); } temp_y[g] = d_x; } return(OverlapAdd(temp_y, h.Count).GetRange(h.Count / 2, x.Count)); }
public List <float> CUDA_FIR_long(List <float> x, List <double> h) { CudaContext ctx = new CudaContext(); string path = Path.GetDirectoryName(mv.plugins[0].filename); int N = 2000000; //alloc data to cuda format double2[][] temp_x = new double2[(int)Math.Ceiling((double)(x.Count + h.Count - 1) / (N + h.Count - 1))][]; double2[] temp_h = new double2[N + h.Count - 1]; double2[][] temp_y = new double2[(int)Math.Ceiling((double)(x.Count + h.Count - 1) / (N + h.Count - 1))][]; //data copy System.Threading.Tasks.Parallel.For(0, (int)Math.Ceiling((double)(x.Count + h.Count - 1) / (N + h.Count - 1)), j => { temp_x[j] = new double2[N + h.Count - 1]; temp_y[j] = new double2[N + h.Count - 1]; for (int i = 0; (j * N + i) < x.Count && i < N; i++) { temp_x[j][i].x = x[j * N + i]; } }); for (int i = 0; i < h.Count; i++) { temp_h[i].x = h[i]; } CudaDeviceVariable <double2> d_x = null; CudaDeviceVariable <double2> d_h = null; CudaFFTPlan1D plan1D = new CudaFFTPlan1D(N + h.Count - 1, cufftType.Z2Z, 1); CudaKernel kernel = ctx.LoadKernel(path + "\\kernel.ptx", "ComplexMultCUDA"); kernel.GridDimensions = (int)Math.Ceiling((double)(N + h.Count - 1) / 1024); kernel.BlockDimensions = 1024; try { d_h = temp_h; } catch (Exception e) { //("{0} Exception caught.", e); return(null); } plan1D.Exec(d_h.DevicePointer, TransformDirection.Forward); for (int g = 0; g < (int)Math.Ceiling((double)(x.Count + h.Count - 1) / (N + h.Count - 1)); g++) { try { d_x = temp_x[g]; } catch (Exception e) { mainView.log(e, "cuda alloc data error", this); return(null); } try { plan1D.Exec(d_x.DevicePointer, TransformDirection.Forward); kernel.Run(d_h.DevicePointer, d_x.DevicePointer, N + h.Count - 1); plan1D.Exec(d_x.DevicePointer, TransformDirection.Inverse); } catch (Exception exp) { mainView.log(exp, "kernel run cuda error", this); } temp_y[g] = d_x; //this.Invoke((MethodInvoker)delegate //{ // progressBar1.Value = (int)(50/ (int)Math.Ceiling((double)(x.Count + h.Count - 1) / (N + h.Count - 1)))*g; //}); d_x.Dispose(); } d_h.Dispose(); plan1D.Dispose(); return(OverlapAdd(temp_y, h.Count).GetRange(h.Count / 2, x.Count)); }