public static float2[] calculateCudaFFT(float[] h_dataIn) { CudaContext cntxt = new CudaContext(); //Caution: Array sizes matter! Based on CUFFFT-Documentation... int size_real = h_dataIn.Length; int size_complex = (int)Math.Floor(size_real / 2.0) + 1; //Crating FFT Plan CudaFFTPlanMany fftPlan = new CudaFFTPlanMany(1, new int[] { size_real }, 1, cufftType.R2C); //Size of d_data must be padded for inplace R2C transforms: size_complex * 2 and not size_real CudaDeviceVariable <float> d_data = new CudaDeviceVariable <float>(size_complex * 2); //device allocation and host have different sizes, why the amount of data must be given explicitly for copying: d_data.CopyToDevice(h_dataIn, 0, 0, size_real * sizeof(float)); //executa plan fftPlan.Exec(d_data.DevicePointer, TransformDirection.Forward); //Output to host, either as float2 or float, but array sizes must be right! float2[] h_dataOut = new float2[size_complex]; float[] h_dataOut2 = new float[size_complex * 2]; d_data.CopyToHost(h_dataOut); d_data.CopyToHost(h_dataOut2); fftPlan.Dispose(); return(h_dataOut); }
public void Run(DistanceOperation operation, CudaDeviceVariable <float> A, int sizeA, CudaDeviceVariable <float> B, int sizeB, CudaDeviceVariable <float> result, int sizeRes) { if (!ValidateAtRun(operation)) { return; } switch (operation) { case DistanceOperation.DotProd: //ZXC m_dotKernel.Run(result.DevicePointer, 0, A.DevicePointer, B.DevicePointer, sizeA, 0); m_dotKernel.Run(result.DevicePointer, A.DevicePointer, B.DevicePointer, sizeA); break; case DistanceOperation.CosDist: //ZXC m_cosKernel.Run(result.DevicePointer, 0, A.DevicePointer, B.DevicePointer, sizeA, 0); m_cosKernel.Run(result.DevicePointer, A.DevicePointer, B.DevicePointer, sizeA); break; case DistanceOperation.EuclidDist: float res = RunReturn(operation, A, sizeA, B, sizeB); result.CopyToDevice(res); break; case DistanceOperation.EuclidDistSquared: m_combineVecsKernel.SetupExecution(sizeA); m_combineVecsKernel.Run(A.DevicePointer, B.DevicePointer, m_temp, (int)MyJoin.MyJoinOperation.Subtraction, sizeA); //ZXC m_dotKernel.Run(result.DevicePointer, 0, m_temp, m_temp, m_temp.Count, 0); m_dotKernel.Run(result.DevicePointer, m_temp, m_temp); break; case DistanceOperation.HammingDist: m_combineVecsKernel.SetupExecution(sizeA); m_combineVecsKernel.Run(A.DevicePointer, B.DevicePointer, m_temp, (int)MyJoin.MyJoinOperation.Equal, sizeA); //ZXC m_reduceSumKernel.Run(result.DevicePointer, m_temp, m_temp.Count, 0, 0, 1, /*distributed = false*/0); // reduction to a single number m_reduceSumKernel.Run(result.DevicePointer, m_temp); float fDist = 0; // to transform number of matches to a number of differences result.CopyToHost(ref fDist); fDist = m_temp.Count - fDist; result.CopyToDevice(fDist); break; case DistanceOperation.HammingSim: m_combineVecsKernel.SetupExecution(sizeA); m_combineVecsKernel.Run(A.DevicePointer, B.DevicePointer, m_temp, (int)MyJoin.MyJoinOperation.Equal, sizeA); //ZXC m_reduceSumKernel.Run(result.DevicePointer, m_temp, m_temp.Count, 0, 0, 1, /*distributed = false*/0); // reduction to a single number m_reduceSumKernel.Run(result.DevicePointer, m_temp); // take the single number (number of different bits) and convert it to Hamming Similarity: // a number in range <0,1> that says how much the vectors are similar float fSim = 0; result.CopyToHost(ref fSim); fSim = fSim / m_temp.Count; result.CopyToDevice(fSim); break; } }
internal MinMax FindMinAndMax(CudaDeviceVariable <float> a, int size) { if (size > 0) { var ptr = a; while (size > BLOCK_DIM2) { var bufferSize = (size / BLOCK_DIM2) + 1; using (var minBlock = new CudaDeviceVariable <float>(bufferSize)) using (var maxBlock = new CudaDeviceVariable <float>(bufferSize)) { minBlock.Memset(0); maxBlock.Memset(0); _Use(_findMinAndMax, size, k => k.Run(BLOCK_DIM2, ptr.DevicePointer, size, minBlock.DevicePointer, maxBlock.DevicePointer)); if (ptr != a) { ptr.Dispose(); } var minTest = new float[bufferSize]; var maxText = new float[bufferSize]; minBlock.CopyToHost(minTest); maxBlock.CopyToHost(maxText); size = bufferSize * 2; ptr = new CudaDeviceVariable <float>(size); ptr.CopyToDevice(minBlock, 0, 0, bufferSize * sizeof(float)); ptr.CopyToDevice(maxBlock, 0, bufferSize * sizeof(float), bufferSize * sizeof(float)); var test = new float[size]; ptr.CopyToHost(test); } } var data = new float[size]; ptr.CopyToHost(data); float min = float.MaxValue, max = float.MinValue; for (var i = 0; i < size; i++) { var val = data[i]; if (val > max) { max = val; } if (val < min) { min = val; } } return(new MinMax(min, max)); } return(MinMax.Empty); }
// Update is called once per frame void Update() { if (wave) { _wall_x = (float)(wall_x + 8 * Math.Sin(t * 1.2f)); t += dt; } for (int i = 0; i < iter; ++i) { // rho cudaKernel[0].Run(d_pPBF_particle.DevicePointer, d_np.DevicePointer, h, NUM_OF_P); // lambda cudaKernel[1].Run(d_pPBF_particle.DevicePointer, d_np.DevicePointer, h, NUM_OF_P); // update x cudaKernel[2].Run(d_pPBF_particle.DevicePointer, d_np.DevicePointer, h, NUM_OF_P, _wall_x, wall_z); } // apply v and x :: F_ext cudaKernel[3].Run(d_pPBF_particle.DevicePointer, d_pPBF_pos.DevicePointer, d_pPBF_col.DevicePointer, g, dt, NUM_OF_P); d_pPBF_pos.CopyToHost(h_pPBF_pos); d_pPBF_col.CopyToHost(h_pPBF_col); //Debug.Log("Pos:" + h_pPBF_pos[10]); posBuf.SetData(h_pPBF_pos); colBuf.SetData(h_pPBF_col); }
public void Backward(CudnnSoftmaxAlgorithm algorithm, CudnnSoftmaxMode mode, CudnnTensorDescriptor srcTensor, float[] srcData, CudnnTensorDescriptor srcDiffTensor, float[] srcDiffData, CudnnTensorDescriptor destDiffTensor, float[] destDiffData) { Contract.Requires(srcTensor != null); Contract.Requires(srcData != null); Contract.Requires(srcDiffTensor != null); Contract.Requires(srcDiffData != null); Contract.Requires(destDiffTensor != null); Contract.Requires(destDiffData != null); ThrowIfNotInitialized(); CheckIfCompatible(CudnnType.Float, srcTensor, srcDiffTensor, destDiffTensor); using (var srcDataGpu = new CudaDeviceVariable <float>(srcData.Length)) using (var srcDiffDataGpu = new CudaDeviceVariable <float>(srcDiffData.Length)) using (var destDiffDataGpu = new CudaDeviceVariable <float>(destDiffData.Length)) { srcDataGpu.CopyToDevice(srcData); srcDiffDataGpu.CopyToDevice(srcDiffData); Invoke(() => CudnnNativeMethods.cudnnSoftmaxBackward(handle, algorithm, mode, srcTensor.Handle, srcDataGpu.DevicePointer, srcDiffTensor.Handle, srcDiffDataGpu.DevicePointer, destDiffTensor.Handle, destDiffDataGpu.DevicePointer)); destDiffDataGpu.CopyToHost(destDiffData); } }
public void Backward(CudnnPoolingDescriptor pooling, CudnnTensorDescriptor srcTensor, double[] srcData, CudnnTensorDescriptor srcDiffTensor, double[] srcDiffData, CudnnTensorDescriptor destTensor, double[] destData, CudnnTensorDescriptor destDiffTensor, double[] destDiffData) { Contract.Requires(pooling != null); Contract.Requires(srcTensor != null); Contract.Requires(srcData != null); Contract.Requires(destTensor != null); Contract.Requires(destData != null); Contract.Requires(srcDiffTensor != null); Contract.Requires(srcDiffData != null); Contract.Requires(destDiffTensor != null); Contract.Requires(destDiffData != null); ThrowIfNotInitialized(); CheckIfCompatible(CudnnType.Double, srcTensor, srcDiffTensor, destTensor, destDiffTensor); using (var srcDataGpu = new CudaDeviceVariable <double>(srcData.Length)) using (var srcDiffDataGpu = new CudaDeviceVariable <double>(srcDiffData.Length)) using (var destDataGpu = new CudaDeviceVariable <double>(destData.Length)) using (var destDiffDataGpu = new CudaDeviceVariable <double>(destDiffData.Length)) { srcDataGpu.CopyToDevice(srcData); srcDiffDataGpu.CopyToDevice(srcDiffData); destDataGpu.CopyToDevice(destData); Invoke(() => CudnnNativeMethods.cudnnPoolingBackward(handle, pooling.Handle, srcTensor.Handle, srcDataGpu.DevicePointer, srcDiffTensor.Handle, srcDiffDataGpu.DevicePointer, destTensor.Handle, destDataGpu.DevicePointer, destDiffTensor.Handle, destDiffDataGpu.DevicePointer)); destDiffDataGpu.CopyToHost(destDiffData); } }
public void BackwardData(CudnnFilterDescriptor filter, double[] filterData, CudnnTensorDescriptor diffTensor, double[] diffData, CudnnConvolutionDescriptor convolution, CudnnTensorDescriptor gradient, double[] gradientData, CudnnAccumulateResult accumulate) { Contract.Requires(filter != null); Contract.Requires(filterData != null); Contract.Requires(diffTensor != null); Contract.Requires(diffData != null); Contract.Requires(convolution != null); Contract.Requires(gradient != null); Contract.Requires(gradientData != null); ThrowIfNotInitialized(); CheckIfCompatible(CudnnType.Double, filter, diffTensor, gradient); using (var filterDataGpu = new CudaDeviceVariable <double>(filterData.Length)) using (var diffDataGpu = new CudaDeviceVariable <double>(diffData.Length)) using (var gradientDataGpu = new CudaDeviceVariable <double>(gradientData.Length)) { filterDataGpu.CopyToDevice(filterData); diffDataGpu.CopyToDevice(diffData); Invoke(() => CudnnNativeMethods.cudnnConvolutionBackwardData(handle, filter.Handle, filterDataGpu.DevicePointer, diffTensor.Handle, diffDataGpu.DevicePointer, convolution.Handle, gradient.Handle, gradientDataGpu.DevicePointer, accumulate)); gradientDataGpu.CopyToHost(gradientData); } }
// perform Fast Fourier transform private float[] PerformFFT(float[] input) { int size_real = input.Length; int size_complex = (int)Math.Floor(size_real / 2.0) + 1; CudaFFTPlanMany fftPlan = new CudaFFTPlanMany(1, new int[] { size_real }, 1, cufftType.R2C); // size of d_data must be padded for inplace R2C transforms: size_complex * 2 and not size_real CudaDeviceVariable <float> fft = new CudaDeviceVariable <float>(size_complex * 2); // device allocation and host have different sizes, why the amount of data must be given explicitly for copying: fft.CopyToDevice(input, 0, 0, size_real * sizeof(float)); // executa plan fftPlan.Exec(fft.DevicePointer, TransformDirection.Forward); // output to host as float2 float2[] output = new float2[size_complex]; fft.CopyToHost(output); // cleanup fft.Dispose(); fftPlan.Dispose(); // squared magnitude of complex output as a result float[] result = new float[output.Length]; for (int i = 0; i < output.Length; i++) { result[i] = (float)Math.Sqrt((output[i].x * output[i].x) + (output[i].y * output[i].y)); } return(result); }
private T[] RunKernel <T>(Action <T[]> method, T[] parameters) where T : struct { var methodInfo = method.Method; string[] kernels; string llvmIr, ptxIr; var ptx = CudaSharp.CudaSharp.Translate(out kernels, out llvmIr, out ptxIr, "sm_20", methodInfo); Console.WriteLine(llvmIr); Console.WriteLine(ptxIr); var kernel = _context.LoadKernelPTX(ptx, kernels[0]); var maxThreads = kernel.MaxThreadsPerBlock; if (parameters.Length <= maxThreads) { kernel.BlockDimensions = parameters.Length; kernel.GridDimensions = 1; } else { kernel.BlockDimensions = maxThreads; kernel.GridDimensions = parameters.Length / maxThreads; if ((kernel.BlockDimensions * kernel.GridDimensions) != parameters.Length) { throw new Exception(string.Format("Invalid parameters size (must be <= {0} or a multiple of {0}", maxThreads)); } } var gpuMem = new CudaDeviceVariable <T>(parameters.Length); gpuMem.CopyToDevice(parameters); kernel.Run(gpuMem.DevicePointer); gpuMem.CopyToHost(parameters); gpuMem.Dispose(); return(parameters); }
public static void blaa() { int num = 10; //NewContext creation CudaContext cntxt = new CudaContext(); //Module loading from precompiled .ptx in a project output folder CUmodule cumodule = cntxt.LoadModule("kernel.ptx"); //_Z9addKernelPf - function name, can be found in *.ptx file CudaKernel addWithCuda = new CudaKernel("_Z9addKernelPf", cumodule, cntxt); //Create device array for data CudaDeviceVariable <float> vec1_device = new CudaDeviceVariable <float>(num); //Create arrays with data float[] vec1 = new float[num]; //Copy data to device vec1_device.CopyToDevice(vec1); //Set grid and block dimensions addWithCuda.GridDimensions = new dim3(8, 1, 1); addWithCuda.BlockDimensions = new dim3(512, 1, 1); //Run the kernel addWithCuda.Run( vec1_device.DevicePointer); //Copy data from device vec1_device.CopyToHost(vec1); }
public void Backward(CudnnSoftmaxAlgorithm algorithm, CudnnSoftmaxMode mode, CudnnTensorDescriptor srcTensor, float[] srcData, CudnnTensorDescriptor srcDiffTensor, float[] srcDiffData, CudnnTensorDescriptor destDiffTensor, float[] destDiffData) { Contract.Requires(srcTensor != null); Contract.Requires(srcData != null); Contract.Requires(srcDiffTensor != null); Contract.Requires(srcDiffData != null); Contract.Requires(destDiffTensor != null); Contract.Requires(destDiffData != null); ThrowIfNotInitialized(); CheckIfCompatible(CudnnType.Float, srcTensor, srcDiffTensor, destDiffTensor); using (var srcDataGpu = new CudaDeviceVariable<float>(srcData.Length)) using (var srcDiffDataGpu = new CudaDeviceVariable<float>(srcDiffData.Length)) using (var destDiffDataGpu = new CudaDeviceVariable<float>(destDiffData.Length)) { srcDataGpu.CopyToDevice(srcData); srcDiffDataGpu.CopyToDevice(srcDiffData); Invoke(() => CudnnNativeMethods.cudnnSoftmaxBackward(handle, algorithm, mode, srcTensor.Handle, srcDataGpu.DevicePointer, srcDiffTensor.Handle, srcDiffDataGpu.DevicePointer, destDiffTensor.Handle, destDiffDataGpu.DevicePointer)); destDiffDataGpu.CopyToHost(destDiffData); } }
public IIndexableVector AsIndexable() { Debug.Assert(IsValid); var data = new float[_size]; _data.CopyToHost(data); return(_cuda.NumericsProvider.Create(_size, i => data[i]).AsIndexable()); }
public IIndexableMatrix AsIndexable() { Debug.Assert(IsValid); var data = new float[_rows * _columns]; _data.CopyToHost(data); return(_cuda.NumericsProvider.Create(_rows, _columns, (j, k) => data[k * _rows + j]).AsIndexable()); }
internal void CopyFromDeviceToHost() { if (!_initialisedInContext) { InitialiseCudaBuffer(copyHostToDevice: false); } _cudaBuffer.CopyToHost(_data, _cudaZero, _cudaOffsetBytes, _cudaLengthBytes); }
static void Main(string[] args) { InitKernels(); int n = 0x1000000; float[] xx = new float[n]; float[] yy = new float[n]; float[] zz = new float[n]; int[] tt = new int[n]; float L = 512; float Lx = L, Ly = L, Lz = L; CudaDeviceVariable <float> gpu_lengths = new float[] { Lx, Ly, Lz }; for (int i = 0; i < n; i++) { xx[i] = ((float)(i & 0x0000FF) / (float)0x000100 - 0.5F) * Lx; yy[i] = ((float)(i & 0x00FF00) / (float)0x010000 - 0.5F) * Ly; zz[i] = ((float)(i & 0xFF0000) / (float)0x1000000 - 0.5F) * Lz; tt[i] = 1; } SimulationMolecules simulationMolecules = new SimulationMolecules(xx, yy, zz, tt); for (int i = 0; i < 10; i++) { Console.WriteLine($"{i} = ({simulationMolecules.x[i]},{simulationMolecules.y[i]},{simulationMolecules.z[i]})"); } int ii = 0x808080; Console.WriteLine($"{ii} = ({simulationMolecules.x[ii]},{simulationMolecules.y[ii]},{simulationMolecules.z[ii]})"); Interactions interactions = new Interactions(); interactions.SetLJParameters(1, 1, 1.5F, 1.5F); float[] cachedEnergies = new float[n / THREADS_PER_BLOCK + 1]; CudaDeviceVariable <float> gpu_energies = cachedEnergies; energyOfExistingMolecule.Run(n, simulationMolecules.gpu_x.DevicePointer, simulationMolecules.gpu_y.DevicePointer, simulationMolecules.gpu_z.DevicePointer, simulationMolecules.gpu_types.DevicePointer, interactions.SigmaPointer(), interactions.EpsilonPointer(), gpu_lengths.DevicePointer, 2.5F, 0x808080, gpu_energies.DevicePointer ); gpu_energies.CopyToHost(cachedEnergies); double e = cachedEnergies.Sum(); Console.WriteLine($"Energy = {e}"); Console.Read(); }
private static void CopyCudaVariableToHost <T>( CudaDeviceVariable <T> variable, BufferTarget bufferTarget, uint buffer) where T : struct { GL.BindBuffer(bufferTarget, buffer); IntPtr ptr = GL.MapBuffer(bufferTarget, BufferAccess.WriteOnly); variable.CopyToHost(ptr); GL.UnmapBuffer(bufferTarget); GL.BindBuffer(bufferTarget, 0); }
public void TransferUpdatedStateToHost() { if (cz_stride != 0) { // transfer to host dcz and icz g_dcz.CopyToHost(c_dcz, 0, 0, sizeof(double) * cz_stride * FP_DATA_SIZE_CZ); // pmax, tmax g_icz.CopyToHost(c_icz, 0, 0, sizeof(int) * cz_stride * 4); // cz_failed // infer failed state, pmax[] and tmax[] Parallel.For(0, mc.nonFailedCZs.Length, i => { CZ cz = mc.nonFailedCZs[i]; if (!cz.failed) { bool tentative_fail = c_icz[i + cz_stride * (TENTATIVE_FAILED_OFFSET_CZ)] == 0 ? false : true; if (tentative_fail) { cz.failed = true; } for (int j = 0; j < 3; j++) { cz.pmax[j] = c_dcz[i + cz_stride * (TENTATIVE_PMAX_OFFSET_CZ + j)]; cz.tmax[j] = c_dcz[i + cz_stride * (TENTATIVE_TMAX_OFFSET_CZ + j)]; } cz.avgDn = c_dcz[i + cz_stride * DELTA_N_OFFSET_CZ]; cz.avgDt = c_dcz[i + cz_stride * DELTA_T_OFFSET_CZ]; cz.avgTn = c_dcz[i + cz_stride * T_N_OFFSET_CZ]; cz.avgTt = c_dcz[i + cz_stride * T_T_OFFSET_CZ]; if (cz.maxAvgDn < cz.avgDn) { cz.maxAvgDn = cz.avgDn; } if (cz.maxAvgDt < cz.avgDt) { cz.maxAvgDt = cz.avgDt; } } }); } // copy back elastic forces (should be zero on rigid objects) g_dn.CopyToHost(c_dn, sizeof(double) * nd_stride * F_OFFSET, sizeof(double) * nd_stride * F_OFFSET, sizeof(double) * nd_stride * 3); Parallel.For(0, mc.allNodes.Length, i => { Node nd = mc.allNodes[i]; nd.fx = c_dn[i + nd_stride * (F_OFFSET + 0)]; nd.fy = c_dn[i + nd_stride * (F_OFFSET + 1)]; nd.fz = c_dn[i + nd_stride * (F_OFFSET + 2)]; }); }
public cuDoubleComplex[] PerformFFT(cuDoubleComplex[] data, int n, TransformDirection direction) { f_plan = new CudaFFTPlan2D(n, n, cufftType.Z2Z); CudaDeviceVariable <cuDoubleComplex> d_signal = new CudaDeviceVariable <cuDoubleComplex>(n * n); CudaDeviceVariable <cuDoubleComplex> o_signal = new CudaDeviceVariable <cuDoubleComplex>(n * n); d_signal.CopyToDevice(data); f_plan.Exec(d_signal.DevicePointer, o_signal.DevicePointer, direction); cuDoubleComplex[] result = new cuDoubleComplex[n * n]; o_signal.CopyToHost(result); d_signal.Dispose(); return(result); }
static void Test(byte[] ptxFile) { const int size = 16; var context = new CudaContext(); var kernel = context.LoadKernelPTX(ptxFile, "kernel"); var memory = context.AllocateMemory(4 * size); var gpuMemory = new CudaDeviceVariable<int>(memory); var cpuMemory = new int[size]; for (var i = 0; i < size; i++) cpuMemory[i] = i - 2; gpuMemory.CopyToDevice(cpuMemory); kernel.BlockDimensions = 4; kernel.GridDimensions = 4; kernel.Run(memory); gpuMemory.CopyToHost(cpuMemory); for (var i = 0; i < size; i++) Console.WriteLine("{0} = {1}", i, cpuMemory[i]); }
public void BackwardBias(CudnnTensorDescriptor srcTensor, double[] srcData, CudnnTensorDescriptor destTensor, double[] destData, CudnnAccumulateResult accumulate) { Contract.Requires(srcTensor != null); Contract.Requires(srcData != null); Contract.Requires(destTensor != null); Contract.Requires(destData != null); ThrowIfNotInitialized(); CheckIfCompatible(CudnnType.Double, srcTensor, destTensor); using (var srcDataGpu = new CudaDeviceVariable <double>(srcData.Length)) using (var destDataGpu = new CudaDeviceVariable <double>(destData.Length)) { srcDataGpu.CopyToDevice(srcData); Invoke(() => CudnnNativeMethods.cudnnConvolutionBackwardBias(handle, srcTensor.Handle, srcDataGpu.DevicePointer, destTensor.Handle, destDataGpu.DevicePointer, accumulate)); destDataGpu.CopyToHost(destData); } }
public override void Proccess() { // run CUDA method myKernel.Run( result_dev.DevicePointer, resultCalc_dev.DevicePointer, input1_dev.DevicePointer, input2_dev.DevicePointer, input3_dev.DevicePointer, input4_dev.DevicePointer, DataGenerator.InputCount, DataGenerator.Width, DataGenerator.Height ); // copy return to host result_dev.CopyToHost(resultsBytes); resultCalc_dev.CopyToHost(calculatables); }
public void BackwardBias(CudnnTensorDescriptor srcTensor, double[] srcData, CudnnTensorDescriptor destTensor, double[] destData, CudnnAccumulateResult accumulate) { Contract.Requires(srcTensor != null); Contract.Requires(srcData != null); Contract.Requires(destTensor != null); Contract.Requires(destData != null); ThrowIfNotInitialized(); CheckIfCompatible(CudnnType.Double, srcTensor, destTensor); using (var srcDataGpu = new CudaDeviceVariable<double>(srcData.Length)) using (var destDataGpu = new CudaDeviceVariable<double>(destData.Length)) { srcDataGpu.CopyToDevice(srcData); Invoke(() => CudnnNativeMethods.cudnnConvolutionBackwardBias(handle, srcTensor.Handle, srcDataGpu.DevicePointer, destTensor.Handle, destDataGpu.DevicePointer, accumulate)); destDataGpu.CopyToHost(destData); } }
public ResultPoint[] FindPoints(byte[] baseImage, byte[] nextImage, ResultPoint[] points, int searchDelta, int subsetDelta, int BitmapWidth, int BitmapHeight, int PointsinX, int PointsinY) { baseImageBuffer = new CudaDeviceVariable <byte>(BitmapWidth * BitmapHeight); baseImageBuffer.CopyToDevice(baseImage); nextImageBuffer = new CudaDeviceVariable <byte>(BitmapWidth * BitmapHeight); nextImageBuffer.CopyToDevice(nextImage); pointsBuffer = new CudaDeviceVariable <ResultPoint>(PointsinX * PointsinY); pointsBuffer.CopyToDevice(points); kernel.BlockDimensions = new ManagedCuda.VectorTypes.dim3(PointsinX, 1, 1); kernel.GridDimensions = new ManagedCuda.VectorTypes.dim3(PointsinY, 1, 1); kernel.Run(new object[] { baseImageBuffer.DevicePointer, nextImageBuffer.DevicePointer, pointsBuffer.DevicePointer, searchDelta, subsetDelta, BitmapWidth, BitmapHeight, PointsinX, PointsinY }); var result = new ResultPoint[PointsinX * PointsinY]; pointsBuffer.CopyToHost(result); return(result); }
/* * int length, * float3* currentPositionH, float3* currentVelocityH, float3* currentAccelerationH, float* currentLifeTimeH, float4* startColorH, float4* endColorH, float* startSizeH, float* endSizeH, float* startLifeTimeH, * int* realCount, int* desiredCount, float3 aroundPosition, float4 startColor, float4 endColor, float startSize, float endSize, float startLifeTime */ public void GenerateParticles(int desiredCount, Vector3 aroundPosition, Vector4 startColor, Vector4 endColor, float startSize, float endSize, float startLifeTime) { List <object> parameters = new List <object>(); parameters.Add(randomIndex_D.DevicePointer); parameters.Add(renderer.particleMesh.maxParticles); foreach (var r in resources) { r.Map(); parameters.Add(r.GetMappedPointer()); } CudaDeviceVariable <int> realCount_D = 0; parameters.Add(realCount_D.DevicePointer); CudaDeviceVariable <int> desiredCount_D = desiredCount; parameters.Add(desiredCount_D.DevicePointer); parameters.Add(Conv(aroundPosition)); parameters.Add(Conv(startColor)); parameters.Add(Conv(endColor)); parameters.Add(startSize); parameters.Add(endSize); parameters.Add(startLifeTime); object[] arrParams = parameters.ToArray(); generateParticles.Run(arrParams); foreach (var r in resources) { r.UnMap(); } int realCount = 0; realCount_D.CopyToHost(ref realCount); //Debug.Info("desiredCount=" + desiredCount + " realCount=" + realCount); }
// Update is called once per frame void Update() { // cudaKernel[0].Run(d_pos.DevicePointer, d_M_index.DevicePointer, d_M.DevicePointer, h, NUM_OF_P, NP); for (int i = 0; i < iter; ++i) { cudaKernel[1].Run(d_pos.DevicePointer, d_delta_p.DevicePointer, d_M_index.DevicePointer, d_M.DevicePointer, NUM_OF_P, NP); cudaKernel[2].Run(d_pos.DevicePointer, d_delta_p.DevicePointer, NUM_OF_P, NP); } cudaKernel[3].Run(d_pos.DevicePointer, d_ppos.DevicePointer, S_flyAway, S_floorFliction, dt, NUM_OF_P); d_ppos.CopyToHost(h_pos); posBuf.SetData(h_pos); colBuf.SetData(h_col); // Print Screen if (S_saveImages) { ScreenCapture.CaptureScreenshot("images/" + string.Format("{0:00000}", capture) + ".png"); } ++capture; }
public void Forward(CudnnTensorDescriptor srcTensor, float[] srcData, CudnnFilterDescriptor filter, float[] filterData, CudnnConvolutionDescriptor convolution, CudnnTensorDescriptor destTensor, float[] destData, CudnnAccumulateResult accumulate) { Contract.Requires(srcTensor != null); Contract.Requires(srcData != null); Contract.Requires(filter != null); Contract.Requires(filterData != null); Contract.Requires(convolution != null); Contract.Requires(destTensor != null); Contract.Requires(destData != null); ThrowIfNotInitialized(); CheckIfCompatible(CudnnType.Float, srcTensor, destTensor, filter); using (var srcDataGpu = new CudaDeviceVariable <float>(srcData.Length)) using (var filterDataGpu = new CudaDeviceVariable <float>(filterData.Length)) using (var destDataGpu = new CudaDeviceVariable <float>(destData.Length)) { srcDataGpu.CopyToDevice(srcData); filterDataGpu.CopyToDevice(filterData); Invoke(() => CudnnNativeMethods.cudnnConvolutionForward(handle, srcTensor.Handle, srcDataGpu.DevicePointer, filter.Handle, filterDataGpu.DevicePointer, convolution.Handle, destTensor.Handle, destDataGpu.DevicePointer, accumulate)); destDataGpu.CopyToHost(destData); } }
public void Run(DistanceOperation operation, CudaDeviceVariable<float> A, int sizeA, CudaDeviceVariable<float> B, int sizeB, CudaDeviceVariable<float> result, int sizeRes) { if (!ValidateAtRun(operation)) return; switch (operation) { case DistanceOperation.DotProd: //ZXC m_dotKernel.Run(result.DevicePointer, 0, A.DevicePointer, B.DevicePointer, sizeA, 0); m_dotKernel.Run(result.DevicePointer, A.DevicePointer, B.DevicePointer, sizeA); break; case DistanceOperation.CosDist: //ZXC m_cosKernel.Run(result.DevicePointer, 0, A.DevicePointer, B.DevicePointer, sizeA, 0); m_cosKernel.Run(result.DevicePointer, A.DevicePointer, B.DevicePointer, sizeA); break; case DistanceOperation.EuclidDist: float res = RunReturn(operation, A, sizeA, B, sizeB); result.CopyToDevice(res); break; case DistanceOperation.EuclidDistSquared: m_combineVecsKernel.SetupExecution(sizeA); m_combineVecsKernel.Run(A.DevicePointer, B.DevicePointer, m_temp, (int)MyJoin.MyJoinOperation.Subtraction, sizeA); //ZXC m_dotKernel.Run(result.DevicePointer, 0, m_temp, m_temp, m_temp.Count, 0); m_dotKernel.Run(result.DevicePointer, m_temp, m_temp); break; case DistanceOperation.HammingDist: m_combineVecsKernel.SetupExecution(sizeA); m_combineVecsKernel.Run(A.DevicePointer, B.DevicePointer, m_temp, (int)MyJoin.MyJoinOperation.Equal, sizeA); //ZXC m_reduceSumKernel.Run(result.DevicePointer, m_temp, m_temp.Count, 0, 0, 1, /*distributed = false*/0); // reduction to a single number m_reduceSumKernel.Run(result.DevicePointer, m_temp); float fDist = 0; // to transform number of matches to a number of differences result.CopyToHost(ref fDist); fDist = m_temp.Count - fDist; result.CopyToDevice(fDist); break; case DistanceOperation.HammingSim: m_combineVecsKernel.SetupExecution(sizeA); m_combineVecsKernel.Run(A.DevicePointer, B.DevicePointer, m_temp, (int)MyJoin.MyJoinOperation.Equal, sizeA); //ZXC m_reduceSumKernel.Run(result.DevicePointer, m_temp, m_temp.Count, 0, 0, 1, /*distributed = false*/0); // reduction to a single number m_reduceSumKernel.Run(result.DevicePointer, m_temp); // take the single number (number of different bits) and convert it to Hamming Similarity: // a number in range <0,1> that says how much the vectors are similar float fSim = 0; result.CopyToHost(ref fSim); fSim = fSim / m_temp.Count; result.CopyToDevice(fSim); break; } }
public void cuFFTreconstruct() { CudaContext ctx = new CudaContext(0); ManagedCuda.BasicTypes.CUmodule cumodule = ctx.LoadModule("kernel.ptx"); CudaKernel cuKernel = new CudaKernel("cu_ArrayInversion", cumodule, ctx); float2[] fData = new float2[Resolution * Resolution]; float2[] result = new float2[Resolution * Resolution]; FFTData2D = new float[Resolution, Resolution, 2]; CudaDeviceVariable <float2> devData = new CudaDeviceVariable <float2>(Resolution * Resolution); CudaDeviceVariable <float2> copy_devData = new CudaDeviceVariable <float2>(Resolution * Resolution); int i, j; Random rnd = new Random(); double avrg = 0.0; for (i = 0; i < Resolution; i++) { for (j = 0; j < Resolution; j++) { fData[i * Resolution + j].x = i + j * 2; avrg += fData[i * Resolution + j].x; fData[i * Resolution + j].y = 0.0f; } } avrg = avrg / (double)(Resolution * Resolution); for (i = 0; i < Resolution; i++) { for (j = 0; j < Resolution; j++) { fData[(i * Resolution + j)].x = fData[(i * Resolution + j)].x - (float)avrg; } } devData.CopyToDevice(fData); CudaFFTPlan1D plan1D = new CudaFFTPlan1D(Resolution, cufftType.C2C, Resolution); plan1D.Exec(devData.DevicePointer, TransformDirection.Forward); cuKernel.GridDimensions = new ManagedCuda.VectorTypes.dim3(Resolution / cuda_blockNum, Resolution, 1); cuKernel.BlockDimensions = new ManagedCuda.VectorTypes.dim3(cuda_blockNum, 1, 1); cuKernel.Run(devData.DevicePointer, copy_devData.DevicePointer, Resolution); copy_devData.CopyToHost(result); for (i = 0; i < Resolution; i++) { for (j = 0; j < Resolution; j++) { FFTData2D[i, j, 0] = result[i * Resolution + j].x; FFTData2D[i, j, 1] = result[i * Resolution + j].y; } } //Clean up devData.Dispose(); copy_devData.Dispose(); plan1D.Dispose(); CudaContext.ProfilerStop(); ctx.Dispose(); }
void SolveInteractions() { ComputeBuffer positionBuffer = new ComputeBuffer(Pedestrians.Length + 1, 8); ComputeBuffer velocityBuffer = new ComputeBuffer(Pedestrians.Length + 1, 8); int j = 0; for (j = 0; j < Pedestrians.Length; ++j) { Positions[j] = Pedestrians[j].GetComponent <PedestrianController>().position; Velocities[j] = Pedestrians[j].GetComponent <PedestrianController>().velocity; } Positions[j] = new Vector2(0, 0); Velocities[j] = new Vector2(0, 0); positionBuffer.SetData(Positions); AgentViewMaterial.SetBuffer("positionBuffer", positionBuffer); velocityBuffer.SetData(Velocities); AgentViewMaterial.SetBuffer("velocityBuffer", velocityBuffer); //1. First Render From Every Pedestrians PoV for (int i = 0; i < Pedestrians.Length; ++i) { Pedestrians[i].GetComponent <PedestrianController>().SolveInteraction(); } //2. Now Process the Resultant Texture if (RenderTargetTex && RenderTarget.IsCreated()) { RenderTexture.active = RenderTarget; RenderTargetTex.ReadPixels(new Rect(0, 0, RenderTarget.width, RenderTarget.height), 0, 0); RenderTargetTex.Apply(); // Launch CUDA Kernel uint textureSize = (uint)(RenderTargetTex.width * agentViewTexHeight); int threadsPerBlock = 1024; float threadsPerBlockInv = 1.0f / (float)threadsPerBlock; int blocksPerGrid = (int)((textureSize + threadsPerBlock - 1) * threadsPerBlockInv); /*******************************************************************/ /************************copyReductionKernel************************/ /*******************************************************************/ dim3 block = new dim3(threadsPerBlock, 1, 1); dim3 grid = new dim3(blocksPerGrid, 1, 1); uint shMemeSize = (uint)(block.x * 5 * sizeof(float)); // size of shared memory uint offset, currentNumData, currentNumBlocks; h_idata = RenderTargetTex.GetPixels(); for (int i = 0; i < h_idata.Length; i++) { h_idata_float4[i] = new float4(h_idata[i].r, h_idata[i].g, h_idata[i].b, h_idata[i].a); } d_idata.CopyToDevice(h_idata_float4); for (int i = pedStart; i < pedStop; i++) { offset = (uint)(i - pedStart) * (textureSize); currentNumData = textureSize; currentNumBlocks = (uint)blocksPerGrid; grid.x = currentNumBlocks; cudaKernel[0].BlockDimensions = block; cudaKernel[0].GridDimensions = grid; cudaKernel[0].DynamicSharedMemory = shMemeSize; cudaKernel[0].Run(d_idata.DevicePointer, d_odata.DevicePointer, 5, textureSize, offset); //Debug.Log("1: CUDA kernel launch with " + blocksPerGrid + " blocks of " + threadsPerBlock + " threads\n"); /*******************************************************************/ /**************************reductionKernel**************************/ /*******************************************************************/ currentNumData = currentNumBlocks; currentNumBlocks = (uint)((currentNumData + threadsPerBlock - 1) * threadsPerBlockInv); for (; currentNumData > 1;) { // perform reduction to get one pixel grid.x = currentNumBlocks; cudaKernel[1].BlockDimensions = block; cudaKernel[1].GridDimensions = grid; cudaKernel[1].DynamicSharedMemory = shMemeSize; cudaKernel[1].Run(d_odata.DevicePointer, 5, currentNumData); //Debug.Log("2: CUDA kernel launch with " + blocksPerGrid + " blocks of " + threadsPerBlock + " threads\n"); currentNumData = currentNumBlocks; currentNumBlocks = (uint)((currentNumData + threadsPerBlock - 1) * threadsPerBlockInv); } d_result_data.CopyToDevice(d_odata, 0, 5 * i * sizeof(float), 5 * sizeof(float)); } d_result_data.CopyToHost(h_result_data); RenderTexture.active = null; } //3. Now Calculate Agent Veclocity Based on Results from Step 2. for (int i = 0; i < Pedestrians.Length; ++i) { float thetaMax = 0; float thetaMin = 0; float ttcMin = 0; float dttcMin = 0; bool goFirstMin = true; bool goFirstMax = true; computeParams(i, ref thetaMax, ref thetaMin, ref ttcMin, ref dttcMin, ref goFirstMin, ref goFirstMax); // compute new velocity Pedestrians[i].GetComponent <PedestrianController>().updateVelocity(thetaMin, thetaMax, ttcMin, dttcMin, goFirstMin, goFirstMax); } }
static void Main(string[] args) { // NOTE: You need to change this location to match your own machine. Console.ForegroundColor = ConsoleColor.Red; Console.WriteLine("NOTE: You must change the kernel location before running this project so it matches your own environment."); Console.ResetColor(); System.Threading.Thread.Sleep(500); string path = @"X:\MachineLearning\CUDAGraph-2\CUDAGraph_Kernel\Debug\kernel.cu.ptx"; CudaContext ctx = new CudaContext(); CUmodule module = ctx.LoadModule(path); kernel = new CudaKernel("kernel", module, ctx); // This tells the kernel to allocate a lot of threads for the Gpu. kernel.BlockDimensions = THREADS_PER_BLOCK; kernel.GridDimensions = VECTOR_SIZE / THREADS_PER_BLOCK + 1;; // Now let's load the kernel! // Create the topology. int[] topology = new int[] { 1, 200, 200, 100, 1 }; int height = topology.Length; int width = 0; for (int i = 0; i < topology.Length; i++) { if (width < topology[i]) { width = topology[i]; } } // Launch! float[] res = new float[height * width]; for (int i = 0; i < 10; i++) { float[] matrix = new float[height * width]; float[] weights = new float[height * width]; Random rand = new Random(424242); for (int y = 0; y < height; y++) { for (int x = 0; x < width; x++) { matrix[y * width + x] = (y == 0 && x < topology[y]) ? 1.0f : 0; weights[y * width + x] = (x < topology[y]) ? (float)(rand.NextDouble() - rand.NextDouble()) : 0; } } // Load the kernel with some variables. CudaDeviceVariable <int> cuda_topology = topology; CudaDeviceVariable <float> cuda_membank = matrix; CudaDeviceVariable <float> cuda_weights = weights; Stopwatch sw = new Stopwatch(); sw.Start(); kernel.Run(cuda_topology.DevicePointer, cuda_membank.DevicePointer, cuda_weights.DevicePointer, height, width); cuda_membank.CopyToHost(res); sw.Stop(); Console.ForegroundColor = ConsoleColor.Green; Console.WriteLine("{0} ticks to compute -> {1}", sw.ElapsedTicks, res[0]); Console.ResetColor(); } Console.ReadKey(); }
public void CopyToHost(float[] target) { _data.CopyToHost(target); }
public void BackwardFilter(CudnnTensorDescriptor srcTensor, double[] srcData, CudnnTensorDescriptor diffTensor, double[] diffData, CudnnConvolutionDescriptor convolution, CudnnFilterDescriptor gradient, double[] gradientData, CudnnAccumulateResult accumulate) { Contract.Requires(srcTensor != null); Contract.Requires(srcData != null); Contract.Requires(diffTensor != null); Contract.Requires(diffData != null); Contract.Requires(convolution != null); Contract.Requires(gradient != null); Contract.Requires(gradientData != null); ThrowIfNotInitialized(); CheckIfCompatible(CudnnType.Double, srcTensor, diffTensor, gradient); using (var srcDataGpu = new CudaDeviceVariable<double>(srcData.Length)) using (var diffDataGpu = new CudaDeviceVariable<double>(diffData.Length)) using (var gradientDataGpu = new CudaDeviceVariable<double>(gradientData.Length)) { srcDataGpu.CopyToDevice(srcData); diffDataGpu.CopyToDevice(diffData); Invoke(() => CudnnNativeMethods.cudnnConvolutionBackwardFilter(handle, srcTensor.Handle, srcDataGpu.DevicePointer, diffTensor.Handle, diffDataGpu.DevicePointer, convolution.Handle, gradient.Handle, gradientDataGpu.DevicePointer, accumulate)); gradientDataGpu.CopyToHost(gradientData); } }
public void Forward(CudnnTensorDescriptor srcTensor, float[] srcData, CudnnFilterDescriptor filter, float[] filterData, CudnnConvolutionDescriptor convolution, CudnnTensorDescriptor destTensor, float[] destData, CudnnAccumulateResult accumulate) { Contract.Requires(srcTensor != null); Contract.Requires(srcData != null); Contract.Requires(filter != null); Contract.Requires(filterData != null); Contract.Requires(convolution != null); Contract.Requires(destTensor != null); Contract.Requires(destData != null); ThrowIfNotInitialized(); CheckIfCompatible(CudnnType.Float, srcTensor, destTensor, filter); using (var srcDataGpu = new CudaDeviceVariable<float>(srcData.Length)) using (var filterDataGpu = new CudaDeviceVariable<float>(filterData.Length)) using (var destDataGpu = new CudaDeviceVariable<float>(destData.Length)) { srcDataGpu.CopyToDevice(srcData); filterDataGpu.CopyToDevice(filterData); Invoke(() => CudnnNativeMethods.cudnnConvolutionForward(handle, srcTensor.Handle, srcDataGpu.DevicePointer, filter.Handle, filterDataGpu.DevicePointer, convolution.Handle, destTensor.Handle, destDataGpu.DevicePointer, accumulate)); destDataGpu.CopyToHost(destData); } }
public void Backward(CudnnPoolingDescriptor pooling, CudnnTensorDescriptor srcTensor, double[] srcData, CudnnTensorDescriptor srcDiffTensor, double[] srcDiffData, CudnnTensorDescriptor destTensor, double[] destData, CudnnTensorDescriptor destDiffTensor, double[] destDiffData) { Contract.Requires(pooling != null); Contract.Requires(srcTensor != null); Contract.Requires(srcData != null); Contract.Requires(destTensor != null); Contract.Requires(destData != null); Contract.Requires(srcDiffTensor != null); Contract.Requires(srcDiffData != null); Contract.Requires(destDiffTensor != null); Contract.Requires(destDiffData != null); ThrowIfNotInitialized(); CheckIfCompatible(CudnnType.Double, srcTensor, srcDiffTensor, destTensor, destDiffTensor); using (var srcDataGpu = new CudaDeviceVariable<double>(srcData.Length)) using (var srcDiffDataGpu = new CudaDeviceVariable<double>(srcDiffData.Length)) using (var destDataGpu = new CudaDeviceVariable<double>(destData.Length)) using (var destDiffDataGpu = new CudaDeviceVariable<double>(destDiffData.Length)) { srcDataGpu.CopyToDevice(srcData); srcDiffDataGpu.CopyToDevice(srcDiffData); destDataGpu.CopyToDevice(destData); Invoke(() => CudnnNativeMethods.cudnnPoolingBackward(handle, pooling.Handle, srcTensor.Handle, srcDataGpu.DevicePointer, srcDiffTensor.Handle, srcDiffDataGpu.DevicePointer, destTensor.Handle, destDataGpu.DevicePointer, destDiffTensor.Handle, destDiffDataGpu.DevicePointer)); destDiffDataGpu.CopyToHost(destDiffData); } }
public static void SaveJpeg(string aFilename, int aQuality, Bitmap aImage) { if (aImage.PixelFormat != System.Drawing.Imaging.PixelFormat.Format24bppRgb) { throw new ArgumentException("Only three channel color images are supported."); } if (aImage.Width % 16 != 0 || aImage.Height % 16 != 0) { throw new ArgumentException("The provided bitmap must have a height and width of a multiple of 16."); } JPEGCompression compression = new JPEGCompression(); NPPImage_8uC3 src = new NPPImage_8uC3(aImage.Width, aImage.Height); NPPImage_8uC1 srcY = new NPPImage_8uC1(aImage.Width, aImage.Height); NPPImage_8uC1 srcCb = new NPPImage_8uC1(aImage.Width / 2, aImage.Height / 2); NPPImage_8uC1 srcCr = new NPPImage_8uC1(aImage.Width / 2, aImage.Height / 2); src.CopyToDevice(aImage); //System.Drawing.Bitmap is ordered BGR not RGB //The NPP routine BGR to YCbCR outputs the values in clamped range, following the YCbCr standard. //But JPEG uses unclamped values ranging all from [0..255], thus use our own color matrix: float[,] BgrToYCbCr = new float[3, 4] {{0.114f, 0.587f, 0.299f, 0}, {0.5f, -0.33126f, -0.16874f, 128}, {-0.08131f, -0.41869f, 0.5f, 128}}; src.ColorTwist(BgrToYCbCr); //Reduce size of of Cb and Cr channel src.Copy(srcY, 2); srcY.Resize(srcCr, 0.5, 0.5, InterpolationMode.SuperSampling); src.Copy(srcY, 1); srcY.Resize(srcCb, 0.5, 0.5, InterpolationMode.SuperSampling); src.Copy(srcY, 0); FrameHeader oFrameHeader = new FrameHeader(); oFrameHeader.nComponents = 3; oFrameHeader.nHeight = (ushort)aImage.Height; oFrameHeader.nSamplePrecision = 8; oFrameHeader.nWidth = (ushort)aImage.Width; oFrameHeader.aComponentIdentifier = new byte[] { 1, 2, 3 }; oFrameHeader.aSamplingFactors = new byte[] { 34, 17, 17 }; //Y channel is twice the sice of Cb/Cr channel oFrameHeader.aQuantizationTableSelector = new byte[] { 0, 1, 1 }; //Get quantization tables from JPEG standard with quality scaling QuantizationTable[] aQuantizationTables = new QuantizationTable[2]; aQuantizationTables[0] = new QuantizationTable(QuantizationTable.QuantizationType.Luminance, aQuality); aQuantizationTables[1] = new QuantizationTable(QuantizationTable.QuantizationType.Chroma, aQuality); CudaDeviceVariable<byte>[] pdQuantizationTables = new CudaDeviceVariable<byte>[2]; pdQuantizationTables[0] = aQuantizationTables[0].aTable; pdQuantizationTables[1] = aQuantizationTables[1].aTable; //Get Huffman tables from JPEG standard HuffmanTable[] aHuffmanTables = new HuffmanTable[4]; aHuffmanTables[0] = new HuffmanTable(HuffmanTable.HuffmanType.LuminanceDC); aHuffmanTables[1] = new HuffmanTable(HuffmanTable.HuffmanType.ChromaDC); aHuffmanTables[2] = new HuffmanTable(HuffmanTable.HuffmanType.LuminanceAC); aHuffmanTables[3] = new HuffmanTable(HuffmanTable.HuffmanType.ChromaAC); //Set header ScanHeader oScanHeader = new ScanHeader(); oScanHeader.nA = 0; oScanHeader.nComponents = 3; oScanHeader.nSe = 63; oScanHeader.nSs = 0; oScanHeader.aComponentSelector = new byte[] { 1, 2, 3 }; oScanHeader.aHuffmanTablesSelector = new byte[] { 0, 17, 17 }; NPPImage_16sC1[] apdDCT = new NPPImage_16sC1[3]; NPPImage_8uC1[] apDstImage = new NPPImage_8uC1[3]; NppiSize[] aDstSize = new NppiSize[3]; aDstSize[0] = new NppiSize(srcY.Width, srcY.Height); aDstSize[1] = new NppiSize(srcCb.Width, srcCb.Height); aDstSize[2] = new NppiSize(srcCr.Width, srcCr.Height); // Compute channel sizes as stored in the output JPEG (8x8 blocks & MCU block layout) NppiSize oDstImageSize = new NppiSize(); float frameWidth = (float)Math.Floor((float)oFrameHeader.nWidth); float frameHeight = (float)Math.Floor((float)oFrameHeader.nHeight); oDstImageSize.width = (int)Math.Max(1.0f, frameWidth); oDstImageSize.height = (int)Math.Max(1.0f, frameHeight); //Console.WriteLine("Output Size: " + oDstImageSize.width + "x" + oDstImageSize.height + "x" + (int)(oFrameHeader.nComponents)); apDstImage[0] = srcY; apDstImage[1] = srcCb; apDstImage[2] = srcCr; int nMCUBlocksH = 0; int nMCUBlocksV = 0; // Compute channel sizes as stored in the JPEG (8x8 blocks & MCU block layout) for (int i = 0; i < oFrameHeader.nComponents; ++i) { nMCUBlocksV = Math.Max(nMCUBlocksV, oFrameHeader.aSamplingFactors[i] >> 4); nMCUBlocksH = Math.Max(nMCUBlocksH, oFrameHeader.aSamplingFactors[i] & 0x0f); } for (int i = 0; i < oFrameHeader.nComponents; ++i) { NppiSize oBlocks = new NppiSize(); NppiSize oBlocksPerMCU = new NppiSize(oFrameHeader.aSamplingFactors[i] & 0x0f, oFrameHeader.aSamplingFactors[i] >> 4); oBlocks.width = (int)Math.Ceiling((oFrameHeader.nWidth + 7) / 8 * (float)(oBlocksPerMCU.width) / nMCUBlocksH); oBlocks.width = DivUp(oBlocks.width, oBlocksPerMCU.width) * oBlocksPerMCU.width; oBlocks.height = (int)Math.Ceiling((oFrameHeader.nHeight + 7) / 8 * (float)(oBlocksPerMCU.height) / nMCUBlocksV); oBlocks.height = DivUp(oBlocks.height, oBlocksPerMCU.height) * oBlocksPerMCU.height; // Allocate Memory apdDCT[i] = new NPPImage_16sC1(oBlocks.width * 64, oBlocks.height); } /*************************** * * Output * ***************************/ // Forward DCT for (int i = 0; i < 3; ++i) { compression.DCTQuantFwd8x8LS(apDstImage[i], apdDCT[i], aDstSize[i], pdQuantizationTables[oFrameHeader.aQuantizationTableSelector[i]]); } // Huffman Encoding CudaDeviceVariable<byte> pdScan = new CudaDeviceVariable<byte>(BUFFER_SIZE); int nScanLength = 0; int nTempSize = JPEGCompression.EncodeHuffmanGetSize(aDstSize[0], 3); CudaDeviceVariable<byte> pJpegEncoderTemp = new CudaDeviceVariable<byte>(nTempSize); NppiEncodeHuffmanSpec[] apHuffmanDCTableEnc = new NppiEncodeHuffmanSpec[3]; NppiEncodeHuffmanSpec[] apHuffmanACTableEnc = new NppiEncodeHuffmanSpec[3]; for (int i = 0; i < 3; ++i) { apHuffmanDCTableEnc[i] = JPEGCompression.EncodeHuffmanSpecInitAlloc(aHuffmanTables[(oScanHeader.aHuffmanTablesSelector[i] >> 4)].aCodes, NppiHuffmanTableType.nppiDCTable); apHuffmanACTableEnc[i] = JPEGCompression.EncodeHuffmanSpecInitAlloc(aHuffmanTables[(oScanHeader.aHuffmanTablesSelector[i] & 0x0f) + 2].aCodes, NppiHuffmanTableType.nppiACTable); } JPEGCompression.EncodeHuffmanScan(apdDCT, 0, oScanHeader.nSs, oScanHeader.nSe, oScanHeader.nA >> 4, oScanHeader.nA & 0x0f, pdScan, ref nScanLength, apHuffmanDCTableEnc, apHuffmanACTableEnc, aDstSize, pJpegEncoderTemp); for (int i = 0; i < 3; ++i) { JPEGCompression.EncodeHuffmanSpecFree(apHuffmanDCTableEnc[i]); JPEGCompression.EncodeHuffmanSpecFree(apHuffmanACTableEnc[i]); } // Write JPEG to byte array, as in original sample code byte[] pDstOutput = new byte[BUFFER_SIZE]; int pos = 0; oFrameHeader.nWidth = (ushort)oDstImageSize.width; oFrameHeader.nHeight = (ushort)oDstImageSize.height; writeMarker(0x0D8, pDstOutput, ref pos); writeJFIFTag(pDstOutput, ref pos); writeQuantizationTable(aQuantizationTables[0], pDstOutput, ref pos); writeQuantizationTable(aQuantizationTables[1], pDstOutput, ref pos); writeFrameHeader(oFrameHeader, pDstOutput, ref pos); writeHuffmanTable(aHuffmanTables[0], pDstOutput, ref pos); writeHuffmanTable(aHuffmanTables[1], pDstOutput, ref pos); writeHuffmanTable(aHuffmanTables[2], pDstOutput, ref pos); writeHuffmanTable(aHuffmanTables[3], pDstOutput, ref pos); writeScanHeader(oScanHeader, pDstOutput, ref pos); pdScan.CopyToHost(pDstOutput, 0, pos, nScanLength); pos += nScanLength; writeMarker(0x0D9, pDstOutput, ref pos); FileStream fs = new FileStream(aFilename, FileMode.Create, FileAccess.Write); fs.Write(pDstOutput, 0, pos); fs.Close(); //cleanup: fs.Dispose(); pJpegEncoderTemp.Dispose(); pdScan.Dispose(); apdDCT[2].Dispose(); apdDCT[1].Dispose(); apdDCT[0].Dispose(); pdQuantizationTables[1].Dispose(); pdQuantizationTables[0].Dispose(); srcCr.Dispose(); srcCb.Dispose(); srcY.Dispose(); src.Dispose(); compression.Dispose(); }
private void Generate(CudaKernel kernelPositionWeight, int width, int height, int depth) { int count = width * height * depth; int widthD = width - 1; int heightD = height - 1; int depthD = depth - 1; int countDecremented = widthD * heightD * depthD; dim3 blockDimensions = new dim3(8, 8, 8); dim3 gridDimensions = new dim3((int)Math.Ceiling(width / 8.0), (int)Math.Ceiling(height / 8.0), (int)Math.Ceiling(depth / 8.0)); dim3 gridDimensionsDecremented = new dim3((int)Math.Ceiling(widthD / 8.0), (int)Math.Ceiling(heightD / 8.0), (int)Math.Ceiling(depthD / 8.0)); CUDANoiseCube noiseCube = new CUDANoiseCube(); CudaArray3D noiseArray = noiseCube.GenerateUniformArray(16, 16, 16); CudaTextureArray3D noiseTexture = new CudaTextureArray3D(kernelPositionWeight, "noiseTexture", CUAddressMode.Wrap, CUFilterMode.Linear, CUTexRefSetFlags.NormalizedCoordinates, noiseArray); CudaDeviceVariable<Voxel> voxelsDev = new CudaDeviceVariable<Voxel>(count); kernelPositionWeight.BlockDimensions = blockDimensions; typeof(CudaKernel).GetField("_gridDim", BindingFlags.Instance | BindingFlags.NonPublic).SetValue(kernelPositionWeight, gridDimensions); kernelPositionWeight.Run(voxelsDev.DevicePointer, width, height, depth); kernelNormalAmbient.BlockDimensions = blockDimensions; typeof(CudaKernel).GetField("_gridDim", BindingFlags.Instance | BindingFlags.NonPublic).SetValue(kernelNormalAmbient, gridDimensions); kernelNormalAmbient.Run(voxelsDev.DevicePointer, width, height, depth, container.Settings.AmbientRayWidth, container.Settings.AmbientSamplesCount); int nearestW = NearestPowerOfTwo(widthD); int nearestH = NearestPowerOfTwo(heightD); int nearestD = NearestPowerOfTwo(depthD); int nearestCount = nearestW * nearestH * nearestD; CudaDeviceVariable<int> trisCountDevice = new CudaDeviceVariable<int>(nearestCount); trisCountDevice.Memset(0); CudaDeviceVariable<int> offsetsDev = new CudaDeviceVariable<int>(countDecremented); kernelMarchingCubesCases.BlockDimensions = blockDimensions; typeof(CudaKernel).GetField("_gridDim", BindingFlags.Instance | BindingFlags.NonPublic).SetValue(kernelMarchingCubesCases, gridDimensionsDecremented); kernelMarchingCubesCases.Run(voxelsDev.DevicePointer, width, height, depth, offsetsDev.DevicePointer, trisCountDevice.DevicePointer, nearestW, nearestH, nearestD); CudaDeviceVariable<int> prefixSumsDev = prefixScan.PrefixSumArray(trisCountDevice, nearestCount); int lastTrisCount = 0; trisCountDevice.CopyToHost(ref lastTrisCount, (nearestCount - 1) * sizeof(int)); int lastPrefixSum = 0; prefixSumsDev.CopyToHost(ref lastPrefixSum, (nearestCount - 1) * sizeof(int)); int totalVerticesCount = (lastTrisCount + lastPrefixSum) * 3; if (totalVerticesCount > 0) { if (container.Geometry != null) container.Geometry.Dispose(); container.VertexCount = totalVerticesCount; container.Geometry = new Buffer(graphicsDevice, new BufferDescription() { BindFlags = BindFlags.VertexBuffer, CpuAccessFlags = CpuAccessFlags.None, OptionFlags = ResourceOptionFlags.None, SizeInBytes = Marshal.SizeOf(typeof(VoxelMeshVertex)) * totalVerticesCount, Usage = ResourceUsage.Default }); CudaDirectXInteropResource directResource = new CudaDirectXInteropResource(container.Geometry.ComPointer, CUGraphicsRegisterFlags.None, CudaContext.DirectXVersion.D3D11, CUGraphicsMapResourceFlags.None); kernelMarchingCubesVertices.BlockDimensions = blockDimensions; typeof(CudaKernel).GetField("_gridDim", BindingFlags.Instance | BindingFlags.NonPublic).SetValue(kernelMarchingCubesVertices, gridDimensionsDecremented); directResource.Map(); kernelMarchingCubesVertices.Run(directResource.GetMappedPointer(), voxelsDev.DevicePointer, prefixSumsDev.DevicePointer, offsetsDev.DevicePointer, width, height, depth, nearestW, nearestH, nearestD); directResource.UnMap(); directResource.Dispose(); } else { container.VertexCount = 0; if (container.Geometry != null) container.Geometry.Dispose(); } noiseCube.Dispose(); prefixSumsDev.Dispose(); trisCountDevice.Dispose(); offsetsDev.Dispose(); noiseArray.Dispose(); noiseTexture.Dispose(); voxelsDev.Dispose(); }
static void Main(string[] args) { try { if (args.Length == 1 && args[0].ToLower().Contains("fidelity")) { string[] fseg = args[0].Split(':'); deviceID = int.Parse(fseg[1]); nonce = Int64.Parse(fseg[2]) - 1; range = int.Parse(fseg[3]); QTEST = true; } else { if (args.Length > 0) { deviceID = int.Parse(args[0]); } } } catch (Exception ex) { Logger.Log(LogLevel.Error, "Device ID parse error: " + ex.Message); } try { if (args.Length > 0) { deviceID = int.Parse(args[0]); } } catch (Exception ex) { Logger.Log(LogLevel.Error, "Device ID parse error"); } try { if (args.Length > 1) { port = int.Parse(args[1]); Comms.ConnectToMaster(port); } else { TEST = true; Logger.CopyToConsole = true; CGraph.ShowCycles = true; } } catch (Exception ex) { Logger.Log(LogLevel.Error, "Master connection error"); } try { if (args.Length > 3) { gpuCount = int.Parse(args[3]); fastCuda = gpuCount <= (Environment.ProcessorCount / 2); if (fastCuda) { Logger.Log(LogLevel.Info, "Using single GPU blocking mode"); } } } catch { } if (TEST) { currentJob = nextJob = new Job() { jobID = 0, k0 = 0xf4956dc403730b01L, k1 = 0xe6d45de39c2a5a3eL, k2 = 0xcbf626a8afee35f6L, k3 = 0x4307b94b1a0c9980L, pre_pow = TestPrePow, timestamp = DateTime.Now }; } else { currentJob = nextJob = new Job() { jobID = 0, k0 = 0xf4956dc403730b01L, k1 = 0xe6d45de39c2a5a3eL, k2 = 0xcbf626a8afee35f6L, k3 = 0x4307b94b1a0c9980L, pre_pow = TestPrePow, timestamp = DateTime.Now }; if (!Comms.IsConnected()) { Console.WriteLine("Master connection failed, aborting"); Logger.Log(LogLevel.Error, "No master connection, exitting!"); return; } if (deviceID < 0) { int devCnt = CudaContext.GetDeviceCount(); GpuDevicesMessage gpum = new GpuDevicesMessage() { devices = new List <GpuDevice>(devCnt) }; for (int i = 0; i < devCnt; i++) { string name = CudaContext.GetDeviceName(i); var info = CudaContext.GetDeviceInfo(i); gpum.devices.Add(new GpuDevice() { deviceID = i, name = name, memory = info.TotalGlobalMemory }); } //Console.WriteLine(devCnt); Comms.gpuMsg = gpum; Comms.SetEvent(); //Console.WriteLine("event fired"); Task.Delay(1000).Wait(); //Console.WriteLine("closing"); Comms.Close(); return; } } try { var assembly = Assembly.GetEntryAssembly(); var resourceStream = assembly.GetManifestResourceStream("CudaSolver.kernel_x64.ptx"); ctx = new CudaContext(deviceID, /*!fastCuda ? (CUCtxFlags.BlockingSync | CUCtxFlags.MapHost) :*/ CUCtxFlags.MapHost); string pow = new StreamReader(resourceStream).ReadToEnd(); //pow = File.ReadAllText(@"kernel_x64.ptx"); Turing = ctx.GetDeviceInfo().MaxSharedMemoryPerMultiprocessor == 65536; using (var s = GenerateStreamFromString(pow)) { if (!Turing) { meanSeedA = ctx.LoadKernelPTX(s, "FluffySeed4K", new CUJITOption[] { CUJITOption.MaxRegisters }, new object[] { (uint)40 }); meanSeedA.BlockDimensions = 512; meanSeedA.GridDimensions = 1024; meanSeedA.PreferredSharedMemoryCarveout = CUshared_carveout.MaxShared; meanRound = ctx.LoadKernelPTX(s, "FluffyRound_A2", new CUJITOption[] { CUJITOption.MaxRegisters }, new object[] { (uint)40 }); meanRound.BlockDimensions = 512; meanRound.GridDimensions = 4096; meanRound.PreferredSharedMemoryCarveout = CUshared_carveout.MaxShared; meanRound_4 = ctx.LoadKernelPTX(s, "FluffyRound_A1", new CUJITOption[] { CUJITOption.MaxRegisters }, new object[] { (uint)32 }); meanRound_4.BlockDimensions = 1024; meanRound_4.GridDimensions = 1024; meanRound_4.PreferredSharedMemoryCarveout = CUshared_carveout.MaxShared; meanRoundJoin = ctx.LoadKernelPTX(s, "FluffyRound_A3", new CUJITOption[] { CUJITOption.MaxRegisters }, new object[] { (uint)32 }); meanRoundJoin.BlockDimensions = 1024; meanRoundJoin.GridDimensions = 4096; meanRoundJoin.PreferredSharedMemoryCarveout = CUshared_carveout.MaxShared; meanTail = ctx.LoadKernelPTX(s, "FluffyTail"); meanTail.BlockDimensions = 1024; meanTail.GridDimensions = 4096; meanTail.PreferredSharedMemoryCarveout = CUshared_carveout.MaxL1; meanRecover = ctx.LoadKernelPTX(s, "FluffyRecovery"); meanRecover.BlockDimensions = 256; meanRecover.GridDimensions = 2048; meanRecover.PreferredSharedMemoryCarveout = CUshared_carveout.MaxL1; } else { meanSeedA = ctx.LoadKernelPTX(s, "FluffySeed4K", new CUJITOption[] { CUJITOption.MaxRegisters }, new object[] { (uint)64 }); meanSeedA.BlockDimensions = 512; meanSeedA.GridDimensions = 1024; meanSeedA.PreferredSharedMemoryCarveout = CUshared_carveout.MaxShared; meanRound = ctx.LoadKernelPTX(s, "FluffyRound_C2", new CUJITOption[] { CUJITOption.MaxRegisters }, new object[] { (uint)32 }); meanRound.BlockDimensions = 1024; meanRound.GridDimensions = 4096; meanRound.PreferredSharedMemoryCarveout = CUshared_carveout.MaxShared; meanRound_4 = ctx.LoadKernelPTX(s, "FluffyRound_C1", new CUJITOption[] { CUJITOption.MaxRegisters }, new object[] { (uint)64 }); meanRound_4.BlockDimensions = 1024; meanRound_4.GridDimensions = 1024; meanRound_4.PreferredSharedMemoryCarveout = CUshared_carveout.MaxShared; meanRoundJoin = ctx.LoadKernelPTX(s, "FluffyRound_C3", new CUJITOption[] { CUJITOption.MaxRegisters }, new object[] { (uint)32 }); meanRoundJoin.BlockDimensions = 1024; meanRoundJoin.GridDimensions = 4096; meanRoundJoin.PreferredSharedMemoryCarveout = CUshared_carveout.MaxShared; meanTail = ctx.LoadKernelPTX(s, "FluffyTail"); meanTail.BlockDimensions = 1024; meanTail.GridDimensions = 4096; meanTail.PreferredSharedMemoryCarveout = CUshared_carveout.MaxL1; meanRecover = ctx.LoadKernelPTX(s, "FluffyRecovery"); meanRecover.BlockDimensions = 256; meanRecover.GridDimensions = 2048; meanRecover.PreferredSharedMemoryCarveout = CUshared_carveout.MaxL1; } } } catch (Exception ex) { Logger.Log(LogLevel.Error, "Unable to create kernels: " + ex.Message); Task.Delay(500).Wait(); Comms.Close(); return; } try { d_buffer = new CudaDeviceVariable <ulong>(BUFFER_SIZE_U32 * (temp ? 8 : 1)); d_bufferMid = new CudaDeviceVariable <ulong>(d_buffer.DevicePointer + (BUFFER_SIZE_B * 2)); d_bufferB = new CudaDeviceVariable <ulong>(d_buffer.DevicePointer + (BUFFER_SIZE_B * 8)); d_indexesA = new CudaDeviceVariable <uint>(INDEX_SIZE); d_indexesB = new CudaDeviceVariable <uint>(INDEX_SIZE); d_aux = new CudaDeviceVariable <uint>(512); Array.Clear(h_indexesA, 0, h_indexesA.Length); Array.Clear(h_indexesB, 0, h_indexesA.Length); d_indexesA = h_indexesA; d_indexesB = h_indexesB; streamPrimary = new CudaStream(CUStreamFlags.NonBlocking); } catch (Exception ex) { Task.Delay(200).Wait(); Logger.Log(LogLevel.Error, $"Mem alloc exception. Out of video memory? {ctx.GetFreeDeviceMemorySize()} free"); Task.Delay(500).Wait(); Comms.Close(); return; } try { AllocateHostMemory(true, ref h_a, ref hAligned_a, 1024 * 1024 * 32); } catch (Exception ex) { Logger.Log(LogLevel.Error, "Unable to create pinned memory."); Task.Delay(500).Wait(); Comms.Close(); return; } int loopCnt = 0; while (!Comms.IsTerminated) { try { if (!TEST && (Comms.nextJob.pre_pow == null || Comms.nextJob.pre_pow == "" || Comms.nextJob.pre_pow == TestPrePow)) { Logger.Log(LogLevel.Info, string.Format("Waiting for job....")); Task.Delay(1000).Wait(); continue; } if (!TEST && ((currentJob.pre_pow != Comms.nextJob.pre_pow) || (currentJob.origin != Comms.nextJob.origin))) { currentJob = Comms.nextJob; currentJob.timestamp = DateTime.Now; } if (!TEST && (currentJob.timestamp.AddMinutes(30) < DateTime.Now) && Comms.lastIncoming.AddMinutes(30) < DateTime.Now) { Logger.Log(LogLevel.Info, string.Format("Job too old...")); Task.Delay(1000).Wait(); continue; } // test runs only once if (TEST && ++loopCnt >= range) { Comms.IsTerminated = true; } Solution s; while (graphSolutions.TryDequeue(out s)) { meanRecover.SetConstantVariable <ulong>("recovery", s.GetUlongEdges()); d_indexesB.MemsetAsync(0, streamPrimary.Stream); meanRecover.RunAsync(streamPrimary.Stream, s.job.k0, s.job.k1, s.job.k2, s.job.k3, d_indexesB.DevicePointer); streamPrimary.Synchronize(); s.nonces = new uint[32]; d_indexesB.CopyToHost(s.nonces, 0, 0, 32 * 4); s.nonces = s.nonces.OrderBy(n => n).ToArray(); //fidelity = (32-cycles_found / graphs_searched) * 32 solutions++; s.fidelity = ((double)solutions / (double)trims) * 32.0; //Console.WriteLine(s.fidelity.ToString("0.000")); if (Comms.IsConnected()) { Comms.graphSolutionsOut.Enqueue(s); Comms.SetEvent(); } if (QTEST) { Console.ForegroundColor = ConsoleColor.Red; Console.WriteLine($"Solution for nonce {s.job.nonce}: {string.Join(' ', s.nonces)}"); Console.ResetColor(); } } if (QTEST) { currentJob = currentJob.NextSequential(ref nonce); Console.WriteLine($"Nonce: {nonce} K0: {currentJob.k0:X} K1: {currentJob.k1:X} K2: {currentJob.k2:X} K3: {currentJob.k3:X}"); } else { currentJob = currentJob.Next(); } Logger.Log(LogLevel.Debug, string.Format("GPU NV{4}:Trimming #{4}: {0} {1} {2} {3}", currentJob.k0, currentJob.k1, currentJob.k2, currentJob.k3, currentJob.jobID, deviceID)); timer.Restart(); d_indexesA.MemsetAsync(0, streamPrimary.Stream); d_indexesB.MemsetAsync(0, streamPrimary.Stream); d_aux.MemsetAsync(0, streamPrimary.Stream); meanSeedA.RunAsync(streamPrimary.Stream, currentJob.k0, currentJob.k1, currentJob.k2, currentJob.k3, d_bufferMid.DevicePointer, d_indexesB.DevicePointer, 0); meanSeedA.RunAsync(streamPrimary.Stream, currentJob.k0, currentJob.k1, currentJob.k2, currentJob.k3, d_bufferMid.DevicePointer + ((BUFFER_SIZE_A * 8) / 4 / 4) * 1, d_indexesB.DevicePointer + (4096 * 4), EDGE_SEG); meanSeedA.RunAsync(streamPrimary.Stream, currentJob.k0, currentJob.k1, currentJob.k2, currentJob.k3, d_bufferMid.DevicePointer + ((BUFFER_SIZE_A * 8) / 4 / 4) * 2, d_indexesB.DevicePointer + (4096 * 8), EDGE_SEG * 2); meanSeedA.RunAsync(streamPrimary.Stream, currentJob.k0, currentJob.k1, currentJob.k2, currentJob.k3, d_bufferMid.DevicePointer + ((BUFFER_SIZE_A * 8) / 4 / 4) * 3, d_indexesB.DevicePointer + (4096 * 12), EDGE_SEG * 3); meanRound_4.RunAsync(streamPrimary.Stream, d_bufferMid.DevicePointer, d_buffer.DevicePointer, d_indexesB.DevicePointer, d_indexesA.DevicePointer, DUCK_EDGES_A / 4, DUCK_EDGES_B / 4, 0); meanRound_4.RunAsync(streamPrimary.Stream, d_bufferMid.DevicePointer, d_buffer.DevicePointer + ((BUFFER_SIZE_B * 8) / 4) * 1, d_indexesB.DevicePointer, d_indexesA.DevicePointer, DUCK_EDGES_A / 4, DUCK_EDGES_B / 4, 1024); meanRound_4.RunAsync(streamPrimary.Stream, d_bufferMid.DevicePointer, d_buffer.DevicePointer + ((BUFFER_SIZE_B * 8) / 4) * 2, d_indexesB.DevicePointer, d_indexesA.DevicePointer, DUCK_EDGES_A / 4, DUCK_EDGES_B / 4, 2048); meanRound_4.RunAsync(streamPrimary.Stream, d_bufferMid.DevicePointer, d_buffer.DevicePointer + ((BUFFER_SIZE_B * 8) / 4) * 3, d_indexesB.DevicePointer, d_indexesA.DevicePointer, DUCK_EDGES_A / 4, DUCK_EDGES_B / 4, 3072); //streamPrimary.Synchronize(); //h_indexesA = d_indexesA; //h_indexesB = d_indexesB; //var sumA = h_indexesA.Sum(e => e); //var sumB = h_indexesB.Sum(e => e); //streamPrimary.Synchronize(); d_indexesB.MemsetAsync(0, streamPrimary.Stream); meanRoundJoin.RunAsync(streamPrimary.Stream, d_buffer.DevicePointer, d_buffer.DevicePointer + ((BUFFER_SIZE_B * 8) / 4) * 1, d_buffer.DevicePointer + ((BUFFER_SIZE_B * 8) / 4) * 2, d_buffer.DevicePointer + ((BUFFER_SIZE_B * 8) / 4) * 3, d_bufferB.DevicePointer, d_indexesA.DevicePointer, d_indexesB.DevicePointer, DUCK_EDGES_B / 4, DUCK_EDGES_B / 2); d_indexesA.MemsetAsync(0, streamPrimary.Stream); meanRound.RunAsync(streamPrimary.Stream, d_bufferB.DevicePointer, d_buffer.DevicePointer, d_indexesB.DevicePointer, d_indexesA.DevicePointer, DUCK_EDGES_B / 2, DUCK_EDGES_B / 2, 0, d_aux.DevicePointer); d_indexesB.MemsetAsync(0, streamPrimary.Stream); meanRound.RunAsync(streamPrimary.Stream, d_buffer.DevicePointer, d_bufferB.DevicePointer, d_indexesA.DevicePointer, d_indexesB.DevicePointer, DUCK_EDGES_B / 2, DUCK_EDGES_B / 2, 1, d_aux.DevicePointer); d_indexesA.MemsetAsync(0, streamPrimary.Stream); meanRound.RunAsync(streamPrimary.Stream, d_bufferB.DevicePointer, d_buffer.DevicePointer, d_indexesB.DevicePointer, d_indexesA.DevicePointer, DUCK_EDGES_B / 2, DUCK_EDGES_B / 2, 2, d_aux.DevicePointer); d_indexesB.MemsetAsync(0, streamPrimary.Stream); meanRound.RunAsync(streamPrimary.Stream, d_buffer.DevicePointer, d_bufferB.DevicePointer, d_indexesA.DevicePointer, d_indexesB.DevicePointer, DUCK_EDGES_B / 2, DUCK_EDGES_B / 4, 3, d_aux.DevicePointer); for (int i = 0; i < (TEST ? 80 : trimRounds); i++) //for (int i = 0; i < 85; i++) { d_indexesA.MemsetAsync(0, streamPrimary.Stream); meanRound.RunAsync(streamPrimary.Stream, d_bufferB.DevicePointer, d_buffer.DevicePointer, d_indexesB.DevicePointer, d_indexesA.DevicePointer, DUCK_EDGES_B / 4, DUCK_EDGES_B / 4, i * 2 + 4, d_aux.DevicePointer); d_indexesB.MemsetAsync(0, streamPrimary.Stream); meanRound.RunAsync(streamPrimary.Stream, d_buffer.DevicePointer, d_bufferB.DevicePointer, d_indexesA.DevicePointer, d_indexesB.DevicePointer, DUCK_EDGES_B / 4, DUCK_EDGES_B / 4, i * 2 + 5, d_aux.DevicePointer); } d_indexesA.MemsetAsync(0, streamPrimary.Stream); meanTail.RunAsync(streamPrimary.Stream, d_bufferB.DevicePointer, d_buffer.DevicePointer, d_indexesB.DevicePointer, d_indexesA.DevicePointer); Task.Delay((int)lastTrimMs).Wait(); streamPrimary.Synchronize(); uint[] count = new uint[2]; d_indexesA.CopyToHost(count, 0, 0, 8); if (count[0] > 131071) { // trouble count[0] = 131071; // log } hAligned_a.AsyncCopyFromDevice(d_buffer.DevicePointer, 0, 0, count[0] * 8, streamPrimary.Stream); streamPrimary.Synchronize(); System.Runtime.InteropServices.Marshal.Copy(hAligned_a.PinnedHostPointer, h_a, 0, ((int)count[0] * 8) / sizeof(int)); trims++; timer.Stop(); lastTrimMs = (long)Math.Min(Math.Max((float)timer.ElapsedMilliseconds * 0.9f, 50), 500); currentJob.solvedAt = DateTime.Now; currentJob.trimTime = timer.ElapsedMilliseconds; //Console.WriteLine("Trimmed in {0}ms to {1} edges", timer.ElapsedMilliseconds, count[0]); Logger.Log(LogLevel.Info, string.Format("GPU NV{2}: Trimmed in {0}ms to {1} edges", timer.ElapsedMilliseconds, count[0], deviceID)); FinderBag.RunFinder(TEST, ref trims, count[0], h_a, currentJob, graphSolutions, timer); if (trims % 50 == 0 && TEST) { Console.ForegroundColor = ConsoleColor.Green; Console.WriteLine("SOLS: {0}/{1} - RATE: {2:F1}", solutions, trims, (float)trims / solutions); Console.ResetColor(); } /* * if (TEST) * { * //Console.WriteLine("Trimmed in {0}ms to {1} edges", timer.ElapsedMilliseconds, count[0]); * * CGraph cg = FinderBag.GetFinder(); * cg.SetEdges(h_a, (int)count[0]); * cg.SetHeader(currentJob); * * //currentJob = currentJob.Next(); * * Task.Factory.StartNew(() => * { * Stopwatch sw = new Stopwatch(); * sw.Start(); * * if (count[0] < 131071) * { * try * { * if (findersInFlight++ < 3) * { * Stopwatch cycleTime = new Stopwatch(); * cycleTime.Start(); * cg.FindSolutions(graphSolutions); * cycleTime.Stop(); * AdjustTrims(cycleTime.ElapsedMilliseconds); * //if (graphSolutions.Count > 0) solutions++; * } * else * Logger.Log(LogLevel.Warning, "CPU overloaded!"); * } * catch (Exception ex) * { * Logger.Log(LogLevel.Error, "Cycle finder error" + ex.Message); * } * finally * { * FinderBag.ReturnFinder(cg); * findersInFlight--; * } * } * * sw.Stop(); * * if (trims % 50 == 0) * { * Console.ForegroundColor = ConsoleColor.Green; * Console.WriteLine("SOLS: {0}/{1} - RATE: {2:F1}", solutions, trims, (float)trims/solutions ); * Console.ResetColor(); * } * //Console.WriteLine("Finder completed in {0}ms on {1} edges with {2} solution(s)", sw.ElapsedMilliseconds, count[0], graphSolutions.Count); * //Console.WriteLine("Duped edges: {0}", cg.dupes); * if (!QTEST) * Logger.Log(LogLevel.Info, string.Format("Finder completed in {0}ms on {1} edges with {2} solution(s) and {3} dupes", sw.ElapsedMilliseconds, count[0], graphSolutions.Count, cg.dupes)); * }); * * //h_indexesA = d_indexesA; * //h_indexesB = d_indexesB; * * //var sumA = h_indexesA.Sum(e => e); * //var sumB = h_indexesB.Sum(e => e); * * ; * } * else * { * CGraph cg = FinderBag.GetFinder(); * cg.SetEdges(h_a, (int)count[0]); * cg.SetHeader(currentJob); * * Task.Factory.StartNew(() => * { * if (count[0] < 131071) * { * try * { * if (findersInFlight++ < 3) * { * Stopwatch cycleTime = new Stopwatch(); * cycleTime.Start(); * cg.FindSolutions(graphSolutions); * cycleTime.Stop(); * AdjustTrims(cycleTime.ElapsedMilliseconds); * } * else * Logger.Log(LogLevel.Warning, "CPU overloaded!"); * } * catch (Exception ex) * { * Logger.Log(LogLevel.Warning, "Cycle finder crashed: " + ex.Message); * } * finally * { * FinderBag.ReturnFinder(cg); * findersInFlight--; * } * } * }); * } * */ } catch (Exception ex) { Logger.Log(LogLevel.Error, "Critical error in main cuda loop " + ex.Message); Task.Delay(500).Wait(); break; } } // clean up try { Task.Delay(500).Wait(); Comms.Close(); d_buffer.Dispose(); d_indexesA.Dispose(); d_indexesB.Dispose(); d_aux.Dispose(); streamPrimary.Dispose(); streamSecondary.Dispose(); hAligned_a.Dispose(); if (ctx != null) { ctx.Dispose(); } } catch { } }
protected override void Execute() { if (!initializedFlag) { m_d_Force = new CudaDeviceVariable <float>(Target.MAX_CELLS * 3); m_d_ActiveConnectionsCount = new CudaDeviceVariable <int>(1); m_d_CenterOfGravity = new CudaDeviceVariable <float>(3); //initialize vertices position InitCoordinatesAndVelocity(); initializedFlag = true; ViewMode = ViewMethod.Orbit_3D; float translationValue = 0.50f * (COORDINATES_MAX - COORDINATES_MIN); m_Translation = new Vector3(-translationValue, 0, -translationValue); m_zeroTextureKernel.SetupExecution(TextureHeight * TextureWidth); m_zeroTextureKernel.Run( VBODevicePointer, TextureHeight * TextureWidth ); } if (TRANSLATE_TO_CENTER == Option.True) { m_centerOfGravityKernel.SetupExecution(1); m_centerOfGravityKernel.Run( m_d_PointsCoordinates.DevicePointer, m_d_CenterOfGravity.DevicePointer, Target.ActivityFlag, Target.MAX_CELLS ); float[] m_h_centerOfGravity = new float[3]; m_d_CenterOfGravity.CopyToHost(m_h_centerOfGravity); m_Translation = new Vector3(-m_h_centerOfGravity[0], -m_h_centerOfGravity[1], -m_h_centerOfGravity[2]); m_Connections.Translation = m_Translation; m_ReferenceFields.Translation = m_Translation; m_WinnerOne.Translation = m_Translation; m_WinnerTwo.Translation = m_Translation; } // PHYSICS PART // set forces to zero m_setForcesToZeroKernel.SetupExecution(Target.MAX_CELLS * 3); m_setForcesToZeroKernel.Run( m_d_Force.DevicePointer, Target.MAX_CELLS ); // spring force computation m_springKernel.SetupExecution(Target.MAX_CELLS); m_springKernel.Run( Target.ActivityFlag, Target.ConnectionMatrix, m_d_PointsCoordinates.DevicePointer, SPRING_STRENGTH, m_d_Force.DevicePointer, Target.MAX_CELLS ); // repulsion force computation m_repulsionKernel.SetupExecution(Target.MAX_CELLS); m_repulsionKernel.Run( REPULSION, REPULSION_DISTANCE, m_d_Force.DevicePointer, m_d_PointsCoordinates.DevicePointer, Target.ActivityFlag, Target.MAX_CELLS ); // applying forces to the points m_useForceKernel.SetupExecution(Target.MAX_CELLS * 3); m_useForceKernel.Run( m_d_Force.DevicePointer, FORCE_FACTOR, m_d_PointsCoordinates.DevicePointer, Target.MAX_CELLS ); // GRAPHICS PART // COPY AND PROCESS TEXTURE m_copyAndProcessTextureKernel.SetupExecution(Target.ReferenceVector.Count); m_copyAndProcessTextureKernel.Run( Target.ReferenceVector, Target.INPUT_SIZE, Target.Input.ColumnHint, TextureWidth, VBODevicePointer, Target.MAX_CELLS, Target.ReferenceVector.Count ); // CONNECTIONS m_d_ActiveConnectionsCount.CopyToDevice(0); m_copyConnectionsCoordinatesKernel.SetupExecution(Target.MAX_CELLS * Target.MAX_CELLS); m_copyConnectionsCoordinatesKernel.Run( Target.ConnectionMatrix, m_d_PointsCoordinates.DevicePointer, VertexVBODevicePointer, m_d_ActiveConnectionsCount.DevicePointer, Target.MAX_CELLS ); m_d_ActiveConnectionsCount.CopyToHost(m_h_ActiveConnectionsCount); m_Connections.VertexCount = 2 * m_h_ActiveConnectionsCount[0]; // REFERENCE VECTORS (CUBES) /* * m_computeCubesKernel.m_kernel.SetupExecution(Target.MAX_CELLS * ); * * * .Run( * m_computeCubesKernel, * m_d_PointsCoordinates.DevicePointer, * VertexVBODevicePointer, * m_ReferenceFields.VertexOffset, * TEXTURE_SIDE, * Target.ActivityFlag, * Target.Input.ColumnHint, * Target.MAX_CELLS * ); */ /* * m_cubeCoordinatesKernel.m_kernel.SetupExecution(Target.MAX_CELLS * 72 * ); * * .Run( * m_cubeCoordinatesKernel, * VertexVBODevicePointer, * m_d_CubeOperation.DevicePointer, * m_ReferenceFields.VertexOffset, * Target.ActivityFlag, * TEXTURE_SIDE, * m_d_PointsCoordinates.DevicePointer, * Target.MAX_CELLS * ); * * m_cubeTextureKernel.m_kernel.SetupExecution(Target.MAX_CELLS * 48 * ); * * .Run( * m_cubeTextureKernel, * VertexVBODevicePointer, * m_ReferenceFields.TexCoordOffset, * m_d_CubeTexCoordinates.DevicePointer, * TEXTURE_SIDE, * Target.Input.ColumnHint, * Target.ActivityFlag, * Target.MAX_CELLS * ); */ m_computeCubes2Kernel.SetupExecution(Target.MAX_CELLS * 6); m_computeCubes2Kernel.Run( m_d_PointsCoordinates.DevicePointer, VertexVBODevicePointer, m_ReferenceFields.VertexOffset, TEXTURE_SIDE, m_d_CubeOperation.DevicePointer, m_d_CubeTexCoordinates.DevicePointer, Target.ActivityFlag, (float)Target.Input.ColumnHint, Target.MAX_CELLS ); /* * m_computeQuadsKernel.m_kernel.SetupExecution( * Target.MAX_CELLS * ); * * m_computeQuadsKernel.Run( * m_d_PointsCoordinates.DevicePointer, * VertexVBODevicePointer, * m_ReferenceFields.VertexOffset, * TEXTURE_SIDE, * Target.ActivityFlag, * Target.Input.ColumnHint, * Target.MAX_CELLS * ); */ m_winnersKernel.SetupExecution(Target.MAX_CELLS); m_winnersKernel.Run( Target.WinnerOne, VertexVBODevicePointer, m_WinnerOne.VertexOffset, m_d_PointsCoordinates.DevicePointer, TEXTURE_SIDE, Target.MAX_CELLS ); m_winnersKernel.SetupExecution(Target.MAX_CELLS); m_winnersKernel.Run( Target.WinnerTwo, VertexVBODevicePointer, m_WinnerTwo.VertexOffset, m_d_PointsCoordinates.DevicePointer, TEXTURE_SIDE, Target.MAX_CELLS ); if (ONE_SHOT_RESTART == Option.True) { initializedFlag = false; TriggerReset(); ONE_SHOT_RESTART = Option.False; } }
static void Main(string[] args) { try { if (args.Length > 0) { deviceID = int.Parse(args[0]); } } catch (Exception ex) { Logger.Log(LogLevel.Error, "Device ID parse error"); } try { if (args.Length > 1) { port = int.Parse(args[1]); Comms.ConnectToMaster(port); } else { TEST = true; Logger.CopyToConsole = true; CGraph.ShowCycles = true; } } catch (Exception ex) { Logger.Log(LogLevel.Error, "Master connection error"); } try { if (args.Length > 3) { gpuCount = int.Parse(args[3]); fastCuda = gpuCount <= (Environment.ProcessorCount / 2); if (fastCuda) { Logger.Log(LogLevel.Info, "Using single GPU blocking mode"); } } } catch { } if (TEST) { currentJob = nextJob = new Job() { jobID = 0, k0 = 0xf4956dc403730b01L, k1 = 0xe6d45de39c2a5a3eL, k2 = 0xcbf626a8afee35f6L, k3 = 0x4307b94b1a0c9980L, pre_pow = TestPrePow, timestamp = DateTime.Now }; } else { currentJob = nextJob = new Job() { jobID = 0, k0 = 0xf4956dc403730b01L, k1 = 0xe6d45de39c2a5a3eL, k2 = 0xcbf626a8afee35f6L, k3 = 0x4307b94b1a0c9980L, pre_pow = TestPrePow, timestamp = DateTime.Now }; if (!Comms.IsConnected()) { Console.WriteLine("Master connection failed, aborting"); Logger.Log(LogLevel.Error, "No master connection, exitting!"); return; } if (deviceID < 0) { int devCnt = CudaContext.GetDeviceCount(); GpuDevicesMessage gpum = new GpuDevicesMessage() { devices = new List <GpuDevice>(devCnt) }; for (int i = 0; i < devCnt; i++) { string name = CudaContext.GetDeviceName(i); var info = CudaContext.GetDeviceInfo(i); gpum.devices.Add(new GpuDevice() { deviceID = i, name = name, memory = info.TotalGlobalMemory }); } //Console.WriteLine(devCnt); Comms.gpuMsg = gpum; Comms.SetEvent(); //Console.WriteLine("event fired"); Task.Delay(1000).Wait(); //Console.WriteLine("closing"); Comms.Close(); return; } } try { var assembly = Assembly.GetEntryAssembly(); var resourceStream = assembly.GetManifestResourceStream("CudaSolver.kernel_x64.ptx"); ctx = new CudaContext(deviceID, !fastCuda ? (CUCtxFlags.BlockingSync | CUCtxFlags.MapHost) : CUCtxFlags.MapHost); meanSeedA = ctx.LoadKernelPTX(resourceStream, "FluffySeed2A"); meanSeedA.BlockDimensions = 128; meanSeedA.GridDimensions = 2048; meanSeedA.PreferredSharedMemoryCarveout = CUshared_carveout.MaxShared; meanSeedB = ctx.LoadKernelPTX(resourceStream, "FluffySeed2B"); meanSeedB.BlockDimensions = 128; meanSeedB.GridDimensions = 2048; meanSeedB.PreferredSharedMemoryCarveout = CUshared_carveout.MaxShared; meanSeedB_4 = ctx.LoadKernelPTX(resourceStream, "FluffySeed2B"); meanSeedB_4.BlockDimensions = 128; meanSeedB_4.GridDimensions = 1024; meanSeedB_4.PreferredSharedMemoryCarveout = CUshared_carveout.MaxShared; meanRound = ctx.LoadKernelPTX(resourceStream, "FluffyRound"); meanRound.BlockDimensions = 512; meanRound.GridDimensions = 4096; meanRound.PreferredSharedMemoryCarveout = CUshared_carveout.MaxShared; meanRound_2 = ctx.LoadKernelPTX(resourceStream, "FluffyRound"); meanRound_2.BlockDimensions = 512; meanRound_2.GridDimensions = 2048; meanRound_2.PreferredSharedMemoryCarveout = CUshared_carveout.MaxShared; meanRoundJoin = ctx.LoadKernelPTX(resourceStream, "FluffyRound_J"); meanRoundJoin.BlockDimensions = 512; meanRoundJoin.GridDimensions = 4096; meanRoundJoin.PreferredSharedMemoryCarveout = CUshared_carveout.MaxShared; meanTail = ctx.LoadKernelPTX(resourceStream, "FluffyTail"); meanTail.BlockDimensions = 1024; meanTail.GridDimensions = 4096; meanTail.PreferredSharedMemoryCarveout = CUshared_carveout.MaxL1; meanRecover = ctx.LoadKernelPTX(resourceStream, "FluffyRecovery"); meanRecover.BlockDimensions = 256; meanRecover.GridDimensions = 2048; meanRecover.PreferredSharedMemoryCarveout = CUshared_carveout.MaxL1; } catch (Exception ex) { Logger.Log(LogLevel.Error, "Unable to create kernels: " + ex.Message); Task.Delay(500).Wait(); Comms.Close(); return; } try { d_buffer = new CudaDeviceVariable <ulong>(BUFFER_SIZE_U32); d_bufferMid = new CudaDeviceVariable <ulong>(d_buffer.DevicePointer + (BUFFER_SIZE_B * 8)); d_bufferB = new CudaDeviceVariable <ulong>(d_buffer.DevicePointer + (BUFFER_SIZE_A * 8)); d_indexesA = new CudaDeviceVariable <uint>(INDEX_SIZE * 2); d_indexesB = new CudaDeviceVariable <uint>(INDEX_SIZE * 2); Array.Clear(h_indexesA, 0, h_indexesA.Length); Array.Clear(h_indexesB, 0, h_indexesA.Length); d_indexesA = h_indexesA; d_indexesB = h_indexesB; streamPrimary = new CudaStream(CUStreamFlags.NonBlocking); streamSecondary = new CudaStream(CUStreamFlags.NonBlocking); } catch (Exception ex) { Task.Delay(200).Wait(); Logger.Log(LogLevel.Error, $"Out of video memory! Only {ctx.GetFreeDeviceMemorySize()} free"); Task.Delay(500).Wait(); Comms.Close(); return; } try { AllocateHostMemory(true, ref h_a, ref hAligned_a, 1024 * 1024 * 32); } catch (Exception ex) { Logger.Log(LogLevel.Error, "Unable to create pinned memory."); Task.Delay(500).Wait(); Comms.Close(); return; } int loopCnt = 0; while (!Comms.IsTerminated) { try { if (!TEST && (Comms.nextJob.pre_pow == null || Comms.nextJob.pre_pow == "" || Comms.nextJob.pre_pow == TestPrePow)) { Logger.Log(LogLevel.Info, string.Format("Waiting for job....")); Task.Delay(1000).Wait(); continue; } if (!TEST && ((currentJob.pre_pow != Comms.nextJob.pre_pow) || (currentJob.origin != Comms.nextJob.origin))) { currentJob = Comms.nextJob; currentJob.timestamp = DateTime.Now; } if (!TEST && (currentJob.timestamp.AddMinutes(30) < DateTime.Now) && Comms.lastIncoming.AddMinutes(30) < DateTime.Now) { Logger.Log(LogLevel.Info, string.Format("Job too old...")); Task.Delay(1000).Wait(); continue; } // test runs only once if (TEST && loopCnt++ > 100) { Comms.IsTerminated = true; } Solution s; while (graphSolutions.TryDequeue(out s)) { meanRecover.SetConstantVariable <ulong>("recovery", s.GetUlongEdges()); d_indexesB.MemsetAsync(0, streamPrimary.Stream); meanRecover.RunAsync(streamPrimary.Stream, s.job.k0, s.job.k1, s.job.k2, s.job.k3, d_indexesB.DevicePointer); streamPrimary.Synchronize(); s.nonces = new uint[40]; d_indexesB.CopyToHost(s.nonces, 0, 0, 40 * 4); s.nonces = s.nonces.OrderBy(n => n).ToArray(); lock (Comms.graphSolutionsOut) { Comms.graphSolutionsOut.Enqueue(s); } Comms.SetEvent(); } uint[] count; do { if (!TEST && ((currentJob.pre_pow != Comms.nextJob.pre_pow) || (currentJob.origin != Comms.nextJob.origin))) { currentJob = Comms.nextJob; currentJob.timestamp = DateTime.Now; } currentJob = currentJob.Next(); Logger.Log(LogLevel.Debug, string.Format("GPU NV{4}:Trimming #{4}: {0} {1} {2} {3}", currentJob.k0, currentJob.k1, currentJob.k2, currentJob.k3, currentJob.jobID, deviceID)); timer.Restart(); d_indexesA.MemsetAsync(0, streamPrimary.Stream); d_indexesB.MemsetAsync(0, streamPrimary.Stream); meanSeedA.RunAsync(streamPrimary.Stream, currentJob.k0, currentJob.k1, currentJob.k2, currentJob.k3, d_bufferMid.DevicePointer, d_indexesB.DevicePointer); meanSeedB_4.RunAsync(streamPrimary.Stream, d_bufferMid.DevicePointer, d_buffer.DevicePointer, d_indexesB.DevicePointer, d_indexesA.DevicePointer, 0); meanSeedB_4.RunAsync(streamPrimary.Stream, d_bufferMid.DevicePointer, d_buffer.DevicePointer + ((BUFFER_SIZE_A * 8) / 4) * 1, d_indexesB.DevicePointer, d_indexesA.DevicePointer, 16); meanSeedB_4.RunAsync(streamPrimary.Stream, d_bufferMid.DevicePointer, d_buffer.DevicePointer + ((BUFFER_SIZE_A * 8) / 4) * 2, d_indexesB.DevicePointer, d_indexesA.DevicePointer, 32); meanSeedB_4.RunAsync(streamPrimary.Stream, d_bufferMid.DevicePointer, d_buffer.DevicePointer + ((BUFFER_SIZE_A * 8) / 4) * 3, d_indexesB.DevicePointer, d_indexesA.DevicePointer, 48); d_indexesB.MemsetAsync(0, streamPrimary.Stream); meanRound_2.RunAsync(streamPrimary.Stream, d_buffer.DevicePointer + ((BUFFER_SIZE_A * 8) / 4) * 2, d_bufferB.DevicePointer, d_indexesA.DevicePointer + (2048 * 4), d_indexesB.DevicePointer + (4096 * 4), DUCK_EDGES_A, DUCK_EDGES_B / 2); meanRound_2.RunAsync(streamPrimary.Stream, d_buffer.DevicePointer, d_bufferB.DevicePointer - (BUFFER_SIZE_B * 8), d_indexesA.DevicePointer, d_indexesB.DevicePointer, DUCK_EDGES_A, DUCK_EDGES_B / 2); d_indexesA.MemsetAsync(0, streamPrimary.Stream); meanRoundJoin.RunAsync(streamPrimary.Stream, d_bufferB.DevicePointer - (BUFFER_SIZE_B * 8), d_bufferB.DevicePointer, d_buffer.DevicePointer, d_indexesB.DevicePointer, d_indexesA.DevicePointer, DUCK_EDGES_B / 2, DUCK_EDGES_B / 2); //d_indexesA.MemsetAsync(0, streamPrimary.Stream); //meanRound.RunAsync(streamPrimary.Stream, d_bufferB.DevicePointer, d_buffer.DevicePointer, d_indexesB.DevicePointer, d_indexesA.DevicePointer, DUCK_EDGES_B, DUCK_EDGES_B / 2); d_indexesB.MemsetAsync(0, streamPrimary.Stream); meanRound.RunAsync(streamPrimary.Stream, d_buffer.DevicePointer, d_bufferB.DevicePointer, d_indexesA.DevicePointer, d_indexesB.DevicePointer, DUCK_EDGES_B / 2, DUCK_EDGES_B / 2); d_indexesA.MemsetAsync(0, streamPrimary.Stream); meanRound.RunAsync(streamPrimary.Stream, d_bufferB.DevicePointer, d_buffer.DevicePointer, d_indexesB.DevicePointer, d_indexesA.DevicePointer, DUCK_EDGES_B / 2, DUCK_EDGES_B / 2); d_indexesB.MemsetAsync(0, streamPrimary.Stream); meanRound.RunAsync(streamPrimary.Stream, d_buffer.DevicePointer, d_bufferB.DevicePointer, d_indexesA.DevicePointer, d_indexesB.DevicePointer, DUCK_EDGES_B / 2, DUCK_EDGES_B / 4); for (int i = 0; i < trimRounds; i++) { d_indexesA.MemsetAsync(0, streamPrimary.Stream); meanRound.RunAsync(streamPrimary.Stream, d_bufferB.DevicePointer, d_buffer.DevicePointer, d_indexesB.DevicePointer, d_indexesA.DevicePointer, DUCK_EDGES_B / 4, DUCK_EDGES_B / 4); d_indexesB.MemsetAsync(0, streamPrimary.Stream); meanRound.RunAsync(streamPrimary.Stream, d_buffer.DevicePointer, d_bufferB.DevicePointer, d_indexesA.DevicePointer, d_indexesB.DevicePointer, DUCK_EDGES_B / 4, DUCK_EDGES_B / 4); } d_indexesA.MemsetAsync(0, streamPrimary.Stream); meanTail.RunAsync(streamPrimary.Stream, d_bufferB.DevicePointer, d_buffer.DevicePointer, d_indexesB.DevicePointer, d_indexesA.DevicePointer); ctx.Synchronize(); streamPrimary.Synchronize(); count = new uint[2]; d_indexesA.CopyToHost(count, 0, 0, 8); if (count[0] > 4194304) { // trouble count[0] = 4194304; // log } hAligned_a.AsyncCopyFromDevice(d_buffer.DevicePointer, 0, 0, count[0] * 8, streamPrimary.Stream); streamPrimary.Synchronize(); System.Runtime.InteropServices.Marshal.Copy(hAligned_a.PinnedHostPointer, h_a, 0, ((int)count[0] * 8) / sizeof(int)); timer.Stop(); currentJob.solvedAt = DateTime.Now; currentJob.trimTime = timer.ElapsedMilliseconds; //Console.WriteLine("Trimmed in {0}ms to {1} edges", timer.ElapsedMilliseconds, count[0]); Logger.Log(LogLevel.Info, string.Format("GPU NV{2}: Trimmed in {0}ms to {1} edges, h {3}", timer.ElapsedMilliseconds, count[0], deviceID, currentJob.height)); }while((currentJob.height != Comms.nextJob.height) && (!Comms.IsTerminated) && (!TEST)); if (TEST) { //Console.WriteLine("Trimmed in {0}ms to {1} edges", timer.ElapsedMilliseconds, count[0]); CGraph cg = FinderBag.GetFinder(); if (cg == null) { continue; } cg.SetEdges(h_a, (int)count[0]); cg.SetHeader(currentJob); //currentJob = currentJob.Next(); Task.Factory.StartNew(() => { Stopwatch sw = new Stopwatch(); sw.Start(); if (count[0] < 200000) { try { if (findersInFlight++ < 3) { Stopwatch cycleTime = new Stopwatch(); cycleTime.Start(); cg.FindSolutions(graphSolutions); cycleTime.Stop(); AdjustTrims(cycleTime.ElapsedMilliseconds); if (graphSolutions.Count > 0) { solutions++; } } else { Logger.Log(LogLevel.Warning, "CPU overloaded!"); } } catch (Exception ex) { Logger.Log(LogLevel.Error, "Cycle finder error" + ex.Message); } finally { findersInFlight--; FinderBag.ReturnFinder(cg); } } sw.Stop(); if (++trims % 50 == 0) { Console.ForegroundColor = ConsoleColor.Green; Console.WriteLine("SOLS: {0}/{1} - RATE: {2:F1}", solutions, trims, (float)trims / solutions); Console.ResetColor(); } //Console.WriteLine("Finder completed in {0}ms on {1} edges with {2} solution(s)", sw.ElapsedMilliseconds, count[0], graphSolutions.Count); //Console.WriteLine("Duped edges: {0}", cg.dupes); Logger.Log(LogLevel.Info, string.Format("Finder completed in {0}ms on {1} edges with {2} solution(s) and {3} dupes", sw.ElapsedMilliseconds, count[0], graphSolutions.Count, cg.dupes)); }); //h_indexesA = d_indexesA; //h_indexesB = d_indexesB; //var sumA = h_indexesA.Sum(e => e); //var sumB = h_indexesB.Sum(e => e); ; } else { CGraph cg = FinderBag.GetFinder(); cg.SetEdges(h_a, (int)count[0]); cg.SetHeader(currentJob); Task.Factory.StartNew(() => { if (count[0] < 200000) { try { if (findersInFlight++ < 3) { Stopwatch cycleTime = new Stopwatch(); cycleTime.Start(); cg.FindSolutions(graphSolutions); cycleTime.Stop(); AdjustTrims(cycleTime.ElapsedMilliseconds); if (graphSolutions.Count > 0) { solutions++; } } else { Logger.Log(LogLevel.Warning, "CPU overloaded!"); } } catch (Exception ex) { Logger.Log(LogLevel.Error, "Cycle finder crashed: " + ex.Message); } finally { findersInFlight--; FinderBag.ReturnFinder(cg); } } }); } } catch (Exception ex) { Logger.Log(LogLevel.Error, "Critical error in main cuda loop " + ex.Message); Task.Delay(5000).Wait(); } } // clean up try { Task.Delay(500).Wait(); Comms.Close(); d_buffer.Dispose(); d_indexesA.Dispose(); d_indexesB.Dispose(); streamPrimary.Dispose(); streamSecondary.Dispose(); hAligned_a.Dispose(); if (ctx != null) { ctx.Dispose(); } } catch { } }
static void Main(string[] args) { string filename = "vectorAdd_kernel.cu"; //we assume the file is in the same folder... string fileToCompile = File.ReadAllText(filename); CudaRuntimeCompiler rtc = new CudaRuntimeCompiler(fileToCompile, "vectorAdd_kernel"); rtc.Compile(args); string log = rtc.GetLogAsString(); Console.WriteLine(log); byte[] ptx = rtc.GetPTX(); rtc.Dispose(); CudaContext ctx = new CudaContext(0); CudaKernel vectorAdd = ctx.LoadKernelPTX(ptx, "vectorAdd"); // Print the vector length to be used, and compute its size int numElements = 50000; SizeT size = numElements * sizeof(float); Console.WriteLine("[Vector addition of {0} elements]", numElements); // Allocate the host input vector A float[] h_A = new float[numElements]; // Allocate the host input vector B float[] h_B = new float[numElements]; // Allocate the host output vector C float[] h_C = new float[numElements]; Random rand = new Random(0); // Initialize the host input vectors for (int i = 0; i < numElements; ++i) { h_A[i] = (float)rand.NextDouble(); h_B[i] = (float)rand.NextDouble(); } Console.WriteLine("Allocate and copy input data from the host memory to the CUDA device\n"); // Allocate the device input vector A and copy to device CudaDeviceVariable<float> d_A = h_A; // Allocate the device input vector B and copy to device CudaDeviceVariable<float> d_B = h_B; // Allocate the device output vector C CudaDeviceVariable<float> d_C = new CudaDeviceVariable<float>(numElements); // Launch the Vector Add CUDA Kernel int threadsPerBlock = 256; int blocksPerGrid = (numElements + threadsPerBlock - 1) / threadsPerBlock; Console.WriteLine("CUDA kernel launch with {0} blocks of {1} threads\n", blocksPerGrid, threadsPerBlock); vectorAdd.BlockDimensions = new dim3(threadsPerBlock,1, 1); vectorAdd.GridDimensions = new dim3(blocksPerGrid, 1, 1); vectorAdd.Run(d_A.DevicePointer, d_B.DevicePointer, d_C.DevicePointer, numElements); // Copy the device result vector in device memory to the host result vector // in host memory. Console.WriteLine("Copy output data from the CUDA device to the host memory\n"); d_C.CopyToHost(h_C); // Verify that the result vector is correct for (int i = 0; i < numElements; ++i) { if (Math.Abs(h_A[i] + h_B[i] - h_C[i]) > 1e-5) { Console.WriteLine("Result verification failed at element {0}!\n", i); return; } } Console.WriteLine("Test PASSED\n"); // Free device global memory d_A.Dispose(); d_B.Dispose(); d_C.Dispose(); ctx.Dispose(); Console.WriteLine("Done\n"); }