private static void SimInit() { Console.WriteLine("Deserializing class"); CudafyModule km = CudafyModule.TryDeserialize(typeof(Program).Name); Console.WriteLine("Got: " + km); var tvc = km == null ? false : km.TryVerifyChecksums(); Console.WriteLine("TVC: " + tvc); if (km == null || !tvc) { Console.WriteLine("Serializing"); km = CudafyTranslator.Cudafy(typeof(Program)); km.Serialize(); } Console.WriteLine("Requesting device"); _gpu = CudafyHost.GetDevice(eGPUType.Cuda); if (_gpu == null) { _gpu = CudafyHost.GetDevice(eGPUType.OpenCL); if (_gpu == null) { _gpu = CudafyHost.GetDevice(eGPUType.Emulator); if (_gpu == null) { Console.WriteLine("No deivce found!"); return; } } else Console.WriteLine("Got OpenCL Device: " + _gpu.DeviceId); } else Console.WriteLine("Got CUDA Device: " + _gpu.DeviceId); Console.WriteLine("Loading module"); _gpu.LoadModule(km); }
/// <summary> /// Removes the specified GPGPU from the cache. /// </summary> /// <param name="gpu">The gpu.</param> /// <returns>True if gpu was removed, else false.</returns> public static bool RemoveDevice(GPGPU gpu) { List <GPGPU> gpus = GPGPUs.Values.Where(v => v == gpu).ToList(); bool removed = gpus.Count > 0; List <string> names = new List <string>(); for (int i = 0; i < gpus.Count; i++) { gpus[i].Dispose(); foreach (var v in GPGPUs) { if (v.Value == gpu) { names.Add(v.Key); } } } foreach (var s in names.Distinct()) { GPGPUs.Remove(s); } return(removed); }
public override bool CanAccessPeer(GPGPU peer) { lock (_peerAccessLock) { return(peer != this && peer is EmulatedGPU); } }
static Bitmap Render(GPGPU gpu, int frameNum) { uint[,] deviceImage = gpu.Allocate <uint>(width, height); float[] pX1_gpu = gpu.CopyToDevice <float>(pX1); float[] pY1_gpu = gpu.CopyToDevice <float>(pY1); float[] pZ1_gpu = gpu.CopyToDevice <float>(pZ1); float[] colorPosition_gpu = gpu.CopyToDevice <float>(colorPosition); float[] currentTime_gpu = gpu.CopyToDevice <float>(currentTime); dim3 threadsPerBlock = new dim3(8, 8); dim3 numBlocks = new dim3(width / threadsPerBlock.x, height / threadsPerBlock.y); gpu.Launch(numBlocks, threadsPerBlock).renderKernel(deviceImage, pX1_gpu, pY1_gpu, pZ1_gpu, colorPosition_gpu, currentTime_gpu); uint[,] finalImage = new uint[width, height]; gpu.CopyFromDevice <uint>(deviceImage, finalImage); gpu.Free(deviceImage); gpu.Free(pX1_gpu); gpu.Free(pY1_gpu); gpu.Free(pZ1_gpu); gpu.Free(colorPosition_gpu); gpu.Free(currentTime_gpu); GCHandle pixels = GCHandle.Alloc(finalImage, GCHandleType.Pinned); Bitmap bmp = new Bitmap(width, height, width * sizeof(int), PixelFormat.Format32bppRgb, pixels.AddrOfPinnedObject()); bmp.Save("spring" + frameNum + ".png"); pixels.Free(); return(bmp); }
public void InitGPU() { // Work around for bug in Cudafy trying to find the path.. var os64Bit = Environment.Is64BitOperatingSystem; if (os64Bit) { var dir = Environment.GetEnvironmentVariable("ProgramFiles"); Environment.SetEnvironmentVariable("ProgramFiles", "C:\\Program Files\\"); dir = Environment.GetEnvironmentVariable("ProgramFiles"); } if (Gpu == null) { Gpu = CudafyHost.GetDevice(_gpuType, 0); //Blas = GPGPUBLAS.Create(Gpu); if (_gpuType == eGPUType.Cuda) { Blas = new SharpBLAS(Gpu); Rand = GPGPURAND.Create(Gpu, curandRngType.CURAND_RNG_PSEUDO_DEFAULT); Rand.SetPseudoRandomGeneratorSeed((ulong)RandomHelpers.Next(9999)); } CudafyTranslator.GenerateDebug = true; Debug.WriteLine("CUDA workdir = " + CudafyTranslator.WorkingDirectory); Console.WriteLine("Recompile module"); CudafyTranslator.Language = eLanguage.Cuda; var km = CudafyTranslator.Cudafy(eArchitecture.sm_30); km = CudafyTranslator.Cudafy(); km.Serialize(); Gpu.LoadModule(km); } }
static void Main(string[] args) { CudafyModule km = CudafyTranslator.Cudafy(); GPGPU gpu = CudafyHost.GetDevice(CudafyModes.Target, CudafyModes.DeviceId); gpu.LoadModule(km); int numFrames = numberOfSeconds * framesPerSecond; InitializeParticles(); File.WriteAllText("length.txt", numFrames.ToString()); for (int i = 0; i < numFrames; i++) { DateTime frameStart = DateTime.Now; Simulate(gpu); Bitmap frame = Render(gpu, i); TimeSpan frameTime = DateTime.Now - frameStart; Console.WriteLine("Frame " + i + " complete. Time: " + frameTime.TotalMilliseconds + "ms"); } }
public void SetUp() { //CudafyModes.Architecture = eArchitecture.sm_30; _gpu = CudafyHost.GetDevice(eArchitecture.sm_30, CudafyModes.DeviceId); Assert.IsFalse(_gpu is OpenCLDevice, "OpenCL devices are not supported."); _cm = CudafyModule.TryDeserialize(); if (_cm == null || !_cm.TryVerifyChecksums()) { _cm = CudafyTranslator.Cudafy(eArchitecture.sm_30); Console.WriteLine(_cm.CompilerOutput); _cm.TrySerialize(); } _gpu.LoadModule(_cm); inputIntArray = new int[] { 0x17, 0x01, 0x7f, 0xd1, 0xfe, 0x23, 0x2c, 0xa0, 0x00, 0xcf, 0xaa, 0x7a, 0x35, 0xf4, 0x04, 0xbc, 0xe9, 0x6d, 0xb2, 0x55, 0xb0, 0xc8, 0x10, 0x49, 0x76, 0x17, 0x92, 0xab, 0xf3, 0xf2, 0xab, 0xcb }; // arbitrary values d_inputIntArray = _gpu.CopyToDevice(inputIntArray); d_outputIntArray = _gpu.Allocate <int>(WARP_SIZE); gpuIntResult = new int[WARP_SIZE]; cpuIntResult = new int[WARP_SIZE]; inputFloatArray = new float[] { 1.7f, -37.03f, 2147.6436f, -0.1f, 7.7f, 99.99f, -809.142f, -0.1115f, 1.0f, 2.0f, 3.0f, 5.0f, 7.5f, 0.1001f, 11.119f, -9.0f, 7749.9847f, -860249.118843f, 0.0f, -2727745.586215f, 12.0f, -11.0f, 77.77f, 22.0f, 377.1112f, -377.1112f, 0.12345f, -0.12345f, 0.11111f, -0.11111f, 700000f, -14f }; // arbitrary values d_inputFloatArray = _gpu.CopyToDevice(inputFloatArray); d_outputFloatArray = _gpu.Allocate <float>(WARP_SIZE); gpuFloatResult = new float[WARP_SIZE]; cpuFloatResult = new float[WARP_SIZE]; }
public static float[] CallGPU() { CudafyModes.Target = eGPUType.OpenCL; CudafyModes.DeviceId = 0; CudafyTranslator.Language = eLanguage.OpenCL; CudafyModule km = CudafyTranslator.Cudafy(ePlatform.Auto, eArchitecture.OpenCL, typeof(GPU)); GPGPU gpu = CudafyHost.GetDevice(eGPUType.OpenCL, 0); gpu.LoadModule(km); km.Serialize(); float[] input = Utils.GenerateRandomVector(); float[,,] NN = Utils.GenerateRandomMatrix().AsSingleDimension(); float[] output = new float[Utils.N]; Stopwatch gpuSW = new Stopwatch(); gpuSW.Start(); float[] dev_output = gpu.Allocate <float>(output); float[] dev_input = gpu.CopyToDevice(input); float[,,] dev_NN = gpu.CopyToDevice(NN); gpu.Launch(Utils.GRID_SIZE, Utils.BLOCK_SIZE).CalculateNeuralNetwork(dev_input, dev_NN, dev_output); gpu.CopyFromDevice(dev_output, output); gpu.FreeAll(); gpuSW.Stop(); Console.WriteLine("GPU: " + gpuSW.ElapsedMilliseconds); return(output); }
public void SetUp() { //CudafyModes.Architecture = eArchitecture.sm_30; _gpu = CudafyHost.GetDevice(eArchitecture.sm_30, CudafyModes.DeviceId); Assert.IsFalse(_gpu is OpenCLDevice, "OpenCL devices are not supported."); _cm = CudafyModule.TryDeserialize(); if (_cm == null || !_cm.TryVerifyChecksums()) { _cm = CudafyTranslator.Cudafy(eArchitecture.sm_30); Console.WriteLine(_cm.CompilerOutput); _cm.TrySerialize(); } _gpu.LoadModule(_cm); inputIntArray = new int[] { 0x17, 0x01, 0x7f, 0xd1, 0xfe, 0x23, 0x2c, 0xa0, 0x00, 0xcf, 0xaa, 0x7a, 0x35, 0xf4, 0x04, 0xbc, 0xe9, 0x6d, 0xb2, 0x55, 0xb0, 0xc8, 0x10, 0x49, 0x76, 0x17, 0x92, 0xab, 0xf3, 0xf2, 0xab, 0xcb}; // arbitrary values d_inputIntArray = _gpu.CopyToDevice(inputIntArray); d_outputIntArray = _gpu.Allocate<int>(WARP_SIZE); gpuIntResult = new int[WARP_SIZE]; cpuIntResult = new int[WARP_SIZE]; inputFloatArray = new float[] { 1.7f, -37.03f, 2147.6436f, -0.1f, 7.7f, 99.99f, -809.142f, -0.1115f, 1.0f, 2.0f, 3.0f, 5.0f, 7.5f, 0.1001f, 11.119f, -9.0f, 7749.9847f, -860249.118843f, 0.0f, -2727745.586215f, 12.0f, -11.0f, 77.77f, 22.0f, 377.1112f, -377.1112f, 0.12345f, -0.12345f, 0.11111f, -0.11111f, 700000f, -14f}; // arbitrary values d_inputFloatArray = _gpu.CopyToDevice(inputFloatArray); d_outputFloatArray = _gpu.Allocate<float>(WARP_SIZE); gpuFloatResult = new float[WARP_SIZE]; cpuFloatResult = new float[WARP_SIZE]; }
public static void MyFirstBlasEmulatorTest() { Console.WriteLine("MyTest()"); // Get GPU device CudafyModes.Target = eGPUType.Emulator; GPGPU gpu = CudafyHost.GetDevice(CudafyModes.Target); // Create GPGPUBLAS (CUBLAS Wrapper) using (GPGPUBLAS blas = GPGPUBLAS.Create(gpu)) { const int N = 100; float[] a = new float[N]; float[] b = new float[N]; float[] c = new float[N]; float alpha = -1; float beta = 0; float[] device_a = gpu.CopyToDevice(a); float[] device_b = gpu.CopyToDevice(b); float[] device_c = gpu.CopyToDevice(c); int m = 10; int n = 10; int k = 10; cublasOperation Op = cublasOperation.N; blas.GEMM(m, k, n, alpha, device_a, device_b, beta, device_c, Op); gpu.CopyFromDevice <float>(device_c, c); } }
public static void Execute() { CudafyModule km = CudafyTranslator.Cudafy(typeof(ParamsStruct), typeof(ImpliedVolatile)); _gpu = CudafyHost.GetDevice(CudafyModes.Target); _gpu.LoadModule(km); ParamsStruct[] host_par = new ParamsStruct[1]; ParamsStruct[] result = new ParamsStruct[1]; host_par[0].OP = 96.95; host_par[0].Price = 1332.24; host_par[0].Strike = 1235; host_par[0].TD = 31; host_par[0].R = 0.0001355; host_par[0].Q = 0.0166; host_par[0].N = 100;// 1000; host_par[0].kind = 1; ParamsStruct[] dev_par = _gpu.CopyToDevice(host_par); float[] PA = _gpu.Allocate<float>(1001); _gpu.Launch(1,1, "impliedVolatile", dev_par, PA); _gpu.CopyFromDevice(dev_par, 0, result, 0, 1); Console.WriteLine("I={0}, B={1}", result[0].i, result[0].B); //Console.ReadKey(); }
public void SetUp() { _gpu = CudafyHost.CreateDevice(CudafyModes.Target); _hostInput = new double[N * BATCH]; _hostInputCplx = new ComplexD[N * BATCH]; _hostOutput = new double[N * BATCH]; _hostOutputCplx = new ComplexD[N * BATCH]; _devInput = _gpu.Allocate(_hostInput); _devInputCplx = _gpu.Allocate(_hostInputCplx); _devInter = _gpu.Allocate<double>(N * 2 * BATCH); _devInterCplx = _gpu.Allocate<ComplexD>(N * BATCH); _devOutput = _gpu.Allocate(_hostOutput); _devOutputCplx = _gpu.Allocate(_hostOutputCplx); _fft = GPGPUFFT.Create(_gpu); for (int b = 0; b < BATCH; b++) { for (int i = 0; i < N; i++) { ComplexD cf = new ComplexD(); cf.x = (double)((10.0F * Math.Sin(100 * 2 * Math.PI * i / N * Math.PI / 180))); cf.y = (double)((10.0F * Math.Sin(200 * 2 * Math.PI * i / N * Math.PI / 180))); _hostInput[i + b * N] = cf.x; _hostInputCplx[i + b * N] = cf; } } }
public Layer(GPUModule gpuModule, Layer previousLayer = null,int size = 0, string id = "", int miniBatchSize = Int32.MinValue) { if (previousLayer != null) MinibatchSize = previousLayer.MinibatchSize; if (miniBatchSize != Int32.MinValue) MinibatchSize = miniBatchSize; LayerIndex = IdCounter++; Id = id; if (String.IsNullOrEmpty(Id)) { Id = "ID" + LayerIndex.ToString().PadLeft(2, '0'); } _gpuModule = gpuModule; _gpu = _gpuModule.Gpu; PreviousLayer = previousLayer; if (size != 0) { this.Size = size; AddArray(ArrayName.Outputs, MinibatchSize, this.Size); } if ((previousLayer != null) && (size > 0)) { AddArray(ArrayName.Gradients, MinibatchSize, size); } }
public static void primaGPU() { CudafyModule modul_kernel = CudafyTranslator.Cudafy(); GPGPU vga = CudafyHost.GetDevice(CudafyModes.Target, CudafyModes.DeviceId); vga.LoadModule(modul_kernel); Stopwatch waktu = new Stopwatch(); waktu.Start(); int[] list_cpu = new int[KONSTANTA_THREAD]; int[] list_cpy = new int[KONSTANTA_THREAD]; int[] list = vga.Allocate <int>(KONSTANTA_THREAD); vga.Launch(KONSTANTA_THREAD, 1).ModulAtomic(list); vga.CopyFromDevice(list, list_cpy); vga.FreeAll(); int index = 0; for (int z = 0; z < list_cpy.Length; z++) { if (list_cpy[z] != -1) { list_cpu[index] = list_cpy[z]; //Console.WriteLine(list_cpu[index]); index++; } } waktu.Stop(); TimeSpan ts = waktu.Elapsed; String total = ts.Seconds.ToString(); Console.WriteLine("Total GPU ------ {0} detik> ", total); }
public static void cudaTranspose(ref MathNet.Numerics.LinearAlgebra.Double.DenseMatrix dm) { GPGPU gpu = CudafyHost.GetDevice(eGPUType.Cuda); GPGPUBLAS blas = GPGPUBLAS.Create(gpu); int cols = dm.ColumnCount, rows = dm.RowCount; int restRows = rows - cols; //double[] a = dm.Storage.ToColumnMajorArray(); double[] a = dm.SubMatrix(0, cols, 0, cols).Storage.ToColumnMajorArray(); double[] b = dm.SubMatrix(cols, restRows, 0, cols).Storage.ToColumnMajorArray(); dm = null; double[] a_d = gpu.CopyToDevice <double>(a); a = null; double[] c_d = gpu.Allocate <double>(cols * cols); double[] x_d = gpu.CopyToDevice <double>(new double[] { 1 }); blas.GEMV(cols, cols, 1, c_d, x_d, 0, x_d, Cudafy.Maths.BLAS.Types.cublasOperation.T); a = new double[cols * rows]; gpu.CopyFromDevice <double>(c_d, 0, a, 0, cols * cols); gpu.FreeAll(); a_d = gpu.CopyToDevice <double>(b); b = null; c_d = gpu.Allocate <double>(restRows * cols); x_d = gpu.CopyToDevice <double>(new double[] { 1 }); blas.GEMV(restRows, cols, 1, c_d, x_d, 0, x_d, Cudafy.Maths.BLAS.Types.cublasOperation.T); gpu.CopyFromDevice <double>(c_d, 0, a, cols * cols, restRows * cols); gpu.FreeAll(); dm = new MathNet.Numerics.LinearAlgebra.Double.DenseMatrix(cols, rows, a); }
public static void eksekusi() { CudafyModule kernel_modul = CudafyTranslator.Cudafy(); GPGPU vga = CudafyHost.GetDevice(CudafyModes.Target, CudafyModes.DeviceId); vga.LoadModule(kernel_modul); Stopwatch waktu = new Stopwatch(); waktu.Start(); int[] array_vga = vga.Allocate <int>(KONSTANTA_THREAD); int[] array_hasil = new int[KONSTANTA_THREAD]; //long[] matriks1 = vga.Allocate<long>(KONSTANTA_THREAD); //long[] matriks2 = vga.Allocate<long>(KONSTANTA_THREAD);//new int[KONSTANTA_THREAD]; //long[] matriks3 = vga.Allocate<long>(KONSTANTA_THREAD); //[KONSTANTA_THREAD]; vga.Launch(KONSTANTA_THREAD, 1).fungsiAtomic(array_vga); vga.CopyFromDevice(array_vga, array_hasil); vga.FreeAll(); //for(int z = 0; z < array_hasil.Length; z++) //{ // Console.WriteLine("Hasil Ekstrak----" + array_hasil[z]); //} vga.FreeAll(); waktu.Stop(); TimeSpan ts = waktu.Elapsed; String total = ts.Milliseconds.ToString(); Console.WriteLine("Total VGA ------ > " + total); }
public static bool TestGpuDoublePrecision(int DeviceId) { if (DeviceId > CudafyHost.GetDeviceCount(eGPUType.OpenCL)) { return(false); } try { CudafyModes.Target = eGPUType.OpenCL; CudafyTranslator.Language = eLanguage.OpenCL; CudafyModule km = CudafyTranslator.Cudafy(); GPGPU gpu = CudafyHost.GetDevice(eGPUType.OpenCL, DeviceId); gpu.LoadModule(km); double c; double[] dev_c = gpu.Allocate <double>(); gpu.Launch().add_double(2.5d, 7.5d, dev_c); gpu.CopyFromDevice(dev_c, out c); gpu.Free(dev_c); return(c == 10.0d); } catch { return(false); } }
public static int MA(int[,] A, int[,] B, int[,] C, GPGPU gpu, int maxTheadBlockSize, int Size) { // allocate the memory on the GPU int[,] GPU_A = gpu.Allocate<int>(A); int[,] GPU_B = gpu.Allocate<int>(B); int[,] GPU_C = gpu.Allocate<int>(C); // copy the arrays 'a' and 'b' to the GPU gpu.CopyToDevice(A, GPU_A); gpu.CopyToDevice(B, GPU_B); dim3 threadsPerBlock; // find the number of threads and blocks if (Size < maxTheadBlockSize) { threadsPerBlock = new dim3(Size, Size); } else { threadsPerBlock = new dim3(maxTheadBlockSize, maxTheadBlockSize); } dim3 block = new dim3(Size, Size); // launch GPU_MA gpu.Launch(block, threadsPerBlock, "GPU_MA", GPU_A, GPU_B, GPU_C, Size); // copy the array 'c' back from the GPU to the CPU gpu.CopyFromDevice(GPU_C, C); gpu.Free(GPU_A); gpu.Free(GPU_B); gpu.Free(GPU_C); return 1; }
public void Test_TwoThreadCopy() { _gpu = CudafyHost.GetDevice(eGPUType.Cuda); _gpuuintBufferIn3 = _gpu.Allocate(_uintBufferIn1); _gpuuintBufferIn4 = _gpu.Allocate(_uintBufferIn1); _gpu.EnableMultithreading(); bool j1 = false; bool j2 = false; for (int i = 0; i < 10; i++) { Console.WriteLine(i); SetInputs(); ClearOutputs(); Thread t1 = new Thread(Test_TwoThreadCopy_Thread1); Thread t2 = new Thread(Test_TwoThreadCopy_Thread2); t1.Start(); t2.Start(); j1 = t1.Join(10000); j2 = t2.Join(10000); if (!j1 || !j2) { break; } } _gpu.DisableMultithreading(); _gpu.FreeAll(); Assert.IsTrue(j1); Assert.IsTrue(j2); }
public void ExeTestKernel() { GPGPU gpu = CudafyHost.GetDevice(CudafyModes.Target, 0); eArchitecture arch = gpu.GetArchitecture(); CudafyModule km = CudafyTranslator.Cudafy(arch); gpu.LoadModule(km); int[] host_results = new int[N]; // Either assign a new block of memory to hold results on device var dev_results = gpu.Allocate <int>(N); // Or fill your array with values first and then for (int i = 0; i < N; i++) { host_results[i] = i * 3; } // Copy array with ints to device var dev_filled_results = gpu.CopyToDevice(host_results); // 64*16 = 1024 threads per block (which is max for sm_30) dim3 threadsPerBlock = new dim3(64, 16); // 8*8 = 64 blocks per grid , just for show so you get varying numbers dim3 blocksPerGrid = new dim3(8, 8); //var threadsPerBlock = 1024; // this will only give you blockDim.x = 1024, .y = 0, .z = 0 //var blocksPerGrid = 1; // just for show gpu.Launch(blocksPerGrid, threadsPerBlock, "GenerateRipples", dev_results, dev_filled_results); gpu.CopyFromDevice(dev_results, host_results); }
public static uint[] Evaluate(ulong[] hands, int numCards) { // Translates this class to CUDA C and then compliles CudafyModule km = CudafyTranslator.Cudafy();//eArchitecture.sm_20); // Get the first GPU and load the module GPGPU gpu = CudafyHost.GetDevice(CudafyModes.Target, CudafyModes.DeviceId); gpu.LoadModule(km); int blockSize = 256; int blockx = hands.Length / blockSize; if (hands.Length % blockSize != 0) { blockx++; } ulong[] dev_hands = gpu.Allocate <ulong>(hands.Length); uint[] dev_ranks = gpu.Allocate <uint>(hands.Length); gpu.CopyToDevice(hands, dev_hands); gpu.StartTimer(); gpu.Launch(blockx, blockSize).evaluate(dev_hands, numCards, hands.Length, dev_ranks); var ts = gpu.StopTimer(); uint[] toReturn = new uint[hands.Length]; gpu.CopyFromDevice(dev_ranks, toReturn); return(toReturn); }
public static void ClearGpuArray(GPGPU gpu, int[] gpuArray, int size) { var array = new int[size]; Array.Clear(array, 0, array.Length); gpu.CopyToDevice(array, gpuArray); }
public void SetUp() { _gpu = CudafyHost.GetDevice(); _sparse = GPGPUSPARSE.Create(_gpu); _blas = GPGPUBLAS.Create(_gpu); _solver = new Solver(_gpu, _blas, _sparse); }
/// <summary> /// Copies any reference type fields (e.g. arrays) of the object to the device. /// </summary> private static void CopyReferenceTypeFieldsToDevice <T>(GPGPU gpu, T hostObject) { var fields = DeviceClassHelper.GetFieldsStandardLayout(hostObject.GetType()); foreach (FieldInfo field in fields) { object fieldValue = field.GetValue(hostObject); // Ignore if CudafyIgnore if (field.GetCustomAttributes(typeof(CudafyIgnoreAttribute), true).Count() > 0) { continue; } // Only copy field to the device if this is not already done. if (field.FieldType.IsArray && !deviceObjectFromHostObject[gpu].ContainsKey(fieldValue)) { // If the elements of the array are value types, then we make a deep copy. Otherwise, we create an array of // pointers to the objects. if (field.FieldType.GetElementType().IsValueType) { Array hostArray = (Array)fieldValue; CopyArrayToDevice(gpu, hostArray); } else { CopyArrayOfReferenceTypeToDevice(gpu, (Array)fieldValue); } } else if (!field.FieldType.IsValueType && !deviceObjectFromHostObject[gpu].ContainsKey(fieldValue)) { CreateDeviceObject(gpu, fieldValue); } } }
private static void AssignPointerFields(GPGPU gpu, object hostObject, object deviceObject, List <FieldMapping> pointerFields) { foreach (FieldMapping mapping in pointerFields) { object fieldValue = mapping.HostObjectField.GetValue(hostObject); // Get the IntPtr to the device memory for the array. var devicePointer = TryGetDeviceMemoryFromHostObject(gpu, fieldValue); if (devicePointer == null) { throw new ArgumentException("No device memory allocated for field " + mapping.Name); } // The device object contains this pointer. mapping.DeviceObjectField.SetValue(deviceObject, devicePointer.Pointer); // If the field is an array then set the dimension fields too. if (mapping is ArrayFieldMapping) { ArrayFieldMapping arrayFieldMapping = (ArrayFieldMapping)mapping; Array array = fieldValue as Array; for (int i = 0; i < arrayFieldMapping.ArrayRank; ++i) { arrayFieldMapping.DeviceObjectDimensionFields[i].SetValue(deviceObject, array.GetLength(i)); } } } }
public static void Execute() { CudafyModule km = CudafyTranslator.Cudafy(typeof(ParamsStruct), typeof(ImpliedVolatile)); _gpu = CudafyHost.GetDevice(CudafyModes.Target); _gpu.LoadModule(km); ParamsStruct[] host_par = new ParamsStruct[1]; ParamsStruct[] result = new ParamsStruct[1]; host_par[0].OP = 96.95; host_par[0].Price = 1332.24; host_par[0].Strike = 1235; host_par[0].TD = 31; host_par[0].R = 0.0001355; host_par[0].Q = 0.0166; host_par[0].N = 100;// 1000; host_par[0].kind = 1; ParamsStruct[] dev_par = _gpu.CopyToDevice(host_par); float[] PA = _gpu.Allocate <float>(1001); _gpu.Launch(1, 1, "impliedVolatile", dev_par, PA); _gpu.CopyFromDevice(dev_par, 0, result, 0, 1); Console.WriteLine("I={0}, B={1}", result[0].i, result[0].B); //Console.ReadKey(); }
public static void Example2(GPGPU gpu) { ArrayView view1 = new ArrayView(); ArrayView view2 = new ArrayView(); float[] data = Enumerable.Range(0, 1000).Select(t => (float)t).ToArray(); // Two views of the array, simply applying an offset to the array; could slice instead for example. view1.CreateView(data, 100); view2.CreateView(data, 200); for (int i = 0; i < 1000; ++i) { data[i] = data[i] * 10f; } // Should copy the 'large' array to the device only once; this is referenced by each ArrayView instance. var dev_view1 = DeviceClassHelper.CreateDeviceObject(gpu, view1); var dev_view2 = DeviceClassHelper.CreateDeviceObject(gpu, view2); var dev_result = gpu.Allocate <float>(5); var hostResult = new float[5]; gpu.Launch(1, 1).Test2(dev_view1, dev_view2, dev_result); gpu.CopyFromDevice(dev_result, hostResult); bool pass = (hostResult[0] == 1050f && hostResult[1] == 7f); Console.WriteLine(pass ? "Pass" : "Fail"); }
public static void Execute() { CudafyModule km = CudafyModule.TryDeserialize(); if (km == null || !km.TryVerifyChecksums()) { km = CudafyTranslator.Cudafy(typeof(Generic <ushort, ushort>), typeof(SimpleGeneric)); km.Serialize(); } GPGPU gpu = CudafyHost.GetDevice(CudafyModes.Target); gpu.LoadModule(km); var input = new Generic <ushort, ushort>(); input.A = 187; int[] devoutput = gpu.Allocate <int>(1); gpu.Launch(1, 1, "Kernel", input, devoutput); int output; gpu.CopyFromDevice(devoutput, out output); Console.WriteLine("Simple Generic: " + ((output == 1) ? "PASSED" : "FAILED")); }
public void Test_copyOnHost() { int len = 35687; int[] bufa = new int[len]; int[] bufb = new int[len]; Random r = new Random(); for (int i = 0; i < len; i++) { bufa[i] = r.Next() + 1; } IntPtr ha = _gpu.HostAllocate <int>(len); ha.Write(bufa, 0, 0, len); IntPtr hb = _gpu.HostAllocate <int>(len); GPGPU.CopyMemory(hb, ha, (uint)len * sizeof(int)); hb.Read(bufb, 0, 0, len); for (int i = 0; i < len; i++) { Assert.True(bufa[i] == bufb[i]); Assert.False(bufa[i] == 0); } _gpu.HostFreeAll(); }
/// <summary> /// Creates a SPARSE wrapper based on the specified gpu. Note only CudaGPU is supported. /// </summary> /// <param name="gpu">The gpu.</param> /// <returns></returns> public static GPGPUSPARSE Create(GPGPU gpu) { if (gpu is CudaGPU) return new CudaSPARSE(gpu); else throw new NotImplementedException(gpu.ToString()); }
/// <summary> /// Вызов и исполнение одной элементарной функции по имени функции /// </summary> /// <param name="function"></param> public static void Execute(string function) { Debug.Assert(_indexes1.Last() == _sequencies1.Length); Debug.Assert(_indexes2.Last() == _sequencies2.Length); CudafyModule km = CudafyTranslator.Cudafy(); GPGPU gpu = CudafyHost.GetDevice(); gpu.LoadModule(km); // copy the arrays 'a' and 'b' to the GPU int[] devIndexes1 = gpu.CopyToDevice(_indexes1); int[] devIndexes2 = gpu.CopyToDevice(_indexes2); int[] devSequencies1 = gpu.CopyToDevice(_sequencies1); int[] devSequencies2 = gpu.CopyToDevice(_sequencies2); int[,] devMatrix = gpu.Allocate(_matrix); int rows = _matrix.GetLength(0); int columns = _matrix.GetLength(1); dim3 gridSize = Math.Min(15, (int)Math.Pow((double)rows * columns, 0.33333333333)); dim3 blockSize = Math.Min(15, (int)Math.Pow((double)rows * columns, 0.33333333333)); gpu.Launch(gridSize, blockSize, function, devSequencies1, devIndexes1, devSequencies2, devIndexes2, devMatrix); // copy the array 'c' back from the GPU to the CPU gpu.CopyFromDevice(devMatrix, _matrix); // free the memory allocated on the GPU gpu.FreeAll(); }
public void SetUp() { _gpu = CudafyHost.CreateDevice(CudafyModes.Target); _hostInput = new double[N * BATCH]; _hostInputCplx = new ComplexD[N * BATCH]; _hostOutput = new double[N * BATCH]; _hostOutputCplx = new ComplexD[N * BATCH]; _devInput = _gpu.Allocate(_hostInput); _devInputCplx = _gpu.Allocate(_hostInputCplx); _devInter = _gpu.Allocate <double>(N * 2 * BATCH); _devInterCplx = _gpu.Allocate <ComplexD>(N * BATCH); _devOutput = _gpu.Allocate(_hostOutput); _devOutputCplx = _gpu.Allocate(_hostOutputCplx); _fft = GPGPUFFT.Create(_gpu); for (int b = 0; b < BATCH; b++) { for (int i = 0; i < N; i++) { ComplexD cf = new ComplexD(); cf.x = (double)((10.0F * Math.Sin(100 * 2 * Math.PI * i / N * Math.PI / 180))); cf.y = (double)((10.0F * Math.Sin(200 * 2 * Math.PI * i / N * Math.PI / 180))); _hostInput[i + b * N] = cf.x; _hostInputCplx[i + b * N] = cf; } } }
public void Test_TwoThreadTwoGPU() { _gpu0 = CudafyHost.CreateDevice(CudafyModes.Target, 0); _gpu1 = CudafyHost.CreateDevice(CudafyModes.Target, 1); _gpu0.EnableMultithreading(); _gpu1.EnableMultithreading(); bool j1 = false; bool j2 = false; for (int i = 0; i < 10; i++) { Console.WriteLine(i); Thread t1 = new Thread(Test_TwoThreadTwoGPU_Thread0); Thread t2 = new Thread(Test_TwoThreadTwoGPU_Thread1); t1.Start(); t2.Start(); j1 = t1.Join(10000); j2 = t2.Join(10000); if (!j1 || !j2) { break; } } _gpu0.DisableMultithreading(); _gpu0.FreeAll(); _gpu1.DisableMultithreading(); _gpu1.FreeAll(); Assert.IsTrue(j1); Assert.IsTrue(j2); }
public void SetUp() { _gpu = CudafyHost.CreateDevice(CudafyModes.Target); Console.WriteLine("CUDA driver version={0}", _gpu.GetDriverVersion()); _fft = GPGPUFFT.Create(_gpu); _hostInput = new float[N * BATCH]; _hostInputCplx = new ComplexF[N * BATCH]; _hostOutput = new float[N * BATCH]; _hostOutputCplx = new ComplexF[N * BATCH]; _devInput = _gpu.Allocate(_hostInput); _devInputCplx = _gpu.Allocate(_hostInputCplx); _devInter = _gpu.Allocate<float>(N * 2 * BATCH); _devInterCplx = _gpu.Allocate<ComplexF>(N * BATCH); _devOutput = _gpu.Allocate(_hostOutput); _devOutputCplx = _gpu.Allocate(_hostOutputCplx); Console.WriteLine("CUFFT version={0}", _fft.GetVersion()); for (int b = 0; b < BATCH; b++) { for (int i = 0; i < N; i++) { ComplexF cf = new ComplexF(); cf.x = (float)((10.0F * Math.Sin(100 * 2 * Math.PI * i / N * Math.PI / 180))); cf.y = (float)((10.0F * Math.Sin(200 * 2 * Math.PI * i / N * Math.PI / 180))); _hostInput[i + b * N] = cf.x; _hostInputCplx[i + b * N] = cf; } } }
public void SetUp() { _gpu = CudafyHost.CreateDevice(CudafyModes.Target); Console.WriteLine(_gpu.GetDriverVersion()); _fft = GPGPUFFT.Create(_gpu); _hostInput = new float[N * BATCH]; _hostInputCplx = new ComplexF[N * BATCH]; _hostOutput = new float[N * BATCH]; _hostOutputCplx = new ComplexF[N * BATCH]; _devInput = _gpu.Allocate(_hostInput); _devInputCplx = _gpu.Allocate(_hostInputCplx); _devInter = _gpu.Allocate <float>(N * 2 * BATCH); _devInterCplx = _gpu.Allocate <ComplexF>(N * BATCH); _devOutput = _gpu.Allocate(_hostOutput); _devOutputCplx = _gpu.Allocate(_hostOutputCplx); Console.WriteLine(_fft.GetVersion()); for (int b = 0; b < BATCH; b++) { for (int i = 0; i < N; i++) { ComplexF cf = new ComplexF(); cf.x = (float)((10.0F * Math.Sin(100 * 2 * Math.PI * i / N * Math.PI / 180))); cf.y = (float)((10.0F * Math.Sin(200 * 2 * Math.PI * i / N * Math.PI / 180))); _hostInput[i + b * N] = cf.x; _hostInputCplx[i + b * N] = cf; } } }
public void SetUp() { _gpu = CudafyHost.GetDevice(CudafyModes.Architecture, CudafyModes.DeviceId); _byteBufferIn = new byte[N]; _byteBufferOut = new byte[N]; _sbyteBufferIn = new sbyte[N]; _sbyteBufferOut = new sbyte[N]; _ushortBufferIn = new ushort[N]; _ushortBufferOut = new ushort[N]; _uintBufferIn = new uint[N]; _uintBufferOut = new uint[N]; _ulongBufferIn = new ulong[N]; _ulongBufferOut = new ulong[N]; _cplxDBufferIn = new ComplexD[N]; _cplxDBufferOut = new ComplexD[N]; _cplxFBufferIn = new ComplexF[N]; _cplxFBufferOut = new ComplexF[N]; SetInputs(); ClearOutputsAndGPU(); }
// // http://stackoverflow.com/questions/18628447/cudafy-throws-an-exception-while-testing // private static void BlasSample(int deviceId) { CudafyModes.Target = eGPUType.Emulator; GPGPU gpu = CudafyHost.GetDevice(CudafyModes.Target, deviceId); CudafyModes.DeviceId = deviceId; eArchitecture arch = gpu.GetArchitecture(); CudafyModule km = CudafyTranslator.Cudafy(arch); gpu.LoadModule(km); GPGPUBLAS blas = GPGPUBLAS.Create(gpu); const int N = 100; float[] a = new float[N]; float[] b = new float[N]; float[] c = new float[N]; float alpha = -1; float beta = 0; float[] device_a = gpu.CopyToDevice(a); float[] device_b = gpu.CopyToDevice(b); float[] device_c = gpu.CopyToDevice(c); int m = 10; int n = 10; int k = 10; cublasOperation Op = cublasOperation.N; blas.GEMM(m, k, n, alpha, device_a, device_b, beta, device_c, Op); throw new NotImplementedException(); }
/// <summary> /// Вызов и исполнение функции проверки что массив отсортирован /// </summary> public static void ExecuteSorted(int direction = 1) { CudafyModule km = CudafyTranslator.Cudafy(); GPGPU gpu = CudafyHost.GetDevice(); gpu.LoadModule(km); int[] devA = gpu.Allocate(_a); int[] devB = gpu.Allocate(_b); int[] devC = gpu.Allocate(_c); int[] devD = gpu.Allocate(D); gpu.CopyToDevice(_a, devA); gpu.Launch(1, 1).Split(devA, devB, devC, _middle); gpu.Launch(_gridSize, _blockSize).Sorted(devA, devB, devC, devD, 0, direction); gpu.Launch(1, 1).Sorted(devA, devB, devC, devD, 1, direction); gpu.CopyFromDevice(devD, D); // free the memory allocated on the GPU gpu.FreeAll(); }
/// <summary> /// Creates a BLAS wrapper based on the specified gpu. /// </summary> /// <param name="gpu">The gpu.</param> /// <returns></returns> public static GPGPUBLAS Create(GPGPU gpu) { if (gpu is CudaGPU) return new CudaBLAS(gpu); else return new HostBLAS(gpu); //throw new NotImplementedException(gpu.ToString()); }
public GPU_func() { km = CudafyTranslator.Cudafy(); gpu = CudafyHost.GetDevice(CudafyModes.Target, CudafyModes.DeviceId); gpu.LoadModule(km); GPU_prop = gpu.GetDeviceProperties(); }
public void SetUp() { _gpu = CudafyHost.CreateDevice(CudafyModes.Target); _blas = GPGPUBLAS.Create(_gpu); _hostInput = new float[ciROWS, ciCOLS]; _hostInput2 = new float[ciROWS, ciCOLS]; _hostOutput = new float[ciROWS, ciCOLS]; _devPtr = _gpu.Allocate<float>(_hostInput); _devPtr2 = _gpu.Allocate<float>(_hostOutput); }
public void SetUp() { //var x = CompilerHelper.Create(ePlatform.x64, eArchitecture.OpenCL, eCudafyCompileMode.Default); var y = CompilerHelper.Create(ePlatform.x64, CudafyModes.Architecture, eCudafyCompileMode.DynamicParallelism); _cm = CudafyTranslator.Cudafy(new CompileProperties[] {y}, this.GetType()); Console.WriteLine(_cm.CompilerOutput); _cm.Serialize(); _gpu = CudafyHost.GetDevice(y.Architecture, CudafyModes.DeviceId); _gpu.LoadModule(_cm); }
public void SetUp() { _gpu = CudafyHost.CreateDevice(CudafyModes.Target); _blas = GPGPUBLAS.Create(_gpu); Console.Write("BLAS Version={0}", _blas.GetVersion()); _hostInput1 = new float[ciN]; _hostInput2 = new float[ciN]; _hostOutput1 = new float[ciN]; _hostOutput2 = new float[ciN]; _devPtr1 = _gpu.Allocate<float>(_hostInput1); _devPtr2 = _gpu.Allocate<float>(_hostOutput1); }
internal CudaRAND(GPGPU gpu, curandRngType rng_type) { _gpu = gpu; if (IntPtr.Size == 8) { _driver = new CURANDDriver64(); } else { _driver = new CURANDDriver32(); } }
public void Initialize(int bytes) { CudafyModule km = CudafyTranslator.Cudafy(); _gpu = CudafyHost.GetDevice(CudafyModes.Target, CudafyModes.DeviceId); _gpu.LoadModule(km); _dev_bitmap = _gpu.Allocate<byte>(bytes); _blocks = new dim3(DIM / 16, DIM / 16); _threads = new dim3(16, 16); }
public void SetUp() { CudafyTranslator.GenerateDebug = true; _cm = CudafyModule.TryDeserialize(); _gpu = CudafyHost.GetDevice(CudafyModes.Architecture, CudafyModes.DeviceId); if (_cm == null || !_cm.TryVerifyChecksums()) { _cm = CudafyTranslator.Cudafy(_gpu.GetArchitecture(), this.GetType(), (_gpu is OpenCLDevice) ? null : typeof(StringConstClass)); _cm.TrySerialize(); } _gpu.LoadModule(_cm); }
internal CudaRAND(GPGPU gpu, curandRngType rng_type) { _gpu = gpu; if (IntPtr.Size == 8) { _driver = new CURANDDriver64(); } else { throw new NotSupportedException(); //_driver = new CURANDDriver32(); } }
public virtual void SetUp() { _gpu = CudafyHost.GetDevice(CudafyModes.Architecture, CudafyModes.DeviceId); var types = new List<Type>(); types.Add(this.GetType()); types.Add(typeof(MathSingleTest)); SupportsDouble = _gpu.GetDeviceProperties().SupportsDoublePrecision; if (SupportsDouble) types.Add(typeof(MathDoubleTest)); _cm = CudafyTranslator.Cudafy(CudafyModes.Architecture, types.ToArray()); Debug.WriteLine(_cm.SourceCode); _gpu.LoadModule(_cm); }
public Solver(GPGPU gpu, GPGPUBLAS blas, GPGPUSPARSE sparse) { this.gpu = gpu; this.blas = blas; this.sparse = sparse; var km = CudafyModule.TryDeserialize(); if (km == null || !km.TryVerifyChecksums()) { km = CudafyTranslator.Cudafy(); km.TrySerialize(); } gpu.LoadModule(km); }
public void SetUp() { _gpu = CudafyHost.GetDevice(CudafyModes.Architecture, CudafyModes.DeviceId); _cm = CudafyModule.TryDeserialize(); if (_cm == null || !_cm.TryVerifyChecksums()) { _cm = CudafyTranslator.Cudafy(CudafyModes.Architecture);//typeof(PrimitiveStruct), typeof(BasicFunctionTests)); Console.WriteLine(_cm.CompilerOutput); _cm.TrySerialize(); } _gpu.LoadModule(_cm); }
public void SetUp() { _gpu = CudafyHost.CreateDevice(CudafyModes.Target); _sparse = GPGPUSPARSE.Create(_gpu); _hiMatrixMN = new double[M * N]; _hiMatrixMN2 = new double[M * N]; _hoMatrixMN = new double[M * N]; _hoPerVector = new int[M]; _hoPerVector2 = new int[N]; _diPerVector2 = _gpu.Allocate(_hoPerVector2); _diMatrixMN = _gpu.Allocate(_hiMatrixMN); _diMatrixMN2 = _gpu.Allocate(_hiMatrixMN2); _diPerVector = _gpu.Allocate(_hoPerVector); }
public void SetUp() { _gpu = CudafyHost.GetDevice(); _sparse = GPGPUSPARSE.Create(_gpu); hiMatrixMN = new double[M * N]; hiMatrixMK = new double[M * K]; hiMatrixKM = new double[K * M]; hiMatrixKN = new double[K * N]; hiMatrixNN = new double[N * N]; hiVectorXM = new double[M]; hiVectorXN = new double[N]; hiVectorYM = new double[M]; hiVectorYN = new double[N]; gpuResultM = new double[M]; gpuResultN = new double[N]; gpuResultMN = new double[M * N]; }
public GpuRenderer() { var availableOpenCLDevices = CudafyHost.GetDeviceProperties(eGPUType.OpenCL); if (availableOpenCLDevices.Any() == false) { throw new Exception("No OpenCL devices found..."); } var device = availableOpenCLDevices.First(); Module = CudafyTranslator.Cudafy(eArchitecture.OpenCL12); var blockSide = Enumerable .Range(1, 15) .Reverse() .First(count => count * count <= device.MaxThreadsPerBlock); BlockSize = new dim3(blockSide, blockSide); // Initialize gpu and load the module (avoids reloading every time) gpu = CudafyHost.GetDevice(eGPUType.OpenCL); gpu.LoadModule(Module); }
public void SetUp() { _gpu = CudafyHost.GetDevice(CudafyModes.Target); _blas = GPGPUBLAS.Create(_gpu); hiMatrixAMM = new double[M * M]; hiMatrixANN = new double[N * N]; hiMatrixAMK = new double[M * K]; hiMatrixAKM = new double[K * M]; hiMatrixBMN = new double[M * N]; hiMatrixBKN = new double[K * N]; hiMatrixBNK = new double[N * K]; hiMatrixBMK = new double[M * K]; hiMatrixBKM = new double[K * M]; hiMatrixCMN = new double[M * N]; hiMatrixCKN = new double[K * N]; hiMatrixCMK = new double[M * K]; hiMatrixCMM = new double[M * M]; gpuResultMN = new double[M * N]; gpuResultMM = new double[M * M]; }
public void SetUp() { _gpu = CudafyHost.GetDevice(CudafyModes.Target); _blas = GPGPUBLAS.Create(_gpu); Console.Write("BLAS Version={0}", _blas.GetVersion()); // Initialize CPU Buffer hiMatrixA = new double[M * N]; hiMatrixANN = new double[N * N]; hiMatrixACBC = new double[(KL + KU + 1) * N]; hiMatrixASCBC = new double[(K + 1) * N]; hiMatrixAPS = new double[(N * (N + 1)) / 2]; hiVectorXM = new double[M]; hiVectorXN = new double[N]; hiVectorYM = new double[M]; hiVectorYN = new double[N]; gpuResultM = new double[M]; gpuResultN = new double[N]; gpuResultMN = new double[M * N]; gpuResultNN = new double[N * N]; gpuResultP = new double[(N * (N + 1)) / 2]; }
public static void Execute() { _gpu = CudafyHost.GetDevice(eGPUType.Cuda); CudafyModule km = CudafyTranslator.Cudafy(ePlatform.Auto, _gpu.GetArchitecture(), typeof(TextInsertion)); Console.WriteLine(km.CompilerOutput); _gpu.LoadModule(km); int[] data = new int[64]; int[] data_d = _gpu.CopyToDevice(data); int[] res_d = _gpu.Allocate(data); int[] res = new int[64]; _gpu.Launch(1, 1, "AHybridMethod", data_d, res_d); _gpu.CopyFromDevice(data_d, res); for(int i = 0; i < 64; i++) if (data[i] != res[i]) { Console.WriteLine("Failed"); break; } }
public static void Execute() { _gpu = CudafyHost.GetDevice(eGPUType.Cuda); CudafyModule km = CudafyTranslator.Cudafy(ePlatform.Auto, _gpu.GetArchitecture(), typeof(SIMDFunctions)); //CudafyModule km = CudafyTranslator.Cudafy(ePlatform.Auto, eArchitecture.sm_12, typeof(SIMDFunctions)); _gpu.LoadModule(km); int w = 1024; int h = 1024; for (int loop = 0; loop < 3; loop++) { uint[] a = new uint[w * h]; Fill(a); uint[] dev_a = _gpu.CopyToDevice(a); uint[] b = new uint[w * h]; Fill(b); uint[] dev_b = _gpu.CopyToDevice(b); uint[] c = new uint[w * h]; uint[] dev_c = _gpu.Allocate(c); _gpu.StartTimer(); _gpu.Launch(h, w, "SIMDFunctionTest", dev_a, dev_b, dev_c); _gpu.CopyFromDevice(dev_c, c); float time = _gpu.StopTimer(); Console.WriteLine("Time: {0}", time); if (loop == 0) { bool passed = true; GThread thread = new GThread(1, 1, null); for (int i = 0; i < w * h; i++) { uint exp = thread.vadd2(a[i], b[i]); if (exp != c[i]) passed = false; } Console.WriteLine("Test {0}", passed ? "passed. " : "failed!"); } _gpu.FreeAll(); } }
public void SetUp() { _gpu = CudafyHost.GetDevice(); _sparse = GPGPUSPARSE.Create(_gpu); _hiVectorX = new float[N]; _hiVectorY = new float[N]; _hoVectorY = new float[N]; FillBufferSparse(_hiVectorX, out NNZ); FillBuffer(_hiVectorY); _hiIndicesX = new int[NNZ]; _hoValsX = new float[NNZ]; _hiValsX = new float[NNZ]; GetSparseIndex(_hiVectorX, _hiValsX, _hiIndicesX); _diValsX = _gpu.Allocate(_hiValsX); _diIndicesX = _gpu.Allocate(_hiIndicesX); _diVectorY = _gpu.Allocate(_hiVectorY); }