private void initGPU() { // Translate all members with the Cudafy attribute in the given type to CUDA and compile. CudafyModule km = CudafyTranslator.Cudafy(typeof(Population), typeof(UserUpdate), typeof(Fitness), typeof(FitnessParameter), typeof(PredictionPerformances), typeof(Experiment), typeof(SimOptions)); // Get the first CUDA device and load the module generated above. gpu = CudafyHost.GetDevice(CudafyModes.Target, 0); gpu.LoadModule(km); // Allocate the memory on the GPU of same size as specified arrays dev_fitnesses = gpu.Allocate <float>(fs); dev_fitnessParams = gpu.Allocate <FitnessParameter>(options.NumberOfIndividuals); dev_groundTruth = gpu.CopyToDevice(researchData.GroundTruth); dev_userTrust = gpu.CopyToDevice(researchData.UserTrusts); dev_updates = gpu.CopyToDevice(researchData.Updates); //FitnessData dev_fitnessData = gpu.CopyToDevice(fitnessData); }
private static CudafyModule GetCudafyModule(Type[] types) { var joined = string.Join(",", types.Select(t => t.ToString())); var filename = string.Format("{0}.cdfy", Path.Combine(AudioKernelCacheRoot, joined)); var km = CudafyModule.TryDeserialize(filename); if (km != null && km.TryVerifyChecksums()) { km.Tag = types; return(km); } km = CudafyTranslator.Cudafy(eArchitecture.OpenCL, types); Directory.CreateDirectory(AudioKernelCacheRoot); km.Serialize(filename); km.Tag = types; return(km); }
public static void Execute() { _gpu = CudafyHost.GetDevice(eGPUType.Cuda); CudafyModule km = CudafyTranslator.Cudafy(ePlatform.Auto, Program.testArchitecture, typeof(SIMDFunctions)); //CudafyModule km = CudafyTranslator.Cudafy(ePlatform.Auto, eArchitecture.sm_12, typeof(SIMDFunctions)); _gpu.LoadModule(km); int w = 1024; int h = 1024; for (int loop = 0; loop < 3; loop++) { uint[] a = new uint[w * h]; Fill(a); uint[] dev_a = _gpu.CopyToDevice(a); uint[] b = new uint[w * h]; Fill(b); uint[] dev_b = _gpu.CopyToDevice(b); uint[] c = new uint[w * h]; uint[] dev_c = _gpu.Allocate(c); _gpu.StartTimer(); _gpu.Launch(h, w, "SIMDFunctionTest", dev_a, dev_b, dev_c); _gpu.CopyFromDevice(dev_c, c); float time = _gpu.StopTimer(); Console.WriteLine("Time: {0}", time); if (loop == 0) { bool passed = true; GThread thread = new GThread(1, 1, null); for (int i = 0; i < w * h; i++) { uint exp = thread.vadd2(a[i], b[i]); if (exp != c[i]) { passed = false; } } Console.WriteLine("Test {0}", passed ? "passed. " : "failed!"); } _gpu.FreeAll(); } }
private void recompileCUDAModule() { Console.Write("(Re)compiling OpenCL module... "); if (_cudaModule == null) { _cudaModule = new CudafyModule(); } _cudaDevice.UnloadModules(); _cudaModule.Reset(); CudafyTranslator.Language = eLanguage.OpenCL; _cudaModule = CudafyTranslator.Cudafy(typeof(MandelComputerCUDA)); //_cudaModule = CudafyTranslator.Cudafy(_cudaModuleSourceInstance); _cudaDevice.LoadModule(_cudaModule); _cudaNeedsRecompile = false; Console.ForegroundColor = ConsoleColor.Green; Console.WriteLine("DONE!"); Console.ForegroundColor = ConsoleColor.Gray; }
private static void SimInit() { Console.WriteLine("Deserializing class"); CudafyModule km = CudafyModule.TryDeserialize(typeof(Program).Name); Console.WriteLine("Got: " + km); var tvc = km == null ? false : km.TryVerifyChecksums(); Console.WriteLine("TVC: " + tvc); if (km == null || !tvc) { Console.WriteLine("Serializing"); km = CudafyTranslator.Cudafy(typeof(Program)); km.Serialize(); } Console.WriteLine("Requesting device"); _gpu = CudafyHost.GetDevice(eGPUType.Cuda); if (_gpu == null) { _gpu = CudafyHost.GetDevice(eGPUType.OpenCL); if (_gpu == null) { _gpu = CudafyHost.GetDevice(eGPUType.Emulator); if (_gpu == null) { Console.WriteLine("No deivce found!"); return; } } else { Console.WriteLine("Got OpenCL Device: " + _gpu.DeviceId); } } else { Console.WriteLine("Got CUDA Device: " + _gpu.DeviceId); } Console.WriteLine("Loading module"); _gpu.LoadModule(km); }
static void popcTest() { var km = CudafyModule.TryDeserialize(typeof(OpenCLTestClass).Name); if (km == null || !km.TryVerifyChecksums()) { km = CudafyTranslator.Cudafy(CudafyModes.Architecture, typeof(OpenCLTestClass)); km.TrySerialize(); } Console.WriteLine(km.SourceCode); GPGPU gpu = CudafyHost.GetDevice(CudafyModes.Target); gpu.LoadModule(km); uint[] v = new uint[N]; int[] c = new int[N]; // allocate the memory on the GPU int[] dev_c = gpu.Allocate <int>(c); // fill the array 'v' for (int i = 0; i < N; i++) { v[i] = (uint)i; } // copy the array 'v' to the GPU uint[] dev_v = gpu.CopyToDevice(v); gpu.Launch(1, N).popVect(dev_v, dev_c); // copy the array 'c' back from the GPU to the CPU gpu.CopyFromDevice(dev_c, c); // display the results for (int i = 0; i < N; i++) { //Console.WriteLine("__popc{0} = {1}", v[i], c[i]); } // free the memory allocated on the GPU gpu.FreeAll(); }
public static void init(eArchitecture archi = eArchitecture.sm_20, bool hasSdk = false, bool generate = false) { if (archi == eArchitecture.Emulator) { CudafyModes.Target = eGPUType.Emulator; } else if (archi >= eArchitecture.OpenCL) { CudafyModes.Target = eGPUType.OpenCL; } if (hasSdk) { // Build the module if (generate || CudafyModes.Target != eGPUType.Cuda) { if (CudafyModes.Target == eGPUType.Cuda) { CudafyTranslator.Language = eLanguage.OpenCL; } km = CudafyTranslator.Cudafy(archi); km.Serialize("bespoke_" + archi); } else { km = new CudafyModule(); km.SourceCode = System.IO.File.ReadAllText("cuda.cu"); km.Compile(eGPUCompiler.CudaNvcc); } } else { // Load the module km = CudafyModule.Deserialize(archi.ToString()); } // pretend it has the function it actually has if (!generate && !km.Functions.ContainsKey("calc_r")) { km.Functions.Add("calc_r", new KernelMethodInfo(typeof(RuneCalc), typeof(RuneCalc).GetMethod("calc_r"), eKernelMethodType.Global, false, eCudafyDummyBehaviour.Default, km)); } gpu = CudafyHost.GetDevice(CudafyModes.Target, 0); gpu.LoadModule(km); }
public static void Basics() { CudafyModule cm = CudafyTranslator.Cudafy(CudafyModes.Architecture); Console.WriteLine(cm.CompilerOutput); GPGPU gpu = CudafyHost.GetDevice(); gpu.LoadModule(cm); int i, total; RandStateXORWOW[] devStates = gpu.Allocate <RandStateXORWOW>(64 * 64); int[] devResults = gpu.Allocate <int>(64 * 64); int[] hostResults = new int[64 * 64]; gpu.Set(devResults); #if !NET35 gpu.Launch(64, 64).setup_kernel(devStates); for (i = 0; i < 10; i++) { gpu.Launch(64, 64).generate_kernel(devStates, devResults); } #else gpu.Launch(64, 64, "setup_kernel", devStates); for (i = 0; i < 10; i++) { gpu.Launch(64, 64, "generate_kernel", devStates, devResults); } #endif gpu.CopyFromDevice(devResults, hostResults); total = 0; for (i = 0; i < 64 * 64; i++) { total += hostResults[i]; } Console.WriteLine("Fraction with low bit set was {0}", (float)total / (64.0f * 64.0f * 100000.0f * 10.0f)); gpu.FreeAll(); }
public static void Execute() { CudafyModule km = CudafyTranslator.Cudafy(); GPGPU gpu = CudafyHost.GetDevice(CudafyModes.Target, CudafyModes.DeviceId); gpu.LoadModule(km); int c = 0; int[] dev_c = gpu.Allocate<int>(); // cudaMalloc one Int32 gpu.Launch().add(2, 7, dev_c); // or gpu.Launch(1, 1, "add", 2, 7, dev_c); gpu.CopyFromDevice(dev_c, out c); Console.WriteLine("2 + 7 = {0}", c); gpu.Launch().sub(2, 7, dev_c); gpu.CopyFromDevice(dev_c, out c); Console.WriteLine("2 - 7 = {0}", c); gpu.Free(dev_c); }
private void InitCudaModule() { cufy.CudafyModes.Target = cufy.eGPUType.Cuda; gpu = CudafyHost.GetDevice(CudafyModes.Target); cuGPU = (CUDA)((CudaGPU)gpu).CudaDotNet; var ctx = cuGPU.CreateContext(0, CUCtxFlags.MapHost); cuGPU.SetCurrentContext(ctx); // gpu.EnableSmartCopy(); module = CudafyModule.TryDeserialize(moduleName); if (module == null || !module.TryVerifyChecksums()) { module = CudafyTranslator.Cudafy(typeof(CudafyRBFSlicedEllpackKernel)); module.Serialize(); } gpu.LoadModule(module); }
public ProteinDigest(double[] potentialPrecursors, int maxPeptideLength, int minPeptideLength) { //Init Gpu access CudafyModule km = CudafyTranslator.Cudafy(); gpu = CudafyHost.GetDevice(CudafyModes.Target, CudafyModes.DeviceId); gpu.LoadModule(km); dev_prec = gpu.CopyToDevice(potentialPrecursors); // allocate the memory on the GPU GPGPUProperties properties = gpu.GetDeviceProperties(); this.maxGridSize = properties.MaxGridSize.x; this.maxPeptideSize = maxPeptideLength; this.peptideArraySize = maxPeptideLength - minPeptideLength + 1; this.minPeptideSize = minPeptideLength; this.outputStart = new int[maxGridSize * peptideArraySize]; //Allocate vector that will store the results dev_outputStart = gpu.Allocate <int>(maxGridSize * peptideArraySize); }
public static void Execute() { CudafyModule km = CudafyTranslator.Cudafy(); GPGPU gpu = CudafyHost.GetDevice(CudafyModes.Target, CudafyModes.DeviceId); gpu.LoadModule(km); int[] a = new int[N]; int[] b = new int[N]; int[] c = new int[N]; // allocate the memory on the GPU int[] dev_c = gpu.Allocate <int>(c); // fill the arrays 'a' and 'b' on the CPU for (int i = 0; i < N; i++) { a[i] = i; b[i] = i * i; } // copy the arrays 'a' and 'b' to the GPU int[] dev_a = gpu.CopyToDevice(a); int[] dev_b = gpu.CopyToDevice(b); gpu.Launch(1, N).add(dev_a, dev_b, dev_c); // copy the array 'c' back from the GPU to the CPU gpu.CopyFromDevice(dev_c, c); // display the results for (int i = 0; i < N; i++) { Console.WriteLine("{0} + {1} = {2}", a[i], b[i], c[i]); } // free the memory allocated on the GPU gpu.FreeAll(); }
public BEP_CUDA(int rozmiarWejsca, int rozmiarWyjscia, int glebokosc) { iloscWejsc = rozmiarWejsca; iloscWyjsc = rozmiarWyjscia; iloscWarstw = glebokosc + 2; neurony = new float[iloscWarstw, 32, 32]; macierzDelt = new float[iloscWarstw, 32]; macierzWyjsc = new float[iloscWarstw + 1, 32]; macierzWejsc = new float[iloscWarstw + 1, 32]; macierzSum = new float[iloscWarstw, 32]; iloscNeuronowWWarstwie = new int[iloscWarstw]; wyjscia = new int[iloscWarstw + 1]; odpowiedz = new float[rozmiarWyjscia]; numerWarstwy = new int[1]; stala = new float[1]; iloscWejscWWarstwie = new int[iloscWarstw]; TworzNeurony(); CzyscMacierze(); CudafyTranslator.GenerateDebug = true; km = CudafyTranslator.Cudafy(eArchitecture.sm_30, typeof(BEP_CUDA)); gpu = CudafyHost.GetDevice(CudafyModes.Target, CudafyModes.DeviceId); gpu.LoadModule(km); neuronyGPU = gpu.CopyToDevice(neurony); //gpu.CopyToConstantMemory(neurony, neuronyGPU); macierzDeltGPU = gpu.CopyToDevice(macierzDelt); //gpu.CopyToConstantMemory(macierzDelt, macierzDeltGPU); macierzWejscGPU = gpu.CopyToDevice(macierzWejsc); //gpu.CopyToConstantMemory(macierzWejsc, macierzWejscGPU); macierzWyjscGPU = gpu.CopyToDevice(macierzWyjsc); //gpu.CopyToConstantMemory(macierzWyjsc, macierzWyjscGPU); macierzSumGPU = gpu.CopyToDevice(macierzSum); //gpu.CopyToConstantMemory(macierzSum, macierzSumGPU); wyjsciaGPU = gpu.CopyToDevice(wyjscia); //gpu.CopyToConstantMemory(wyjscia, wyjsciaGPU); warstwyGPU = gpu.CopyToDevice(iloscNeuronowWWarstwie); //gpu.CopyToConstantMemory(iloscNeuronowWWarstwie, warstwyGPU); iloscWejscWWarstwieGPU = gpu.CopyToDevice(iloscWejscWWarstwie); }
public static void Execute() { CudafyModule km = CudafyTranslator.Cudafy(Program.testArchitecture); GPGPU gpu = CudafyHost.GetDevice(CudafyModes.Target, 0); gpu.LoadModule(km); const int count = 128; var random = new Random(); var input = new int[count]; int output = 0; int expectedOutput = 0; for (var i = 0; i < count; i++) { input[i] = random.Next(16); } for (var i = 0; i < count; i++) { expectedOutput += (input[i] == 1) ? 1 : 0; } var devInput = gpu.Allocate <int>(count); var devOutput = gpu.Allocate <int>(1); gpu.CopyToDevice(input, devInput); gpu.Launch(1, count, "SyncThreadCountKernel", devInput, devOutput); // copy the array 'c' back from the GPU to the CPU gpu.CopyFromDevice(devOutput, out output); gpu.Free(devInput); gpu.Free(devOutput); Console.WriteLine("SyncThreadCount: {0}", output); Console.WriteLine("Expected: {0} \t{1}", expectedOutput, expectedOutput == output ? "PASSED" : "FAILED"); }
internal override Answer GetAnswer() { var stopWatchLoad = Stopwatch.StartNew(); using (var gpu = CudafyHost.GetDevice()) { var arch = gpu.GetDeviceProperties().Capability.GetArchitecture(); gpu.LoadModule(CudafyTranslator.Cudafy(ePlatform.x64, arch)); LoadTime = stopWatchLoad.ElapsedMilliseconds; var stopWatchRun = Stopwatch.StartNew(); var gpuLatLong = gpu.CopyToDevice(_latLong.ToArray()); var answer = new AnswerStruct[_blocksPerGrid];; var gpuAnswer = gpu.Allocate(answer); gpu.SafeLaunch(_blocksPerGrid, _threadsPerBlock, GpuFindPathDistance, (int)_permutations, gpuLatLong, gpuAnswer); gpu.Synchronize(); gpu.CopyFromDevice(gpuAnswer, answer); gpu.FreeAll(); var bestDistance = float.MaxValue; var bestPermutation = 0; for (var i = 0; i < _blocksPerGrid; i++) { if (answer[i].distance < bestDistance) { bestDistance = answer[i].distance; bestPermutation = answer[i].pathNo; } } return(new AnswerBetter { Distance = bestDistance, Permutation = bestPermutation, msLoadTime = LoadTime, msRunTime = stopWatchRun.ElapsedMilliseconds }); } }
private unsafe static void Main(string[] args) { GPGPU gpuCuda = CudafyHost.GetDevice(eGPUType.Cuda, 0); CudafyModule km = CudafyTranslator.Cudafy(); gpuCuda.LoadModule(km); TestStruct[] host_array = new TestStruct[1]; host_array[0] = new TestStruct(); int[] host_intArray = new[] { 1, 8, 3 }; int[] dev_intArray = gpuCuda.CopyToDevice(host_intArray); DevicePtrEx p = gpuCuda.GetDeviceMemory(dev_intArray); IntPtr pointer = p.Pointer; host_array[0].dataPointer = pointer.ToInt64(); TestStruct[] dev_array = gpuCuda.Allocate(host_array); gpuCuda.CopyToDevice(host_array, dev_array); gpuCuda.Launch().kernelTest(dev_array, dev_intArray); gpuCuda.CopyFromDevice(dev_array, host_array); Console.WriteLine(host_array[0].value); Console.ReadKey(); }
public int Init() { this.m_km = CudafyTranslator.Cudafy(); CudafyModes.Target = eGPUType.Cuda; var tgCount = CudafyHost.GetDeviceCount(CudafyModes.Target); if (tgCount <= 0) { CudafyModes.Target = eGPUType.OpenCL; tgCount = CudafyHost.GetDeviceCount(CudafyModes.Target); } if (tgCount <= 0) { CudafyModes.Target = eGPUType.Emulator; tgCount = CudafyHost.GetDeviceCount(CudafyModes.Target); } if (tgCount <= 0) { throw new CtkCudafyCannotUseException("無法使用Cudafy"); } for (int idx = 0; idx < tgCount; idx++) { try { this.m_gpu = CudafyHost.GetDevice(CudafyModes.Target, idx); this.m_gpu.LoadModule(Km); return(0); } catch (Cudafy.CudafyCompileException) { } } throw new Exception("Cudafy buidling fail."); }
internal static Answer GpuTsp() { var stopWatchLoad = Stopwatch.StartNew(); using (var gpu = CudafyHost.GetDevice()) { gpu.LoadModule(CudafyTranslator.Cudafy()); LoadTime = stopWatchLoad.ElapsedMilliseconds; var stopWatchRun = Stopwatch.StartNew(); var gpuLatitudes = gpu.CopyToDevice(_latitudes.ToArray()); var gpuLongitudes = gpu.CopyToDevice(_longitudes.ToArray()); var answer = new AnswerStruct[_blocksPerGrid];; var gpuAnswer = gpu.Allocate(answer); gpu.SafeLaunch(_blocksPerGrid, _threadsPerBlock, GpuFindPathDistance, (int)_permutations, _cities, gpuLatitudes, gpuLongitudes, gpuAnswer); gpu.Synchronize(); gpu.CopyFromDevice(gpuAnswer, answer); var bestDistance = float.MaxValue; var bestPermutation = 0; for (var i = 0; i < _blocksPerGrid; i++) { if (answer[i].distance < bestDistance) { bestDistance = answer[i].distance; bestPermutation = answer[i].pathNo; } } return(new Answer { Distance = bestDistance, Permutation = bestPermutation, msLoadTime = LoadTime, msRunTime = stopWatchRun.ElapsedMilliseconds }); } }
public static bool InitGPU(PictureBox passedViewport) { viewport = passedViewport; CudafyModes.Target = eGPUType.OpenCL; // To use OpenCL, change this enum CudafyModes.DeviceId = 0; CudafyTranslator.Language = CudafyModes.Target == eGPUType.OpenCL ? eLanguage.OpenCL : eLanguage.Cuda; CudafyModule km = null; try { int deviceCount = CudafyHost.GetDeviceCount(CudafyModes.Target); if (deviceCount == 0) { Console.WriteLine("No suitable {0} devices found.", CudafyModes.Target); return(false); } gpu = CudafyHost.GetDevice(CudafyModes.Target, CudafyModes.DeviceId); Console.WriteLine("Device Name: {0}", gpu.GetDeviceProperties(false).Name); var result = gpu.GetDeviceProperties(true); // diagnostic data km = CudafyTranslator.Cudafy(); gpu.LoadModule(km); } catch (Exception ex) { Console.WriteLine(ex); Console.WriteLine(km.SourceCode); Debugger.Break(); return(false); } InitDevicePointers(); return(true); }
/// <summary> /// Вызов и исполнение одной элементарной функции по имени функции /// </summary> /// <param name="function"></param> public static void Execute(string function) { CudafyModule km = CudafyTranslator.Cudafy(); GPGPU gpu = CudafyHost.GetDevice(); gpu.LoadModule(km); int[] devA = gpu.Allocate(_a); int[] devB = gpu.Allocate(_b); int[] devC = gpu.Allocate(_c); int[] devD = gpu.Allocate(D); gpu.CopyToDevice(_a, devA); gpu.Launch(_gridSize, _blockSize, function, devA, devB, devC, devD, 1); gpu.Launch(1, 1, function, devA, devB, devC, devD, 2); gpu.CopyFromDevice(devD, D); // free the memory allocated on the GPU gpu.FreeAll(); }
public static bool cudaEnable() { if (!isCudaAvailable()) { return(false); } try { CudafyModule km = CudafyTranslator.Cudafy(ARCH); Console.WriteLine("Translator OK"); gpu = CudafyHost.GetDevice(CudafyModes.Target, CudafyModes.DeviceId); Console.WriteLine("GPU OK"); gpu.LoadModule(km); enabled = true; busy = false; return(true); } catch (Exception ex) { errorMessage = ex.ToString(); return(false); } }
public GpuRenderer() { var availableOpenCLDevices = CudafyHost.GetDeviceProperties(eGPUType.OpenCL); if (availableOpenCLDevices.Any() == false) { throw new Exception("No OpenCL devices found..."); } var device = availableOpenCLDevices.First(); Module = CudafyTranslator.Cudafy(eArchitecture.OpenCL12); var blockSide = Enumerable .Range(1, 15) .Reverse() .First(count => count * count <= device.MaxThreadsPerBlock); BlockSize = new dim3(blockSide, blockSide); // Initialize gpu and load the module (avoids reloading every time) gpu = CudafyHost.GetDevice(eGPUType.OpenCL); gpu.LoadModule(Module); }
public void ExeTestKernel() { GPGPU gpu = CudafyHost.GetDevice(CudafyModes.Target, 0); eArchitecture arch = gpu.GetArchitecture(); CudafyModule km = CudafyTranslator.Cudafy(arch); gpu.LoadModule(km); int[] host_results = new int[N]; // Either assign a new block of memory to hold results on device var dev_results = gpu.Allocate <int>(N); // Or fill your array with values first and then for (int i = 0; i < N; i++) { host_results[i] = i * 3; } // Copy array with ints to device var dev_filled_results = gpu.CopyToDevice(host_results); // 64*16 = 1024 threads per block (which is max for sm_30) dim3 threadsPerBlock = new dim3(64, 16); // 8*8 = 64 blocks per grid , just for show so you get varying numbers // 64 blocks * 1024 threads = 65536 // it's useful to align the number of threads with the amount of data (notice int[65536], i.e. 1 thread per int in the array) dim3 blocksPerGrid = new dim3(8, 8); //var threadsPerBlock = 1024; // this will only give you blockDim.x = 1024, .y = 0, .z = 0 //var blocksPerGrid = 1; // just for show gpu.Launch(blocksPerGrid, threadsPerBlock, "GenerateRipples", dev_results, dev_filled_results); gpu.CopyFromDevice(dev_results, host_results); }
static void TempOpenCLVectorAddTest() { int[] inputData1 = new int[N]; int[] inputData2 = new int[N]; int[] inputData3 = new int[N]; int[] outputData = new int[N]; Random rand = new Random(); for (int i = 0; i < N; i++) { inputData1[i] = rand.Next(128); inputData2[i] = rand.Next(128); inputData3[i] = 2; } GPGPU gpu = CudafyHost.GetDevice(eGPUType.Cuda, 0); Console.WriteLine(gpu.GetDeviceProperties().Name); CudafyTranslator.Language = eLanguage.Cuda; var mod = CudafyTranslator.Cudafy(CudafyModes.Architecture, typeof(OpenCLTestClass)); //mod.CudaSourceCode Console.WriteLine(mod.SourceCode); gpu.LoadModule(mod); int[] dev_data1 = gpu.CopyToDevice(inputData1); int[] dev_data2 = gpu.CopyToDevice(inputData2); gpu.CopyToConstantMemory(inputData3, OpenCLTestClass.ConstantMemory); int[] dev_res = gpu.Allocate <int>(N); #warning Work group and local size mess! http://stackoverflow.com/questions/7996537/cl-invalid-work-group-size-error-should-be-solved-though gpu.Launch(2, 512).VectorAdd(dev_data1, dev_data2, dev_res); gpu.CopyFromDevice(dev_res, 0, outputData, 0, N); for (int i = 0; i < N; i++) { Assert.AreEqual((inputData1[i] + inputData2[i]) * inputData3[i], outputData[i], string.Format("Error at {0}", i)); } }
public static void Execute() { var km = CudafyModule.TryDeserialize(); if (km == null || !km.TryVerifyChecksums()) { km = CudafyTranslator.Cudafy(ePlatform.Auto, eArchitecture.sm_20, typeof(ValueB), typeof(ValueA), typeof(StructTest)); km.Serialize(); } GPGPU gpu = CudafyHost.GetDevice(CudafyModes.Target, 0); gpu.LoadModule(km); var value = new ValueA(); value.valueB = new ValueB(); value.valueB.value = 56; var devOutput = gpu.Allocate <int>(1); gpu.Launch(1, 1, "StructTestKernel", value, devOutput); int output; // copy the array 'c' back from the GPU to the CPU gpu.CopyFromDevice(devOutput, out output); gpu.Free(devOutput); Console.WriteLine("Expected: {0} \t{1}", 56, 56 == output ? "PASSED" : "FAILED"); }
public static void Execute() { _gpu = CudafyHost.GetDevice(eGPUType.Cuda); CudafyModule km = CudafyTranslator.Cudafy(ePlatform.Auto, _gpu.GetArchitecture(), typeof(TextInsertion)); Console.WriteLine(km.CompilerOutput); _gpu.LoadModule(km); int[] data = new int[64]; int[] data_d = _gpu.CopyToDevice(data); int[] res_d = _gpu.Allocate(data); int[] res = new int[64]; _gpu.Launch(1, 1, "AHybridMethod", data_d, res_d); _gpu.CopyFromDevice(data_d, res); for (int i = 0; i < 64; i++) { if (data[i] != res[i]) { Console.WriteLine("Failed"); break; } } }
private void InitializeGPUs() { eGPUType[] gpuTypes = new eGPUType[] { eGPUType.Cuda, eGPUType.OpenCL, eGPUType.Emulator }; eLanguage[] languages = new eLanguage[] { eLanguage.Cuda, eLanguage.OpenCL }; foreach (eGPUType gpuType in gpuTypes) { try { int numberOfAvailableDevices = CudafyHost.GetDeviceCount(gpuType); for (int deviceNumber = 0; deviceNumber < numberOfAvailableDevices; deviceNumber++) { GPGPU gpgpu = CudafyHost.GetDevice(gpuType, deviceNumber); GPGPUProperties gpgpuProperties = gpgpu.GetDeviceProperties(true); CudafyModes.Target = gpuType; foreach (eLanguage language in languages) { string cudaRandomFilename = Path.GetRandomFileName(); try { CudafyTranslator.Language = language; CompileProperties compileProperties = CompilerHelper.Create(ePlatform.Auto, eArchitecture.Unknown, eCudafyCompileMode.Default, CudafyTranslator.WorkingDirectory, CudafyTranslator.GenerateDebug); // Use a random filename to prevent conflict on default temp file when multithreading (unit tests) compileProperties.InputFile = cudaRandomFilename; // If this line fails with NCrunch/Unit tests, there probably is a new version of Cudafy.NET // and it needs to be registered in the GAC like this: gacutil -i Cudafy.NET.dll CudafyModule cudafyModule = CudafyTranslator.Cudafy(compileProperties, typeof(Primitives)); if (!gpgpu.IsModuleLoaded(cudafyModule.Name)) { gpgpu.LoadModule(cudafyModule); } gpgpu.EnableMultithreading(); string gpuName = gpgpuProperties.Name.Trim() + " - " + gpuType.ToString() + " - " + language.ToString(); ////this.gpgpus.Add(gpuName, gpgpu); ////this.gpgpuProperties.Add(gpuName, gpgpuProperties); ////this.gpuTypes.Add(gpuName, gpuType); } catch (CudafyCompileException) { // Language not supported } finally { File.Delete(cudaRandomFilename); // ncrunch: no coverage start } } } } catch (DllNotFoundException) { } catch (InvalidOperationException) { // Language not supported } catch (Cloo.ComputeException) { // Language not supported } // ncrunch: no coverage end } }
public static int Execute() { CudafyModule km = CudafyTranslator.Cudafy(); GPGPU gpu = CudafyHost.GetDevice(CudafyModes.Target, CudafyModes.DeviceId); if (gpu is CudaGPU && gpu.GetDeviceProperties().Capability < new Version(1, 2)) { Console.WriteLine("Compute capability 1.2 or higher required for atomics."); return(-1); } gpu.LoadModule(km); byte[] buffer = big_random_block(SIZE); // cudart.dll must be accessible! GPGPUProperties prop = null; try { prop = gpu.GetDeviceProperties(true); } catch (DllNotFoundException) { prop = gpu.GetDeviceProperties(false); } // capture the start time // starting the timer here so that we include the cost of // all of the operations on the GPU. if the data were // already on the GPU and we just timed the kernel // the timing would drop from 74 ms to 15 ms. Very fast. gpu.StartTimer(); // allocate memory on the GPU for the file's data byte[] dev_buffer = gpu.CopyToDevice(buffer); uint[] dev_histo = gpu.Allocate <uint>(256); gpu.Set(dev_histo); // kernel launch - 2x the number of mps gave best timing int blocks = prop.MultiProcessorCount; if (blocks == 0) { blocks = 16; } Console.WriteLine("Processors: {0}", blocks); gpu.Launch(blocks * 2, 256).histo_kernel(dev_buffer, SIZE, dev_histo); uint[] histo = new uint[256]; gpu.CopyFromDevice(dev_histo, histo); // get stop time, and display the timing results float elapsedTime = gpu.StopTimer(); Console.WriteLine("Time to generate: {0} ms", elapsedTime); long histoCount = 0; for (int i = 0; i < 256; i++) { histoCount += histo[i]; } Console.WriteLine("Histogram Sum: {0}", histoCount); // verify that we have the same counts via CPU for (int i = 0; i < SIZE; i++) { histo[buffer[i]]--; } for (int i = 0; i < 256; i++) { if (histo[i] != 0) { Console.WriteLine("Failure at {0}!", i); } } gpu.FreeAll(); return(0); }
public static void prepareGPU() { km = CudafyTranslator.Cudafy(); gpu = CudafyHost.GetDevice(CudafyModes.Target, CudafyModes.DeviceId); gpu.LoadModule(km); }
public static char[] Execute(String[] keys, string I, int n) { GPGPU gpu = CudafyHost.GetDevice(CudafyModes.Target, 0); eArchitecture arch = gpu.GetArchitecture(); CudafyModule km = CudafyTranslator.Cudafy(arch); gpu.LoadModule(km); Stopwatch xxxx = new Stopwatch(); xxxx.Start(); StringSearch abb = new StringSearch(keys); string alphabet = "ABCDEFGHI*KLMN*PQRST*VWXYZ"; int alpha = alphabet.Length; int[,] table1 = new int[StringSearch.nodeCount, alpha]; for (int i = 0; i < StringSearch.nodeCount; i++) { for (int j = 0; j < alpha; j++) { table1[i, j] = -1; } } abb.build_table1(table1, abb._root); char[] input = I.ToCharArray(); int length = I.Length; I = ""; int[] output_table = new int[StringSearch.nodeCount]; abb.build_tableO(output_table, abb._root); abb = new StringSearch(); char[] matched_result = new char[length]; xxxx.Stop(); //CudafyModule km = CudafyModule.TryDeserialize(); //if (km == null || !km.TryVerifyChecksums()) //{ // km = CudafyTranslator.Cudafy(); // km.Serialize(); // gpu.LoadModule(km); //} gpu.SetCurrentContext(); int[] tempas = new int[StringSearch.nodeCount]; int[,] tempbab = new int[StringSearch.nodeCount, alpha]; int[,] table1_d = gpu.Allocate <int>(tempbab); int[] output_table_d = gpu.Allocate <int>(tempas); char[] matched_result_d = gpu.Allocate <char>(length); char[] input_d = gpu.Allocate <char>(length); int[] input_length_d = gpu.Allocate <int>(1); int[] input_length = { length }; gpu.CopyToDevice(table1, table1_d); gpu.CopyToDevice(output_table, output_table_d); gpu.CopyToDevice(matched_result, matched_result_d); gpu.CopyToDevice(input, input_d); gpu.CopyToDevice(input_length, input_length_d); int block = (int)Math.Ceiling((double)length / N); gpu.Launch(block, N).Dot(table1_d, output_table_d, matched_result_d, input_d, input_length_d); gpu.CopyFromDevice(matched_result_d, matched_result); gpu.FreeAll(); return(matched_result); }