private Parallel() { _converter = JITER.Singleton; Buffer = new BUFFERS(); //InitCuda(); // var ok = GC.TryStartNoGCRegion(200000000); }
static void Main(string[] args) { unsafe { var s = JITER.Singleton; BUFFERS buffers = new BUFFERS(); int the_size = 536870912; IntPtr b = buffers.New(the_size); RUNTIME.BclPtr = b; RUNTIME.BclPtrSize = (ulong)the_size; int max_threads = 1; IntPtr b2 = buffers.New(sizeof(int *)); InitTheBcl(b, the_size, 2 * 16777216, max_threads, b2); InitFileSystem(); // Set up corlib.dll in file system. string full_path_assem = RUNTIME.FindCoreLib(); string assem = Path.GetFileName(full_path_assem); Stream stream = new FileStream(full_path_assem, FileMode.Open, FileAccess.Read, FileShare.Read); var corlib_bytes_handle_len = stream.Length; var corlib_bytes = new byte[corlib_bytes_handle_len]; stream.Read(corlib_bytes, 0, (int)corlib_bytes_handle_len); var corlib_bytes_handle = GCHandle.Alloc(corlib_bytes, GCHandleType.Pinned); var corlib_bytes_intptr = corlib_bytes_handle.AddrOfPinnedObject(); stream.Close(); stream.Dispose(); var ptrx = Marshal.StringToHGlobalAnsi(assem); IntPtr pointer1 = buffers.New(assem.Length + 1); BUFFERS.Cp(pointer1, ptrx, assem.Length + 1); var pointer4 = buffers.New(sizeof(int)); GfsAddFile(pointer1, corlib_bytes_intptr, corlib_bytes_handle_len, pointer4); InitializeBCL1(); InitializeBCL2(); } // Open assembly and print contents. { string full_path_assem = args[0]; string assem = Path.GetFileName(full_path_assem); Stream stream = new FileStream(full_path_assem, FileMode.Open, FileAccess.Read, FileShare.Read); var corlib_bytes_handle_len = stream.Length; var corlib_bytes = new byte[corlib_bytes_handle_len]; stream.Read(corlib_bytes, 0, (int)corlib_bytes_handle_len); var corlib_bytes_handle = GCHandle.Alloc(corlib_bytes, GCHandleType.Pinned); var corlib_bytes_intptr = corlib_bytes_handle.AddrOfPinnedObject(); stream.Close(); stream.Dispose(); var ptrx = Marshal.StringToHGlobalAnsi(assem); BUFFERS buffers = new BUFFERS(); IntPtr pointer1 = buffers.New(assem.Length + 1); BUFFERS.Cp(pointer1, ptrx, assem.Length + 1); var pointer4 = buffers.New(sizeof(int)); GfsAddFile(pointer1, corlib_bytes_intptr, corlib_bytes_handle_len, pointer4); var meta = BclGetMeta(assem); BclPrintMeta(meta); } }
public static void For(int number_of_threads, SimpleKernel simpleKernel) { if (Campy.Utils.Options.IsOn("import-only")) { JustImport(simpleKernel); return; } GCHandle handle1 = default(GCHandle); GCHandle handle2 = default(GCHandle); try { unsafe { System.Reflection.MethodInfo method_info = simpleKernel.Method; String kernel_assembly_file_name = method_info.DeclaringType.Assembly.Location; Mono.Cecil.ModuleDefinition md = Campy.Meta.StickyReadMod.StickyReadModule( kernel_assembly_file_name, new ReaderParameters { ReadSymbols = true }); MethodReference method_reference = md.ImportReference(method_info); CUfunction ptr_to_kernel = default(CUfunction); CUmodule module = default(CUmodule); Campy.Utils.TimePhase.Time("compile ", () => { IntPtr image = Singleton._compiler.Compile(method_reference, simpleKernel.Target); module = Singleton._compiler.SetModule(method_reference, image); Singleton._compiler.StoreJits(module); ptr_to_kernel = Singleton._compiler.GetCudaFunction(method_reference, module); }); RUNTIME.BclCheckHeap(); BUFFERS buffer = Singleton.Buffer; IntPtr kernel_target_object = IntPtr.Zero; Campy.Utils.TimePhase.Time("deep copy ", () => { int count = simpleKernel.Method.GetParameters().Length; var bb = Singleton._compiler.GetBasicBlock(method_reference); if (bb.HasThis) { count++; } if (!(count == 1 || count == 2)) { throw new Exception("Expecting at least one parameter for kernel."); } if (bb.HasThis) { kernel_target_object = buffer.AddDataStructure(simpleKernel.Target); } }); Campy.Utils.TimePhase.Time("kernel cctor set up", () => { // For each cctor, run on GPU. // Construct dependency graph of methods. List <MethodReference> order_list = COMPILER.Singleton.ConstructCctorOrder(); // Finally, call cctors. foreach (var bb in order_list) { if (Campy.Utils.Options.IsOn("trace-cctors")) { System.Console.WriteLine("Executing cctor " + bb.FullName); } var cctor = Singleton._compiler.GetCudaFunction(bb, module); var res = CUresult.CUDA_SUCCESS; Campy.Utils.CudaHelpers.MakeLinearTiling(1, out Campy.Utils.CudaHelpers.dim3 tile_size, out Campy.Utils.CudaHelpers.dim3 tiles); res = Cuda.cuLaunchKernel( cctor, tiles.x, tiles.y, tiles.z, // grid has one block. tile_size.x, tile_size.y, tile_size.z, // n threads. 0, // no shared memory default(CUstream), (IntPtr)IntPtr.Zero, (IntPtr)IntPtr.Zero ); CudaHelpers.CheckCudaError(res); res = Cuda.cuCtxSynchronize(); // Make sure it's copied back to host. CudaHelpers.CheckCudaError(res); } }); if (Campy.Utils.Options.IsOn("trace-cctors")) { System.Console.WriteLine("Done with cctors"); } Campy.Utils.TimePhase.Time("kernel call ", () => { IntPtr[] parm1 = new IntPtr[1]; IntPtr[] parm2 = new IntPtr[1]; parm1[0] = kernel_target_object; parm2[0] = buffer.New(BUFFERS.SizeOf(typeof(int))); IntPtr[] x1 = parm1; handle1 = GCHandle.Alloc(x1, GCHandleType.Pinned); IntPtr pointer1 = handle1.AddrOfPinnedObject(); IntPtr[] x2 = parm2; handle2 = GCHandle.Alloc(x2, GCHandleType.Pinned); IntPtr pointer2 = handle2.AddrOfPinnedObject(); IntPtr[] kp = new IntPtr[] { pointer1, pointer2 }; var res = CUresult.CUDA_SUCCESS; fixed(IntPtr * kernelParams = kp) { Campy.Utils.CudaHelpers.MakeLinearTiling(number_of_threads, out Campy.Utils.CudaHelpers.dim3 tile_size, out Campy.Utils.CudaHelpers.dim3 tiles); //MakeLinearTiling(1, out dim3 tile_size, out dim3 tiles); res = Cuda.cuLaunchKernel( ptr_to_kernel, tiles.x, tiles.y, tiles.z, // grid has one block. tile_size.x, tile_size.y, tile_size.z, // n threads. 0, // no shared memory default(CUstream), (IntPtr)kernelParams, (IntPtr)IntPtr.Zero ); }
private Parallel() { _compiler = COMPILER.Singleton; Buffer = new BUFFERS(); }
public static void For(int number_of_threads, SimpleKernel simpleKernel) { GCHandle handle1 = default(GCHandle); GCHandle handle2 = default(GCHandle); try { unsafe { //////// COMPILE KERNEL INTO GPU CODE /////// ///////////////////////////////////////////// var stopwatch_cuda_compile = new Stopwatch(); stopwatch_cuda_compile.Start(); IntPtr image = Singleton()._converter.Compile(simpleKernel.Method, simpleKernel.Target); CUfunction ptr_to_kernel = Singleton()._converter.GetCudaFunction(simpleKernel.Method, image); var elapse_cuda_compile = stopwatch_cuda_compile.Elapsed; RUNTIME.CheckHeap(); //////// COPY DATA INTO GPU ///////////////// ///////////////////////////////////////////// var stopwatch_deep_copy_to = new Stopwatch(); stopwatch_deep_copy_to.Reset(); stopwatch_deep_copy_to.Start(); BUFFERS buffer = Singleton().Buffer; // Set up parameters. int count = simpleKernel.Method.GetParameters().Length; var bb = Singleton()._converter.GetBasicBlock(simpleKernel.Method); if (bb.HasThis) { count++; } if (!(count == 1 || count == 2)) { throw new Exception("Expecting at least one parameter for kernel."); } IntPtr[] parm1 = new IntPtr[1]; IntPtr[] parm2 = new IntPtr[1]; IntPtr ptr = IntPtr.Zero; // The method really should have a "this" because it's a closure // object. if (bb.HasThis) { RUNTIME.CheckHeap(); ptr = buffer.AddDataStructure(simpleKernel.Target); parm1[0] = ptr; } { Type btype = typeof(int); var s = BUFFERS.SizeOf(btype); var ptr2 = buffer.New(s); // buffer.DeepCopyToImplementation(index, ptr2); parm2[0] = ptr2; } stopwatch_deep_copy_to.Start(); var elapse_deep_copy_to = stopwatch_cuda_compile.Elapsed; var stopwatch_call_kernel = new Stopwatch(); stopwatch_call_kernel.Reset(); stopwatch_call_kernel.Start(); IntPtr[] x1 = parm1; handle1 = GCHandle.Alloc(x1, GCHandleType.Pinned); IntPtr pointer1 = handle1.AddrOfPinnedObject(); IntPtr[] x2 = parm2; handle2 = GCHandle.Alloc(x2, GCHandleType.Pinned); IntPtr pointer2 = handle2.AddrOfPinnedObject(); RUNTIME.CheckHeap(); IntPtr[] kp = new IntPtr[] { pointer1, pointer2 }; var res = CUresult.CUDA_SUCCESS; fixed(IntPtr *kernelParams = kp) { Campy.Utils.CudaHelpers.MakeLinearTiling(number_of_threads, out Campy.Utils.CudaHelpers.dim3 tile_size, out Campy.Utils.CudaHelpers.dim3 tiles); //MakeLinearTiling(1, out dim3 tile_size, out dim3 tiles); res = Cuda.cuLaunchKernel( ptr_to_kernel, tiles.x, tiles.y, tiles.z, // grid has one block. tile_size.x, tile_size.y, tile_size.z, // n threads. 0, // no shared memory default(CUstream), (IntPtr)kernelParams, (IntPtr)IntPtr.Zero ); } CudaHelpers.CheckCudaError(res); res = Cuda.cuCtxSynchronize(); // Make sure it's copied back to host. CudaHelpers.CheckCudaError(res); stopwatch_call_kernel.Stop(); var elapse_call_kernel = stopwatch_call_kernel.Elapsed; if (Campy.Utils.Options.IsOn("jit_trace")) { System.Console.WriteLine("cuda compile " + elapse_cuda_compile); System.Console.WriteLine("deep copy in " + elapse_deep_copy_to); System.Console.WriteLine("cuda kernel " + elapse_call_kernel); } { var stopwatch_deep_copy_back = new Stopwatch(); stopwatch_deep_copy_back.Reset(); RUNTIME.CheckHeap(); stopwatch_deep_copy_back.Start(); buffer.SynchDataStructures(); stopwatch_deep_copy_back.Stop(); RUNTIME.CheckHeap(); var elapse_deep_copy_back = stopwatch_deep_copy_back.Elapsed; if (Campy.Utils.Options.IsOn("jit_trace")) { System.Console.WriteLine("deep copy out " + elapse_deep_copy_back); } } } } catch (Exception e) { Console.WriteLine(e); throw e; } finally { if (default(GCHandle) != handle1) { handle1.Free(); } if (default(GCHandle) != handle2) { handle2.Free(); } } }