protected T[] InternalExecuteMPI <T>( byte[] kernelBinary, String function, int bufferSize, ParallelTaskParams loaderParams, params Object[] kernelParams) where T : struct { TriggerCheckpoint(ParallelExecutionCheckpointType.CheckpointStart); String binaryPath = Path.GetTempFileName(); File.WriteAllBytes(binaryPath, kernelBinary); TriggerCheckpoint(ParallelExecutionCheckpointType.CheckpointPlatformInit); String mpiDirectory = SystemArchitecture.ProgramFolder(ArchitectureType.x86, @"OpenMPI*"); Process mpirunProcess = new Process(); mpirunProcess.StartInfo.CreateNoWindow = true; mpirunProcess.StartInfo.UseShellExecute = false; mpirunProcess.StartInfo.RedirectStandardOutput = true; mpirunProcess.StartInfo.EnvironmentVariables["PATH"] += @";" + mpiDirectory + @"\bin"; mpirunProcess.StartInfo.FileName = mpiDirectory + @"\bin\mpirun.exe"; TriggerCheckpoint(ParallelExecutionCheckpointType.CheckpointKernelBuild); mpirunProcess.StartInfo.Arguments = String.Format("-n {0} \"{1}\" {2} {3} {4}", loaderParams.ProcessCount, ShortPath(binaryPath), TypeName(typeof(T)), function, bufferSize); foreach (Object param in kernelParams) { mpirunProcess.StartInfo.Arguments += " " + param.ToString(); } TriggerCheckpoint(ParallelExecutionCheckpointType.CheckpointDeviceWrite); mpirunProcess.Start(); TriggerCheckpoint(ParallelExecutionCheckpointType.CheckpointKernelExecute); using (MemoryStream resultStream = new MemoryStream()) { mpirunProcess.StandardOutput.BaseStream.CopyTo(resultStream); TriggerCheckpoint(ParallelExecutionCheckpointType.CheckpointDeviceRead); byte[] processOutput = resultStream.ToArray(); T[] result = new T[(int)Math.Ceiling((float)processOutput.Length / Marshal.SizeOf(typeof(T)))]; Buffer.BlockCopy(processOutput, 0, result, 0, processOutput.Length); TriggerCheckpoint(ParallelExecutionCheckpointType.CheckpointPlatformDeinit); return(result); } }
protected T[] InternalExecuteOpencl <T>( String source, String function, int bufferSize, ParallelTaskParams loaderParams, params Object[] kernelParams) where T : struct { TriggerCheckpoint(ParallelExecutionCheckpointType.CheckpointStart); ComputeCommandQueue queue = QueueWithDevice(loaderParams.OpenCLDevice); TriggerCheckpoint(ParallelExecutionCheckpointType.CheckpointPlatformInit); String updatedSource = "#define OpenCL\r\n" + source; ComputeProgram program = new ComputeProgram(queue.Context, updatedSource); program.Build(new ComputeDevice[] { queue.Device }, null, null, IntPtr.Zero); TriggerCheckpoint(ParallelExecutionCheckpointType.CheckpointKernelBuild); T[] resultBuffer = new T[bufferSize]; ComputeBuffer <T> resultBufferVar = new ComputeBuffer <T>(queue.Context, ComputeMemoryFlags.WriteOnly, bufferSize); List <ComputeMemory> vars = new List <ComputeMemory>(); vars.Add(resultBufferVar); vars.AddRange(WrapDeviceVariables(kernelParams, queue.Context)); ComputeKernel kernel = program.CreateKernel(function); for (int i = 0; i < vars.Count; i++) { kernel.SetMemoryArgument(i, vars[i]); } TriggerCheckpoint(ParallelExecutionCheckpointType.CheckpointDeviceWrite); long[] workersGlobal = new long[2] { loaderParams.GlobalWorkers.Width, loaderParams.GlobalWorkers.Height }; queue.Execute(kernel, null, workersGlobal, null, null); TriggerCheckpoint(ParallelExecutionCheckpointType.CheckpointKernelExecute); queue.ReadFromBuffer <T>(resultBufferVar, ref resultBuffer, false, null); TriggerCheckpoint(ParallelExecutionCheckpointType.CheckpointDeviceRead); queue.Finish(); TriggerCheckpoint(ParallelExecutionCheckpointType.CheckpointPlatformDeinit); return(resultBuffer); }
protected T[] InternalExecuteCuda <T>( byte[] kernelBinary, String function, int bufferSize, ParallelTaskParams loaderParams, params Object[] kernelParams) where T : struct { TriggerCheckpoint(ParallelExecutionCheckpointType.CheckpointStart); CudaContext context = ContextWithDevice(loaderParams.CudaDevice); TriggerCheckpoint(ParallelExecutionCheckpointType.CheckpointPlatformInit); TriggerCheckpoint(ParallelExecutionCheckpointType.CheckpointKernelBuild); CudaDeviceVariable <T> resultBufferVar = new CudaDeviceVariable <T>(bufferSize); resultBufferVar.Memset(0); List <Tuple <Object, IDisposable> > vars = new List <Tuple <Object, IDisposable> >(); vars.Add(new Tuple <Object, IDisposable>(resultBufferVar.DevicePointer, resultBufferVar)); vars.AddRange(WrapDeviceVariables(kernelParams, true)); TriggerCheckpoint(ParallelExecutionCheckpointType.CheckpointDeviceWrite); CudaKernel kernel = context.LoadKernelPTX(kernelBinary, function); kernel.BlockDimensions = new dim3(loaderParams.BlockSize.Width, loaderParams.BlockSize.Height); kernel.GridDimensions = new dim3(loaderParams.GridSize.Width, loaderParams.GridSize.Height); kernel.Run(vars.Select(tuple => tuple.Item1).ToArray()); TriggerCheckpoint(ParallelExecutionCheckpointType.CheckpointKernelExecute); T[] resultBuffer = resultBufferVar; TriggerCheckpoint(ParallelExecutionCheckpointType.CheckpointDeviceRead); vars.Where(tuple => tuple.Item2 != null).ToList().ForEach(tuple => tuple.Item2.Dispose()); TriggerCheckpoint(ParallelExecutionCheckpointType.CheckpointPlatformDeinit); return(resultBuffer); }