public void run_benchmark_gpu_sgemm(uint dev) { log.Add("running sgemm, device " + dev); Random rg = new Random(); int d = 96 * 6; int fsz = 3 * d * d; float[] host_float_buf = new float[fsz]; for (int a = 0; a < host_float_buf.Length; a++) host_float_buf[a] = (float)(0.01 * rg.Next(-1000, 1000)); int status = opcuda_cublas_init(); if (status != 0) throw new ExecutionEngineException(); opcuda_set_device(dev); uint device_float_buf_ptr = opcuda_mem_alloc((uint)(host_float_buf.Length * sizeof(float))); uint aptr = device_float_buf_ptr; uint bptr = (uint)(device_float_buf_ptr + d * d * sizeof(float)); uint cptr = (uint)(device_float_buf_ptr + 2 * d * d * sizeof(float)); unsafe { fixed (float* bufp = &host_float_buf[0]) { status = opcuda_memcpy_h2d(device_float_buf_ptr, (IntPtr)bufp, (uint)(fsz * sizeof(float))); if (status != 0) throw new System.Exception(); } } CStopWatch sw = new CStopWatch(); sw.Reset(); int niter = 100; for (int iter = 0; iter < niter; iter++) { opcuda_sgemm(d, d, d, 1, aptr, d, bptr, d, 0, cptr, d); } opcuda_thread_synchronize(); double time1 = sw.Peek(); double nflops = 2d * niter * (double)d * (double)d * (double)d; double gigaflops_per_second = nflops / (1000000000d * time1); opcuda_mem_free_device(device_float_buf_ptr); status = opcuda_shutdown(); if (status != 0) throw new ExecutionEngineException(); if (benchmarks == null) benchmarks = new SBenchmarks(); if (benchmarks.gpu_sgemm_performance_dev == null) benchmarks.gpu_sgemm_performance_dev = new double[ndev]; benchmarks.gpu_sgemm_performance_dev[dev] = gigaflops_per_second; log.Add("performance: " + String.Format("{0:0.0}", gigaflops_per_second) + " GF/sec"); opcuda_shutdown(); }
public void run_benchmark_cpu_sgemm() { log.Add("running sgemm on the cpu"); Random rg = new Random(); int d = 96 * 6; int fsz = 3 * d * d; float[] host_float_buf = new float[fsz]; for (int a = 0; a < host_float_buf.Length; a++) host_float_buf[a] = (float)(0.01 * rg.Next(-1000, 1000)); CStopWatch sw = new CStopWatch(); sw.Reset(); int niter = 100; unsafe { fixed (float* ap = &host_float_buf[0]) { fixed (float* bp = &host_float_buf[d * d]) { fixed (float* cp = &host_float_buf[2 * d * d]) { for (int iter = 0; iter < niter; iter++) { opc_sgemm(ap, bp, cp, d, d, d, d, d, d); } } } } } double time1 = sw.Peek(); double nflops = 2d * niter * (double)d * (double)d * (double)d; double gigaflops_per_second = nflops / (1000000000d * time1); if (benchmarks == null) benchmarks = new SBenchmarks(); benchmarks.cpu_sgemm_performance = gigaflops_per_second; }
public void run_benchmark_gpu_mt(uint dev) { log.Add("running sglv1f, device " + dev); int nscen_per_batch = 4096 * 25; int nbatches = 20; int status = opcuda_cublas_init(); if (status != 0) throw new ExecutionEngineException(); opcuda_set_device(dev); opcuda_mc_load_mt_gpu(); Random rand = new Random(); int nrng = opcuda_mc_nrng(); CArray host_seed_rg = new CArray(nrng, EType.int_t, EMemorySpace.host, null, "host_seed_rg"); unsafe { int* seed_rg = (int*)host_seed_rg.hptr; for (int rg = 0; rg < nrng; rg++) { seed_rg[rg] = (int)(rand.NextDouble() * int.MaxValue); } } CArray device_rgstatus = new CArray(opcuda_mc_status_sz(), EType.int_t, EMemorySpace.device, null, "mcbuf._device_rgstatus"); unsafe { opcuda_mc_setseed(host_seed_rg.hptr, device_rgstatus.ptr); } CArray device_unif_s = new CArray(nscen_per_batch, EType.float_t, EMemorySpace.device, null, "device_unif_s"); CArray host_unif_s = new CArray(nscen_per_batch, EType.float_t, EMemorySpace.host, null, "host_unif_s"); CStopWatch sw = new CStopWatch(); sw.Reset(); unsafe { for (int b = 0; b < nbatches; b++) { opcuda_mt_benchmark(device_rgstatus.ptr, device_unif_s.ptr, nscen_per_batch); opcuda_memcpy_d2h(device_unif_s.ptr, host_unif_s.hptr, (uint)(sizeof(short) * host_unif_s.length)); } } opcuda_thread_synchronize(); double time = sw.Peek(); double nevals = (double)nbatches * (double)nscen_per_batch; double milion_evals_per_second = nevals / (1000000 * time); status = opcuda_shutdown(); if (status != 0) throw new ExecutionEngineException(); if (benchmarks == null) benchmarks = new SBenchmarks(); if (benchmarks.gpu_mt_with_copy_performance_dev == null) benchmarks.gpu_mt_with_copy_performance_dev = new double[ndev]; benchmarks.gpu_mt_with_copy_performance_dev[dev] = milion_evals_per_second; log.Add("mc performance: " + String.Format("{0:0.0}", milion_evals_per_second) + " milion eval/sec"); sw.Reset(); unsafe { for (int b = 0; b < nbatches; b++) { opcuda_mt_benchmark(device_rgstatus.ptr, device_unif_s.ptr, nscen_per_batch); } } opcuda_thread_synchronize(); time = sw.Peek(); nevals = (double)nbatches * (double)nscen_per_batch; milion_evals_per_second = nevals / (1000000 * time); status = opcuda_shutdown(); if (status != 0) throw new ExecutionEngineException(); if (benchmarks == null) benchmarks = new SBenchmarks(); if (benchmarks.gpu_mt_no_copy_performance_dev == null) benchmarks.gpu_mt_no_copy_performance_dev = new double[ndev]; benchmarks.gpu_mt_no_copy_performance_dev[dev] = milion_evals_per_second; log.Add("mc performance: " + String.Format("{0:0.0}", milion_evals_per_second) + " milion eval/sec"); opcuda_shutdown(); }
public void run_benchmark_cpu_dcsv1f() { log.Add("running dcsv1f on the cpu"); int ni = 10; double S0 = 100; int nscen_per_batch = 4096 * 25; int nbatches = 100; TimeSpan dt0 = TimeSpan.FromDays(1); EFloatingPointUnit fpu = EFloatingPointUnit.host; EFloatingPointPrecision fpp = EFloatingPointPrecision.bit64; DateTime today = DateTime.Today; DateTime[] t_k = new DateTime[40]; for (int k = 0; k < 40; k++) t_k[k] = today.AddDays(7 * (k + 1)); DateTime[] t_i = new DateTime[ni]; t_i[0] = today.AddDays(30); t_i[1] = today.AddDays(60); t_i[2] = today.AddDays(90); t_i[3] = today.AddDays(120); t_i[4] = today.AddDays(150); t_i[5] = today.AddDays(180); t_i[6] = today.AddDays(210); t_i[7] = today.AddDays(240); t_i[8] = today.AddDays(270); t_i[9] = today.AddDays(300); double[] xpivot_p = new double[7]; double[] xgridspacing_p = new double[7]; xpivot_p[0] = 1; xgridspacing_p[0] = 5; xpivot_p[1] = 70; xgridspacing_p[1] = 2.5; xpivot_p[2] = 90; xgridspacing_p[2] = 1; xpivot_p[3] = 110; xgridspacing_p[3] = 2.5; xpivot_p[4] = 140; xgridspacing_p[4] = 5; xpivot_p[5] = 200; xgridspacing_p[5] = 7.5; xpivot_p[6] = 300; xgridspacing_p[6] = 10; int nx = 64; int nr = 8; double[] rval_r = new double[nr]; rval_r[0] = 0.55; rval_r[1] = 0.75; rval_r[2] = 0.90; rval_r[3] = 1.0; rval_r[4] = 1.1; rval_r[5] = 1.3; rval_r[6] = 1.6; rval_r[7] = 2.0; int r0 = 4; OPModel.Types.CDevice device = new OPModel.Types.CDevice(fpp, fpu, 0); OPModel.Types.S2DGrid grid = new OPModel.Types.S2DGrid(device, nx, nr, today, t_i, dt0, S0, xpivot_p, xgridspacing_p, rval_r, r0); double vol = .25; double lowbeta = 0.5; double highbeta = 0.5; double volvol = 0.5; double volmrr = 0.5; double volmrl = 3; double jumpsz_minus = -3.0; double jumpsz_plus = 0.0; double[,] taumatrix_ccol = new double[8, 4]; for (int col = 0; col <= 2; col++) { taumatrix_ccol[0, col] = vol; taumatrix_ccol[1, col] = lowbeta; taumatrix_ccol[2, col] = highbeta; taumatrix_ccol[3, col] = volvol; taumatrix_ccol[4, col] = volmrr; taumatrix_ccol[5, col] = volmrl; taumatrix_ccol[6, col] = jumpsz_minus; taumatrix_ccol[7, col] = jumpsz_plus; } double[] ir_i = new double[ni]; double[] df_i = new double[ni]; for (int i = 0; i < ni; i++) { ir_i[i] = 0.05; if (i == 0) df_i[0] = Math.Exp(-ir_i[i] * (t_i[0] - grid.today).Days / 365.25); else df_i[i] = df_i[i - 1] * Math.Exp(-ir_i[i] * (t_i[i] - t_i[i - 1]).Days / 365.25); } CStopWatch sw = new CStopWatch(); CSVModel model = new CSVModel(grid, "DCSV1F", df_i); model.mkgen(taumatrix_ccol, null); model.make_mc_plan(nscen_per_batch, nbatches, t_k); model.reset_flop_counter(); double time = sw.Peek(); sw.Reset(); model.exe_mc_plan(); time = sw.Peek(); double nflops = model.cpu_nflops; double gigaflops_per_second = nflops / (1000000000d * time); if (benchmarks == null) benchmarks = new SBenchmarks(); benchmarks.cpu_dcsv1f_blas_performance = gigaflops_per_second; log.Add("blas performance: " + String.Format("{0:0.0}", gigaflops_per_second) + " GF/sec"); CMCEvaluator evaluator = new emptyEvaluator(); double[] payoff_a = new double[nscen_per_batch * model.mcplan.nth]; model.host_d_mc_init(); sw.Reset(); unsafe { model.host_d_mc_run1f(payoff_a, evaluator); } time = sw.Peek(); double nevals = (double)t_k.Length * (double)nbatches * (double)nscen_per_batch; double milion_evals_per_second = nevals / (1000000 * time); benchmarks.cpu_dcsv1f_mc_performance = milion_evals_per_second; log.Add("mc performance: " + String.Format("{0:0.0}", milion_evals_per_second) + " milion eval/sec"); }
public void run_benchmark_cpu_mt() { log.Add("running the Mersenne Twister benchmark on the CPU"); int nscen_per_batch = 4096 * 25; int nbatches = 2000; int nth = Environment.ProcessorCount; OPModel.Types.CRangen.mtinit(nth); float[] host_unif_scen_th = new float[nscen_per_batch * nth]; OPModel.Types.CJobQueue Queue = new OPModel.Types.CJobQueue(); object[] p_b = new object[nbatches]; for (int b = 0; b < nbatches; b++) { host_d_mc_mt_func_input input = new host_d_mc_mt_func_input(); input.nth = nth; input.batch = b; input.nscen_per_batch = nscen_per_batch; input.host_unif_scen_th = host_unif_scen_th; p_b[b] = input; } CStopWatch sw = new CStopWatch(); sw.Reset(); Queue.Exec(host_d_mc_mt_func, null, p_b, nth); double time = sw.Peek(); double nevals = (double)nbatches * (double)nscen_per_batch; double milion_evals_per_second = nevals / (1000000 * time); if (benchmarks == null) benchmarks = new SBenchmarks(); benchmarks.cpu_mt_performance = milion_evals_per_second; log.Add("mt performance: " + String.Format("{0:0.0}", milion_evals_per_second) + " milion eval/sec"); host_unif_scen_th = null; System.GC.Collect(); }
public void run_benchmark_cpu_dclv1f() { log.Add("running dclv1f on the cpu"); int ni = 10; double S0 = 100; int nscen_per_batch = 4096 * 25; int nbatches = 120; TimeSpan dt0 = TimeSpan.FromDays(1); EFloatingPointUnit fpu = EFloatingPointUnit.host; EFloatingPointPrecision fpp = EFloatingPointPrecision.bit64; DateTime today = DateTime.Today; DateTime[] t_k = new DateTime[40]; for (int k = 0; k < 40; k++) t_k[k] = today.AddDays(7 * (k + 1)); DateTime[] t_i = new DateTime[ni]; t_i[0] = today.AddDays(30); t_i[1] = today.AddDays(60); t_i[2] = today.AddDays(90); t_i[3] = today.AddDays(120); t_i[4] = today.AddDays(150); t_i[5] = today.AddDays(180); t_i[6] = today.AddDays(210); t_i[7] = today.AddDays(240); t_i[8] = today.AddDays(270); t_i[9] = today.AddDays(300); double[] xpivot_p = new double[7]; double[] xgridspacing_p = new double[7]; xpivot_p[0] = 1; xgridspacing_p[0] = 5; xpivot_p[1] = 70; xgridspacing_p[1] = 2.5; xpivot_p[2] = 90; xgridspacing_p[2] = 1; xpivot_p[3] = 110; xgridspacing_p[3] = 2.5; xpivot_p[4] = 140; xgridspacing_p[4] = 5; xpivot_p[5] = 200; xgridspacing_p[5] = 7.5; xpivot_p[6] = 300; xgridspacing_p[6] = 10; int nx = 128; OPModel.Types.CDevice device = new OPModel.Types.CDevice(fpp, fpu, 0); OPModel.Types.S1DGrid grid = new OPModel.Types.S1DGrid(device, nx, today, t_i, dt0, S0, xpivot_p, xgridspacing_p); double beta_i; double[] ir_i = new double[ni]; double[] df_i = new double[ni]; double[][] SDrift_i_y = new double[ni][]; double[][] SVol_i_y = new double[ni][]; for (int i = 0; i < ni; i++) { ir_i[i] = 0.05; if (i == 0) df_i[0] = Math.Exp(-ir_i[i] * (t_i[0] - grid.today).Days / 365.25); else df_i[i] = df_i[i - 1] * Math.Exp(-ir_i[i] * (t_i[i] - t_i[i - 1]).Days / 365.25); double Sigma0 = 0.25; beta_i = 1; SDrift_i_y[i] = new double[grid.d]; SVol_i_y[i] = new double[grid.d]; for (int y = 0; y < grid.d; y++) { SDrift_i_y[i][y] = ir_i[i] * grid.host_d_xval(y); SVol_i_y[i][y] = Sigma0 * grid.host_d_xval(grid.y0) * Math.Pow(grid.host_d_xval(y) / grid.host_d_xval(grid.y0), beta_i); } } CStopWatch sw = new CStopWatch(); sw.Reset(); CLVModel model = new CLVModel(grid, "DCLV1F"); model.set_discount_curve(df_i); model.mkgen(SDrift_i_y, SVol_i_y); model.make_mc_plan(nscen_per_batch, nbatches, t_k); model.reset_flop_counter(); double time = sw.Peek(); sw.Reset(); model.exe_mc_plan(); time = sw.Peek(); double nflops = model.cpu_nflops; double gigaflops_per_second = nflops / (1000000000d * time); if (benchmarks == null) benchmarks = new SBenchmarks(); benchmarks.cpu_dclv1f_blas_performance = gigaflops_per_second; log.Add("blas performance: " + String.Format("{0:0.0}", gigaflops_per_second) + " GF/sec"); CMCEvaluator evaluator = new emptyEvaluator(); model.host_d_mc_init(); double[] payoff_a = new double[nscen_per_batch * model.mcplan.nth]; sw.Reset(); unsafe { model.host_d_mc_run1f(payoff_a, evaluator); } time = sw.Peek(); double nevals = (double)t_k.Length * (double)nbatches * (double)nscen_per_batch; double milion_evals_per_second = nevals / (1000000 * time); benchmarks.cpu_dclv1f_mc_performance = milion_evals_per_second; log.Add("mc performance: " + String.Format("{0:0.0}", milion_evals_per_second) + " milion eval/sec"); opcuda_shutdown(); }
public void run_benchmark_shared_peak(uint dev) { log.Add("running shared_peak, device " + dev); CStopWatch sw = new CStopWatch(); sw.Reset(); int status = opcuda_cublas_init(); if (status != 0) throw new ExecutionEngineException(); opcuda_set_device(dev); int nblocks = 128; int block_dim = 64; float[] x1 = new float[128 * nblocks]; float[] y1 = new float[128 * nblocks]; float[] x2 = new float[128 * nblocks]; float[] y2 = new float[128 * nblocks]; Random rg = new Random(); for (int i = 0; i < block_dim; i++) { double alpha = (float)rg.Next(-1000, 1000) / (float)1000; x1[i] = (float)Math.Cos(alpha); y1[i] = (float)Math.Sin(alpha); double beta = (float)rg.Next(-1000, 1000) / (float)1000; x2[i] = (float)Math.Cos(beta); y2[i] = (float)Math.Sin(beta); } uint x1ptr = opcuda_mem_alloc((uint)(block_dim * nblocks * sizeof(float))); uint y1ptr = opcuda_mem_alloc((uint)(block_dim * nblocks * sizeof(float))); uint x2ptr = opcuda_mem_alloc((uint)(block_dim * nblocks * sizeof(float))); uint y2ptr = opcuda_mem_alloc((uint)(block_dim * nblocks * sizeof(float))); unsafe { fixed (float* x1p = &x1[0]) { fixed (float* y1p = &y1[0]) { status = opcuda_memcpy_h2d(x1ptr, (IntPtr)x1p, (uint)(block_dim * nblocks * sizeof(float))); if (status != 0) throw new System.Exception(); status = opcuda_memcpy_h2d(y1ptr, (IntPtr)y1p, (uint)(block_dim * nblocks * sizeof(float))); if (status != 0) throw new System.Exception(); } } fixed (float* x2p = &x2[0]) { fixed (float* y2p = &y2[0]) { status = opcuda_memcpy_h2d(x2ptr, (IntPtr)x2p, (uint)(block_dim * nblocks * sizeof(float))); if (status != 0) throw new System.Exception(); status = opcuda_memcpy_h2d(y2ptr, (IntPtr)y2p, (uint)(block_dim * nblocks * sizeof(float))); if (status != 0) throw new System.Exception(); } } } opcuda_thread_synchronize(); int niter = 100; sw.Reset(); double time0 = sw.Peek(); for (int iter = 0; iter < niter; iter++) { opcuda_benchmark_shared_peak(x1ptr, y1ptr, x2ptr, y2ptr); } opcuda_thread_synchronize(); double time1 = sw.Peek(); unsafe { fixed (float* x1p = &x1[0]) { fixed (float* y1p = &y1[0]) { status = opcuda_memcpy_d2h(x1ptr, (IntPtr)x1p, (uint)(128 * nblocks * sizeof(float))); if (status != 0) throw new System.Exception(); status = opcuda_memcpy_d2h(y1ptr, (IntPtr)y1p, (uint)(128 * nblocks * sizeof(float))); if (status != 0) throw new System.Exception(); } } fixed (float* x2p = &x2[0]) { fixed (float* y2p = &y2[0]) { status = opcuda_memcpy_d2h(x2ptr, (IntPtr)x2p, (uint)(block_dim * nblocks * sizeof(float))); if (status != 0) throw new System.Exception(); status = opcuda_memcpy_d2h(y2ptr, (IntPtr)y2p, (uint)(block_dim * nblocks * sizeof(float))); if (status != 0) throw new System.Exception(); } } } double nflops = (double)niter * 4 * 8 * 300d * nblocks * (double)block_dim; double gigaflops_per_second = nflops / (1000000000d * (time1 - time0)); if (benchmarks == null) benchmarks = new SBenchmarks(); if (benchmarks.shared_peak_performance_dev == null) benchmarks.shared_peak_performance_dev = new double[ndev]; benchmarks.shared_peak_performance_dev[dev] = gigaflops_per_second; status = opcuda_shutdown(); }
public void run_benchmark_gpu_sglv1f(uint dev) { log.Add("running sglv1f, device " + dev); int ni = 10; double S0 = 100; int nscen_per_batch = 4096 * 25; int nbatches = 20; TimeSpan dt0 = TimeSpan.FromDays(1); EFloatingPointUnit fpu = EFloatingPointUnit.device; EFloatingPointPrecision fpp = EFloatingPointPrecision.bit32; DateTime today = DateTime.Today; DateTime[] t_k = new DateTime[40]; for (int k = 0; k < 40; k++) t_k[k] = today.AddDays(7 * (k + 1)); DateTime[] t_i = new DateTime[ni]; t_i[0] = today.AddDays(30); t_i[1] = today.AddDays(60); t_i[2] = today.AddDays(90); t_i[3] = today.AddDays(120); t_i[4] = today.AddDays(150); t_i[5] = today.AddDays(180); t_i[6] = today.AddDays(210); t_i[7] = today.AddDays(240); t_i[8] = today.AddDays(270); t_i[9] = today.AddDays(300); double[] xpivot_p = new double[7]; double[] xgridspacing_p = new double[7]; xpivot_p[0] = 1; xgridspacing_p[0] = 5; xpivot_p[1] = 70; xgridspacing_p[1] = 2.5; xpivot_p[2] = 90; xgridspacing_p[2] = 1; xpivot_p[3] = 110; xgridspacing_p[3] = 2.5; xpivot_p[4] = 140; xgridspacing_p[4] = 5; xpivot_p[5] = 200; xgridspacing_p[5] = 7.5; xpivot_p[6] = 300; xgridspacing_p[6] = 10; int nx = 128; OPModel.Types.CDevice device = new OPModel.Types.CDevice(fpp, fpu, dev); OPModel.Types.S1DGrid grid = new OPModel.Types.S1DGrid(device, nx, today, t_i, dt0, S0, xpivot_p, xgridspacing_p); double beta_i; double[] ir_i = new double[ni]; double[] df_i = new double[ni]; double[][] SDrift_i_y = new double[ni][]; double[][] SVol_i_y = new double[ni][]; for (int i = 0; i < ni; i++) { ir_i[i] = 0.05; if (i == 0) df_i[0] = Math.Exp(-ir_i[i] * (t_i[0] - grid.today).Days / 365.25); else df_i[i] = df_i[i - 1] * Math.Exp(-ir_i[i] * (t_i[i] - t_i[i - 1]).Days / 365.25); double Sigma0 = 0.25; beta_i = 1; SDrift_i_y[i] = new double[grid.d]; SVol_i_y[i] = new double[grid.d]; for (int y = 0; y < grid.d; y++) { SDrift_i_y[i][y] = ir_i[i] * grid.host_d_xval(y); SVol_i_y[i][y] = Sigma0 * grid.host_d_xval(grid.y0) * Math.Pow(grid.host_d_xval(y) / grid.host_d_xval(grid.y0), beta_i); } } CStopWatch sw = new CStopWatch(); sw.Reset(); CLVModel model = new CLVModel(grid, "SGLV1F"); model.set_discount_curve(df_i); model.mkgen(SDrift_i_y, SVol_i_y); model.make_mc_plan(nscen_per_batch, nbatches, t_k); model.reset_flop_counter(); sw.Reset(); model.exe_mc_plan(); model.device_thread_synchronize(); double time = sw.Peek(); double nflops = model.gpu_nflops; double gigaflops_per_second = nflops / (1000000000d * time); if (benchmarks == null) benchmarks = new SBenchmarks(); if (benchmarks.gpu_sglv1f_blas_performance_dev == null) benchmarks.gpu_sglv1f_blas_performance_dev = new double[ndev]; benchmarks.gpu_sglv1f_blas_performance_dev[dev] = gigaflops_per_second; log.Add("blas performance: " + String.Format("{0:0.0}", gigaflops_per_second) + " GF/sec"); CMCEvaluator evaluator = new emptyEvaluator(); double[] pdf_y = new double[grid.d]; if (model.device_mc_init() == 1) { MessageBox.Show("device_mc_init() failed", "OPBench", MessageBoxButtons.OK); return; } sw.Reset(); unsafe { model.device_mc_run1f(pdf_y, evaluator); } model.device_thread_synchronize(); time = sw.Peek(); double nevals = (double)t_k.Length * (double)nbatches * (double)nscen_per_batch; double milion_evals_per_second = nevals / (1000000 * time); int status = opcuda_shutdown(); if (status != 0) throw new ExecutionEngineException(); if (benchmarks == null) benchmarks = new SBenchmarks(); if (benchmarks.gpu_sglv1f_mc_performance_dev == null) benchmarks.gpu_sglv1f_mc_performance_dev = new double[ndev]; benchmarks.gpu_sglv1f_mc_performance_dev[dev] = milion_evals_per_second; log.Add("mc performance: " + String.Format("{0:0.0}", milion_evals_per_second) + " milion eval/sec"); opcuda_shutdown(); }
public void run_benchmark_gpu_sgemv4(uint dev) { log.Add("running sgemv4, device " + dev); int ni = 20; Random rg = new Random(); int d = 96 * 6; int nz = 1600; int[] ncol_i = new int[ni]; int[][] col0_i_c = new int[ni][]; int[][] col1_i_c = new int[ni][]; int[] col0_c = new int[2 * nz]; int[] col1_c = new int[2 * nz]; bool[] todo_v = new bool[2 * ni * nz]; for (int i = 0; i < ni; i++) { ncol_i[i] = 0; for (int iter = 0; iter < 60; iter++) { if (rg.Next(2) == 1) { int v0 = 0, v1 = 0; int x = rg.Next(2 * nz); for (int k = 0; k <= x; k++) { v0 += 1; if (v0 >= 2 * nz) v0 -= 2 * nz; while (todo_v[v0]) { v0 += 1; if (v0 >= 2 * nz) v0 -= 2 * nz; } } if (todo_v[v0]) throw new System.Exception(); todo_v[v0] = true; x = rg.Next(2 * nz); for (int k = 0; k <= x; k++) { v1 += 1; if (v1 >= 2 * nz) v1 -= 2 * nz; while (todo_v[v1]) { v1 += 1; if (v1 >= 2 * nz) v1 -= 2 * nz; } } if (todo_v[v1]) throw new System.Exception(); todo_v[v1] = true; col0_c[ncol_i[i]] = v0 * d; col1_c[ncol_i[i]] = v1 * d; ncol_i[i] += 1; } } col0_i_c[i] = new int[16 * ((ncol_i[i] + 15) / 16)]; col1_i_c[i] = new int[16 * ((ncol_i[i] + 15) / 16)]; int c; for (c = 0; c < ncol_i[i]; c++) { col0_i_c[i][c] = col0_c[c]; col1_i_c[i][c] = col1_c[c]; } for (; c < col0_i_c[i].Length; c++) { col0_i_c[i][c] = 2 * nz * d; col1_i_c[i][c] = 2 * nz * d; } } int fsz = ni * d * d + 2 * d * nz + 1; float[] host_float_buf = new float[fsz]; for (int a = 0; a < host_float_buf.Length; a++) host_float_buf[a] = (float)(0.01 * rg.Next(-1000, 1000)); int status = opcuda_cublas_init(); if (status != 0) throw new ExecutionEngineException(); opcuda_set_device(dev); uint device_float_buf_ptr = opcuda_mem_alloc((uint)(host_float_buf.Length * sizeof(float))); unsafe { fixed (float* bufp = &host_float_buf[0]) { status = opcuda_memcpy_h2d(device_float_buf_ptr, (IntPtr)bufp, (uint)(fsz * sizeof(float))); if (status != 0) throw new System.Exception(); } } int nblocks = 0; for (int i = 0; i < ni; i++) { int blockDim_x, blockDim_y; blockDim_x = (d + 63) / 64; blockDim_y = (ncol_i[i] + 15) / 16; nblocks += blockDim_x * blockDim_y; } int bid = 0; int[] i_bid = new int[nblocks]; int[] host_blockIdx_x_bid = new int[nblocks]; int[] host_blockIdx_y_bid = new int[nblocks]; for (int i = 0; i < ni; i++) { int blockDim_x, blockDim_y; blockDim_x = (d + 63) / 64; blockDim_y = (ncol_i[i] + 15) / 16; for (int bx = 0; bx < blockDim_x; bx++) { for (int by = 0; by < blockDim_y; by++) { i_bid[bid] = i; host_blockIdx_x_bid[bid] = bx; host_blockIdx_y_bid[bid] = by; bid += 1; } } } long ptr = (long)device_float_buf_ptr; uint[] A_i = new uint[ni]; uint Z; Z = (uint)ptr; ptr += 2 * nz * d * sizeof(float); for (int i = 0; i < ni; i++) { A_i[i] = (uint)ptr; ptr += d * d * sizeof(float); } int nargs = 10; int isz = (nargs + 1) * nblocks; for (int i = 0; i < ni; i++) { isz += 16 * 2 * ((ncol_i[i] + 15) / 16); } uint device_int_buf_ptr = opcuda_mem_alloc((uint)(isz * sizeof(int))); uint[] col0_v_ptr_i = new uint[ni]; uint[] col1_v_ptr_i = new uint[ni]; long colptr = device_int_buf_ptr + (nargs + 1) * nblocks * sizeof(uint); for (int i = 0; i < ni; i++) { col0_v_ptr_i[i] = (uint)colptr; colptr += 16 * ((ncol_i[i] + 15) / 16) * sizeof(uint); col1_v_ptr_i[i] = (uint)colptr; colptr += 16 * ((ncol_i[i] + 15) / 16) * sizeof(uint); } uint[] host_int_buf = new uint[isz]; for (bid = 0; bid < nblocks; bid++) { int offset = nblocks + nargs * bid; host_int_buf[bid] = (uint)(device_int_buf_ptr + offset * sizeof(uint)); int i = i_bid[bid]; host_int_buf[offset + 0] = (uint)i; //const int i = c[0]; host_int_buf[offset + 1] = (uint)host_blockIdx_x_bid[bid]; //const int blockIdx_x = c[1]; host_int_buf[offset + 2] = (uint)host_blockIdx_y_bid[bid]; //const int blockIdx_y = c[2]; host_int_buf[offset + 3] = (uint)d; //const int m = c[3]; host_int_buf[offset + 4] = (uint)nz; //const int n = c[4]; host_int_buf[offset + 5] = (uint)ncol_i[i]; //int k = c[5]; host_int_buf[offset + 6] = A_i[i]; //float* A = (float*)(c[6]); host_int_buf[offset + 7] = Z; //float* B = (float*)(c[7]); host_int_buf[offset + 8] = col0_v_ptr_i[i]; //float* C = (float*)(c[8]); host_int_buf[offset + 9] = col1_v_ptr_i[i]; //float* C = (float*)(c[8]); } int coffset = nblocks + nargs * nblocks; for (int i = 0; i < ni; i++) { int c = 0; int bufsz = 16 * ((ncol_i[i] + 15) / 16); for (c = 0; c < ncol_i[i]; c++) { host_int_buf[coffset + c] = (uint)col0_i_c[i][c]; } for (; c < bufsz; c++) { host_int_buf[coffset + c] = (uint)(2 * nz * d); } coffset += bufsz; for (c = 0; c < ncol_i[i]; c++) { host_int_buf[coffset + c] = (uint)col1_i_c[i][c]; } for (; c < bufsz; c++) { host_int_buf[coffset + c] = (uint)(2 * nz * d); } coffset += bufsz; } float[] host_float_buf2 = new float[fsz]; uint[] host_int_buf2 = new uint[isz]; unsafe { fixed (float* bufp = &host_float_buf[0]) { status = opcuda_memcpy_h2d(device_float_buf_ptr, (IntPtr)bufp, (uint)(fsz * sizeof(float))); if (status != 0) throw new System.Exception(); } fixed (uint* bufp = &host_int_buf[0]) { status = opcuda_memcpy_h2d(device_int_buf_ptr, (IntPtr)bufp, (uint)(isz * sizeof(uint))); if (status != 0) throw new System.Exception(); } fixed (float* bufp = &host_float_buf2[0]) { status = opcuda_memcpy_d2h(device_float_buf_ptr, (IntPtr)bufp, (uint)(fsz * sizeof(float))); if (status != 0) throw new System.Exception(); } fixed (uint* bufp = &host_int_buf2[0]) { status = opcuda_memcpy_d2h(device_int_buf_ptr, (IntPtr)bufp, (uint)(isz * sizeof(uint))); if (status != 0) throw new System.Exception(); } } opcuda_sgemv3(d, nz, ncol_i[0], A_i[0], Z, col0_v_ptr_i[0], col1_v_ptr_i[0]); unsafe { fixed (float* buf2p = &host_float_buf2[0]) { status = opcuda_memcpy_d2h(device_float_buf_ptr, (IntPtr)buf2p, (uint)(fsz * sizeof(float))); if (status != 0) throw new System.Exception(); fixed (float* bufp = &host_float_buf[0]) { int ptr2 = 0; float errore, maxerror1, maxerror2; maxerror1 = 0; float* Zp = bufp; float* Z2p = buf2p; ptr2 += 2 * d * nz; maxerror2 = 0.0000001f; for (int j = 0; j < d; j++) { for (int v = 0; v < 2 * nz; v++) { if (!todo_v[v]) { errore = Math.Abs(Zp[j + d * v] - Z2p[j + d * v]); if (errore > maxerror2) maxerror2 = errore; } } } if (maxerror2 > Math.Pow(10, -6)) throw new System.Exception(); int i = 0; float* Ap = bufp + ptr2; float* A2p = buf2p + ptr2; ptr2 += d * d; maxerror2 = 0; for (int j = 0; j < d; j++) { for (int k = 0; k < d; k++) { errore = Math.Abs(Ap[j + d * k] - A2p[j + d * k]); if (errore > maxerror2) maxerror2 = errore; } } if (maxerror2 > Math.Pow(10, -6)) throw new System.Exception(); for (int c = 0; c < ncol_i[i]; c++) { opc_sgemv(d, d, 1, Ap, d, Zp + col0_i_c[i][c], 1, 0, Zp + col1_i_c[i][c], 1); } for (int j = 0; j < d; j++) { for (int k = 0; k < nz; k++) { errore = Math.Abs(Zp[j + d * k] - Z2p[j + d * k]) / (1 + Math.Abs(Zp[j + d * k])); if (errore > maxerror1) { maxerror1 = errore; } } } if (maxerror1 > Math.Pow(10, -3)) throw new System.Exception(); } } } unsafe { fixed (float* bufp = &host_float_buf[0]) { status = opcuda_memcpy_h2d(device_float_buf_ptr, (IntPtr)bufp, (uint)(fsz * sizeof(float))); if (status != 0) throw new System.Exception(); } fixed (uint* bufp = &host_int_buf[0]) { status = opcuda_memcpy_h2d(device_int_buf_ptr, (IntPtr)bufp, (uint)(isz * sizeof(uint))); if (status != 0) throw new System.Exception(); } } opcuda_sgemv4(nblocks, device_int_buf_ptr); unsafe { fixed (float* buf2p = &host_float_buf2[0]) { status = opcuda_memcpy_d2h(device_float_buf_ptr, (IntPtr)buf2p, (uint)(fsz * sizeof(float))); if (status != 0) throw new System.Exception(); fixed (float* bufp = &host_float_buf[0]) { int ptr2 = 0; float errore, maxerror1, maxerror2; maxerror1 = 0; float* Zp = bufp; float* Z2p = buf2p; ptr2 += 2 * d * nz; maxerror2 = 0.0000001f; for (int j = 0; j < d; j++) { for (int v = 0; v < 2 * nz; v++) { if (!todo_v[v]) { errore = Math.Abs(Zp[j + d * v] - Z2p[j + d * v]); if (errore > maxerror2) maxerror2 = errore; } } } if (maxerror2 > Math.Pow(10, -6)) throw new System.Exception(); for (int i = 0; i < ni; i++) { float* Ap = bufp + ptr2; float* A2p = buf2p + ptr2; ptr2 += d * d; maxerror2 = 0; for (int j = 0; j < d; j++) { for (int k = 0; k < d; k++) { errore = Math.Abs(Ap[j + d * k] - A2p[j + d * k]); if (errore > maxerror2) maxerror2 = errore; } } if (maxerror2 > Math.Pow(10, -6)) throw new System.Exception(); for (int c = 0; c < ncol_i[i]; c++) { opc_sgemv(d, d, 1, Ap, d, Zp + col0_i_c[i][c], 1, 0, Zp + col1_i_c[i][c], 1); } } for (int j = 0; j < d; j++) { for (int k = 0; k < nz; k++) { errore = Math.Abs(Zp[j + d * k] - Z2p[j + d * k]) / (1 + Math.Abs(Zp[j + d * k])); if (errore > maxerror1) { maxerror1 = errore; } } } if (maxerror1 > Math.Pow(10, -3)) throw new System.Exception(); } } } CStopWatch sw = new CStopWatch(); sw.Reset(); int niter = 100; for (int iter = 0; iter < niter; iter++) { opcuda_sgemv4(nblocks, device_int_buf_ptr); } opcuda_thread_synchronize(); double time1 = sw.Peek(); double nflops = 0; for (int i = 0; i < ni; i++) { nflops += 2 * d * d * ncol_i[i]; } nflops *= niter; double gigaflops_per_second = nflops / (1000000000d * time1); status = opcuda_shutdown(); if (status != 0) throw new ExecutionEngineException(); if (benchmarks == null) benchmarks = new SBenchmarks(); if (benchmarks.gpu_sgemv4_performance_dev == null) benchmarks.gpu_sgemv4_performance_dev = new double[ndev]; benchmarks.gpu_sgemv4_performance_dev[dev] = gigaflops_per_second; log.Add("performance: " + String.Format("{0:0.0}", gigaflops_per_second) + " GF/sec"); opcuda_shutdown(); }
public void run_benchmark_gpu_sgemm4(uint dev) { log.Add("running sgemm4, device " + dev); int ni = 40; Random rg = new Random(); int[] m_i = new int[ni]; int[] n_i = new int[ni]; int[] k_i = new int[ni]; int d = 96 * 6; int fsz = 0; for (int i = 0; i < ni; i++) { m_i[i] = d; n_i[i] = d; // rg.Next(1, 25); k_i[i] = d; } for (int i = 0; i < ni; i++) { //n_i[i] = (int)((n_i[i] * 400.0) / isum); fsz += m_i[i] * k_i[i] + k_i[i] * n_i[i] + m_i[i] * n_i[i]; } float[] host_float_buf = new float[fsz]; for (int a = 0; a < host_float_buf.Length; a++) host_float_buf[a] = (float)(0.01 * rg.Next(-1000, 1000)); int status = opcuda_cublas_init(); opcuda_set_device(dev); if (status != 0) throw new ExecutionEngineException(); uint device_float_buf_ptr = opcuda_mem_alloc((uint)(host_float_buf.Length * sizeof(float))); unsafe { fixed (float* bufp = &host_float_buf[0]) { status = opcuda_memcpy_h2d(device_float_buf_ptr, (IntPtr)bufp, (uint)(fsz * sizeof(float))); if (status != 0) throw new System.Exception(); } } int nblocks = 0; for (int i = 0; i < ni; i++) { int blockDim_x, blockDim_y; blockDim_x = (m_i[i] + 63) / 64; blockDim_y = (n_i[i] + 15) / 16; nblocks += blockDim_x * blockDim_y; } int bid = 0; int[] i_bid = new int[nblocks]; int[] host_blockIdx_x_bid = new int[nblocks]; int[] host_blockIdx_y_bid = new int[nblocks]; for (int i = 0; i < ni; i++) { int blockDim_x, blockDim_y; blockDim_x = (m_i[i] + 63) / 64; blockDim_y = (n_i[i] + 15) / 16; for (int bx = 0; bx < blockDim_x; bx++) { for (int by = 0; by < blockDim_y; by++) { i_bid[bid] = i; host_blockIdx_x_bid[bid] = bx; host_blockIdx_y_bid[bid] = by; bid += 1; } } } long ptr = (long)device_float_buf_ptr; uint[] A_i = new uint[ni]; uint[] B_i = new uint[ni]; uint[] C_i = new uint[ni]; for (int i = 0; i < ni; i++) { A_i[i] = (uint)ptr; ptr += m_i[i] * k_i[i] * sizeof(float); B_i[i] = (uint)ptr; ptr += k_i[i] * n_i[i] * sizeof(float); C_i[i] = (uint)ptr; ptr += m_i[i] * n_i[i] * sizeof(float); } int isz = nblocks * (1 + 9); uint device_int_buf_ptr = opcuda_mem_alloc((uint)(isz * sizeof(int))); uint[] host_int_buf = new uint[isz]; int nargs = 9; for (bid = 0; bid < nblocks; bid++) { host_int_buf[bid] = (uint)(device_int_buf_ptr + (nblocks + nargs * bid) * sizeof(uint)); int offset = nblocks + nargs * bid; int i = i_bid[bid]; host_int_buf[offset + 0] = (uint)i; //const int i = c[0]; host_int_buf[offset + 1] = (uint)host_blockIdx_x_bid[bid]; //const int blockIdx_x = c[1]; host_int_buf[offset + 2] = (uint)host_blockIdx_y_bid[bid]; //const int blockIdx_y = c[2]; host_int_buf[offset + 3] = (uint)m_i[i]; //const int m = c[3]; host_int_buf[offset + 4] = (uint)n_i[i]; //const int n = c[4]; host_int_buf[offset + 5] = (uint)k_i[i]; //int k = c[5]; host_int_buf[offset + 6] = A_i[i]; //float* A = (float*)(c[6]); host_int_buf[offset + 7] = B_i[i]; //float* B = (float*)(c[7]); host_int_buf[offset + 8] = C_i[i]; //float* C = (float*)(c[8]); } float[] host_float_buf2 = new float[fsz]; unsafe { fixed (float* bufp = &host_float_buf[0]) { status = opcuda_memcpy_h2d(device_float_buf_ptr, (IntPtr)bufp, (uint)(fsz * sizeof(float))); if (status != 0) throw new System.Exception(); } fixed (uint* bufp = &host_int_buf[0]) { status = opcuda_memcpy_h2d(device_int_buf_ptr, (IntPtr)bufp, (uint)(isz * sizeof(float))); if (status != 0) throw new System.Exception(); } fixed (float* bufp = &host_float_buf2[0]) { status = opcuda_memcpy_d2h(device_float_buf_ptr, (IntPtr)bufp, (uint)(fsz * sizeof(float))); if (status != 0) throw new System.Exception(); } } opcuda_sgemm4(nblocks, device_int_buf_ptr); unsafe { fixed (float* buf2p = &host_float_buf2[0]) { status = opcuda_memcpy_d2h(device_float_buf_ptr, (IntPtr)buf2p, (uint)(fsz * sizeof(float))); if (status != 0) throw new System.Exception(); fixed (float* bufp = &host_float_buf[0]) { int ptr2 = 0; float errore, maxerror1, maxerror2; maxerror1 = 0; for (int i = 0; i < ni; i++) { float* Ap = bufp + ptr2; float* A2p = buf2p + ptr2; ptr2 += m_i[i] * k_i[i]; float* Bp = bufp + ptr2; float* B2p = buf2p + ptr2; ptr2 += k_i[i] * n_i[i]; float* Cp = bufp + ptr2; float* C2p = buf2p + ptr2; ptr2 += m_i[i] * n_i[i]; maxerror2 = 0; for (int j = 0; j < m_i[i]; j++) { for (int k = 0; k < k_i[i]; k++) { errore = Math.Abs(Ap[j + m_i[i] * k] - A2p[j + m_i[i] * k]); if (errore > maxerror2) maxerror2 = errore; } } if (maxerror2 > Math.Pow(10, -6)) throw new System.Exception(); maxerror2 = 0; for (int j = 0; j < k_i[i]; j++) { for (int k = 0; k < n_i[i]; k++) { errore = Math.Abs(Bp[j + k_i[i] * k] - B2p[j + k_i[i] * k]); if (errore > maxerror2) maxerror2 = errore; } } if (maxerror2 > Math.Pow(10, -6)) throw new System.Exception(); opc_sgemm(Ap, Bp, Cp, m_i[i], k_i[i], m_i[i], m_i[i], n_i[i], k_i[i]); for (int j = 0; j < m_i[i]; j++) { for (int k = 0; k < n_i[i]; k++) { errore = Math.Abs(Cp[j + m_i[i] * k] - C2p[j + m_i[i] * k]) / (1 + Math.Abs(Cp[j + m_i[i] * k])); if (errore > maxerror1) { maxerror1 = errore; } } } } if (maxerror1 > 2 * Math.Pow(10, -3)) throw new System.Exception(); } } } CStopWatch sw = new CStopWatch(); sw.Reset(); int niter = 10; for (int iter = 0; iter < niter; iter++) { opcuda_sgemm4(nblocks, device_int_buf_ptr); } opcuda_thread_synchronize(); double time1 = sw.Peek(); double nflops = 0; for (int i = 0; i < ni; i++) { nflops += 2 * k_i[i] * m_i[i] * n_i[i]; } nflops *= niter; double gigaflops_per_second = nflops / (1000000000d * time1); opcuda_mem_free_device(device_int_buf_ptr); opcuda_mem_free_device(device_float_buf_ptr); status = opcuda_shutdown(); if (status != 0) throw new ExecutionEngineException(); if (benchmarks == null) benchmarks = new SBenchmarks(); if (benchmarks.gpu_sgemm4_performance_dev == null) benchmarks.gpu_sgemm4_performance_dev = new double[ndev]; benchmarks.gpu_sgemm4_performance_dev[dev] = gigaflops_per_second; log.Add("performance: " + String.Format("{0:0.0}", gigaflops_per_second) + " GF/sec"); opcuda_shutdown(); }