Exemplo n.º 1
0
        public void run_benchmark_gpu_sgemm(uint dev)
        {
            log.Add("running sgemm, device " + dev);

            Random rg = new Random();
            int d = 96 * 6;
            int fsz = 3 * d * d;
            float[] host_float_buf = new float[fsz];

            for (int a = 0; a < host_float_buf.Length; a++)
                host_float_buf[a] = (float)(0.01 * rg.Next(-1000, 1000));

            int status = opcuda_cublas_init();
            if (status != 0) throw new ExecutionEngineException();
            opcuda_set_device(dev);

            uint device_float_buf_ptr = opcuda_mem_alloc((uint)(host_float_buf.Length * sizeof(float)));
            uint aptr = device_float_buf_ptr;
            uint bptr = (uint)(device_float_buf_ptr + d * d * sizeof(float));
            uint cptr = (uint)(device_float_buf_ptr + 2 * d * d * sizeof(float));

            unsafe
            {
                fixed (float* bufp = &host_float_buf[0])
                {
                    status = opcuda_memcpy_h2d(device_float_buf_ptr, (IntPtr)bufp, (uint)(fsz * sizeof(float)));
                    if (status != 0) throw new System.Exception();
                }
            }

            CStopWatch sw = new CStopWatch();
            sw.Reset();

            int niter = 100;

            for (int iter = 0; iter < niter; iter++)
            {
                opcuda_sgemm(d, d, d, 1, aptr, d, bptr, d, 0, cptr, d);
            }

            opcuda_thread_synchronize();
            double time1 = sw.Peek();

            double nflops = 2d * niter * (double)d * (double)d * (double)d;
            double gigaflops_per_second = nflops / (1000000000d * time1);

            opcuda_mem_free_device(device_float_buf_ptr);
            status = opcuda_shutdown();
            if (status != 0) throw new ExecutionEngineException();

            if (benchmarks == null) benchmarks = new SBenchmarks();
            if (benchmarks.gpu_sgemm_performance_dev == null)
                benchmarks.gpu_sgemm_performance_dev = new double[ndev];
            benchmarks.gpu_sgemm_performance_dev[dev] = gigaflops_per_second;
            log.Add("performance: " + String.Format("{0:0.0}", gigaflops_per_second) + " GF/sec");

            opcuda_shutdown();
        }
Exemplo n.º 2
0
        public void run_benchmark_cpu_sgemm()
        {
            log.Add("running sgemm on the cpu");

            Random rg = new Random();
            int d = 96 * 6;
            int fsz = 3 * d * d;
            float[] host_float_buf = new float[fsz];

            for (int a = 0; a < host_float_buf.Length; a++)
                host_float_buf[a] = (float)(0.01 * rg.Next(-1000, 1000));

            CStopWatch sw = new CStopWatch();
            sw.Reset();

            int niter = 100;

            unsafe
            {
                fixed (float* ap = &host_float_buf[0])
                {
                    fixed (float* bp = &host_float_buf[d * d])
                    {
                        fixed (float* cp = &host_float_buf[2 * d * d])
                        {
                            for (int iter = 0; iter < niter; iter++)
                            {
                                opc_sgemm(ap, bp, cp, d, d, d, d, d, d);
                            }
                        }
                    }
                }
            }

            double time1 = sw.Peek();

            double nflops = 2d * niter * (double)d * (double)d * (double)d;
            double gigaflops_per_second = nflops / (1000000000d * time1);

            if (benchmarks == null) benchmarks = new SBenchmarks();

            benchmarks.cpu_sgemm_performance = gigaflops_per_second;
        }
Exemplo n.º 3
0
        public void run_benchmark_gpu_mt(uint dev)
        {
            log.Add("running sglv1f, device " + dev);

            int nscen_per_batch = 4096 * 25;
            int nbatches = 20;

            int status = opcuda_cublas_init();
            if (status != 0) throw new ExecutionEngineException();
            opcuda_set_device(dev);

            opcuda_mc_load_mt_gpu();
            Random rand = new Random();
            int nrng = opcuda_mc_nrng();

            CArray host_seed_rg = new CArray(nrng, EType.int_t, EMemorySpace.host, null, "host_seed_rg");
            unsafe
            {
                int* seed_rg = (int*)host_seed_rg.hptr;
                for (int rg = 0; rg < nrng; rg++)
                {
                    seed_rg[rg] = (int)(rand.NextDouble() * int.MaxValue);
                }
            }

            CArray device_rgstatus = new CArray(opcuda_mc_status_sz(), EType.int_t, EMemorySpace.device, null, "mcbuf._device_rgstatus");
            unsafe
            {
                opcuda_mc_setseed(host_seed_rg.hptr, device_rgstatus.ptr);
            }

            CArray device_unif_s = new CArray(nscen_per_batch, EType.float_t, EMemorySpace.device, null, "device_unif_s");
            CArray host_unif_s = new CArray(nscen_per_batch, EType.float_t, EMemorySpace.host, null, "host_unif_s");

            CStopWatch sw = new CStopWatch();
            sw.Reset();
            unsafe
            {
                for (int b = 0; b < nbatches; b++)
                {
                    opcuda_mt_benchmark(device_rgstatus.ptr, device_unif_s.ptr, nscen_per_batch);
                    opcuda_memcpy_d2h(device_unif_s.ptr, host_unif_s.hptr, (uint)(sizeof(short) * host_unif_s.length));
                }
            }
            opcuda_thread_synchronize();
            double time = sw.Peek();

            double nevals = (double)nbatches * (double)nscen_per_batch;
            double milion_evals_per_second = nevals / (1000000 * time);
            status = opcuda_shutdown();
            if (status != 0) throw new ExecutionEngineException();
            if (benchmarks == null) benchmarks = new SBenchmarks();
            if (benchmarks.gpu_mt_with_copy_performance_dev == null)
                benchmarks.gpu_mt_with_copy_performance_dev = new double[ndev];
            benchmarks.gpu_mt_with_copy_performance_dev[dev] = milion_evals_per_second;
            log.Add("mc performance: " + String.Format("{0:0.0}", milion_evals_per_second) + " milion eval/sec");

            sw.Reset();
            unsafe
            {
                for (int b = 0; b < nbatches; b++)
                {
                    opcuda_mt_benchmark(device_rgstatus.ptr, device_unif_s.ptr, nscen_per_batch);
                }
            }
            opcuda_thread_synchronize();
            time = sw.Peek();

            nevals = (double)nbatches * (double)nscen_per_batch;
            milion_evals_per_second = nevals / (1000000 * time);
            status = opcuda_shutdown();
            if (status != 0) throw new ExecutionEngineException();
            if (benchmarks == null) benchmarks = new SBenchmarks();
            if (benchmarks.gpu_mt_no_copy_performance_dev == null)
                benchmarks.gpu_mt_no_copy_performance_dev = new double[ndev];
            benchmarks.gpu_mt_no_copy_performance_dev[dev] = milion_evals_per_second;
            log.Add("mc performance: " + String.Format("{0:0.0}", milion_evals_per_second) + " milion eval/sec");

            opcuda_shutdown();
        }
Exemplo n.º 4
0
        public void run_benchmark_cpu_dcsv1f()
        {
            log.Add("running dcsv1f on the cpu");

            int ni = 10;
            double S0 = 100;
            int nscen_per_batch = 4096 * 25;
            int nbatches = 100;
            TimeSpan dt0 = TimeSpan.FromDays(1);
            EFloatingPointUnit fpu = EFloatingPointUnit.host;
            EFloatingPointPrecision fpp = EFloatingPointPrecision.bit64;
            DateTime today = DateTime.Today;
            DateTime[] t_k = new DateTime[40];
            for (int k = 0; k < 40; k++) t_k[k] = today.AddDays(7 * (k + 1));
            DateTime[] t_i = new DateTime[ni];
            t_i[0] = today.AddDays(30); t_i[1] = today.AddDays(60);
            t_i[2] = today.AddDays(90); t_i[3] = today.AddDays(120);
            t_i[4] = today.AddDays(150); t_i[5] = today.AddDays(180);
            t_i[6] = today.AddDays(210); t_i[7] = today.AddDays(240);
            t_i[8] = today.AddDays(270); t_i[9] = today.AddDays(300);
            double[] xpivot_p = new double[7];
            double[] xgridspacing_p = new double[7];
            xpivot_p[0] = 1; xgridspacing_p[0] = 5;
            xpivot_p[1] = 70; xgridspacing_p[1] = 2.5;
            xpivot_p[2] = 90; xgridspacing_p[2] = 1;
            xpivot_p[3] = 110; xgridspacing_p[3] = 2.5;
            xpivot_p[4] = 140; xgridspacing_p[4] = 5;
            xpivot_p[5] = 200; xgridspacing_p[5] = 7.5;
            xpivot_p[6] = 300; xgridspacing_p[6] = 10;
            int nx = 64;
            int nr = 8;
            double[] rval_r = new double[nr];
            rval_r[0] = 0.55;
            rval_r[1] = 0.75;
            rval_r[2] = 0.90;
            rval_r[3] = 1.0;
            rval_r[4] = 1.1;
            rval_r[5] = 1.3;
            rval_r[6] = 1.6;
            rval_r[7] = 2.0;
            int r0 = 4;

            OPModel.Types.CDevice device = new OPModel.Types.CDevice(fpp, fpu, 0);
            OPModel.Types.S2DGrid grid = new OPModel.Types.S2DGrid(device, nx, nr, today, t_i, dt0, S0, xpivot_p, xgridspacing_p, rval_r, r0);
            double vol = .25;
            double lowbeta = 0.5;
            double highbeta = 0.5;
            double volvol = 0.5;
            double volmrr = 0.5;
            double volmrl = 3;
            double jumpsz_minus = -3.0;
            double jumpsz_plus = 0.0;

            double[,] taumatrix_ccol = new double[8, 4];
            for (int col = 0; col <= 2; col++)
            {
                taumatrix_ccol[0, col] = vol;
                taumatrix_ccol[1, col] = lowbeta;
                taumatrix_ccol[2, col] = highbeta;
                taumatrix_ccol[3, col] = volvol;
                taumatrix_ccol[4, col] = volmrr;
                taumatrix_ccol[5, col] = volmrl;
                taumatrix_ccol[6, col] = jumpsz_minus;
                taumatrix_ccol[7, col] = jumpsz_plus;
            }

            double[] ir_i = new double[ni];
            double[] df_i = new double[ni];
            for (int i = 0; i < ni; i++)
            {
                ir_i[i] = 0.05;
                if (i == 0) df_i[0] = Math.Exp(-ir_i[i] * (t_i[0] - grid.today).Days / 365.25);
                else df_i[i] = df_i[i - 1] * Math.Exp(-ir_i[i] * (t_i[i] - t_i[i - 1]).Days / 365.25);
            }

            CStopWatch sw = new CStopWatch();
            CSVModel model = new CSVModel(grid, "DCSV1F", df_i);
            model.mkgen(taumatrix_ccol, null);
            model.make_mc_plan(nscen_per_batch, nbatches, t_k);

            model.reset_flop_counter();
            double time = sw.Peek();
            sw.Reset();
            model.exe_mc_plan();
            time = sw.Peek();
            double nflops = model.cpu_nflops;

            double gigaflops_per_second = nflops / (1000000000d * time);
            if (benchmarks == null) benchmarks = new SBenchmarks();
            benchmarks.cpu_dcsv1f_blas_performance = gigaflops_per_second;
            log.Add("blas performance: " + String.Format("{0:0.0}", gigaflops_per_second) + " GF/sec");

            CMCEvaluator evaluator = new emptyEvaluator();
            double[] payoff_a = new double[nscen_per_batch * model.mcplan.nth];

            model.host_d_mc_init();
            sw.Reset();
            unsafe
            {
                model.host_d_mc_run1f(payoff_a, evaluator);
            }
            time = sw.Peek();

            double nevals = (double)t_k.Length * (double)nbatches * (double)nscen_per_batch;
            double milion_evals_per_second = nevals / (1000000 * time);
            benchmarks.cpu_dcsv1f_mc_performance = milion_evals_per_second;
            log.Add("mc performance: " + String.Format("{0:0.0}", milion_evals_per_second) + " milion eval/sec");
        }
Exemplo n.º 5
0
        public void run_benchmark_cpu_mt()
        {
            log.Add("running the Mersenne Twister benchmark on the CPU");

            int nscen_per_batch = 4096 * 25;
            int nbatches = 2000;

            int nth = Environment.ProcessorCount;
            OPModel.Types.CRangen.mtinit(nth);

            float[] host_unif_scen_th = new float[nscen_per_batch * nth];

            OPModel.Types.CJobQueue Queue = new OPModel.Types.CJobQueue();
            object[] p_b = new object[nbatches];
            for (int b = 0; b < nbatches; b++)
            {
                host_d_mc_mt_func_input input = new host_d_mc_mt_func_input();
                input.nth = nth;
                input.batch = b;
                input.nscen_per_batch = nscen_per_batch;
                input.host_unif_scen_th = host_unif_scen_th;
                p_b[b] = input;
            }

            CStopWatch sw = new CStopWatch();
            sw.Reset();
            Queue.Exec(host_d_mc_mt_func, null, p_b, nth);
            double time = sw.Peek();

            double nevals = (double)nbatches * (double)nscen_per_batch;
            double milion_evals_per_second = nevals / (1000000 * time);

            if (benchmarks == null) benchmarks = new SBenchmarks();
            benchmarks.cpu_mt_performance = milion_evals_per_second;
            log.Add("mt performance: " + String.Format("{0:0.0}", milion_evals_per_second) + " milion eval/sec");
            host_unif_scen_th = null;
            System.GC.Collect();
        }
Exemplo n.º 6
0
        public void run_benchmark_cpu_dclv1f()
        {
            log.Add("running dclv1f on the cpu");
            int ni = 10;
            double S0 = 100;
            int nscen_per_batch = 4096 * 25;
            int nbatches = 120;
            TimeSpan dt0 = TimeSpan.FromDays(1);
            EFloatingPointUnit fpu = EFloatingPointUnit.host;
            EFloatingPointPrecision fpp = EFloatingPointPrecision.bit64;
            DateTime today = DateTime.Today;
            DateTime[] t_k = new DateTime[40];
            for (int k = 0; k < 40; k++) t_k[k] = today.AddDays(7 * (k + 1));
            DateTime[] t_i = new DateTime[ni];
            t_i[0] = today.AddDays(30); t_i[1] = today.AddDays(60);
            t_i[2] = today.AddDays(90); t_i[3] = today.AddDays(120);
            t_i[4] = today.AddDays(150); t_i[5] = today.AddDays(180);
            t_i[6] = today.AddDays(210); t_i[7] = today.AddDays(240);
            t_i[8] = today.AddDays(270); t_i[9] = today.AddDays(300);
            double[] xpivot_p = new double[7];
            double[] xgridspacing_p = new double[7];
            xpivot_p[0] = 1; xgridspacing_p[0] = 5;
            xpivot_p[1] = 70; xgridspacing_p[1] = 2.5;
            xpivot_p[2] = 90; xgridspacing_p[2] = 1;
            xpivot_p[3] = 110; xgridspacing_p[3] = 2.5;
            xpivot_p[4] = 140; xgridspacing_p[4] = 5;
            xpivot_p[5] = 200; xgridspacing_p[5] = 7.5;
            xpivot_p[6] = 300; xgridspacing_p[6] = 10;
            int nx = 128;

            OPModel.Types.CDevice device = new OPModel.Types.CDevice(fpp, fpu, 0);
            OPModel.Types.S1DGrid grid = new OPModel.Types.S1DGrid(device, nx, today, t_i, dt0, S0, xpivot_p, xgridspacing_p);

            double beta_i;
            double[] ir_i = new double[ni];
            double[] df_i = new double[ni];
            double[][] SDrift_i_y = new double[ni][];
            double[][] SVol_i_y = new double[ni][];
            for (int i = 0; i < ni; i++)
            {
                ir_i[i] = 0.05;
                if (i == 0)
                    df_i[0] = Math.Exp(-ir_i[i] * (t_i[0] - grid.today).Days / 365.25);
                else
                    df_i[i] = df_i[i - 1] * Math.Exp(-ir_i[i] * (t_i[i] - t_i[i - 1]).Days / 365.25);

                double Sigma0 = 0.25;
                beta_i = 1;
                SDrift_i_y[i] = new double[grid.d];
                SVol_i_y[i] = new double[grid.d];
                for (int y = 0; y < grid.d; y++)
                {
                    SDrift_i_y[i][y] = ir_i[i] * grid.host_d_xval(y);
                    SVol_i_y[i][y] = Sigma0 * grid.host_d_xval(grid.y0) * Math.Pow(grid.host_d_xval(y) / grid.host_d_xval(grid.y0), beta_i);
                }
            }

            CStopWatch sw = new CStopWatch();
            sw.Reset();
            CLVModel model = new CLVModel(grid, "DCLV1F");

            model.set_discount_curve(df_i);
            model.mkgen(SDrift_i_y, SVol_i_y);
            model.make_mc_plan(nscen_per_batch, nbatches, t_k);

            model.reset_flop_counter();
            double time = sw.Peek();
            sw.Reset();
            model.exe_mc_plan();
            time = sw.Peek();
            double nflops = model.cpu_nflops;

            double gigaflops_per_second = nflops / (1000000000d * time);

            if (benchmarks == null) benchmarks = new SBenchmarks();
            benchmarks.cpu_dclv1f_blas_performance = gigaflops_per_second;
            log.Add("blas performance: " + String.Format("{0:0.0}", gigaflops_per_second) + " GF/sec");

            CMCEvaluator evaluator = new emptyEvaluator();
            model.host_d_mc_init();
            double[] payoff_a = new double[nscen_per_batch * model.mcplan.nth];
            sw.Reset();
            unsafe
            {
                model.host_d_mc_run1f(payoff_a, evaluator);
            }

            time = sw.Peek();

            double nevals = (double)t_k.Length * (double)nbatches * (double)nscen_per_batch;
            double milion_evals_per_second = nevals / (1000000 * time);

            benchmarks.cpu_dclv1f_mc_performance = milion_evals_per_second;
            log.Add("mc performance: " + String.Format("{0:0.0}", milion_evals_per_second) + " milion eval/sec");

            opcuda_shutdown();
        }
Exemplo n.º 7
0
        public void run_benchmark_shared_peak(uint dev)
        {
            log.Add("running shared_peak, device " + dev);

            CStopWatch sw = new CStopWatch();
            sw.Reset();
            int status = opcuda_cublas_init();
            if (status != 0) throw new ExecutionEngineException();
            opcuda_set_device(dev);

            int nblocks = 128;
            int block_dim = 64;

            float[] x1 = new float[128 * nblocks];
            float[] y1 = new float[128 * nblocks];
            float[] x2 = new float[128 * nblocks];
            float[] y2 = new float[128 * nblocks];
            Random rg = new Random();

            for (int i = 0; i < block_dim; i++)
            {
                double alpha = (float)rg.Next(-1000, 1000) / (float)1000;
                x1[i] = (float)Math.Cos(alpha);
                y1[i] = (float)Math.Sin(alpha);
                double beta = (float)rg.Next(-1000, 1000) / (float)1000;
                x2[i] = (float)Math.Cos(beta);
                y2[i] = (float)Math.Sin(beta);
            }
            uint x1ptr = opcuda_mem_alloc((uint)(block_dim * nblocks * sizeof(float)));
            uint y1ptr = opcuda_mem_alloc((uint)(block_dim * nblocks * sizeof(float)));
            uint x2ptr = opcuda_mem_alloc((uint)(block_dim * nblocks * sizeof(float)));
            uint y2ptr = opcuda_mem_alloc((uint)(block_dim * nblocks * sizeof(float)));

            unsafe
            {
                fixed (float* x1p = &x1[0])
                {
                    fixed (float* y1p = &y1[0])
                    {
                        status = opcuda_memcpy_h2d(x1ptr, (IntPtr)x1p, (uint)(block_dim * nblocks * sizeof(float)));
                        if (status != 0) throw new System.Exception();
                        status = opcuda_memcpy_h2d(y1ptr, (IntPtr)y1p, (uint)(block_dim * nblocks * sizeof(float)));
                        if (status != 0) throw new System.Exception();
                    }
                }
                fixed (float* x2p = &x2[0])
                {
                    fixed (float* y2p = &y2[0])
                    {
                        status = opcuda_memcpy_h2d(x2ptr, (IntPtr)x2p, (uint)(block_dim * nblocks * sizeof(float)));
                        if (status != 0) throw new System.Exception();
                        status = opcuda_memcpy_h2d(y2ptr, (IntPtr)y2p, (uint)(block_dim * nblocks * sizeof(float)));
                        if (status != 0) throw new System.Exception();
                    }
                }
            }

            opcuda_thread_synchronize();

            int niter = 100;
            sw.Reset();
            double time0 = sw.Peek();
            for (int iter = 0; iter < niter; iter++)
            {
                opcuda_benchmark_shared_peak(x1ptr, y1ptr, x2ptr, y2ptr);
            }

            opcuda_thread_synchronize();
            double time1 = sw.Peek();

            unsafe
            {
                fixed (float* x1p = &x1[0])
                {
                    fixed (float* y1p = &y1[0])
                    {
                        status = opcuda_memcpy_d2h(x1ptr, (IntPtr)x1p, (uint)(128 * nblocks * sizeof(float)));
                        if (status != 0) throw new System.Exception();
                        status = opcuda_memcpy_d2h(y1ptr, (IntPtr)y1p, (uint)(128 * nblocks * sizeof(float)));
                        if (status != 0) throw new System.Exception();
                    }
                }
                fixed (float* x2p = &x2[0])
                {
                    fixed (float* y2p = &y2[0])
                    {
                        status = opcuda_memcpy_d2h(x2ptr, (IntPtr)x2p, (uint)(block_dim * nblocks * sizeof(float)));
                        if (status != 0) throw new System.Exception();
                        status = opcuda_memcpy_d2h(y2ptr, (IntPtr)y2p, (uint)(block_dim * nblocks * sizeof(float)));
                        if (status != 0) throw new System.Exception();
                    }
                }
            }

            double nflops = (double)niter * 4 * 8 * 300d * nblocks * (double)block_dim;
            double gigaflops_per_second = nflops / (1000000000d * (time1 - time0));
            if (benchmarks == null) benchmarks = new SBenchmarks();
            if (benchmarks.shared_peak_performance_dev == null)
                benchmarks.shared_peak_performance_dev = new double[ndev];

            benchmarks.shared_peak_performance_dev[dev] = gigaflops_per_second;

            status = opcuda_shutdown();
        }
Exemplo n.º 8
0
        public void run_benchmark_gpu_sglv1f(uint dev)
        {
            log.Add("running sglv1f, device " + dev);

            int ni = 10;
            double S0 = 100;
            int nscen_per_batch = 4096 * 25;
            int nbatches = 20;
            TimeSpan dt0 = TimeSpan.FromDays(1);
            EFloatingPointUnit fpu = EFloatingPointUnit.device;
            EFloatingPointPrecision fpp = EFloatingPointPrecision.bit32;
            DateTime today = DateTime.Today;
            DateTime[] t_k = new DateTime[40];
            for (int k = 0; k < 40; k++) t_k[k] = today.AddDays(7 * (k + 1));
            DateTime[] t_i = new DateTime[ni];
            t_i[0] = today.AddDays(30); t_i[1] = today.AddDays(60);
            t_i[2] = today.AddDays(90); t_i[3] = today.AddDays(120);
            t_i[4] = today.AddDays(150); t_i[5] = today.AddDays(180);
            t_i[6] = today.AddDays(210); t_i[7] = today.AddDays(240);
            t_i[8] = today.AddDays(270); t_i[9] = today.AddDays(300);
            double[] xpivot_p = new double[7];
            double[] xgridspacing_p = new double[7];
            xpivot_p[0] = 1; xgridspacing_p[0] = 5;
            xpivot_p[1] = 70; xgridspacing_p[1] = 2.5;
            xpivot_p[2] = 90; xgridspacing_p[2] = 1;
            xpivot_p[3] = 110; xgridspacing_p[3] = 2.5;
            xpivot_p[4] = 140; xgridspacing_p[4] = 5;
            xpivot_p[5] = 200; xgridspacing_p[5] = 7.5;
            xpivot_p[6] = 300; xgridspacing_p[6] = 10;
            int nx = 128;

            OPModel.Types.CDevice device = new OPModel.Types.CDevice(fpp, fpu, dev);
            OPModel.Types.S1DGrid grid = new OPModel.Types.S1DGrid(device, nx, today, t_i, dt0, S0, xpivot_p, xgridspacing_p);

            double beta_i;
            double[] ir_i = new double[ni];
            double[] df_i = new double[ni];
            double[][] SDrift_i_y = new double[ni][];
            double[][] SVol_i_y = new double[ni][];
            for (int i = 0; i < ni; i++)
            {
                ir_i[i] = 0.05;

                if (i == 0)
                    df_i[0] = Math.Exp(-ir_i[i] * (t_i[0] - grid.today).Days / 365.25);
                else
                    df_i[i] = df_i[i - 1] * Math.Exp(-ir_i[i] * (t_i[i] - t_i[i - 1]).Days / 365.25);

                double Sigma0 = 0.25;
                beta_i = 1;
                SDrift_i_y[i] = new double[grid.d];
                SVol_i_y[i] = new double[grid.d];
                for (int y = 0; y < grid.d; y++)
                {
                    SDrift_i_y[i][y] = ir_i[i] * grid.host_d_xval(y);
                    SVol_i_y[i][y] = Sigma0 * grid.host_d_xval(grid.y0) * Math.Pow(grid.host_d_xval(y) / grid.host_d_xval(grid.y0), beta_i);
                }
            }

            CStopWatch sw = new CStopWatch();
            sw.Reset();

            CLVModel model = new CLVModel(grid, "SGLV1F");
            model.set_discount_curve(df_i);
            model.mkgen(SDrift_i_y, SVol_i_y);
            model.make_mc_plan(nscen_per_batch, nbatches, t_k);
            model.reset_flop_counter();
            sw.Reset();
            model.exe_mc_plan();
            model.device_thread_synchronize();
            double time = sw.Peek();
            double nflops = model.gpu_nflops;
            double gigaflops_per_second = nflops / (1000000000d * time);

            if (benchmarks == null) benchmarks = new SBenchmarks();
            if (benchmarks.gpu_sglv1f_blas_performance_dev == null)
                benchmarks.gpu_sglv1f_blas_performance_dev = new double[ndev];

            benchmarks.gpu_sglv1f_blas_performance_dev[dev] = gigaflops_per_second;
            log.Add("blas performance: " + String.Format("{0:0.0}", gigaflops_per_second) + " GF/sec");

            CMCEvaluator evaluator = new emptyEvaluator();
            double[] pdf_y = new double[grid.d];
            if (model.device_mc_init() == 1)
            {
                MessageBox.Show("device_mc_init() failed", "OPBench", MessageBoxButtons.OK);
                return;
            }
            sw.Reset();
            unsafe
            {
                model.device_mc_run1f(pdf_y, evaluator);
            }
            model.device_thread_synchronize();
            time = sw.Peek();

            double nevals = (double)t_k.Length * (double)nbatches * (double)nscen_per_batch;
            double milion_evals_per_second = nevals / (1000000 * time);
            int status = opcuda_shutdown();
            if (status != 0) throw new ExecutionEngineException();
            if (benchmarks == null) benchmarks = new SBenchmarks();
            if (benchmarks.gpu_sglv1f_mc_performance_dev == null)
                benchmarks.gpu_sglv1f_mc_performance_dev = new double[ndev];
            benchmarks.gpu_sglv1f_mc_performance_dev[dev] = milion_evals_per_second;
            log.Add("mc performance: " + String.Format("{0:0.0}", milion_evals_per_second) + " milion eval/sec");

            opcuda_shutdown();
        }
Exemplo n.º 9
0
        public void run_benchmark_gpu_sgemv4(uint dev)
        {
            log.Add("running sgemv4, device " + dev);

            int ni = 20;
            Random rg = new Random();
            int d = 96 * 6;
            int nz = 1600;
            int[] ncol_i = new int[ni];
            int[][] col0_i_c = new int[ni][];
            int[][] col1_i_c = new int[ni][];
            int[] col0_c = new int[2 * nz];
            int[] col1_c = new int[2 * nz];

            bool[] todo_v = new bool[2 * ni * nz];

            for (int i = 0; i < ni; i++)
            {
                ncol_i[i] = 0;
                for (int iter = 0; iter < 60; iter++)
                {
                    if (rg.Next(2) == 1)
                    {
                        int v0 = 0, v1 = 0;
                        int x = rg.Next(2 * nz);
                        for (int k = 0; k <= x; k++)
                        {
                            v0 += 1;
                            if (v0 >= 2 * nz) v0 -= 2 * nz;
                            while (todo_v[v0])
                            {
                                v0 += 1;
                                if (v0 >= 2 * nz) v0 -= 2 * nz;
                            }
                        }
                        if (todo_v[v0]) throw new System.Exception();
                        todo_v[v0] = true;
                        x = rg.Next(2 * nz);
                        for (int k = 0; k <= x; k++)
                        {
                            v1 += 1;
                            if (v1 >= 2 * nz) v1 -= 2 * nz;
                            while (todo_v[v1])
                            {
                                v1 += 1;
                                if (v1 >= 2 * nz) v1 -= 2 * nz;
                            }
                        }
                        if (todo_v[v1]) throw new System.Exception();
                        todo_v[v1] = true;
                        col0_c[ncol_i[i]] = v0 * d;
                        col1_c[ncol_i[i]] = v1 * d;
                        ncol_i[i] += 1;
                    }
                }

                col0_i_c[i] = new int[16 * ((ncol_i[i] + 15) / 16)];
                col1_i_c[i] = new int[16 * ((ncol_i[i] + 15) / 16)];

                int c;
                for (c = 0; c < ncol_i[i]; c++)
                {
                    col0_i_c[i][c] = col0_c[c];
                    col1_i_c[i][c] = col1_c[c];
                }

                for (; c < col0_i_c[i].Length; c++)
                {
                    col0_i_c[i][c] = 2 * nz * d;
                    col1_i_c[i][c] = 2 * nz * d;
                }
            }

            int fsz = ni * d * d + 2 * d * nz + 1;

            float[] host_float_buf = new float[fsz];

            for (int a = 0; a < host_float_buf.Length; a++)
                host_float_buf[a] = (float)(0.01 * rg.Next(-1000, 1000));

            int status = opcuda_cublas_init();
            if (status != 0) throw new ExecutionEngineException();
            opcuda_set_device(dev);

            uint device_float_buf_ptr = opcuda_mem_alloc((uint)(host_float_buf.Length * sizeof(float)));

            unsafe
            {
                fixed (float* bufp = &host_float_buf[0])
                {
                    status = opcuda_memcpy_h2d(device_float_buf_ptr, (IntPtr)bufp, (uint)(fsz * sizeof(float)));
                    if (status != 0) throw new System.Exception();
                }
            }

            int nblocks = 0;
            for (int i = 0; i < ni; i++)
            {
                int blockDim_x, blockDim_y;
                blockDim_x = (d + 63) / 64;
                blockDim_y = (ncol_i[i] + 15) / 16;
                nblocks += blockDim_x * blockDim_y;
            }

            int bid = 0;
            int[] i_bid = new int[nblocks];
            int[] host_blockIdx_x_bid = new int[nblocks];
            int[] host_blockIdx_y_bid = new int[nblocks];

            for (int i = 0; i < ni; i++)
            {
                int blockDim_x, blockDim_y;
                blockDim_x = (d + 63) / 64;
                blockDim_y = (ncol_i[i] + 15) / 16;
                for (int bx = 0; bx < blockDim_x; bx++)
                {
                    for (int by = 0; by < blockDim_y; by++)
                    {
                        i_bid[bid] = i;
                        host_blockIdx_x_bid[bid] = bx;
                        host_blockIdx_y_bid[bid] = by;
                        bid += 1;
                    }
                }
            }

            long ptr = (long)device_float_buf_ptr;
            uint[] A_i = new uint[ni];
            uint Z;

            Z = (uint)ptr;
            ptr += 2 * nz * d * sizeof(float);

            for (int i = 0; i < ni; i++)
            {
                A_i[i] = (uint)ptr;
                ptr += d * d * sizeof(float);
            }

            int nargs = 10;
            int isz = (nargs + 1) * nblocks;
            for (int i = 0; i < ni; i++)
            {
                isz += 16 * 2 * ((ncol_i[i] + 15) / 16);
            }
            uint device_int_buf_ptr = opcuda_mem_alloc((uint)(isz * sizeof(int)));
            uint[] col0_v_ptr_i = new uint[ni];
            uint[] col1_v_ptr_i = new uint[ni];

            long colptr = device_int_buf_ptr + (nargs + 1) * nblocks * sizeof(uint);
            for (int i = 0; i < ni; i++)
            {
                col0_v_ptr_i[i] = (uint)colptr;
                colptr += 16 * ((ncol_i[i] + 15) / 16) * sizeof(uint);
                col1_v_ptr_i[i] = (uint)colptr;
                colptr += 16 * ((ncol_i[i] + 15) / 16) * sizeof(uint);
            }

            uint[] host_int_buf = new uint[isz];
            for (bid = 0; bid < nblocks; bid++)
            {
                int offset = nblocks + nargs * bid;
                host_int_buf[bid] = (uint)(device_int_buf_ptr + offset * sizeof(uint));
                int i = i_bid[bid];
                host_int_buf[offset + 0] = (uint)i;                             //const int i = c[0];
                host_int_buf[offset + 1] = (uint)host_blockIdx_x_bid[bid];      //const int blockIdx_x = c[1];
                host_int_buf[offset + 2] = (uint)host_blockIdx_y_bid[bid];      //const int blockIdx_y = c[2];
                host_int_buf[offset + 3] = (uint)d;                             //const int m = c[3];
                host_int_buf[offset + 4] = (uint)nz;                            //const int n = c[4];
                host_int_buf[offset + 5] = (uint)ncol_i[i];                     //int k = c[5];
                host_int_buf[offset + 6] = A_i[i];                               //float* A = (float*)(c[6]);
                host_int_buf[offset + 7] = Z;                                    //float* B = (float*)(c[7]);
                host_int_buf[offset + 8] = col0_v_ptr_i[i];                               //float* C = (float*)(c[8]);
                host_int_buf[offset + 9] = col1_v_ptr_i[i];                               //float* C = (float*)(c[8]);
            }

            int coffset = nblocks + nargs * nblocks;
            for (int i = 0; i < ni; i++)
            {
                int c = 0;
                int bufsz = 16 * ((ncol_i[i] + 15) / 16);

                for (c = 0; c < ncol_i[i]; c++)
                {
                    host_int_buf[coffset + c] = (uint)col0_i_c[i][c];
                }

                for (; c < bufsz; c++)
                {
                    host_int_buf[coffset + c] = (uint)(2 * nz * d);
                }

                coffset += bufsz;
                for (c = 0; c < ncol_i[i]; c++)
                {
                    host_int_buf[coffset + c] = (uint)col1_i_c[i][c];
                }

                for (; c < bufsz; c++)
                {
                    host_int_buf[coffset + c] = (uint)(2 * nz * d);
                }
                coffset += bufsz;
            }
            float[] host_float_buf2 = new float[fsz];
            uint[] host_int_buf2 = new uint[isz];

            unsafe
            {
                fixed (float* bufp = &host_float_buf[0])
                {
                    status = opcuda_memcpy_h2d(device_float_buf_ptr, (IntPtr)bufp, (uint)(fsz * sizeof(float)));
                    if (status != 0) throw new System.Exception();
                }
                fixed (uint* bufp = &host_int_buf[0])
                {
                    status = opcuda_memcpy_h2d(device_int_buf_ptr, (IntPtr)bufp, (uint)(isz * sizeof(uint)));
                    if (status != 0) throw new System.Exception();
                }
                fixed (float* bufp = &host_float_buf2[0])
                {
                    status = opcuda_memcpy_d2h(device_float_buf_ptr, (IntPtr)bufp, (uint)(fsz * sizeof(float)));
                    if (status != 0) throw new System.Exception();
                }
                fixed (uint* bufp = &host_int_buf2[0])
                {
                    status = opcuda_memcpy_d2h(device_int_buf_ptr, (IntPtr)bufp, (uint)(isz * sizeof(uint)));
                    if (status != 0) throw new System.Exception();
                }
            }

            opcuda_sgemv3(d, nz, ncol_i[0], A_i[0], Z, col0_v_ptr_i[0], col1_v_ptr_i[0]);

            unsafe
            {
                fixed (float* buf2p = &host_float_buf2[0])
                {
                    status = opcuda_memcpy_d2h(device_float_buf_ptr, (IntPtr)buf2p, (uint)(fsz * sizeof(float)));
                    if (status != 0) throw new System.Exception();

                    fixed (float* bufp = &host_float_buf[0])
                    {
                        int ptr2 = 0;
                        float errore, maxerror1, maxerror2;
                        maxerror1 = 0;
                        float* Zp = bufp;
                        float* Z2p = buf2p;
                        ptr2 += 2 * d * nz;
                        maxerror2 = 0.0000001f;
                        for (int j = 0; j < d; j++)
                        {
                            for (int v = 0; v < 2 * nz; v++)
                            {
                                if (!todo_v[v])
                                {
                                    errore = Math.Abs(Zp[j + d * v] - Z2p[j + d * v]);
                                    if (errore > maxerror2)
                                        maxerror2 = errore;
                                }
                            }
                        }
                        if (maxerror2 > Math.Pow(10, -6)) throw new System.Exception();
                        int i = 0;
                        float* Ap = bufp + ptr2;
                        float* A2p = buf2p + ptr2;
                        ptr2 += d * d;
                        maxerror2 = 0;
                        for (int j = 0; j < d; j++)
                        {
                            for (int k = 0; k < d; k++)
                            {
                                errore = Math.Abs(Ap[j + d * k] - A2p[j + d * k]);
                                if (errore > maxerror2) maxerror2 = errore;
                            }
                        }
                        if (maxerror2 > Math.Pow(10, -6)) throw new System.Exception();

                        for (int c = 0; c < ncol_i[i]; c++)
                        {
                            opc_sgemv(d, d, 1, Ap, d, Zp + col0_i_c[i][c], 1, 0, Zp + col1_i_c[i][c], 1);
                        }

                        for (int j = 0; j < d; j++)
                        {
                            for (int k = 0; k < nz; k++)
                            {
                                errore = Math.Abs(Zp[j + d * k] - Z2p[j + d * k]) / (1 + Math.Abs(Zp[j + d * k]));
                                if (errore > maxerror1)
                                {
                                    maxerror1 = errore;
                                }
                            }
                        }
                        if (maxerror1 > Math.Pow(10, -3)) throw new System.Exception();
                    }
                }
            }

            unsafe
            {
                fixed (float* bufp = &host_float_buf[0])
                {
                    status = opcuda_memcpy_h2d(device_float_buf_ptr, (IntPtr)bufp, (uint)(fsz * sizeof(float)));
                    if (status != 0) throw new System.Exception();
                }
                fixed (uint* bufp = &host_int_buf[0])
                {
                    status = opcuda_memcpy_h2d(device_int_buf_ptr, (IntPtr)bufp, (uint)(isz * sizeof(uint)));
                    if (status != 0) throw new System.Exception();
                }
            }

            opcuda_sgemv4(nblocks, device_int_buf_ptr);

            unsafe
            {
                fixed (float* buf2p = &host_float_buf2[0])
                {
                    status = opcuda_memcpy_d2h(device_float_buf_ptr, (IntPtr)buf2p, (uint)(fsz * sizeof(float)));
                    if (status != 0) throw new System.Exception();

                    fixed (float* bufp = &host_float_buf[0])
                    {
                        int ptr2 = 0;
                        float errore, maxerror1, maxerror2;
                        maxerror1 = 0;

                        float* Zp = bufp;
                        float* Z2p = buf2p;
                        ptr2 += 2 * d * nz;

                        maxerror2 = 0.0000001f;
                        for (int j = 0; j < d; j++)
                        {
                            for (int v = 0; v < 2 * nz; v++)
                            {
                                if (!todo_v[v])
                                {
                                    errore = Math.Abs(Zp[j + d * v] - Z2p[j + d * v]);
                                    if (errore > maxerror2)
                                        maxerror2 = errore;
                                }
                            }
                        }
                        if (maxerror2 > Math.Pow(10, -6)) throw new System.Exception();

                        for (int i = 0; i < ni; i++)
                        {
                            float* Ap = bufp + ptr2;
                            float* A2p = buf2p + ptr2;
                            ptr2 += d * d;

                            maxerror2 = 0;
                            for (int j = 0; j < d; j++)
                            {
                                for (int k = 0; k < d; k++)
                                {
                                    errore = Math.Abs(Ap[j + d * k] - A2p[j + d * k]);
                                    if (errore > maxerror2) maxerror2 = errore;
                                }
                            }
                            if (maxerror2 > Math.Pow(10, -6)) throw new System.Exception();

                            for (int c = 0; c < ncol_i[i]; c++)
                            {
                                opc_sgemv(d, d, 1, Ap, d, Zp + col0_i_c[i][c], 1, 0, Zp + col1_i_c[i][c], 1);
                            }

                        }

                        for (int j = 0; j < d; j++)
                        {
                            for (int k = 0; k < nz; k++)
                            {
                                errore = Math.Abs(Zp[j + d * k] - Z2p[j + d * k]) / (1 + Math.Abs(Zp[j + d * k]));
                                if (errore > maxerror1)
                                {
                                    maxerror1 = errore;
                                }
                            }
                        }

                        if (maxerror1 > Math.Pow(10, -3)) throw new System.Exception();
                    }
                }
            }

            CStopWatch sw = new CStopWatch();
            sw.Reset();

            int niter = 100;

            for (int iter = 0; iter < niter; iter++)
            {
                opcuda_sgemv4(nblocks, device_int_buf_ptr);
            }

            opcuda_thread_synchronize();
            double time1 = sw.Peek();

            double nflops = 0;
            for (int i = 0; i < ni; i++)
            {
                nflops += 2 * d * d * ncol_i[i];
            }
            nflops *= niter;
            double gigaflops_per_second = nflops / (1000000000d * time1);

            status = opcuda_shutdown();
            if (status != 0) throw new ExecutionEngineException();

            if (benchmarks == null) benchmarks = new SBenchmarks();
            if (benchmarks.gpu_sgemv4_performance_dev == null)
                benchmarks.gpu_sgemv4_performance_dev = new double[ndev];

            benchmarks.gpu_sgemv4_performance_dev[dev] = gigaflops_per_second;

            log.Add("performance: " + String.Format("{0:0.0}", gigaflops_per_second) + " GF/sec");

            opcuda_shutdown();
        }
Exemplo n.º 10
0
        public void run_benchmark_gpu_sgemm4(uint dev)
        {
            log.Add("running sgemm4, device " + dev);

            int ni = 40;
            Random rg = new Random();
            int[] m_i = new int[ni];
            int[] n_i = new int[ni];
            int[] k_i = new int[ni];
            int d = 96 * 6;

            int fsz = 0;
            for (int i = 0; i < ni; i++)
            {
                m_i[i] = d;
                n_i[i] = d; // rg.Next(1, 25);
                k_i[i] = d;
            }
            for (int i = 0; i < ni; i++)
            {
                //n_i[i] = (int)((n_i[i] * 400.0) / isum);
                fsz += m_i[i] * k_i[i] + k_i[i] * n_i[i] + m_i[i] * n_i[i];
            }

            float[] host_float_buf = new float[fsz];
            for (int a = 0; a < host_float_buf.Length; a++)
                host_float_buf[a] = (float)(0.01 * rg.Next(-1000, 1000));

            int status = opcuda_cublas_init();
            opcuda_set_device(dev);

            if (status != 0) throw new ExecutionEngineException();
            uint device_float_buf_ptr = opcuda_mem_alloc((uint)(host_float_buf.Length * sizeof(float)));

            unsafe
            {
                fixed (float* bufp = &host_float_buf[0])
                {
                    status = opcuda_memcpy_h2d(device_float_buf_ptr, (IntPtr)bufp, (uint)(fsz * sizeof(float)));
                    if (status != 0) throw new System.Exception();
                }
            }

            int nblocks = 0;
            for (int i = 0; i < ni; i++)
            {
                int blockDim_x, blockDim_y;
                blockDim_x = (m_i[i] + 63) / 64;
                blockDim_y = (n_i[i] + 15) / 16;
                nblocks += blockDim_x * blockDim_y;
            }

            int bid = 0;
            int[] i_bid = new int[nblocks];
            int[] host_blockIdx_x_bid = new int[nblocks];
            int[] host_blockIdx_y_bid = new int[nblocks];

            for (int i = 0; i < ni; i++)
            {
                int blockDim_x, blockDim_y;
                blockDim_x = (m_i[i] + 63) / 64;
                blockDim_y = (n_i[i] + 15) / 16;
                for (int bx = 0; bx < blockDim_x; bx++)
                {
                    for (int by = 0; by < blockDim_y; by++)
                    {
                        i_bid[bid] = i;
                        host_blockIdx_x_bid[bid] = bx;
                        host_blockIdx_y_bid[bid] = by;
                        bid += 1;
                    }
                }
            }

            long ptr = (long)device_float_buf_ptr;
            uint[] A_i = new uint[ni];
            uint[] B_i = new uint[ni];
            uint[] C_i = new uint[ni];

            for (int i = 0; i < ni; i++)
            {
                A_i[i] = (uint)ptr;
                ptr += m_i[i] * k_i[i] * sizeof(float);

                B_i[i] = (uint)ptr;
                ptr += k_i[i] * n_i[i] * sizeof(float);

                C_i[i] = (uint)ptr;
                ptr += m_i[i] * n_i[i] * sizeof(float);
            }

            int isz = nblocks * (1 + 9);
            uint device_int_buf_ptr = opcuda_mem_alloc((uint)(isz * sizeof(int)));
            uint[] host_int_buf = new uint[isz];
            int nargs = 9;

            for (bid = 0; bid < nblocks; bid++)
            {
                host_int_buf[bid] = (uint)(device_int_buf_ptr + (nblocks + nargs * bid) * sizeof(uint));
                int offset = nblocks + nargs * bid;
                int i = i_bid[bid];
                host_int_buf[offset + 0] = (uint)i; //const int i = c[0];
                host_int_buf[offset + 1] = (uint)host_blockIdx_x_bid[bid]; //const int blockIdx_x = c[1];
                host_int_buf[offset + 2] = (uint)host_blockIdx_y_bid[bid]; //const int blockIdx_y = c[2];
                host_int_buf[offset + 3] = (uint)m_i[i]; //const int m = c[3];
                host_int_buf[offset + 4] = (uint)n_i[i]; //const int n = c[4];
                host_int_buf[offset + 5] = (uint)k_i[i]; //int k = c[5];
                host_int_buf[offset + 6] = A_i[i]; //float* A = (float*)(c[6]);
                host_int_buf[offset + 7] = B_i[i]; //float* B = (float*)(c[7]);
                host_int_buf[offset + 8] = C_i[i]; //float* C = (float*)(c[8]);
            }

            float[] host_float_buf2 = new float[fsz];
            unsafe
            {
                fixed (float* bufp = &host_float_buf[0])
                {
                    status = opcuda_memcpy_h2d(device_float_buf_ptr, (IntPtr)bufp, (uint)(fsz * sizeof(float)));
                    if (status != 0) throw new System.Exception();
                }
                fixed (uint* bufp = &host_int_buf[0])
                {
                    status = opcuda_memcpy_h2d(device_int_buf_ptr, (IntPtr)bufp, (uint)(isz * sizeof(float)));
                    if (status != 0) throw new System.Exception();
                }
                fixed (float* bufp = &host_float_buf2[0])
                {
                    status = opcuda_memcpy_d2h(device_float_buf_ptr, (IntPtr)bufp, (uint)(fsz * sizeof(float)));
                    if (status != 0) throw new System.Exception();
                }
            }

            opcuda_sgemm4(nblocks, device_int_buf_ptr);

            unsafe
            {
                fixed (float* buf2p = &host_float_buf2[0])
                {
                    status = opcuda_memcpy_d2h(device_float_buf_ptr, (IntPtr)buf2p, (uint)(fsz * sizeof(float)));
                    if (status != 0) throw new System.Exception();

                    fixed (float* bufp = &host_float_buf[0])
                    {
                        int ptr2 = 0;
                        float errore, maxerror1, maxerror2;
                        maxerror1 = 0;

                        for (int i = 0; i < ni; i++)
                        {
                            float* Ap = bufp + ptr2;
                            float* A2p = buf2p + ptr2;
                            ptr2 += m_i[i] * k_i[i];

                            float* Bp = bufp + ptr2;
                            float* B2p = buf2p + ptr2;
                            ptr2 += k_i[i] * n_i[i];

                            float* Cp = bufp + ptr2;
                            float* C2p = buf2p + ptr2;
                            ptr2 += m_i[i] * n_i[i];

                            maxerror2 = 0;
                            for (int j = 0; j < m_i[i]; j++)
                            {
                                for (int k = 0; k < k_i[i]; k++)
                                {
                                    errore = Math.Abs(Ap[j + m_i[i] * k] - A2p[j + m_i[i] * k]);
                                    if (errore > maxerror2) maxerror2 = errore;
                                }
                            }
                            if (maxerror2 > Math.Pow(10, -6)) throw new System.Exception();

                            maxerror2 = 0;
                            for (int j = 0; j < k_i[i]; j++)
                            {
                                for (int k = 0; k < n_i[i]; k++)
                                {
                                    errore = Math.Abs(Bp[j + k_i[i] * k] - B2p[j + k_i[i] * k]);
                                    if (errore > maxerror2) maxerror2 = errore;
                                }
                            }
                            if (maxerror2 > Math.Pow(10, -6)) throw new System.Exception();

                            opc_sgemm(Ap, Bp, Cp, m_i[i], k_i[i], m_i[i], m_i[i], n_i[i], k_i[i]);
                            for (int j = 0; j < m_i[i]; j++)
                            {
                                for (int k = 0; k < n_i[i]; k++)
                                {
                                    errore = Math.Abs(Cp[j + m_i[i] * k] - C2p[j + m_i[i] * k]) / (1 + Math.Abs(Cp[j + m_i[i] * k]));
                                    if (errore > maxerror1)
                                    {
                                        maxerror1 = errore;
                                    }
                                }
                            }
                        }
                        if (maxerror1 > 2 * Math.Pow(10, -3)) throw new System.Exception();
                    }
                }
            }

            CStopWatch sw = new CStopWatch();
            sw.Reset();
            int niter = 10;

            for (int iter = 0; iter < niter; iter++)
            {
                opcuda_sgemm4(nblocks, device_int_buf_ptr);
            }

            opcuda_thread_synchronize();
            double time1 = sw.Peek();

            double nflops = 0;
            for (int i = 0; i < ni; i++)
            {
                nflops += 2 * k_i[i] * m_i[i] * n_i[i];
            }
            nflops *= niter;
            double gigaflops_per_second = nflops / (1000000000d * time1);

            opcuda_mem_free_device(device_int_buf_ptr);
            opcuda_mem_free_device(device_float_buf_ptr);
            status = opcuda_shutdown();
            if (status != 0) throw new ExecutionEngineException();

            if (benchmarks == null) benchmarks = new SBenchmarks();
            if (benchmarks.gpu_sgemm4_performance_dev == null)
                benchmarks.gpu_sgemm4_performance_dev = new double[ndev];

            benchmarks.gpu_sgemm4_performance_dev[dev] = gigaflops_per_second;

            log.Add("performance: " + String.Format("{0:0.0}", gigaflops_per_second) + " GF/sec");

            opcuda_shutdown();
        }