Ejemplo n.º 1
0
        public static void Execute()
        {
            CudafyModule km = CudafyModule.TryDeserialize();
            if (km == null || !km.TryVerifyChecksums())
            {
                km = CudafyTranslator.Cudafy();
                km.Serialize();
            }

            GPGPU gpu = CudafyHost.GetDevice(CudafyModes.Target);
            gpu.LoadModule(km);

            float c = 0;

            int loops = 20;
            int batches = 12;

            // allocate memory on the cpu side
            float[] a = new float[N];
            float[] b = new float[N];
            float[] partial_c = new float[blocksPerGrid];

            // allocate the memory on the GPU
            float[] dev_a = gpu.Allocate<float>(N);
            float[] dev_b = gpu.Allocate<float>(N);
            float[] dev_partial_c = gpu.Allocate<float>(blocksPerGrid);

            float[] dev_test = gpu.Allocate<float>(blocksPerGrid * blocksPerGrid);

            // fill in the host memory with data
            for (int i = 0; i < N; i++)
            {
                a[i] = i;
                b[i] = i * 2;
            }

            // Synchronous Implementation
            Stopwatch sw = Stopwatch.StartNew();
            for (int l = 0; l < loops; l++)
            {
                for (int bat = 0; bat < batches; bat++)
                {
                    // copy the arrays 'a' and 'b' to the GPU
                    gpu.CopyToDevice(a, dev_a);
                    gpu.CopyToDevice(b, dev_b);

                    gpu.Launch(blocksPerGrid, threadsPerBlock, "Dot", dev_a, dev_b, dev_partial_c);

                    // copy the array 'c' back from the GPU to the CPU
                    gpu.CopyFromDevice(dev_partial_c, partial_c);
                    // finish up on the CPU side
                    c = 0;
                    for (int i = 0; i < blocksPerGrid; i++)
                    {
                        c += partial_c[i];
                    }
                }
            }
            long syncTime = sw.ElapsedMilliseconds;
            Console.WriteLine("Synchronous Time: {0}", syncTime);
            Console.WriteLine("Does GPU value {0} = {1}?\n", c, 2 * sum_squares((float)(N - 1)));

            // Asynchronous Pinned Memory Implementation
            IntPtr[] host_stages_a = new IntPtr[batches];
            IntPtr[] host_stages_b = new IntPtr[batches];
            IntPtr[] host_stages_c = new IntPtr[batches];
            for (int bat = 0; bat < batches; bat++)
            {
                host_stages_a[bat] = gpu.HostAllocate<float>(N);
                host_stages_b[bat] = gpu.HostAllocate<float>(N);
                host_stages_c[bat] = gpu.HostAllocate<float>(blocksPerGrid);
            }

            // Set GPU memory to zero
            gpu.Set(dev_a);
            gpu.Set(dev_b);
            gpu.Set(dev_partial_c);

            gpu.EnableSmartCopy();
            sw.Restart();
            for (int l = 0; l < loops; l++)
            {
                // Queue all the copying operations of the batch
                for (int bat = 0; bat < batches; bat++)
                {
                    // Finish processing the previous loop on CPU
                    if (l > 0)
                    {
                        gpu.SynchronizeStream(bat + 1);
                        c = 0;
                        for (int i = 0; i < blocksPerGrid; i++)
                        {
                            c += partial_c[i];
                        }
                    }
                    gpu.CopyToDeviceAsync(a, 0, dev_a, 0, N, bat + 1, host_stages_a[bat]);
                }
                // All copies to the GPU are put into a queue, so the different stream id's are abstract only
                // We are guaranteed that all previous copies with same stream id will be completed first.
                for (int bat = 0; bat < batches; bat++)
                    gpu.CopyToDeviceAsync(b, 0, dev_b, 0, N, bat + 1, host_stages_b[bat]);
                // Launch the kernels. These have same stream id as the copies and will take place as soon as 
                // the copy to the GPU with same stream id is complete. Hence kernels may be running in parallel
                // with copies fo higher stream id that are still running.
                for (int bat = 0; bat < batches; bat++)
                    gpu.LaunchAsync(blocksPerGrid, threadsPerBlock, bat + 1, "Dot", dev_a, dev_b, dev_partial_c);
                // Here we add to the copying from GPU queue. Copying will begin once the kernel with same stream
                // id is completed. If the GPU supports concurrent copying to and from at same time then the first 
                // copy from operations may be completed before all the copy to operations have.
                for (int bat = 0; bat < batches; bat++)
                    gpu.CopyFromDeviceAsync(dev_partial_c, 0, partial_c, 0, blocksPerGrid, bat + 1, host_stages_c[bat]);

            }
            // Finish processing the last loop on CPU
            for (int bat = 0; bat < batches; bat++)
            {
                gpu.SynchronizeStream(bat + 1);
                c = 0;
                for (int i = 0; i < blocksPerGrid; i++)
                {
                    c += partial_c[i];
                }
            }

            long asyncTime = sw.ElapsedMilliseconds;
            Console.WriteLine("Asynchronous Time: {0}", asyncTime);
            Console.WriteLine("Does GPU value {0} = {1}?\n", c, 2 * sum_squares((float)(N - 1)));
            gpu.DisableSmartCopy();

            // free memory on the gpu side
            gpu.FreeAll();

            // free memory on the cpu side
            gpu.HostFreeAll();

            // let's try and do this on the CPU in an straight forward fashion
            sw.Restart();
            c = 0;
            for (int l = 0; l < loops; l++)
                for (int bat = 0; bat < batches; bat++)
                    c = DotProduct(a, b);
            long cpuTime = sw.ElapsedMilliseconds;
            Console.WriteLine("CPU Time: {0}", cpuTime);
            Console.WriteLine("Does CPU value {0} = {1}?\n", c, 2 * sum_squares((float)(N - 1)));

            // let's try and do this on the CPU with Linq
            sw.Restart();
            c = 0;
            for (int l = 0; l < loops; l++)
                for (int bat = 0; bat < batches; bat++)
                    c = DotProductLinq(a, b);
            long cpuLinqTime = sw.ElapsedMilliseconds;
            Console.WriteLine("CPU Linq Time: {0}", cpuLinqTime);
            Console.WriteLine("Does CPU value {0} = {1}?\n", c, 2 * sum_squares((float)(N - 1)));

            // let's try and do this on the CPU with multiple threads
            DotProductDelegate dlgt = new DotProductDelegate(DotProduct);
            IAsyncResult[] res = new IAsyncResult[batches];
            for (int bat = 0; bat < batches; bat++)
                res[bat] = null;
            sw.Restart();
            c = 0;
            for (int l = 0; l < loops; l++)
                for (int bat = 0; bat < batches; bat++)
                {
                    if (res[bat] != null)
                        c = dlgt.EndInvoke(res[bat]);
                    res[bat] = dlgt.BeginInvoke(a, b, null, null);
                }
            for (int bat = 0; bat < batches; bat++)
                if (res[bat] != null)
                    c = dlgt.EndInvoke(res[bat]);
            long cpuMultiTime = sw.ElapsedMilliseconds;
            Console.WriteLine("CPU Multi Time: {0}", cpuMultiTime);
            Console.WriteLine("Does CPU value {0} = {1}?\n", c, 2 * sum_squares((float)(N - 1)));
        }
Ejemplo n.º 2
0
        public static void Execute()
        {
            CudafyModule km = CudafyModule.TryDeserialize();

            if (km == null || !km.TryVerifyChecksums())
            {
                km = CudafyTranslator.Cudafy();
                km.Serialize();
            }

            GPGPU gpu = CudafyHost.GetDevice(CudafyModes.Target);

            gpu.LoadModule(km);

            float c = 0;

            int loops   = 20;
            int batches = 12;

            // allocate memory on the cpu side
            float[] a         = new float[N];
            float[] b         = new float[N];
            float[] partial_c = new float[blocksPerGrid];

            // allocate the memory on the GPU
            float[] dev_a         = gpu.Allocate <float>(N);
            float[] dev_b         = gpu.Allocate <float>(N);
            float[] dev_partial_c = gpu.Allocate <float>(blocksPerGrid);

            float[] dev_test = gpu.Allocate <float>(blocksPerGrid * blocksPerGrid);

            // fill in the host memory with data
            for (int i = 0; i < N; i++)
            {
                a[i] = i;
                b[i] = i * 2;
            }

            // Synchronous Implementation
            Stopwatch sw = Stopwatch.StartNew();

            for (int l = 0; l < loops; l++)
            {
                for (int bat = 0; bat < batches; bat++)
                {
                    // copy the arrays 'a' and 'b' to the GPU
                    gpu.CopyToDevice(a, dev_a);
                    gpu.CopyToDevice(b, dev_b);

                    gpu.Launch(blocksPerGrid, threadsPerBlock, "Dot", dev_a, dev_b, dev_partial_c);

                    // copy the array 'c' back from the GPU to the CPU
                    gpu.CopyFromDevice(dev_partial_c, partial_c);
                    // finish up on the CPU side
                    c = 0;
                    for (int i = 0; i < blocksPerGrid; i++)
                    {
                        c += partial_c[i];
                    }
                }
            }
            long syncTime = sw.ElapsedMilliseconds;

            Console.WriteLine("Synchronous Time: {0}", syncTime);
            Console.WriteLine("Does GPU value {0} = {1}?\n", c, 2 * sum_squares((float)(N - 1)));

            // Asynchronous Pinned Memory Implementation
            IntPtr[] host_stages_a = new IntPtr[batches];
            IntPtr[] host_stages_b = new IntPtr[batches];
            IntPtr[] host_stages_c = new IntPtr[batches];
            for (int bat = 0; bat < batches; bat++)
            {
                host_stages_a[bat] = gpu.HostAllocate <float>(N);
                host_stages_b[bat] = gpu.HostAllocate <float>(N);
                host_stages_c[bat] = gpu.HostAllocate <float>(blocksPerGrid);
            }

            // Set GPU memory to zero
            gpu.Set(dev_a);
            gpu.Set(dev_b);
            gpu.Set(dev_partial_c);

            gpu.EnableSmartCopy();
            sw.Restart();
            for (int l = 0; l < loops; l++)
            {
                // Queue all the copying operations of the batch
                for (int bat = 0; bat < batches; bat++)
                {
                    // Finish processing the previous loop on CPU
                    if (l > 0)
                    {
                        gpu.SynchronizeStream(bat + 1);
                        c = 0;
                        for (int i = 0; i < blocksPerGrid; i++)
                        {
                            c += partial_c[i];
                        }
                    }
                    gpu.CopyToDeviceAsync(a, 0, dev_a, 0, N, bat + 1, host_stages_a[bat]);
                }
                // All copies to the GPU are put into a queue, so the different stream id's are abstract only
                // We are guaranteed that all previous copies with same stream id will be completed first.
                for (int bat = 0; bat < batches; bat++)
                {
                    gpu.CopyToDeviceAsync(b, 0, dev_b, 0, N, bat + 1, host_stages_b[bat]);
                }
                // Launch the kernels. These have same stream id as the copies and will take place as soon as
                // the copy to the GPU with same stream id is complete. Hence kernels may be running in parallel
                // with copies fo higher stream id that are still running.
                for (int bat = 0; bat < batches; bat++)
                {
                    gpu.LaunchAsync(blocksPerGrid, threadsPerBlock, bat + 1, "Dot", dev_a, dev_b, dev_partial_c);
                }
                // Here we add to the copying from GPU queue. Copying will begin once the kernel with same stream
                // id is completed. If the GPU supports concurrent copying to and from at same time then the first
                // copy from operations may be completed before all the copy to operations have.
                for (int bat = 0; bat < batches; bat++)
                {
                    gpu.CopyFromDeviceAsync(dev_partial_c, 0, partial_c, 0, blocksPerGrid, bat + 1, host_stages_c[bat]);
                }
            }
            // Finish processing the last loop on CPU
            for (int bat = 0; bat < batches; bat++)
            {
                gpu.SynchronizeStream(bat + 1);
                c = 0;
                for (int i = 0; i < blocksPerGrid; i++)
                {
                    c += partial_c[i];
                }
            }

            long asyncTime = sw.ElapsedMilliseconds;

            Console.WriteLine("Asynchronous Time: {0}", asyncTime);
            Console.WriteLine("Does GPU value {0} = {1}?\n", c, 2 * sum_squares((float)(N - 1)));
            gpu.DisableSmartCopy();

            // free memory on the gpu side
            gpu.FreeAll();

            // free memory on the cpu side
            gpu.HostFreeAll();

            // let's try and do this on the CPU in an straight forward fashion
            sw.Restart();
            c = 0;
            for (int l = 0; l < loops; l++)
            {
                for (int bat = 0; bat < batches; bat++)
                {
                    c = DotProduct(a, b);
                }
            }
            long cpuTime = sw.ElapsedMilliseconds;

            Console.WriteLine("CPU Time: {0}", cpuTime);
            Console.WriteLine("Does CPU value {0} = {1}?\n", c, 2 * sum_squares((float)(N - 1)));

            // let's try and do this on the CPU with Linq
            sw.Restart();
            c = 0;
            for (int l = 0; l < loops; l++)
            {
                for (int bat = 0; bat < batches; bat++)
                {
                    c = DotProductLinq(a, b);
                }
            }
            long cpuLinqTime = sw.ElapsedMilliseconds;

            Console.WriteLine("CPU Linq Time: {0}", cpuLinqTime);
            Console.WriteLine("Does CPU value {0} = {1}?\n", c, 2 * sum_squares((float)(N - 1)));

            // let's try and do this on the CPU with multiple threads
            DotProductDelegate dlgt = new DotProductDelegate(DotProduct);

            IAsyncResult[] res = new IAsyncResult[batches];
            for (int bat = 0; bat < batches; bat++)
            {
                res[bat] = null;
            }
            sw.Restart();
            c = 0;
            for (int l = 0; l < loops; l++)
            {
                for (int bat = 0; bat < batches; bat++)
                {
                    if (res[bat] != null)
                    {
                        c = dlgt.EndInvoke(res[bat]);
                    }
                    res[bat] = dlgt.BeginInvoke(a, b, null, null);
                }
            }
            for (int bat = 0; bat < batches; bat++)
            {
                if (res[bat] != null)
                {
                    c = dlgt.EndInvoke(res[bat]);
                }
            }
            long cpuMultiTime = sw.ElapsedMilliseconds;

            Console.WriteLine("CPU Multi Time: {0}", cpuMultiTime);
            Console.WriteLine("Does CPU value {0} = {1}?\n", c, 2 * sum_squares((float)(N - 1)));
        }