Beispiel #1
0
        private float cuda_host_alloc_copy_test(int size, bool up)
        {
            IntPtr a = _gpu.HostAllocate <int>(size);
            IntPtr b = _gpu.HostAllocate <int>(size);

            int[] dev_a  = _gpu.Allocate <int>(size);
            int[] host_a = new int[size];
            _gpu.StartTimer();

            for (int i = 0; i < 50; i++) // 50 = two copies per loop
            {
                if (up)
                {
                    a.Write(host_a);
                    _gpu.CopyToDeviceAsync(a, 0, dev_a, 0, size);
                    b.Write(host_a);
                    _gpu.CopyToDeviceAsync(b, 0, dev_a, 0, size);
                }
                else
                {
                    _gpu.CopyFromDeviceAsync(dev_a, 0, a, 0, size);
                    b.Read(host_a);
                    _gpu.CopyFromDeviceAsync(dev_a, 0, b, 0, size);
                    b.Read(host_a);
                }
            }
            _gpu.SynchronizeStream();

            float elapsedTime = _gpu.StopTimer();

            _gpu.FreeAll();
            _gpu.HostFree(a);
            _gpu.HostFree(b);
            GC.Collect();
            return(elapsedTime);
        }
Beispiel #2
0
        public void Test_smartCopyToDevice()
        {
            if (_gpu is OpenCLDevice)
            {
                Console.WriteLine("Device not supporting smart copy, so skip.");
                return;
            }
            var mod = CudafyModule.TryDeserialize();

            if (mod == null || !mod.TryVerifyChecksums())
            {
                mod = CudafyTranslator.Cudafy(CudafyModes.Architecture);
                mod.Serialize();
            }
            _gpu.LoadModule(mod);
            _gpuuintBufferIn  = _gpu.Allocate <uint>(N);
            _gpuuintBufferOut = _gpu.Allocate <uint>(N);
            int       batchSize = 8;
            int       loops     = 6;
            Stopwatch sw        = Stopwatch.StartNew();

            for (int x = 0; x < loops; x++)
            {
                for (int i = 0; i < batchSize; i++)
                {
                    _gpu.CopyToDevice(_uintBufferIn, 0, _gpuuintBufferIn, 0, N);
                    _gpu.Launch(N / 512, 512, "DoubleAllValues", _gpuuintBufferIn, _gpuuintBufferOut);
                    _gpu.CopyFromDevice(_gpuuintBufferOut, 0, _uintBufferOut, 0, N);
                }
            }
            long time = sw.ElapsedMilliseconds;

            Console.WriteLine(time);
            IntPtr[] stagingPostIn  = new IntPtr[batchSize];
            IntPtr[] stagingPostOut = new IntPtr[batchSize];
            for (int i = 0; i < batchSize; i++)
            {
                stagingPostIn[i]  = _gpu.HostAllocate <uint>(N);
                stagingPostOut[i] = _gpu.HostAllocate <uint>(N);
            }
            _gpu.EnableSmartCopy();
            sw.Restart();
            for (int x = 0; x < loops; x++)
            {
                for (int i = 0; i < batchSize; i++)
                {
                    _gpu.CopyToDeviceAsync(_uintBufferIn, 0, _gpuuintBufferIn, 0, N, i + 1, stagingPostIn[i]);
                }
                for (int i = 0; i < batchSize; i++)
                {
                    _gpu.LaunchAsync(N / 256, 256, i + 1, "DoubleAllValues", _gpuuintBufferIn, _gpuuintBufferOut);
                }
                for (int i = 0; i < batchSize; i++)
                {
                    _gpu.CopyFromDeviceAsync(_gpuuintBufferOut, 0, _uintBufferOut, 0, N, i + 1, stagingPostOut[i]);
                }
                for (int i = 0; i < batchSize; i++)
                {
                    _gpu.SynchronizeStream(i + 1);
                }
                //for (int i = 0; i < batchSize; i++)
                //{
                //    _gpu.CopyToDeviceAsync(stagingPostIn[i], 0, _gpuuintBufferIn, 0, N, i+1);
                //    _gpu.LaunchAsync(N / 512, 512, i + 1, "DoubleAllValues", _gpuuintBufferIn, _gpuuintBufferOut);
                //    _gpu.CopyFromDeviceAsync(_gpuuintBufferOut, 0, stagingPostOut[i], 0, N, i + 1);
                //}
                for (int i = 0; i < batchSize; i++)
                {
                    _gpu.SynchronizeStream(i + 1);
                }
            }

            time = sw.ElapsedMilliseconds;
            Console.WriteLine(time);
            _gpu.DisableSmartCopy();
            for (int i = 0; i < N; i++)
            {
                _uintBufferIn[i] *= 2;
            }
            Assert.IsTrue(Compare(_uintBufferIn, _uintBufferOut));

            ClearOutputsAndGPU();
        }
        public static void Execute()
        {
            CudafyModule km = CudafyTranslator.Cudafy();

            GPGPU gpu = CudafyHost.GetDevice(CudafyModes.Target, CudafyModes.DeviceId);

            gpu.LoadModule(km);

            int[] dev_a0, dev_b0, dev_c0;
            int[] dev_a1, dev_b1, dev_c1;

            // allocate the memory on the GPU
            dev_a0 = gpu.Allocate <int>(N);
            dev_b0 = gpu.Allocate <int>(N);
            dev_c0 = gpu.Allocate <int>(N);
            dev_a1 = gpu.Allocate <int>(N);
            dev_b1 = gpu.Allocate <int>(N);
            dev_c1 = gpu.Allocate <int>(N);

            // allocate host locked memory, used to stream
            IntPtr host_aPtr = gpu.HostAllocate <int>(FULL_DATA_SIZE);
            IntPtr host_bPtr = gpu.HostAllocate <int>(FULL_DATA_SIZE);
            IntPtr host_cPtr = gpu.HostAllocate <int>(FULL_DATA_SIZE);

            Random rand = new Random();

            for (int i = 0; i < FULL_DATA_SIZE; i++)
            {
                host_aPtr.Set(i, rand.Next(1024 * 1024));  // There will be differences between the .NET code and the GPU
                host_bPtr.Set(i, rand.Next(1024 * 1024));  // So let's keep these to a minimum by having a max random values.
            }

            // start timer
            gpu.StartTimer();

            // now loop over full data, in bite-sized chunks
            for (int i = 0; i < FULL_DATA_SIZE; i += N * 2)
            {
                gpu.CopyToDeviceAsync(host_aPtr, i, dev_a0, 0, N, 1);
                gpu.CopyToDeviceAsync(host_bPtr, i, dev_b0, 0, N, 2);
                gpu.CopyToDeviceAsync(host_aPtr, i + N, dev_a1, 0, N, 1);
                gpu.CopyToDeviceAsync(host_bPtr, i + N, dev_b1, 0, N, 2);
                gpu.LaunchAsync(N / 256, 256, 1, "thekernel", dev_a0, dev_b0, dev_c0);
                gpu.LaunchAsync(N / 256, 256, 2, "thekernel", dev_a1, dev_b1, dev_c1);
                //gpu.Launch(N / 256, 256, 1).kernel(dev_a0, dev_b0, dev_c0);
                //gpu.Launch(N / 256, 256, 2).kernel(dev_a1, dev_b1, dev_c1);
                gpu.CopyFromDeviceAsync(dev_c0, 0, host_cPtr, i, N, 1);
                gpu.CopyFromDeviceAsync(dev_c1, 0, host_cPtr, i + N, N, 2);
            }
            gpu.SynchronizeStream(1);
            gpu.SynchronizeStream(2);

            float elapsed = gpu.StopTimer();

            // verify
            int[] host_a = new int[FULL_DATA_SIZE];
            int[] host_b = new int[FULL_DATA_SIZE];
            int[] host_c = new int[FULL_DATA_SIZE];

            GPGPU.CopyOnHost(host_aPtr, 0, host_a, 0, FULL_DATA_SIZE);
            GPGPU.CopyOnHost(host_bPtr, 0, host_b, 0, FULL_DATA_SIZE);
            GPGPU.CopyOnHost(host_cPtr, 0, host_c, 0, FULL_DATA_SIZE);
            Console.WriteLine("Elapsed: {0} ms", elapsed);

            int[] host_d = new int[FULL_DATA_SIZE];
            int   errors = 0;
            int   id     = 0;

            {
                for (int j = 0; j < N; j++, id++)
                {
                    control(id, j, host_a, host_b, host_d);
                    if (host_c[id] > host_d[id] + 1) // There will be differences between the .NET code and the GPU
                    {
                        Console.WriteLine("Mismatch at {0}: {1} != {2}", id, host_c[id], host_d[id]);
                        errors++;
                        if (errors > 8)
                        {
                            break;
                        }
                    }
                }
            }

            gpu.HostFree(host_aPtr);
            gpu.HostFree(host_bPtr);
            gpu.HostFree(host_cPtr);
            gpu.DestroyStream(1);
            gpu.DestroyStream(2);
        }
        public static void Execute()
        {
            CudafyModule km = CudafyModule.TryDeserialize();

            if (km == null || !km.TryVerifyChecksums())
            {
                km = CudafyTranslator.Cudafy();
                km.Serialize();
            }

            GPGPU gpu = CudafyHost.GetDevice(CudafyModes.Target);

            gpu.LoadModule(km);

            float c = 0;

            int loops   = 20;
            int batches = 12;

            // allocate memory on the cpu side
            float[] a         = new float[N];
            float[] b         = new float[N];
            float[] partial_c = new float[blocksPerGrid];

            // allocate the memory on the GPU
            float[] dev_a         = gpu.Allocate <float>(N);
            float[] dev_b         = gpu.Allocate <float>(N);
            float[] dev_partial_c = gpu.Allocate <float>(blocksPerGrid);

            float[] dev_test = gpu.Allocate <float>(blocksPerGrid * blocksPerGrid);

            // fill in the host memory with data
            for (int i = 0; i < N; i++)
            {
                a[i] = i;
                b[i] = i * 2;
            }

            // Synchronous Implementation
            Stopwatch sw = Stopwatch.StartNew();

            for (int l = 0; l < loops; l++)
            {
                for (int bat = 0; bat < batches; bat++)
                {
                    // copy the arrays 'a' and 'b' to the GPU
                    gpu.CopyToDevice(a, dev_a);
                    gpu.CopyToDevice(b, dev_b);

                    gpu.Launch(blocksPerGrid, threadsPerBlock, "Dot", dev_a, dev_b, dev_partial_c);

                    // copy the array 'c' back from the GPU to the CPU
                    gpu.CopyFromDevice(dev_partial_c, partial_c);
                    // finish up on the CPU side
                    c = 0;
                    for (int i = 0; i < blocksPerGrid; i++)
                    {
                        c += partial_c[i];
                    }
                }
            }
            long syncTime = sw.ElapsedMilliseconds;

            Console.WriteLine("Synchronous Time: {0}", syncTime);
            Console.WriteLine("Does GPU value {0} = {1}?\n", c, 2 * sum_squares((float)(N - 1)));

            // Asynchronous Pinned Memory Implementation
            IntPtr[] host_stages_a = new IntPtr[batches];
            IntPtr[] host_stages_b = new IntPtr[batches];
            IntPtr[] host_stages_c = new IntPtr[batches];
            for (int bat = 0; bat < batches; bat++)
            {
                host_stages_a[bat] = gpu.HostAllocate <float>(N);
                host_stages_b[bat] = gpu.HostAllocate <float>(N);
                host_stages_c[bat] = gpu.HostAllocate <float>(blocksPerGrid);
            }

            // Set GPU memory to zero
            gpu.Set(dev_a);
            gpu.Set(dev_b);
            gpu.Set(dev_partial_c);

            gpu.EnableSmartCopy();
            sw.Restart();
            for (int l = 0; l < loops; l++)
            {
                // Queue all the copying operations of the batch
                for (int bat = 0; bat < batches; bat++)
                {
                    // Finish processing the previous loop on CPU
                    if (l > 0)
                    {
                        gpu.SynchronizeStream(bat + 1);
                        c = 0;
                        for (int i = 0; i < blocksPerGrid; i++)
                        {
                            c += partial_c[i];
                        }
                    }
                    gpu.CopyToDeviceAsync(a, 0, dev_a, 0, N, bat + 1, host_stages_a[bat]);
                }
                // All copies to the GPU are put into a queue, so the different stream id's are abstract only
                // We are guaranteed that all previous copies with same stream id will be completed first.
                for (int bat = 0; bat < batches; bat++)
                {
                    gpu.CopyToDeviceAsync(b, 0, dev_b, 0, N, bat + 1, host_stages_b[bat]);
                }
                // Launch the kernels. These have same stream id as the copies and will take place as soon as
                // the copy to the GPU with same stream id is complete. Hence kernels may be running in parallel
                // with copies fo higher stream id that are still running.
                for (int bat = 0; bat < batches; bat++)
                {
                    gpu.LaunchAsync(blocksPerGrid, threadsPerBlock, bat + 1, "Dot", dev_a, dev_b, dev_partial_c);
                }
                // Here we add to the copying from GPU queue. Copying will begin once the kernel with same stream
                // id is completed. If the GPU supports concurrent copying to and from at same time then the first
                // copy from operations may be completed before all the copy to operations have.
                for (int bat = 0; bat < batches; bat++)
                {
                    gpu.CopyFromDeviceAsync(dev_partial_c, 0, partial_c, 0, blocksPerGrid, bat + 1, host_stages_c[bat]);
                }
            }
            // Finish processing the last loop on CPU
            for (int bat = 0; bat < batches; bat++)
            {
                gpu.SynchronizeStream(bat + 1);
                c = 0;
                for (int i = 0; i < blocksPerGrid; i++)
                {
                    c += partial_c[i];
                }
            }

            long asyncTime = sw.ElapsedMilliseconds;

            Console.WriteLine("Asynchronous Time: {0}", asyncTime);
            Console.WriteLine("Does GPU value {0} = {1}?\n", c, 2 * sum_squares((float)(N - 1)));
            gpu.DisableSmartCopy();

            // free memory on the gpu side
            gpu.FreeAll();

            // free memory on the cpu side
            gpu.HostFreeAll();

            // let's try and do this on the CPU in an straight forward fashion
            sw.Restart();
            c = 0;
            for (int l = 0; l < loops; l++)
            {
                for (int bat = 0; bat < batches; bat++)
                {
                    c = DotProduct(a, b);
                }
            }
            long cpuTime = sw.ElapsedMilliseconds;

            Console.WriteLine("CPU Time: {0}", cpuTime);
            Console.WriteLine("Does CPU value {0} = {1}?\n", c, 2 * sum_squares((float)(N - 1)));

            // let's try and do this on the CPU with Linq
            sw.Restart();
            c = 0;
            for (int l = 0; l < loops; l++)
            {
                for (int bat = 0; bat < batches; bat++)
                {
                    c = DotProductLinq(a, b);
                }
            }
            long cpuLinqTime = sw.ElapsedMilliseconds;

            Console.WriteLine("CPU Linq Time: {0}", cpuLinqTime);
            Console.WriteLine("Does CPU value {0} = {1}?\n", c, 2 * sum_squares((float)(N - 1)));

            // let's try and do this on the CPU with multiple threads
            DotProductDelegate dlgt = new DotProductDelegate(DotProduct);

            IAsyncResult[] res = new IAsyncResult[batches];
            for (int bat = 0; bat < batches; bat++)
            {
                res[bat] = null;
            }
            sw.Restart();
            c = 0;
            for (int l = 0; l < loops; l++)
            {
                for (int bat = 0; bat < batches; bat++)
                {
                    if (res[bat] != null)
                    {
                        c = dlgt.EndInvoke(res[bat]);
                    }
                    res[bat] = dlgt.BeginInvoke(a, b, null, null);
                }
            }
            for (int bat = 0; bat < batches; bat++)
            {
                if (res[bat] != null)
                {
                    c = dlgt.EndInvoke(res[bat]);
                }
            }
            long cpuMultiTime = sw.ElapsedMilliseconds;

            Console.WriteLine("CPU Multi Time: {0}", cpuMultiTime);
            Console.WriteLine("Does CPU value {0} = {1}?\n", c, 2 * sum_squares((float)(N - 1)));
        }