Ejemplo n.º 1
0
        public void runClassifyVoxel()
        {
            var gpu = Gpu.Default;

            samplePts = ConvertPointsToFloat3(samplePoints);

            // allocate memorys
            var d_voxelVerts = gpu.Allocate <int>(numVoxels);

            float3[] d_samplePts = gpu.Allocate <float3>(samplePts);

            //Copy const values
            float3 baseP = new float3((float)basePoint.X, (float)basePoint.Y, (float)basePoint.Z);

            gpu.Copy(isoValue, constIsovalue);
            gpu.Copy(fusion, constFusion);
            gpu.Copy(baseP, constBasePoint);
            gpu.Copy(voxelSize, constVoxelSize);
            gpu.Copy(gridSize, constGridSize);
            gpu.Copy(Tables.VertsTable, verticesTable);

            gpu.For(0, numVoxels, i => ClassifyVoxel(i, d_samplePts, d_voxelVerts));
            voxelVerts = Gpu.CopyToHost(d_voxelVerts);

            Gpu.Free(d_samplePts);
            Gpu.Free(d_voxelVerts);
        }
 public void Dispose()
 {
     Gpu.Free(fromArr);
     Gpu.Free(toArr);
     Gpu.Free(fromCacheArr);
     Gpu.Free(toCacheArr);
 }
Ejemplo n.º 3
0
        private async void MergeBuffersGPU(float[] color, float[] normal, float[] p3D, float[] tex)
        {
            _pixelBufferImg.Clear((byte)0);


            var specular = Settings.Specular;

            Gpu.Default.For(0, _imgHeight * _imgWidth, i =>
            {
                var k = i * 3;

                ColorPass(_enviromentLight, color, normal, p3D, tex, specular, k);

                ArrayMath.Clamp(color, 0, 255, k, 3);

                _pixelBufferImg[k + 0] = (byte)(color[k + 0]);
                _pixelBufferImg[k + 1] = (byte)(color[k + 1]);
                _pixelBufferImg[k + 2] = (byte)(color[k + 2]);
            });

            Gpu.Free(color);
            Gpu.Free(normal);
            Gpu.Free(p3D);
            Gpu.Free(tex);

            await App.Current?.Dispatcher.InvokeAsync(() =>
            {
                if (Bitmap == null)
                {
                    return;
                }
                WriteToBitmap(Bitmap, _pixelBufferImg);
            });
        }
Ejemplo n.º 4
0
        private float[] ComputeDistancesDatasetGPU(Item query)
        {
            // gpu
            Task <float[]> task = Task <float[]> .Factory.StartNew(() =>
            {
                lock (gpuLock)
                {
                    float[] queryGpu = Gpu.Default.Allocate(query.Descriptor);
                    //float[] distancesGpu = Gpu.Default.Allocate<float>(gpuDatasetSize);
                    Gpu.Default.Launch <float[], float[][], float[], Func <float[], float[], float> >
                        (ComputeDistancesKernel, launchParam, queryGpu, subDatasetGpu, distancesGpu, Item.GetDistanceSQR);
                    Gpu.Free(queryGpu);
                    float[] distancesGpuResultTask = Gpu.CopyToHost(distancesGpu);
                    //Gpu.Free(distancesGpu);
                    return(distancesGpuResultTask);
                }
            });

            // cpu
            //float[] distances = new float[Dataset.Length];
            //Parallel.For(gpuDatasetSize, distances.Length, index =>
            //{
            //    distances[index] = Item.GetDistanceSQR(query.Descriptor, Dataset[index].Descriptor);
            //});

            float[] distancesGpuResult = task.Result;
            //Array.Copy(distancesGpuResult, distances, distancesGpuResult.Length);

            return(distancesGpuResult);
        }
Ejemplo n.º 5
0
        private static int TestGPU()
        {
            var length = 32_000_000;
            var gpu    = Gpu.Default;
            var a1     = Enumerable.Repeat(1, length).ToArray();
            var a2     = Enumerable.Repeat(1, length).ToArray();
            var r      = new int[length];


            int[] arg1   = gpu.Allocate <int>(a1);
            int[] arg2   = gpu.Allocate <int>(a2);
            int[] result = gpu.Allocate <int>(r);

            for (var i = 0; i < 10; i++)
            {
                var sw = new Stopwatch();
                sw.Start();
                var s = TestSummGPU(gpu, length, arg1, arg2, result);
                sw.Stop();
                Console.WriteLine($"GPU1: {sw.Elapsed.Milliseconds} Summ: {s}");
                Console.WriteLine();
            }


            Gpu.Free(arg1);
            Gpu.Free(arg2);
            Gpu.Free(result);

            return(0);
        }
Ejemplo n.º 6
0
        // reduce empty voxel and extract active voxels
        public void runExtractActiveVoxels()
        {
            var gpu = Gpu.Default;
            // compute the number of active voxels
            List <int> index_voxelActiveList = new List <int>();

            for (int i = 0; i < voxelVerts.Length; i++)
            {
                if (voxelVerts[i] > 0)
                {
                    index_voxelActiveList.Add(i);
                }
            }

            // the index of active voxel
            index_voxelActive = index_voxelActiveList.ToArray();
            num_voxelActive   = index_voxelActive.Length;
            // the number of vertices in each active voxel
            verts_voxelActive = new int[num_voxelActive];
            // the number of all vertices
            sumVerts = 0;

            Parallel.For(0, num_voxelActive, i =>
            {
                verts_voxelActive[i] = voxelVerts[index_voxelActive[i]];
            });

            // execute exclusive scan for finding out the indices of result vertices
            var op = new Func <int, int, int>((a, b) => { return(a + b); });

            Alea.Session session             = new Alea.Session(gpu);
            int[]        d_verts_voxelActive = Gpu.Default.Allocate <int>(verts_voxelActive);
            int[]        d_voxelVertsScan    = Gpu.Default.Allocate <int>(num_voxelActive);

            GpuExtension.Scan <int>(session, d_voxelVertsScan, d_verts_voxelActive, 0, op, 0);

            var result_Scan = Gpu.CopyToHost(d_voxelVertsScan);

            verts_scanIdx = new int[num_voxelActive];

            for (int i = 1; i < num_voxelActive; i++)
            {
                verts_scanIdx[i] = result_Scan[i - 1];
            }

            try
            {
                verts_scanIdx[0] = 0;
            }
            catch (Exception)
            {
                throw new Exception("No eligible isosurface can be extracted, please change isovalue.");
            }

            sumVerts = verts_scanIdx.ElementAt(verts_scanIdx.Length - 1) + verts_voxelActive.ElementAt(verts_voxelActive.Length - 1);

            Gpu.Free(d_verts_voxelActive);
            Gpu.Free(d_voxelVertsScan);
        }
Ejemplo n.º 7
0
        public Tuple <int, int> CompareAbsoluteOpt(double[] source, double[] target, double tolerance, double ThreshholdTol)
        {
            System.Diagnostics.Debug.WriteLine("starting an absolute comparison on GPU");
            if (source.Length != target.Length)
            {
                throw new ArgumentException("The source and target lengths need to match");
            }

            double epsilon          = ThreshholdTol;
            double MaxSource        = source.Max();
            double MaxTarget        = target.Max();
            double MinDoseEvaluated = (MaxSource * epsilon);
            double zero             = 0.0;
            double lowMultiplier    = (1 - tolerance);
            double highMultiplier   = (1 + tolerance);
            int    failed           = 0;
            int    isCounted        = 0;
            Gpu    gpu = Gpu.Default;

            // filter doses below threshold
            // TODO: should failure be -1?

            int dimension = source.Length;

            double[] sourceOnGPU     = gpu.Allocate(source);
            double[] targetOnGPU     = gpu.Allocate(target);
            double[] isCountedArray  = gpu.Allocate <double>(dimension);
            double[] sourceOnGPULow  = gpu.Allocate <double>(dimension);
            double[] sourceOnGPUHigh = gpu.Allocate <double>(dimension);
            double[] isGTtol         = gpu.Allocate <double>(dimension);

            gpu.For(0, dimension, i => sourceOnGPU[i]    = (sourceOnGPU[i] > epsilon) ? sourceOnGPU[i] : zero);
            gpu.For(0, dimension, i => targetOnGPU[i]    = (targetOnGPU[i] > epsilon) ? targetOnGPU[i] : zero);
            gpu.For(0, dimension, i => sourceOnGPU[i]    = (targetOnGPU[i] > epsilon) ? sourceOnGPU[i] : zero);
            gpu.For(0, dimension, i => targetOnGPU[i]    = (sourceOnGPU[i] > epsilon) ? targetOnGPU[i] : zero);
            gpu.For(0, dimension, i => isCountedArray[i] = (sourceOnGPU[i] > zero) ? 1.0 : zero);

            gpu.For(0, dimension, i => sourceOnGPULow[i]  = lowMultiplier * sourceOnGPU[i]);
            gpu.For(0, dimension, i => sourceOnGPUHigh[i] = highMultiplier * sourceOnGPU[i]);

            //determine if relative difference is greater than minDoseEvaluated
            // stores 1 as GT minDoseEvaluated is true
            gpu.For(0, isGTtol.Length,
                    i => isGTtol[i] = (targetOnGPU[i] < sourceOnGPULow[i] || targetOnGPU[i] > sourceOnGPUHigh[i]) ? 1 : 0);
            isCounted = (int)gpu.Sum(isCountedArray);
            failed    = (int)gpu.Sum(isGTtol);

            Gpu.Free(sourceOnGPU);
            Gpu.Free(targetOnGPU);
            Gpu.Free(sourceOnGPULow);
            Gpu.Free(sourceOnGPUHigh);
            Gpu.Free(isCountedArray);
            Gpu.Free(isGTtol);
            System.Diagnostics.Debug.WriteLine("finished an absolute comparison on GPU");
            //gpu.Dispose();

            return(new Tuple <int, int>(failed, isCounted));
        }
Ejemplo n.º 8
0
        private void DisposeDevInputSamples()
        {
            if (m_DevInputSamples == null)
            {
                return;
            }

            Gpu.Free(m_DevInputSamples);
            m_DevInputSamples = null;
        }
            private void DisposeGpuResources()
            {
                if (m_DevOverdBs == null)
                {
                    return;
                }

                Gpu.Free(m_DevOverdBs);
                m_DevOverdBs  = null;
                m_SampleCount = 0;
            }
Ejemplo n.º 10
0
        public static void RunGpu()
        {
            var n            = GetData(out var x, out var y);
            var result       = new float[n];
            var gpu          = Gpu.Default;
            var xDevice      = gpu.Allocate <float>(n);
            var yDevice      = gpu.Allocate <float>(n);
            var resultDevice = gpu.Allocate <float>(n);

            Gpu.Copy(x, xDevice);
            Gpu.Copy(y, yDevice);

            var lp = new LaunchParam(16, 256);

            gpu.Launch(Kernel, lp, resultDevice, xDevice, yDevice);
            Gpu.Copy(resultDevice, result);

            Gpu.Free(xDevice);
            Gpu.Free(yDevice);
            Gpu.Free(resultDevice);
        }
        private static double[,] CosineSimilarityGpu(Gpu gpu, double[][] dataset)
        {
            int size       = dataset.Length * dataset.Length;
            var gpuDataset = gpu.Allocate(dataset);

            // Allocate directly on gpu.
            var gpuDistances = gpu.Allocate <double>(dataset.Length, dataset.Length);

            gpu.For(0, size, index =>
            {
                int i               = index / dataset.Length;
                int j               = index % dataset.Length;
                double dotProduct   = 0;
                double magnitudeOne = 0;
                double magnitudeTwo = 0;
                for (int k = 0; k < dataset[i].Length; k++)
                {
                    dotProduct   += (dataset[i][k] * dataset[j][k]);
                    magnitudeOne += (dataset[i][k] * dataset[i][k]);
                    magnitudeTwo += (dataset[j][k] * dataset[j][k]);
                }
                double distance    = Math.Max(0, 1 - (dotProduct / Math.Sqrt(magnitudeOne * magnitudeTwo)));
                gpuDistances[i, j] = distance;
            });

            // Gpu -> Cpu.
            var result = new double[dataset.Length, dataset.Length];

            Gpu.Copy(gpuDistances, result);

            // Release gpu memory.
            Gpu.Free(gpuDataset);
            Gpu.Free(gpuDistances);

            return(result);
        }
 protected virtual void Free()
 {
     Gpu.Free(rneuronsArr);
     Gpu.Free(resultsArr);
     Gpu.Free(inputsArr);
 }
Ejemplo n.º 13
0
 public void Free()
 {
     Gpu.Free(GPUValues);
     Gpu.Free(GPUIndices);
 }
Ejemplo n.º 14
0
 public void Free()
 {
     Gpu.Free(GPUArray);
 }
        public ComputationResult[] Compute(
            Problem[] problemsToSolve,
            int streamCount,
            Action asyncAction = null,
            int warpCount      = 2)
        // cannot be more warps since more memory should be allocated
        {
#if (benchmark)
            var totalTiming = new Stopwatch();
            totalTiming.Start();
            var benchmarkTiming = new Stopwatch();
#endif
            var gpu = Gpu.Default;
            var n   = problemsToSolve.First().size;

            var power = 1 << n;
            var maximumPermissibleWordLength = (n - 1) * (n - 1);

            // in order to atomically add to a checking array (designed for queue consistency) one int is used by four threads, so in a reeeeaally pessimistic case 255 is the maximum number of threads (everyone go to the same vertex)
            var maximumWarps = gpu.Device.Attributes.MaxThreadsPerBlock / gpu.Device.Attributes.WarpSize;
            if (warpCount > maximumWarps)
            {
                warpCount = maximumWarps;
            }

            var problemsPerStream   = (problemsToSolve.Count() + streamCount - 1) / streamCount;
            var problemsPartitioned = Enumerable.Range(0, streamCount)
                                      .Select(i => problemsToSolve.Skip(problemsPerStream * i)
                                              .Take(problemsPerStream)
                                              .ToArray())
                                      .Where(partition => partition.Length > 0)
                                      .ToArray();
            streamCount = problemsPartitioned.Length;
            var streams = Enumerable.Range(0, streamCount)
                          .Select(_ => gpu.CreateStream()).ToArray();

            var gpuA = problemsPartitioned.Select(problems => gpu.Allocate <int>(problems.Length * n)).ToArray();
            var gpuB = problemsPartitioned.Select(problems => gpu.Allocate <int>(problems.Length * n)).ToArray();
            var shortestSynchronizingWordLength = problemsPartitioned.Select(problems => gpu.Allocate <int>(problems.Length)).ToArray();
            var isSynchronizable = problemsPartitioned.Select(problems => gpu.Allocate <bool>(problems.Length)).ToArray();
            gpu.Copy(n, problemSize);
            var queueUpperBound  = power / 2 + 1;
            var launchParameters = new LaunchParam(
                new dim3(1, 1, 1),
                new dim3(gpu.Device.Attributes.WarpSize * warpCount, 1, 1),
                warpCount * (
                    sizeof(ushort) * queueUpperBound
                    + sizeof(ushort) * n * 2
                    + sizeof(byte) * power
                    )
                );
            var gpuResultsIsSynchronizable = problemsPartitioned
                                             .Select(problems => new bool[problems.Length])
                                             .ToArray();
            var gpuResultsShortestSynchronizingWordLength = problemsPartitioned
                                                            .Select(problems => new int[problems.Length])
                                                            .ToArray();


            for (int stream = 0; stream < streamCount; stream++)
            {
                var problems = problemsPartitioned[stream];

                var matrixA = new int[problems.Length * n];
                var matrixB = new int[problems.Length * n];
                Parallel.For(0, problems.Length, problem =>
                {
                    Array.ConstrainedCopy(problems[problem].stateTransitioningMatrixA, 0, matrixA, problem * n, n);
                    Array.ConstrainedCopy(problems[problem].stateTransitioningMatrixB, 0, matrixB, problem * n, n);
                });

                streams[stream].Copy(matrixA, gpuA[stream]);
                streams[stream].Copy(matrixB, gpuB[stream]);

                streams[stream].Launch(
                    Kernel,
                    launchParameters,
                    gpuA[stream],
                    gpuB[stream],
                    isSynchronizable[stream],
                    shortestSynchronizingWordLength[stream]
                    );
            }

            asyncAction?.Invoke();

            for (int stream = 0; stream < streamCount; stream++)
            {
#if (benchmark)
                benchmarkTiming.Start();
#endif
                streams[stream].Synchronize();
#if (benchmark)
                benchmarkTiming.Stop();
#endif
                streams[stream].Copy(isSynchronizable[stream], gpuResultsIsSynchronizable[stream]);
                streams[stream].Copy(shortestSynchronizingWordLength[stream], gpuResultsShortestSynchronizingWordLength[stream]);
            }

            gpu.Synchronize();

#if (benchmark)
#endif
            var results = Enumerable.Range(0, streamCount).SelectMany(i => gpuResultsIsSynchronizable[i].Zip(gpuResultsShortestSynchronizingWordLength[i], (isSyncable, shortestWordLength)
                                                                                                             => new ComputationResult()
            {
                computationType  = ComputationType.GPU,
                size             = problemsToSolve.First().size,
                isSynchronizable = isSyncable,
                shortestSynchronizingWordLength = shortestWordLength,
                algorithmName = GetType().Name
            }
                                                                                                             ).ToArray()
                                                                      ).ToArray();

            foreach (var array in gpuA.AsEnumerable <Array>()
                     .Concat(gpuB)
                     .Concat(shortestSynchronizingWordLength)
                     .Concat(isSynchronizable))
            {
                Gpu.Free(array);
            }

            foreach (var stream in streams)
            {
                stream.Dispose();
            }

            if (results.Any(result => result.isSynchronizable && result.shortestSynchronizingWordLength > maximumPermissibleWordLength))
            {
                throw new Exception("Cerny conjecture is false");
            }

#if (benchmark)
            results[0].benchmarkResult = new BenchmarkResult
            {
                benchmarkedTime = benchmarkTiming.Elapsed,
                totalTime       = totalTiming.Elapsed
            };
#endif
            return(results);
        }
Ejemplo n.º 16
0
        private static void DoStuff()
        {
            const float xs     = -2.1F;
            const float ys     = -1.3F;
            const int   zoom   = 1;
            const int   width  = 10240;
            const int   height = 10240;

            const int blackpixel = (byte)0 | ((byte)0 << 8) | ((byte)0 << 16) | ((byte)255 << 24);

            var gpubits = gpu.Allocate <int>(width * height);

            Console.WriteLine("Started...");
            var watch = new Stopwatch();

            watch.Start();

            const float dx = 2.6F / zoom / width;
            const float dy = 2.6F / zoom / height;

            gpu.For(0, width * height, (index) => {
                //gpu.For(0, width, (x) => {
                //for (var y = 0; y < height; y++) {
                var y  = index / height;
                var x  = index - (y * height);
                var cr = xs + (x * dx);
                var ci = ys + (y * dy);

                var zr = 0F;
                var zi = 0F;

                float zrSquare;
                float ziSquare;

                byte i = 0;
                while (i < 255)
                {
                    zrSquare = zr * zr;
                    ziSquare = zi * zi;
                    zi       = (2 * zi * zr) + ci;
                    zr       = zrSquare - ziSquare + cr;
                    i++;

                    if (zrSquare + ziSquare > 4)
                    {
                        break;
                    }
                }

                if (i == 255)
                {
                    gpubits[index] = blackpixel;
                }
                else
                {
                    gpubits[index] = (byte)(i * 25 % 256) | ((byte)(i * 3 % 256) << 8) | (((byte)(i % 256)) << 16) | ((byte)255 << 24);
                }
            });

            watch.Stop();
            Console.WriteLine($"Elapsed microseconds: {((double)watch.ElapsedTicks) / Stopwatch.Frequency * 1000000}");

            var bits       = Gpu.CopyToHost(gpubits);
            var bitsHandle = GCHandle.Alloc(bits, GCHandleType.Pinned);
            var bitmap     = new Bitmap(width, height, width * 4, PixelFormat.Format32bppPArgb, bitsHandle.AddrOfPinnedObject());

            bitmap.Save("b.png", ImageFormat.Png);
            bitmap.Dispose();
            bitsHandle.Free();
            Gpu.Free(gpubits);
        }
Ejemplo n.º 17
0
        // extract isosurface points using GPU
        public List <Point3f> runExtractIsoSurfaceGPU()
        {
            var gpu = Gpu.Default;

            // output arguments
            float3[] pts          = new float3[12 * num_voxelActive];
            float3[] d_pts        = Gpu.Default.Allocate <float3>(pts);
            float3[] d_resultV    = Gpu.Default.Allocate <float3>(sumVerts);
            float[]  d_cubeValues = Gpu.Default.Allocate <float>(8 * num_voxelActive);

            // input arguments
            float3[] d_samplePts         = Gpu.Default.Allocate <float3>(samplePts);
            int[]    d_verts_scanIdx     = Gpu.Default.Allocate <int>(verts_scanIdx);
            int[]    d_index_voxelActive = Gpu.Default.Allocate <int>(index_voxelActive);

            // const values
            gpu.Copy(Vertices, constVertices);
            gpu.Copy(EdgeDirection, constEdgeDirection);
            gpu.Copy(EdgeConnection, constEdgeConnection);
            gpu.Copy(Tables.EdgeTable, edgeTable);
            gpu.Copy(Tables.TriangleTable_GPU, triangleTable);

            float3 baseP = new float3((float)basePoint.X, (float)basePoint.Y, (float)basePoint.Z);

            gpu.Copy(baseP, constBasePoint);
            gpu.Copy(isoValue, constIsovalue);
            gpu.Copy(scale, constScale);


            gpu.For(0, num_voxelActive, i =>
            {
                //计算grid中的位置
                int3 gridPos = calcGridPos(d_index_voxelActive[i], constGridSize.Value);
                float3 p     = new float3();

                p.x = constBasePoint.Value.x + gridPos.x * constVoxelSize.Value.x;
                p.y = constBasePoint.Value.y + gridPos.y * constVoxelSize.Value.y;
                p.z = constBasePoint.Value.z + gridPos.z * constVoxelSize.Value.z;

                //输出所有顶点
                float3 a0 = p;
                float3 a1 = CreateFloat3(constVoxelSize.Value.x + p.x, 0 + p.y, 0 + p.z);
                float3 a2 = CreateFloat3(constVoxelSize.Value.x + p.x, constVoxelSize.Value.y + p.y, 0 + p.z);
                float3 a3 = CreateFloat3(0 + p.x, constVoxelSize.Value.y + p.y, 0 + p.z);
                float3 a4 = CreateFloat3(0 + p.x, 0 + p.y, constVoxelSize.Value.z + p.z);
                float3 a5 = CreateFloat3(constVoxelSize.Value.x + p.x, 0 + p.y, constVoxelSize.Value.z + p.z);
                float3 a6 = CreateFloat3(constVoxelSize.Value.x + p.x, constVoxelSize.Value.y + p.y, constVoxelSize.Value.z + p.z);
                float3 a7 = CreateFloat3(0 + p.x, constVoxelSize.Value.y + p.y, constVoxelSize.Value.z + p.z);

                float distance = constVoxelSize.Value.x * constVoxelSize.Value.x +
                                 constVoxelSize.Value.y * constVoxelSize.Value.y + constVoxelSize.Value.z * constVoxelSize.Value.z;
                float radius = distance * constFusion.Value;

                //Compute cubeValues of 8 vertices
                d_cubeValues[i * 8]     = ComputeValue(d_samplePts, a0, d_samplePts.Length, constFusion.Value, radius);
                d_cubeValues[i * 8 + 1] = ComputeValue(d_samplePts, a1, d_samplePts.Length, constFusion.Value, radius);
                d_cubeValues[i * 8 + 2] = ComputeValue(d_samplePts, a2, d_samplePts.Length, constFusion.Value, radius);
                d_cubeValues[i * 8 + 3] = ComputeValue(d_samplePts, a3, d_samplePts.Length, constFusion.Value, radius);
                d_cubeValues[i * 8 + 4] = ComputeValue(d_samplePts, a4, d_samplePts.Length, constFusion.Value, radius);
                d_cubeValues[i * 8 + 5] = ComputeValue(d_samplePts, a5, d_samplePts.Length, constFusion.Value, radius);
                d_cubeValues[i * 8 + 6] = ComputeValue(d_samplePts, a6, d_samplePts.Length, constFusion.Value, radius);
                d_cubeValues[i * 8 + 7] = ComputeValue(d_samplePts, a7, d_samplePts.Length, constFusion.Value, radius);

                //Check each vertex state
                int flag = Compact(d_cubeValues[i * 8], constIsovalue.Value);
                flag    += Compact(d_cubeValues[i * 8 + 1], constIsovalue.Value) * 2;
                flag    += Compact(d_cubeValues[i * 8 + 2], constIsovalue.Value) * 4;
                flag    += Compact(d_cubeValues[i * 8 + 3], constIsovalue.Value) * 8;
                flag    += Compact(d_cubeValues[i * 8 + 4], constIsovalue.Value) * 16;
                flag    += Compact(d_cubeValues[i * 8 + 5], constIsovalue.Value) * 32;
                flag    += Compact(d_cubeValues[i * 8 + 6], constIsovalue.Value) * 64;
                flag    += Compact(d_cubeValues[i * 8 + 7], constIsovalue.Value) * 128;

                //find out which edge intersects the isosurface
                int EdgeFlag = edgeTable[flag];

                //check whether this voxel is crossed by the isosurface
                for (int j = 0; j < 12; j++)
                {
                    //check whether an edge have a point
                    if ((EdgeFlag & (1 << j)) != 0)
                    {
                        //compute t values from two end points on each edge
                        float Offset = GetOffset(d_cubeValues[i * 8 + constEdgeConnection[j * 2 + 0]], d_cubeValues[i * 8 + constEdgeConnection[j * 2 + 1]], constIsovalue.Value);
                        float3 pt    = new float3();
                        //get positions
                        pt.x = constBasePoint.Value.x + (gridPos.x + constVertices[constEdgeConnection[j * 2 + 0] * 3 + 0] + Offset * constEdgeDirection[j * 3 + 0]) * constScale.Value;
                        pt.y = constBasePoint.Value.y + (gridPos.y + constVertices[constEdgeConnection[j * 2 + 0] * 3 + 1] + Offset * constEdgeDirection[j * 3 + 1]) * constScale.Value;
                        pt.z = constBasePoint.Value.z + (gridPos.z + constVertices[constEdgeConnection[j * 2 + 0] * 3 + 2] + Offset * constEdgeDirection[j * 3 + 2]) * constScale.Value;
                        d_pts[12 * i + j] = pt;
                    }
                }
                int num = 0;
                //Find out points from each triangle
                for (int Triangle = 0; Triangle < 5; Triangle++)
                {
                    if (triangleTable[flag * 16 + 3 * Triangle] < 0)
                    {
                        break;
                    }

                    for (int Corner = 0; Corner < 3; Corner++)
                    {
                        int Vertex = triangleTable[flag * 16 + 3 * Triangle + Corner];
                        float3 pd  = CreateFloat3(d_pts[12 * i + Vertex].x, d_pts[12 * i + Vertex].y, d_pts[12 * i + Vertex].z);
                        d_resultV[d_verts_scanIdx[i] + num] = pd;
                        num++;
                    }
                }
            });
            resultVerts = Gpu.CopyToHost(d_resultV);

            Gpu.Free(d_resultV);
            Gpu.Free(d_pts);
            Gpu.Free(d_samplePts);
            Gpu.Free(d_verts_scanIdx);
            Gpu.Free(d_index_voxelActive);

            return(ConvertFloat3ToPoint3f(resultVerts));
        }
        public int ComputeAction(
            Problem[] problemsToSolve,
            int problemsReadingIndex,
            ComputationResult[] computationResults,
            int resultsWritingIndex,
            int problemCount,
            int streamCount,
            Action asyncAction = null)
        // cannot be more warps since more memory should be allocated
        {
#if (benchmark)
            var totalTiming = new Stopwatch();
            totalTiming.Start();
            var benchmarkTiming = new Stopwatch();
#endif
            var CernyConjectureFailingIndex = -1;
            var gpu = Gpu.Default;
            var n   = problemsToSolve[problemsReadingIndex].size;

            var power = 1 << n;
            var maximumPermissibleWordLength = (n - 1) * (n - 1);

            // in order to atomically add to a checking array (designed for queue consistency) one int is used by four threads, so in a reeeeaally pessimistic case 255 is the maximum number of threads (everyone go to the same vertex)
            // -1 for the already discovered vertex
            // at least 2*n+1 threads i.e. 27
            var threads        = gpu.Device.Attributes.MaxThreadsPerBlock;
            var maximumThreads = Math.Min(
                gpu.Device.Attributes.MaxThreadsPerBlock,
                32 * 2
                );
            if (threads > maximumThreads)
            {
                threads = maximumThreads;
            }

            if (problemCount < streamCount)
            {
                streamCount = problemCount;
            }

            var problemsPerStream = (problemCount + streamCount - 1) / streamCount;
            var streams           = Enumerable.Range(0, streamCount)
                                    .Select(_ => gpu.CreateStream()).ToArray();

            gpu.Copy(n, problemSize);

            var launchParameters = new LaunchParam(
                new dim3(1 << 10, 1, 1),
                new dim3(threads, 1, 1),
                2 * n * sizeof(ushort)
                );
            var gpuResultsIsSynchronizable = new bool[streamCount][];
            var gpuAs            = new int[streamCount][];
            var gpuBs            = new int[streamCount][];
            var isSynchronizable = new bool[streamCount][];

            for (int stream = 0; stream < streamCount; stream++)
            {
                var offset             = stream * problemsPerStream;
                var localProblemsCount = Math.Min(problemsPerStream, problemCount - offset);

                gpuAs[stream]            = gpu.Allocate <int>(localProblemsCount * n);
                gpuBs[stream]            = gpu.Allocate <int>(localProblemsCount * n);
                isSynchronizable[stream] = gpu.Allocate <bool>(localProblemsCount);

                var matrixA = new int[localProblemsCount * n];
                var matrixB = new int[localProblemsCount * n];
                Parallel.For(0, localProblemsCount, problem =>
                {
                    Array.ConstrainedCopy(
                        problemsToSolve[problemsReadingIndex + offset + problem].stateTransitioningMatrixA,
                        0,
                        matrixA,
                        problem * n,
                        n
                        );
                    Array.ConstrainedCopy(
                        problemsToSolve[problemsReadingIndex + offset + problem].stateTransitioningMatrixB,
                        0,
                        matrixB,
                        problem * n,
                        n
                        );
                });

                streams[stream].Copy(matrixA, gpuAs[stream]);
                streams[stream].Copy(matrixB, gpuBs[stream]);

                gpuResultsIsSynchronizable[stream] = new bool[localProblemsCount];

                streams[stream].Launch(
                    Kernel,
                    launchParameters,
                    gpuAs[stream],
                    gpuBs[stream],
                    isSynchronizable[stream]
                    );
            }

            asyncAction?.Invoke();

            var streamId = 0;
            foreach (var stream in streams)
            {
#if (benchmark)
                benchmarkTiming.Start();
#endif
                stream.Synchronize();
#if (benchmark)
                benchmarkTiming.Stop();
#endif
                stream.Copy(isSynchronizable[streamId], gpuResultsIsSynchronizable[streamId]);

                var offset             = streamId * problemsPerStream;
                var localProblemsCount = Math.Min(problemsPerStream, problemCount - offset);

                if (computationResults == null)
                {
                    resultsWritingIndex = 0;
                    computationResults  = new ComputationResult[resultsWritingIndex + problemCount];
                }

                for (int i = 0; i < localProblemsCount; i++)
                {
                    computationResults[resultsWritingIndex + offset + i].isSynchronizable = gpuResultsIsSynchronizable[streamId][i];
                }

                streamId++;
            }

#if (benchmark)
            gpu.Synchronize();
#endif


            foreach (var arrays in new IEnumerable <Array>[] { gpuAs, gpuBs, isSynchronizable })
            {
                foreach (var array in arrays)
                {
                    Gpu.Free(array);
                }
            }

            foreach (var stream in streams)
            {
                stream.Dispose();
            }

            var cpu    = new SlimCPUSkipper();
            var result = cpu.Compute(problemsToSolve, problemsReadingIndex, computationResults, resultsWritingIndex, problemCount, cpu.GetBestParallelism());

#if (benchmark)
            computationResults[resultsWritingIndex].benchmarkResult = new BenchmarkResult
            {
                benchmarkedTime = benchmarkTiming.Elapsed,
                totalTime       = totalTiming.Elapsed
            };
#endif
            return(result);
        }
        public int ComputeManyWithAction(
            Problem[] problemsToSolve,
            int problemsReadingIndex,
            ComputationResult[] computationResults,
            int resultsWritingIndex,
            int problemCount,
            int streamCount,
            Action asyncAction = null)
        // cannot be more warps since more memory should be allocated
        {
#if (benchmark)
            var totalTiming = new Stopwatch();
            totalTiming.Start();
            var benchmarkTiming = new Stopwatch();
#endif
            var CernyConjectureFailingIndex = -1;
            var gpu = Gpu.Default;
            var n   = problemsToSolve[problemsReadingIndex].size;

            var       power   = 1 << n;
            const int bitSize = 6;
            var       maximumPermissibleWordLength = (n - 1) * (n - 1);

            // in order to atomically add to a checking array (designed for queue consistency) one int is used by four threads, so in a reeeeaally pessimistic case 255 is the maximum number of threads (everyone go to the same vertex)
            // -1 for the already discovered vertex, so its -2 in total for both reasons
            // at least 2*n+1 threads i.e. 27
            var threads        = gpu.Device.Attributes.MaxThreadsPerBlock;
            var maximumThreads = Math.Min(
                gpu.Device.Attributes.MaxThreadsPerBlock,
                ((1 << bitSize) - 1) - 1
                );
            var minimumThreads = n + 1;
            if (threads > maximumThreads)
            {
                threads = maximumThreads;
            }
            if (threads < minimumThreads)
            {
                threads = minimumThreads;
            }
            if (threads > maximumThreads)
            {
                throw new Exception("Impossible to satisfy");
            }

            if (problemCount < streamCount)
            {
                streamCount = problemCount;
            }

            var problemsPerStream = (problemCount + streamCount - 1) / streamCount;
            var streams           = Enumerable.Range(0, streamCount)
                                    .Select(_ => gpu.CreateStream()).ToArray();

            gpu.Copy(n, problemSize);

            var isDiscoveredComplexOffset = (power * sizeof(int) + 8 * sizeof(int) / bitSize - 1) / (8 * sizeof(int) / bitSize);

            var launchParameters = new LaunchParam(
                new dim3(1 << 9, 1, 1),
                new dim3(threads, 1, 1),
                sizeof(int) * 3
                + isDiscoveredComplexOffset + (((isDiscoveredComplexOffset % sizeof(int)) & 1) == 1 ? 1 : 0)
                + (power / 2 + 1) * sizeof(ushort) * 2
                + (n + 1) * sizeof(uint)
                + sizeof(bool)
                );
            var gpuResultsIsSynchronizable = new bool[streamCount][];
            var gpuResultsShortestSynchronizingWordLength = new int[streamCount][];
            var gpuAs = new int[streamCount][];
            var gpuBs = new int[streamCount][];
            var shortestSynchronizingWordLength = new int[streamCount][];
            var isSynchronizable = new bool[streamCount][];

            for (int stream = 0; stream < streamCount; stream++)
            {
                var offset             = stream * problemsPerStream;
                var localProblemsCount = Math.Min(problemsPerStream, problemCount - offset);

                gpuAs[stream] = gpu.Allocate <int>(localProblemsCount * n);
                gpuBs[stream] = gpu.Allocate <int>(localProblemsCount * n);
                shortestSynchronizingWordLength[stream] = gpu.Allocate <int>(localProblemsCount);
                isSynchronizable[stream] = gpu.Allocate <bool>(localProblemsCount);

                var matrixA = new int[localProblemsCount * n];
                var matrixB = new int[localProblemsCount * n];
                Parallel.For(0, localProblemsCount, problem =>
                {
                    Array.ConstrainedCopy(
                        problemsToSolve[problemsReadingIndex + offset + problem].stateTransitioningMatrixA,
                        0,
                        matrixA,
                        problem * n,
                        n
                        );
                    Array.ConstrainedCopy(
                        problemsToSolve[problemsReadingIndex + offset + problem].stateTransitioningMatrixB,
                        0,
                        matrixB,
                        problem * n,
                        n
                        );
                });

                streams[stream].Copy(matrixA, gpuAs[stream]);
                streams[stream].Copy(matrixB, gpuBs[stream]);

                gpuResultsIsSynchronizable[stream] = new bool[localProblemsCount];
                gpuResultsShortestSynchronizingWordLength[stream] = new int[localProblemsCount];

                streams[stream].Launch(
                    Kernel,
                    launchParameters,
                    gpuAs[stream],
                    gpuBs[stream],
                    isSynchronizable[stream],
                    shortestSynchronizingWordLength[stream]
                    );
            }

            asyncAction?.Invoke();

            var streamId = 0;
            foreach (var stream in streams)
            {
#if (benchmark)
                benchmarkTiming.Start();
#endif
                stream.Synchronize();
#if (benchmark)
                benchmarkTiming.Stop();
#endif
                stream.Copy(isSynchronizable[streamId], gpuResultsIsSynchronizable[streamId]);
                stream.Copy(shortestSynchronizingWordLength[streamId], gpuResultsShortestSynchronizingWordLength[streamId]);

                streamId++;
            }

#if (benchmark)
            gpu.Synchronize();
#endif


            Parallel.For(0, streamCount, stream =>
            {
                var offset             = stream * problemsPerStream;
                var localProblemsCount = Math.Min(problemsPerStream, problemCount - offset);

                for (int i = 0; i < localProblemsCount; i++)
                {
                    if (computationResults != null)
                    {
                        computationResults[resultsWritingIndex + offset + i] = new ComputationResult()
                        {
                            computationType  = ComputationType.GPU,
                            size             = problemsToSolve[problemsReadingIndex].size,
                            isSynchronizable = gpuResultsIsSynchronizable[stream][i],
                            shortestSynchronizingWordLength = gpuResultsShortestSynchronizingWordLength[stream][i],
                            algorithmName = GetType().Name
                        }
                    }
                    ;

                    if (gpuResultsIsSynchronizable[stream][i] && gpuResultsShortestSynchronizingWordLength[stream][i] > maximumPermissibleWordLength)
                    {
                        CernyConjectureFailingIndex = resultsWritingIndex + offset + i;
                        break;
                    }
                }
            });

            foreach (var arrays in new IEnumerable <Array>[] { gpuAs, gpuBs, isSynchronizable, shortestSynchronizingWordLength })
            {
                foreach (var array in arrays)
                {
                    Gpu.Free(array);
                }
            }
            Gpu.FreeAllImplicitMemory();

            foreach (var stream in streams)
            {
                stream.Dispose();
            }

#if (benchmark)
            computationResults[resultsWritingIndex].benchmarkResult = new BenchmarkResult
            {
                benchmarkedTime = benchmarkTiming.Elapsed,
                totalTime       = totalTiming.Elapsed
            };
#endif
            return(CernyConjectureFailingIndex);
        }
        public ComputationResult[] Compute(
            Problem[] problemsToSolve,
            int streamCount,
            Action asyncAction = null,
            int warpCount      = 16)
        // cannot be more warps since more memory should be allocated
        {
#if (benchmark)
            var totalTiming = new Stopwatch();
            totalTiming.Start();
            var benchmarkTiming = new Stopwatch();
#endif
            var gpu = Gpu.Default;
            var n   = problemsToSolve.First().size;

            var power = 1 << n;
            var maximumPermissibleWordLength = (n - 1) * (n - 1);

            var maximumWarps = gpu.Device.Attributes.MaxThreadsPerBlock / gpu.Device.Attributes.WarpSize;
            if (warpCount > maximumWarps)
            {
                warpCount = maximumWarps;
            }

            var problemsPerStream   = (problemsToSolve.Count() + streamCount - 1) / streamCount;
            var problemsPartitioned = Enumerable.Range(0, streamCount)
                                      .Select(i => problemsToSolve.Skip(problemsPerStream * i)
                                              .Take(problemsPerStream)
                                              .ToArray())
                                      .Where(partition => partition.Length > 0)
                                      .ToArray();
            streamCount = problemsPartitioned.Length;
            var streams = Enumerable.Range(0, streamCount)
                          .Select(_ => gpu.CreateStream()).ToArray();

            var gpuA             = problemsPartitioned.Select(problems => gpu.Allocate <int>(problems.Length * n)).ToArray();
            var gpuB             = problemsPartitioned.Select(problems => gpu.Allocate <int>(problems.Length * n)).ToArray();
            var isSynchronizable = problemsPartitioned.Select(problems => gpu.Allocate <bool>(problems.Length)).ToArray();
            gpu.Copy(n, problemSize);

            var launchParameters = new LaunchParam(
                new dim3(1, 1, 1),
                new dim3(gpu.Device.Attributes.WarpSize * warpCount, 1, 1),
                2 * n * sizeof(ushort)
                );
            var gpuResultsIsSynchronizable = problemsPartitioned
                                             .Select(problems => new bool[problems.Length])
                                             .ToArray();


            for (int stream = 0; stream < streamCount; stream++)
            {
                var problems = problemsPartitioned[stream];

                var matrixA = new int[problems.Length * n];
                var matrixB = new int[problems.Length * n];
                Parallel.For(0, problems.Length, problem =>
                {
                    Array.ConstrainedCopy(problems[problem].stateTransitioningMatrixA, 0, matrixA, problem * n, n);
                    Array.ConstrainedCopy(problems[problem].stateTransitioningMatrixB, 0, matrixB, problem * n, n);
                });

                streams[stream].Copy(matrixA, gpuA[stream]);
                streams[stream].Copy(matrixB, gpuB[stream]);

                // TODO: change this entirely
                // warning, this might not compute the length of a shortest synchronizing word but it will verify the Cerny conjecture
                streams[stream].Launch(
                    Kernel,
                    launchParameters,
                    gpuA[stream],
                    gpuB[stream],
                    isSynchronizable[stream]
                    );
            }

            asyncAction?.Invoke();

            for (int stream = 0; stream < streamCount; stream++)
            {
#if (benchmark)
                benchmarkTiming.Start();
#endif
                streams[stream].Synchronize();
#if (benchmark)
                benchmarkTiming.Stop();
#endif
                streams[stream].Copy(isSynchronizable[stream], gpuResultsIsSynchronizable[stream]);
            }

            gpu.Synchronize();

#if (benchmark)
#endif
            //Enumerable.Range(0, streamCount)
            //    .SelectMany(stream => gpuResultsIsSynchronizable[stream])
            //    .Select((result) =>
            //    {

            //    });
            var results = new ComputationResult[problemsToSolve.Count()];


            var slimCPU           = new SlimCPU();
            var listOfCPUProblems = new List <Problem>();
            var cpuProblemIndex   = new List <int>();
            int generalIndex      = 0;
            for (int stream = 0; stream < streamCount; stream++)
            {
                for (int index = 0; index < gpuResultsIsSynchronizable[stream].Length; index++)
                {
                    if (gpuResultsIsSynchronizable[stream][index])
                    {
                        results[generalIndex] = new ComputationResult
                        {
                            isSynchronizable = true,
                            computationType  = ComputationType.CPU_GPU_Combined,
                            size             = n,
                            algorithmName    = GetType().Name
                        };
                    }
                    else
                    {
                        listOfCPUProblems.Add(problemsPartitioned[stream][index]);
                        cpuProblemIndex.Add(generalIndex);
                    }
                    generalIndex++;
                }
            }

            var cpuResults = slimCPU.Compute(listOfCPUProblems.ToArray(), slimCPU.GetBestParallelism());

            for (int i = 0; i < listOfCPUProblems.Count; i++)
            {
                results[cpuProblemIndex[i]] = cpuResults[i];
            }

            if (cpuResults.Any(result => result.isSynchronizable && result.shortestSynchronizingWordLength > maximumPermissibleWordLength))
            {
                throw new Exception("Cerny conjecture is false");
            }

            foreach (var array in gpuA.AsEnumerable <Array>()
                     .Concat(gpuB)
                     .Concat(isSynchronizable))
            {
                Gpu.Free(array);
            }

            foreach (var stream in streams)
            {
                stream.Dispose();
            }

#if (benchmark)
            results[0].benchmarkResult = new BenchmarkResult
            {
                benchmarkedTime = benchmarkTiming.Elapsed,
                totalTime       = totalTiming.Elapsed
            };
#endif
            return(results);
        }
 protected virtual void FreeCache()
 {
     Free();
     Gpu.Free(rneuronsCacheArr);
 }