public unsafe static void RunLocal()
        {
            var         mem  = new SharedMemory(_channel.GetSubChannel("mem"));
            int *       ptr1 = (int *)mem.Allocate(_channel.GetSubChannel("A"), 4, 4).Pointer;
            int *       ptr2 = (int *)mem.Allocate(_channel.GetSubChannel("B"), 4, 4).Pointer;
            Func <bool> func = () => Volatile.Read(ref *ptr2) == 2;

            FastSpinUntil(() => Volatile.Read(ref *ptr2) == 1, 2000);

            for (int i = 0; i < 50; ++i)
            {
                Volatile.Write(ref *ptr1, 1);
                FastSpinUntil(func, 2000);
                Volatile.Write(ref *ptr2, 1);
            }

            var clock = Stopwatch.StartNew();

            for (int i = 0; i < 50; ++i)
            {
                Volatile.Write(ref *ptr1, 1);
                FastSpinUntil(func, 2000);
                Volatile.Write(ref *ptr2, 1);
            }

            var time = clock.ElapsedTicks / (float)TimeSpan.TicksPerMillisecond * 1000;

            Console.WriteLine("Average round-trip delay: {0} us", time / 50);
        }
Esempio n. 2
0
        private static ArrayView <T> InclusiveScanImplementation <
            T,
            TScanOperation,
            TImpl>(
            T value)
            where T : unmanaged
            where TScanOperation : struct, IScanReduceOperation <T>
            where TImpl : struct, IILFunctionImplementation
        {
            TImpl impl = default;

            // Load values into shared memory
            var sharedMemory = SharedMemory.Allocate <T>(impl.MaxNumThreads);

            Debug.Assert(
                impl.ThreadDimension <= impl.MaxNumThreads,
                "Invalid group/warp size");
            sharedMemory[impl.ThreadIndex] = value;
            impl.Barrier();

            // First thread performs all operations
            if (impl.IsFirstThread)
            {
                TScanOperation scanOperation = default;
                for (int i = 1; i < impl.ThreadDimension; ++i)
                {
                    sharedMemory[i] = scanOperation.Apply(
                        sharedMemory[i - 1],
                        sharedMemory[i]);
                }
            }
            impl.Barrier();

            return(sharedMemory);
        }
Esempio n. 3
0
        public void RunTest()
        {
            const int     id0       = 100;
            const int     blockSize = 16;
            SharedMemory  memory    = new SharedMemory(Channel.FromHash("Test"));
            List <IntPtr> blocks    = new List <IntPtr>();

            //Create
            for (int i = 0; i < 200; ++i)
            {
                blocks.Add(memory.Allocate(id0 + i, 4 * blockSize, 4).Pointer);
            }

            //Write
            for (int i = 0; i < 200; ++i)
            {
                for (int j = 0; j < blockSize; ++j)
                {
                    Assert.AreEqual(0, Marshal.ReadInt32(blocks[i], j * 4));
                    Marshal.WriteInt32(blocks[i], j * 4, i + j);
                }
            }

            //Read
            for (int i = 0; i < 200; ++i)
            {
                for (int j = 0; j < blockSize; ++j)
                {
                    Assert.AreEqual(i + j, Marshal.ReadInt32(blocks[i], j * 4));
                }
            }
        }
        public unsafe static void RunRemote()
        {
            var         mem  = new SharedMemory(_channel.GetSubChannel("mem"));
            int *       ptr1 = (int *)mem.Allocate(_channel.GetSubChannel("A"), 4, 4).Pointer;
            int *       ptr2 = (int *)mem.Allocate(_channel.GetSubChannel("B"), 4, 4).Pointer;
            Func <bool> func = () => Volatile.Read(ref *ptr1) == 1;

            Volatile.Write(ref *ptr2, 1);

            for (int i = 0; i < 100; ++i)
            {
                FastSpinUntil(func, 2000);
                Volatile.Write(ref *ptr1, 0);
                Volatile.Write(ref *ptr2, 2);
            }
        }
Esempio n. 5
0
        private static void IlGpuKernelLocalMemory(
            ArrayView2D <Real> mSquaredDistances,
            ArrayView <Real> mCoordinates,
            SpecializedValue <int> dimX,
            SpecializedValue <int> c,
            int n)
        {
            // Same as KernelConstants, but use both local and shared memory to increase the effective shared memory.

            var coordinatesI = SharedMemory.Allocate <Real>(c * dimX);
            var coordinatesJ = new IlReal2[c.Value];

            var bI       = Grid.IdxY * dimX;
            var bJ       = Grid.IdxX * dimX;
            var line     = Group.IdxX / (dimX / 2);
            var tid      = Group.IdxX % (dimX / 2);
            var isActive = bJ + tid * 2 < n;

            for (int k = 0; k != c.Value; ++k)
            {
                if (bI + Group.IdxX < n)
                {
                    coordinatesI[k * dimX + Group.IdxX] = mCoordinates[k * n + bI + Group.IdxX];
                }

                if (isActive)
                {
                    var mCoordinates2 = mCoordinates.Cast <IlReal2>();
                    coordinatesJ[k] = mCoordinates2[(k * n + bJ) / 2 + tid];
                }
            }

            Group.Barrier();

            if (isActive)
            {
                for (int i = line; i < dimX && bI + i < n; i += 2)
                {
                    var dist = default(IlReal2);

                    for (int k = 0; k != c.Value; ++k)
                    {
                        var coord1 = coordinatesI[k * dimX + i];
                        var coord2 = coordinatesJ[k];
                        var diff   = new IlReal2(coord1 - coord2.X, coord1 - coord2.Y);

                        dist += diff * diff;
                    }

                    var dst = mSquaredDistances.Cast <IlReal2>();
                    dst[bJ / 2 + tid, bI + i] = dist;
                }
            }
        }
Esempio n. 6
0
        public static T AllReduce <T, TReduction>(T value)
            where T : unmanaged
            where TReduction : IScanReduceOperation <T>
        {
            // A fixed number of memory banks to distribute the workload
            // of the atomic operations in shared memory.
            const int NumMemoryBanks = 4;
            var       sharedMemory   = SharedMemory.Allocate <T>(NumMemoryBanks);

            var warpIdx = Warp.ComputeWarpIdx(Group.IdxX);
            var laneIdx = Warp.LaneIdx;

            TReduction reduction = default;

            if (warpIdx == 0)
            {
                for (
                    int bankIdx = laneIdx;
                    bankIdx < NumMemoryBanks;
                    bankIdx += Warp.WarpSize)
                {
                    sharedMemory[bankIdx] = reduction.Identity;
                }
            }
            Group.Barrier();

            value = PTXWarpExtensions.Reduce <T, TReduction>(value);
            if (laneIdx == 0)
            {
                reduction.AtomicApply(ref sharedMemory[warpIdx % NumMemoryBanks], value);
            }
            Group.Barrier();

            // Note that this is explicitly unrolled (see NumMemoryBanks above)
            var result = sharedMemory[0];

            result = reduction.Apply(result, sharedMemory[1]);
            result = reduction.Apply(result, sharedMemory[2]);
            result = reduction.Apply(result, sharedMemory[3]);
            Group.Barrier();

            return(result);
        }
Esempio n. 7
0
        private static T ComputeScan <T, TScanOperation, TScanImplementation>(
            T value,
            out ArrayView <T> sharedMemory)
            where T : unmanaged
            where TScanOperation : struct, IScanReduceOperation <T>
            where TScanImplementation : struct, IScanImplementation <T, TScanOperation>
        {
            const int SharedMemoryLength = 32;

            sharedMemory = SharedMemory.Allocate <T>(SharedMemoryLength);

            int warpIdx = Warp.WarpIdx;

            TScanOperation scanOperation = default;

            // Initialize
            if (Group.DimX / Warp.WarpSize < SharedMemoryLength)
            {
                if (warpIdx < 1)
                {
                    sharedMemory[Group.IdxX] = scanOperation.Identity;
                }
                Group.Barrier();
            }

            TScanImplementation scanImplementation = default;
            var scannedValue = scanImplementation.Scan(value);

            if (Warp.IsLastLane)
            {
                sharedMemory[warpIdx] = scanImplementation.ScanRightBoundary(
                    scannedValue,
                    value);
            }
            Group.Barrier();

            // Reduce results again in the first warp
            if (warpIdx < 1)
            {
                ref T sharedBoundary = ref sharedMemory[Group.IdxX];
                sharedBoundary = PTXWarpExtensions.InclusiveScan <T, TScanOperation>(
                    sharedBoundary);
            }
Esempio n. 8
0
        public static T AllReduce <T, TReduction, TImpl>(T value)
            where T : unmanaged
            where TReduction : IScanReduceOperation <T>
            where TImpl : struct, IILFunctionImplementation
        {
            TImpl impl         = default;
            var   sharedMemory = SharedMemory.Allocate <T>(impl.ReduceSegments);

            TReduction reduction = default;

            if (impl.IsFirstThread)
            {
                sharedMemory[impl.ReduceSegmentIndex] = reduction.Identity;
            }
            impl.Barrier();

            reduction.AtomicApply(ref sharedMemory[impl.ReduceSegmentIndex], value);

            impl.Barrier();
            return(sharedMemory[impl.ReduceSegmentIndex]);
        }
Esempio n. 9
0
        /// <summary>
        /// The actual unique kernel implementation.
        /// </summary>
        /// <typeparam name="T">The element type.</typeparam>
        /// <typeparam name="TComparisonOperation">The comparison operation.</typeparam>
        /// <param name="input">The input view.</param>
        /// <param name="output">The output view to store the new length.</param>
        /// <param name="sequentialGroupExecutor">
        /// The sequential group executor to use.
        /// </param>
        /// <param name="tileSize">The tile size.</param>
        /// <param name="numIterationsPerGroup">
        /// The number of iterations per group.
        /// </param>
        internal static void UniqueKernel <T, TComparisonOperation>(
            ArrayView <T> input,
            ArrayView <long> output,
            SequentialGroupExecutor sequentialGroupExecutor,
            SpecializedValue <int> tileSize,
            Index1D numIterationsPerGroup)
            where T : unmanaged
            where TComparisonOperation : struct, IComparisonOperation <T>
        {
            TComparisonOperation comparison = default;
            var isFirstGrid = Grid.IdxX == 0;
            var tileInfo    = new TileInfo(input.IntLength, numIterationsPerGroup);

            // Sync groups and wait for the current one to become active
            sequentialGroupExecutor.Wait();

            var temp     = SharedMemory.Allocate <bool>(tileSize);
            var startIdx = Grid.ComputeGlobalIndex(Grid.IdxX, 0);

            for (
                int i = tileInfo.StartIndex;
                i < tileInfo.MaxLength;
                i += Group.DimX)
            {
                if (Group.IsFirstThread && i == tileInfo.StartIndex && isFirstGrid)
                {
                    temp[0] = true;
                }
                else
                {
                    var currIdx = i;
                    var prevIdx = Group.IsFirstThread && i == tileInfo.StartIndex
                        ? output[0] - 1
                        : currIdx - 1;

                    temp[currIdx - startIdx] =
                        comparison.Compare(input[currIdx], input[prevIdx]) != 0;
                }
            }
            Group.Barrier();

            if (Group.IsFirstThread)
            {
                var offset    = isFirstGrid ? 0 : output[0];
                var maxLength =
                    XMath.Min(startIdx + temp.IntLength, tileInfo.MaxLength) - startIdx;

                for (var i = 0; i < maxLength; i++)
                {
                    if (temp[i])
                    {
                        input[offset++] = input[startIdx + i];
                    }
                }
                output[0] = offset;
            }

            MemoryFence.DeviceLevel();
            Group.Barrier();
            sequentialGroupExecutor.Release();
        }
        /// <summary>
        /// Performs the first radix-sort pass.
        /// </summary>
        /// <typeparam name="T">The element type.</typeparam>
        /// <typeparam name="TOperation">The radix-sort operation.</typeparam>
        /// <typeparam name="TSpecialization">The specialization type.</typeparam>
        /// <param name="view">The input view to use.</param>
        /// <param name="counter">The global counter view.</param>
        /// <param name="groupSize">The number of threads in the group.</param>
        /// <param name="numGroups">The number of virtually launched groups.</param>
        /// <param name="paddedLength">The padded length of the input view.</param>
        /// <param name="shift">The bit shift to use.</param>
        internal static void RadixSortKernel1 <T, TOperation, TSpecialization>(
            ArrayView <T> view,
            ArrayView <int> counter,
            SpecializedValue <int> groupSize,
            int numGroups,
            int paddedLength,
            int shift)
            where T : unmanaged
            where TOperation : struct, IRadixSortOperation <T>
            where TSpecialization : struct, IRadixSortSpecialization
        {
            TSpecialization specialization = default;
            var             scanMemory     = SharedMemory.Allocate <int>(
                groupSize * specialization.UnrollFactor);

            int gridIdx = Grid.IdxX;

            for (
                int i = Grid.GlobalIndex.X;
                i < paddedLength;
                i += GridExtensions.GridStrideLoopStride)
            {
                bool inRange = i < view.Length;

                // Read value from global memory
                TOperation operation = default;
                T          value     = operation.DefaultValue;
                if (inRange)
                {
                    value = view[i];
                }
                var bits = operation.ExtractRadixBits(
                    value,
                    shift,
                    specialization.UnrollFactor - 1);

                for (int j = 0; j < specialization.UnrollFactor; ++j)
                {
                    scanMemory[Group.IdxX + groupSize * j] = 0;
                }
                if (inRange)
                {
                    scanMemory[Group.IdxX + groupSize * bits] = 1;
                }
                Group.Barrier();

                for (int j = 0; j < specialization.UnrollFactor; ++j)
                {
                    var address = Group.IdxX + groupSize * j;
                    scanMemory[address] =
                        GroupExtensions.ExclusiveScan <int, AddInt32>(scanMemory[address]);
                }
                Group.Barrier();

                if (Group.IdxX == Group.DimX - 1)
                {
                    // Write counters to global memory
                    for (int j = 0; j < specialization.UnrollFactor; ++j)
                    {
                        ref var newOffset = ref scanMemory[Group.IdxX + groupSize * j];
                        newOffset += Utilities.Select(inRange & j == bits, 1, 0);
                        counter[j * numGroups + gridIdx] = newOffset;
                    }
                }
                Group.Barrier();

                var    gridSize = gridIdx * Group.DimX;
                Index1 pos      = gridSize + scanMemory[Group.IdxX + groupSize * bits] -
                                  Utilities.Select(inRange & Group.IdxX == Group.DimX - 1, 1, 0);
                for (int j = 1; j <= bits; ++j)
                {
                    pos += scanMemory[groupSize * j - 1] +
                           Utilities.Select(j - 1 == bits, 1, 0);
                }

                // Pre-sort the current value into the corresponding segment
                if (inRange)
                {
                    view[pos] = value;
                }
                Group.Barrier();

                gridIdx += Grid.DimX;
            }
Esempio n. 11
0
        static void Test3Kernel(
            GroupedIndex index,
            ArrayView <short> heights,
            ArrayView <float> matrices,
            ArrayView <float> horizon,
            int a_line,
            int a_sample)
        {
            var       idx        = index.GroupIdx;
            const int patch_size = 128;

            ArrayView <float> horizon_shared = SharedMemory.Allocate <float>(1440);

            // Do the calculation
            var aline = index.GridIdx;

            for (var asample = 0; asample < patch_size; asample++)
            {
                // Copy horizon for a[line,sample] into shared memory
                {
                    var dim    = Group.Dimension.X;
                    var len    = horizon_shared.Length;
                    var passes = (len + (dim - 1)) / dim;
                    var offset = (aline * patch_size + asample) * len;
                    for (var pass = 0; pass < passes; pass++)
                    {
                        var ptr = pass * dim + idx;
                        if (ptr < len)
                        {
                            horizon_shared[ptr] = horizon[ptr + offset];
                        }
                        // Note warp divergence
                    }
                }

                Group.Barrier();

                // Copy the matrix into registers
                var pos = (aline * patch_size + asample) * 12;

                var row0x = matrices[pos++];
                var row1x = matrices[pos++];
                var row2x = matrices[pos++];
                var row3x = matrices[pos++];

                var row0y = matrices[pos++];
                var row1y = matrices[pos++];
                var row2y = matrices[pos++];
                var row3y = matrices[pos++];

                var row0z = matrices[pos++];
                var row1z = matrices[pos++];
                var row2z = matrices[pos++];
                var row3z = matrices[pos];

                for (var oline = 0; oline < patch_size; oline++)
                {
                    // osample = idx
                    var relz   = 0.5d * heights[aline * patch_size + idx];
                    var radius = MoonRadius + relz / 1000d;
                    var line   = a_line + aline;
                    var sample = a_sample + idx;

                    var map_x     = (sample - S0) * Scale;
                    var map_y     = (L0 - line) * Scale;
                    var P         = Math.Sqrt(map_x * map_x + map_y * map_y);
                    var C         = 2d * Math.Atan2(P, 2 * MoonRadius);
                    var latitude  = Math.Asin(Math.Cos(C) * Math.Sin(LatP) + map_y * Math.Sin(C) * Math.Cos(LatP) / P);
                    var longitude = LonP + Math.Atan2(map_x, map_y * LonFactor);

                    var latdeg = latitude * 180d / Math.PI;
                    var londeg = longitude * 180d / Math.PI;

                    // Calculate the other point in ME frame
                    var z_me = radius * Math.Sin(latitude);
                    var c    = radius * Math.Cos(latitude);
                    var x_me = c * Math.Cos(longitude);
                    var y_me = c * Math.Sin(longitude);

                    // Transform the point to the local frame
                    var x = x_me * row0x + y_me * row1x + z_me * row2x + row3x;
                    var y = x_me * row0y + y_me * row1y + z_me * row2y + row3y;
                    var z = x_me * row0z + y_me * row1z + z_me * row2z + row3z;

                    //                        if (idx == 0)
                    //                            relz = relz;

                    var azimuth = Math.Atan2(y, x) + Math.PI;  // [0,2 PI]
                    var alen    = Math.Sqrt(x * x + y * y);
                    var slope   = z / alen;
                    var slopef  = (float)slope;

                    var horizon_index = (int)(0.5d + 1439 * (azimuth / (2d * Math.PI)));
                    Atomic.Max(ref horizon_shared[horizon_index], slopef);
                    //horizon_shared[horizon_index] = 1f;
                }

                Group.Barrier();

                {
                    var dim    = Group.Dimension.X;
                    var len    = horizon_shared.Length;
                    var passes = (len + (dim - 1)) / dim;
                    var offset = (aline * patch_size + asample) * len;
                    for (var pass = 0; pass < passes; pass++)
                    {
                        var ptr = pass * dim + idx;
                        if (ptr < len)
                        {
                            horizon[ptr + offset] = horizon_shared[ptr];
                        }
                        // Note warp divergence
                    }
                }
            }
        }
Esempio n. 12
0
        static void FarFieldKernel1(
            GroupedIndex2 index,
            ArrayView <float> points,
            ArrayView <float> matrices,
            ArrayView <int> horizon,
            ArrayView <float> test_array,
            float observer_height_in_km)
        {
            var target_line   = index.GridIdx.Y;
            var target_sample = index.GridIdx.X;
            var caster_line   = index.GroupIdx.Y;

            Debug.Assert(index.GroupIdx.X == 1);

            ArrayView <int> horizon_shared = SharedMemory.Allocate <int>(1440);

            // Copy horizon for a[target_line,target_sample] into shared memory
            {
                var dim    = Group.Dimension.Y;
                var len    = horizon_shared.Length;
                var passes = (len + (dim - 1)) / dim;
                var offset = (target_line * TerrainPatch.DefaultSize + target_sample) * len;
                for (var pass = 0; pass < passes; pass++)
                {
                    var ptr = pass * dim + caster_line;
                    if (ptr < len)  // divergence
                    {
                        horizon_shared[ptr] = horizon[ptr + offset];
                    }
                }
            }

            Group.Barrier();

            // Copy the matrix into registers
            var pos = (target_line * TerrainPatch.DefaultSize + target_sample) * 12;

            var row0x = matrices[pos++];
            var row1x = matrices[pos++];
            var row2x = matrices[pos++];
            var row3x = matrices[pos++];

            var row0y = matrices[pos++];
            var row1y = matrices[pos++];
            var row2y = matrices[pos++];
            var row3y = matrices[pos++];

            var row0z = matrices[pos++];
            var row1z = matrices[pos++];
            var row2z = matrices[pos++];
            var row3z = matrices[pos];

            for (var caster_sample = 0; caster_sample < TerrainPatch.DefaultSize; caster_sample++)
            {
                // Fetch the other point in local frame
                var points_offset = (caster_line * TerrainPatch.DefaultSize + caster_sample) * 3;
                var x_patch       = points[points_offset];
                var y_patch       = points[points_offset + 1];
                var z_patch       = points[points_offset + 2];

                // Transform the point to the local frame
                var x = x_patch * row0x + y_patch * row1x + z_patch * row2x + row3x;
                var y = x_patch * row0y + y_patch * row1y + z_patch * row2y + row3y;
                var z = x_patch * row0z + y_patch * row1z + z_patch * row2z + row3z;

                // Adjust for solar array height
                z -= observer_height_in_km;

                var azimuth = Atan2(y, x) + XMath.PI;  // [0,2 PI]
                var alen    = XMath.Sqrt(x * x + y * y);
                var slope   = z / alen;

                var slopem = slope > 2f ? 2f : slope;
                slopem = slopem < -2f ? -2f : slopem;
                slopem = slopem / 4f;

                var slopei = (int)(slopem * 1000000);

                var horizon_index = (int)(0.5f + 1439 * (azimuth / (2f * XMath.PI)));
                Atomic.Max(ref horizon_shared[horizon_index], slopei);

                if (caster_sample == 0 && caster_line == 0 && target_line == 0 && target_sample == 0)
                {
                    test_array[0]  = x_patch;
                    test_array[1]  = y_patch;
                    test_array[2]  = z_patch;
                    test_array[3]  = x;
                    test_array[4]  = y;
                    test_array[5]  = z;
                    test_array[6]  = slope;
                    test_array[7]  = slopem;
                    test_array[8]  = slopei;
                    test_array[9]  = row3x;
                    test_array[10] = row3y;
                    test_array[11] = row3z;
                }
            }

            Group.Barrier();

            {
                var dim    = Group.Dimension.Y;
                var len    = horizon_shared.Length;
                var passes = (len + (dim - 1)) / dim;
                var offset = (target_line * TerrainPatch.DefaultSize + target_sample) * len;
                for (var pass = 0; pass < passes; pass++)
                {
                    var ptr = pass * dim + caster_line;
                    if (ptr < len)  // divergence
                    {
                        horizon[ptr + offset] = horizon_shared[ptr];
                    }
                }
            }
        }