public override ulong Run(CancellationToken cancellationToken) { if (!Popcnt.IsSupported) { return(0uL); } var iterations = 0uL; while (!cancellationToken.IsCancellationRequested) { for (var i = 0; i < LENGTH; i++) { data = Popcnt.PopCount(data); } iterations++; } return(iterations); }
public static int PopCount(uint value) { if (Popcnt.IsSupported) { return((int)Popcnt.PopCount(value)); } if (AdvSimd.Arm64.IsSupported) { // PopCount works on vector so convert input value to vector first. // Vector64.CreateScalar(uint) generates suboptimal code by storing and // loading the result to memory. // See https://github.com/dotnet/runtime/issues/35976 for details. // Hence use Vector64.Create(ulong) to create Vector64<ulong> and operate on that. Vector64 <ulong> input = Vector64.Create((ulong)value); Vector64 <byte> aggregated = AdvSimd.Arm64.AddAcross(AdvSimd.PopCount(input.AsByte())); return(AdvSimd.Extract(aggregated, 0)); } return(SoftwareFallback(value));
public (List <int> codes, int score) Solve(int gridCode) { var minScore = int.MaxValue; var solutionCodes = new List <int>(); for (int i = 0; i < 1 << (Size * Size); i++) { var numBlack = (int)Popcnt.PopCount((uint)(gridCode ^ flipActions[i])); var score = (int)Popcnt.PopCount((uint)i) + Math.Min(numBlack, Size * Size - numBlack); if (minScore >= score) { if (minScore > score) { solutionCodes.Clear(); } solutionCodes.Add(i); minScore = score; } } return(solutionCodes, minScore); }
internal override ImmutableDictionary <TKey, TValue> Add(TKey key, TValue value, uint hash, int shift) { var bit = 1U << (int)((hash >> shift) & Mask); if ((_bitmapNodes & bit) != 0) { var index = Popcnt.PopCount(_bitmapNodes & (bit - 1)); return(_nodes.DuplicateWith(key, value, hash, index, shift, _bitmapNodes, _bitmapValues, _values)); } else if ((_bitmapValues & bit) != 0) { // TODO collisions and same value var index = Popcnt.PopCount(_bitmapNodes & (bit - 1)); var indexValues = Popcnt.PopCount(_bitmapValues & (bit - 1)); return(_nodes.Add(key, value, hash, (uint)index, (uint)indexValues, shift, _bitmapNodes | bit, _bitmapValues ^ bit, _values)); } else { var index = (uint)Popcnt.PopCount(_bitmapValues & (bit - 1)); return(_values.Add(key, value, _bitmapNodes, _nodes, _bitmapValues | bit, index)); } }
internal static int GetHammingDistanceCore(ulong v) { #if !NO_X86_INSTRINSICS unchecked { if (Popcnt.X64.IsSupported) { return((int)Popcnt.X64.PopCount(v)); } if (Popcnt.IsSupported) { return((int)(Popcnt.PopCount((uint)v) + Popcnt.PopCount((uint)(v >> 32)))); } } #endif unchecked { v = v - ((v >> 1) & 0x5555555555555555UL); v = (v & 0x3333333333333333UL) + ((v >> 2) & 0x3333333333333333UL); return((int)((((v + (v >> 4)) & 0xF0F0F0F0F0F0F0FUL) * 0x101010101010101UL) >> 56)); } }
internal override ImmutableDictionary <TKey, TValue> Add(TKey key, TValue value, uint hash, int shift) { var bit = 1U << (int)((hash >> shift) & Mask); if ((_bitmap & bit) != 0) { var newNodes = new ImmutableDictionary <TKey, TValue> [_nodes.Length]; Array.Copy(_nodes, newNodes, _nodes.Length); var index = Popcnt.PopCount((_bitmap >> (int)bit) & Mask); newNodes[index] = _nodes[index].Add(key, value, hash, shift + Shift); return(new BitMapNode <TKey, TValue>(_bitmap, newNodes)); } else { var index = Popcnt.PopCount((_bitmap >> (int)bit) & Mask); var newNodes = new ImmutableDictionary <TKey, TValue> [_nodes.Length + 1]; Array.Copy(_nodes, newNodes, index); Array.Copy(_nodes, index, newNodes, index + 1, _nodes.Length - index); newNodes[index] = new KeyValueNode <TKey, TValue>(key, value, hash); return(new BitMapNode <TKey, TValue>(_bitmap | bit, newNodes)); } }
public static int PopCount(uint value) { if (Popcnt.IsSupported) { return((int)Popcnt.PopCount(value)); } return(SoftwareFallback(value)); int SoftwareFallback(uint v) { const uint c1 = 0x_55555555u; const uint c2 = 0x_33333333u; const uint c3 = 0x_0F0F0F0Fu; const uint c4 = 0x_01010101u; v = v - ((v >> 1) & c1); v = (v & c2) + ((v >> 2) & c2); v = (((v + (v >> 4)) & c3) * c4) >> 24; return((int)v); } }
public override void Solve(IOManager io) { var n = io.ReadInt(); var k = io.ReadInt(); var a = io.ReadIntArray(n); long min = long.MaxValue; for (var flag = BitSet.Zero; flag < (1 << n); flag++) { if (Popcnt.PopCount(flag) == k) { int last = 0; long total = 0; for (int i = 0; i < a.Length; i++) { if (flag[i]) { if (last >= a[i]) { var added = last - a[i] + 1; last++; total += added; } } last.ChangeMax(a[i]); } min.ChangeMin(total); } } io.WriteLine(min); }
static int Main() { s_success = true; // We expect the AOT compiler generated HW intrinsics with the following characteristics: // // * TRUE = IsSupported assumed to be true, no runtime check // * NULL = IsSupported is a runtime check, code should be behind the check or bad things happen // * FALSE = IsSupported assumed to be false, no runtime check, PlatformNotSupportedException if used // // The test is compiled with multiple defines to test this. #if BASELINE_INTRINSICS bool vectorsAccelerated = true; int byteVectorLength = 16; bool?Sse2AndBelow = true; bool?Sse3Group = null; bool?AesLzPcl = null; bool?Sse4142 = null; bool?PopCnt = null; bool?Avx12 = false; bool?FmaBmi12 = false; bool?Avxvnni = false; #elif NON_VEX_INTRINSICS bool vectorsAccelerated = true; int byteVectorLength = 16; bool?Sse2AndBelow = true; bool?Sse3Group = true; bool?AesLzPcl = null; bool?Sse4142 = true; bool?PopCnt = null; bool?Avx12 = false; bool?FmaBmi12 = false; bool?Avxvnni = false; #elif VEX_INTRINSICS bool vectorsAccelerated = true; int byteVectorLength = 32; bool?Sse2AndBelow = true; bool?Sse3Group = true; bool?AesLzPcl = null; bool?Sse4142 = true; bool?PopCnt = null; bool?Avx12 = true; bool?FmaBmi12 = null; bool?Avxvnni = null; #else #error Who dis? #endif if (vectorsAccelerated != Vector.IsHardwareAccelerated) { throw new Exception($"Vectors HW acceleration state unexpected - expected {vectorsAccelerated}, got {Vector.IsHardwareAccelerated}"); } if (byteVectorLength != Vector <byte> .Count) { throw new Exception($"Unexpected vector length - expected {byteVectorLength}, got {Vector<byte>.Count}"); } Check("Sse", Sse2AndBelow, &SseIsSupported, Sse.IsSupported, () => Sse.Subtract(Vector128 <float> .Zero, Vector128 <float> .Zero).Equals(Vector128 <float> .Zero)); Check("Sse.X64", Sse2AndBelow, &SseX64IsSupported, Sse.X64.IsSupported, () => Sse.X64.ConvertToInt64WithTruncation(Vector128 <float> .Zero) == 0); Check("Sse2", Sse2AndBelow, &Sse2IsSupported, Sse2.IsSupported, () => Sse2.Extract(Vector128 <ushort> .Zero, 0) == 0); Check("Sse2.X64", Sse2AndBelow, &Sse2X64IsSupported, Sse2.X64.IsSupported, () => Sse2.X64.ConvertToInt64(Vector128 <double> .Zero) == 0); Check("Sse3", Sse3Group, &Sse3IsSupported, Sse3.IsSupported, () => Sse3.MoveHighAndDuplicate(Vector128 <float> .Zero).Equals(Vector128 <float> .Zero)); Check("Sse3.X64", Sse3Group, &Sse3X64IsSupported, Sse3.X64.IsSupported, null); Check("Ssse3", Sse3Group, &Ssse3IsSupported, Ssse3.IsSupported, () => Ssse3.Abs(Vector128 <short> .Zero).Equals(Vector128 <ushort> .Zero)); Check("Ssse3.X64", Sse3Group, &Ssse3X64IsSupported, Ssse3.X64.IsSupported, null); Check("Sse41", Sse4142, &Sse41IsSupported, Sse41.IsSupported, () => Sse41.Max(Vector128 <int> .Zero, Vector128 <int> .Zero).Equals(Vector128 <int> .Zero)); Check("Sse41.X64", Sse4142, &Sse41X64IsSupported, Sse41.X64.IsSupported, () => Sse41.X64.Extract(Vector128 <long> .Zero, 0) == 0); Check("Sse42", Sse4142, &Sse42IsSupported, Sse42.IsSupported, () => Sse42.Crc32(0, 0) == 0); Check("Sse42.X64", Sse4142, &Sse42X64IsSupported, Sse42.X64.IsSupported, () => Sse42.X64.Crc32(0, 0) == 0); Check("Aes", AesLzPcl, &AesIsSupported, Aes.IsSupported, () => Aes.KeygenAssist(Vector128 <byte> .Zero, 0).Equals(Vector128.Create((byte)99))); Check("Aes.X64", AesLzPcl, &AesX64IsSupported, Aes.X64.IsSupported, null); Check("Avx", Avx12, &AvxIsSupported, Avx.IsSupported, () => Avx.Add(Vector256 <double> .Zero, Vector256 <double> .Zero).Equals(Vector256 <double> .Zero)); Check("Avx.X64", Avx12, &AvxX64IsSupported, Avx.X64.IsSupported, null); Check("Avx2", Avx12, &Avx2IsSupported, Avx2.IsSupported, () => Avx2.Abs(Vector256 <int> .Zero).Equals(Vector256 <uint> .Zero)); Check("Avx2.X64", Avx12, &Avx2X64IsSupported, Avx2.X64.IsSupported, null); Check("Bmi1", FmaBmi12, &Bmi1IsSupported, Bmi1.IsSupported, () => Bmi1.AndNot(0, 0) == 0); Check("Bmi1.X64", FmaBmi12, &Bmi1X64IsSupported, Bmi1.X64.IsSupported, () => Bmi1.X64.AndNot(0, 0) == 0); Check("Bmi2", FmaBmi12, &Bmi2IsSupported, Bmi2.IsSupported, () => Bmi2.MultiplyNoFlags(0, 0) == 0); Check("Bmi2.X64", FmaBmi12, &Bmi2X64IsSupported, Bmi2.X64.IsSupported, () => Bmi2.X64.MultiplyNoFlags(0, 0) == 0); Check("Fma", FmaBmi12, &FmaIsSupported, Fma.IsSupported, () => Fma.MultiplyAdd(Vector128 <float> .Zero, Vector128 <float> .Zero, Vector128 <float> .Zero).Equals(Vector128 <float> .Zero)); Check("Fma.X64", FmaBmi12, &FmaX64IsSupported, Fma.X64.IsSupported, null); Check("Lzcnt", AesLzPcl, &LzcntIsSupported, Lzcnt.IsSupported, () => Lzcnt.LeadingZeroCount(0) == 32); Check("Lzcnt.X64", AesLzPcl, &LzcntX64IsSupported, Lzcnt.X64.IsSupported, () => Lzcnt.X64.LeadingZeroCount(0) == 64); Check("Pclmulqdq", AesLzPcl, &PclmulqdqIsSupported, Pclmulqdq.IsSupported, () => Pclmulqdq.CarrylessMultiply(Vector128 <long> .Zero, Vector128 <long> .Zero, 0).Equals(Vector128 <long> .Zero)); Check("Pclmulqdq.X64", AesLzPcl, &PclmulqdqX64IsSupported, Pclmulqdq.X64.IsSupported, null); Check("Popcnt", PopCnt, &PopcntIsSupported, Popcnt.IsSupported, () => Popcnt.PopCount(0) == 0); Check("Popcnt.X64", PopCnt, &PopcntX64IsSupported, Popcnt.X64.IsSupported, () => Popcnt.X64.PopCount(0) == 0); Check("AvxVnni", Avxvnni, &AvxVnniIsSupported, AvxVnni.IsSupported, () => AvxVnni.MultiplyWideningAndAdd(Vector128 <int> .Zero, Vector128 <byte> .Zero, Vector128 <sbyte> .Zero).Equals(Vector128 <int> .Zero)); Check("AvxVnni.X64", Avxvnni, &AvxVnniX64IsSupported, AvxVnni.X64.IsSupported, null); return(s_success ? 100 : 1); }
public unsafe void Serialize(ref MessagePackWriter writer, sbyte[]?value, MessagePackSerializerOptions options) { if (value == null) { writer.WriteNil(); return; } var inputLength = value.Length; writer.WriteArrayHeader(inputLength); if (inputLength == 0) { return; } fixed(sbyte *pSource = &value[0]) { var inputEnd = pSource + inputLength; var inputIterator = pSource; if (Popcnt.IsSupported) { const int ShiftCount = 4; const int Stride = 1 << ShiftCount; // We enter the SIMD mode when there are more than the Stride after alignment adjustment. if (inputLength < Stride << 1) { goto ProcessEach; } { // Make InputIterator Aligned var offset = UnsafeMemoryAlignmentUtility.CalculateDifferenceAlign16(inputIterator); inputLength -= offset; var offsetEnd = inputIterator + offset; while (inputIterator != offsetEnd) { writer.Write(*inputIterator++); } } fixed(byte *tablePointer = &ShuffleAndMaskTable[0]) { fixed(byte *maskTablePointer = &SingleInstructionMultipleDataPrimitiveArrayFormatterHelper.StoreMaskTable[0]) { var vectorMinFixNegInt = Vector128.Create((sbyte)MessagePackRange.MinFixNegativeInt); var vectorMessagePackCodeInt8 = Vector128.Create(MessagePackCode.Int8); for (var vectorizedEnd = inputIterator + ((inputLength >> ShiftCount) << ShiftCount); inputIterator != vectorizedEnd; inputIterator += Stride) { var current = Sse2.LoadVector128(inputIterator); var index = unchecked ((uint)Sse2.MoveMask(Sse2.CompareGreaterThan(vectorMinFixNegInt, current))); if (index == 0) { // When all 32 input values are in the FixNum range. var span = writer.GetSpan(Stride); Sse2.Store((sbyte *)Unsafe.AsPointer(ref span[0]), current); writer.Advance(Stride); continue; } unchecked { var index0 = (byte)index; var index1 = (byte)(index >> 8); var count0 = (int)(Popcnt.PopCount(index0) + 8); var count1 = (int)(Popcnt.PopCount(index1) + 8); var countTotal = count0 + count1; var destination = writer.GetSpan(countTotal); fixed(byte *pDestination = &destination[0]) { var tempDestination = pDestination; var shuffle0 = Sse2.LoadVector128(tablePointer + (index0 << 4)); var shuffled0 = Ssse3.Shuffle(current.AsByte(), shuffle0); var answer0 = Sse41.BlendVariable(shuffled0, vectorMessagePackCodeInt8, shuffle0); Sse2.MaskMove(answer0, Sse2.LoadVector128(maskTablePointer + (count0 << 4)), tempDestination); tempDestination += count0; var shuffle1 = Sse2.LoadVector128(tablePointer + (index1 << 4)); var shift1 = Sse2.ShiftRightLogical128BitLane(current.AsByte(), 8); var shuffled1 = Ssse3.Shuffle(shift1, shuffle1); var answer1 = Sse41.BlendVariable(shuffled1, vectorMessagePackCodeInt8, shuffle1); Sse2.MaskMove(answer1, Sse2.LoadVector128(maskTablePointer + (count1 << 4)), tempDestination); } writer.Advance(countTotal); } } } } } ProcessEach: while (inputIterator != inputEnd) { writer.Write(*inputIterator++); } } }
public static unsafe int GetUtf16CharCountFromKnownWellFormedUtf8(ReadOnlySpan <byte> utf8Data) { // Remember: the number of resulting UTF-16 chars will never be greater than the number // of UTF-8 bytes given well-formed input, so we can get away with casting the final // result to an 'int'. fixed(byte *pPinnedUtf8Data = &MemoryMarshal.GetReference(utf8Data)) { if (Sse2.IsSupported && Popcnt.IsSupported) { // Optimizations via SSE2 & POPCNT are available - use them. Debug.Assert(BitConverter.IsLittleEndian, "SSE2 only supported on little-endian platforms."); Debug.Assert(sizeof(nint) == IntPtr.Size, "nint defined incorrectly."); Debug.Assert(sizeof(nuint) == IntPtr.Size, "nuint defined incorrectly."); byte *pBuffer = pPinnedUtf8Data; nuint bufferLength = (uint)utf8Data.Length; // Optimization: Can we stay in the all-ASCII code paths? nuint utf16CharCount = GetIndexOfFirstNonAsciiByte_Sse2(pBuffer, bufferLength); if (utf16CharCount != bufferLength) { // Found at least one non-ASCII byte, so fall down the slower (but still vectorized) code paths. // Given well-formed UTF-8 input, we can compute the number of resulting UTF-16 code units // using the following formula: // // utf16CharCount = utf8ByteCount - numUtf8ContinuationBytes + numUtf8FourByteHeaders utf16CharCount = bufferLength; Vector128 <sbyte> vecAllC0 = Vector128.Create(unchecked ((sbyte)0xC0)); Vector128 <sbyte> vecAll80 = Vector128.Create(unchecked ((sbyte)0x80)); Vector128 <sbyte> vecAll6F = Vector128.Create(unchecked ((sbyte)0x6F)); { // Perform an aligned read of the first part of the buffer. // We'll mask out any data at the start of the buffer we don't care about. // // For example, if (pBuffer MOD 16) = 2: // [ AA BB CC DD ... ] <-- original vector // [ 00 00 CC DD ... ] <-- after PANDN operation nint offset = -((nint)pBuffer & (sizeof(Vector128 <sbyte>) - 1)); Vector128 <sbyte> shouldBeMaskedOut = Sse2.CompareGreaterThan(Vector128.Create((byte)((int)offset + sizeof(Vector128 <sbyte>) - 1)).AsSByte(), VectorOfElementIndices); Vector128 <sbyte> thisVector = Sse2.AndNot(shouldBeMaskedOut, Unsafe.Read <Vector128 <sbyte> >(pBuffer + offset)); // If there's any data at the end of the buffer we don't care about, mask it out now. // If this happens the 'bufferLength' value will be a lie, but it'll cause all of the // branches later in the method to be skipped, so it's not a huge problem. if (bufferLength < (nuint)offset + (uint)sizeof(Vector128 <sbyte>)) { Vector128 <sbyte> shouldBeAllowed = Sse2.CompareLessThan(VectorOfElementIndices, Vector128.Create((byte)((int)bufferLength - (int)offset)).AsSByte()); thisVector = Sse2.And(shouldBeAllowed, thisVector); bufferLength = (nuint)offset + (uint)sizeof(Vector128 <sbyte>); } uint maskOfContinuationBytes = (uint)Sse2.MoveMask(Sse2.CompareGreaterThan(vecAllC0, thisVector)); uint countOfContinuationBytes = Popcnt.PopCount(maskOfContinuationBytes); utf16CharCount -= countOfContinuationBytes; uint maskOfFourByteHeaders = (uint)Sse2.MoveMask(Sse2.CompareGreaterThan(Sse2.Xor(thisVector, vecAll80), vecAll6F)); uint countOfFourByteHeaders = Popcnt.PopCount(maskOfFourByteHeaders); utf16CharCount += countOfFourByteHeaders; bufferLength -= (nuint)offset; bufferLength -= (uint)sizeof(Vector128 <sbyte>); pBuffer += offset; pBuffer += (uint)sizeof(Vector128 <sbyte>); } // At this point, pBuffer is guaranteed aligned. Debug.Assert((nuint)pBuffer % (uint)sizeof(Vector128 <sbyte>) == 0, "pBuffer should have been aligned."); while (bufferLength >= (uint)sizeof(Vector128 <sbyte>)) { Vector128 <sbyte> thisVector = Sse2.LoadAlignedVector128((sbyte *)pBuffer); uint maskOfContinuationBytes = (uint)Sse2.MoveMask(Sse2.CompareGreaterThan(vecAllC0, thisVector)); uint countOfContinuationBytes = Popcnt.PopCount(maskOfContinuationBytes); utf16CharCount -= countOfContinuationBytes; uint maskOfFourByteHeaders = (uint)Sse2.MoveMask(Sse2.CompareGreaterThan(Sse2.Xor(thisVector, vecAll80), vecAll6F)); uint countOfFourByteHeaders = Popcnt.PopCount(maskOfFourByteHeaders); utf16CharCount += countOfFourByteHeaders; pBuffer += sizeof(Vector128 <sbyte>); bufferLength -= (uint)sizeof(Vector128 <sbyte>); } if ((uint)bufferLength > 0) { // There's still more data to be read. // We need to mask out elements of the vector we don't care about. // These elements will occur at the end of the vector. // // For example, if 14 bytes remain in the input stream: // [ ... CC DD EE FF ] <-- original vector // [ ... CC DD 00 00 ] <-- after PANDN operation Vector128 <sbyte> shouldBeMaskedOut = Sse2.CompareGreaterThan(VectorOfElementIndices, Vector128.Create((byte)((int)bufferLength - 1)).AsSByte()); Vector128 <sbyte> thisVector = Sse2.AndNot(shouldBeMaskedOut, *(Vector128 <sbyte> *)pBuffer); uint maskOfContinuationBytes = (uint)Sse2.MoveMask(Sse2.CompareGreaterThan(vecAllC0, thisVector)); uint countOfContinuationBytes = Popcnt.PopCount(maskOfContinuationBytes); utf16CharCount -= countOfContinuationBytes; uint maskOfFourByteHeaders = (uint)Sse2.MoveMask(Sse2.CompareGreaterThan(Sse2.Xor(thisVector, vecAll80), vecAll6F)); uint countOfFourByteHeaders = Popcnt.PopCount(maskOfFourByteHeaders); utf16CharCount += countOfFourByteHeaders; } } return((int)utf16CharCount); } else { // Cannot use SSE2 & POPCNT. Fall back to slower code paths. throw new NotImplementedException(); } } }
private static int CompareScalarAltPopCount(void *p1, void *p2, int size) { byte *bpx = (byte *)p1; byte *bpy = (byte *)p2; // PERF: This allows us to do pointer arithmetics and use relative addressing using the // hardware instructions without needed an extra register. long offset = bpy - bpx; if (size < 8) { goto ProcessSmall; } // PERF: Current version of the JIT (2.0.5) will use a 4 instruction magic division // instead of a simple shift because it is a power of 2 dividend. int l = size >> 3; // (Equivalent to size / 8) ulong xor; for (int i = 0; i < l; i++, bpx += 8) { // PERF: JIT will emit: ```{op} {reg}, qword ptr [rdx+rax]``` xor = *((ulong *)bpx) ^ *(ulong *)(bpx + offset); if (xor != 0) { goto Tail; } } ProcessSmall: if ((size & 4) != 0) { xor = *((uint *)bpx) ^ *((uint *)(bpx + offset)); if (xor != 0) { goto Tail; } bpx += 4; } if ((size & 2) != 0) { xor = (ulong)(*((ushort *)bpx) ^ *((ushort *)(bpx + offset))); if (xor != 0) { goto Tail; } bpx += 2; } if ((size & 1) != 0) { return(*bpx - *(bpx + offset)); } return(0); Tail: // PERF: This is a bit twiddling hack. Given that bitwise xoring 2 values flag the bits difference, // we can use that we know we are running on little endian hardware and the very first bit set // will correspond to the first byte which is different. bpx += Popcnt.PopCount((ulong)((long)xor & -(long)xor) - 1) >> 3; return(*bpx - *(bpx + offset)); }
public static uint pop(sbyte src) => Popcnt.PopCount((uint)src);
public override IEnumerable <object> Solve(TextReader inputStream) { var n = inputStream.ReadInt(); var villages = new Village[n]; for (int i = 0; i < villages.Length; i++) { var(x, y, p) = inputStream.ReadValue <int, int, int>(); villages[i] = new Village(x, y, p); } var xCosts = new long[1 << villages.Length, villages.Length]; var yCosts = new long[1 << villages.Length, villages.Length]; for (var flags = BitSet.Zero; flags < 1 << villages.Length; flags++) { for (int i = 0; i < villages.Length; i++) { var minX = villages[i].WalkX(0); var minY = villages[i].WalkY(0); for (int road = 0; road < villages.Length; road++) { if (flags[road]) { minX = Math.Min(minX, villages[i].WalkX(villages[road].X)); minY = Math.Min(minY, villages[i].WalkY(villages[road].Y)); } } xCosts[flags, i] += minX; yCosts[flags, i] += minY; } } var results = new long[villages.Length + 1]; results.AsSpan().Fill(long.MaxValue); DFS(0, 0, 0); foreach (var result in results) { yield return(result); } void DFS(int xFlags, int yFlags, int depth) { if (depth == villages.Length) { long cost = 0; for (int i = 0; i < villages.Length; i++) { var xCost = xCosts[xFlags, i]; var yCost = yCosts[yFlags, i]; cost += Math.Min(xCost, yCost); } var construction = Popcnt.PopCount((uint)xFlags) + Popcnt.PopCount((uint)yFlags); results[construction] = Math.Min(results[construction], cost); } else { DFS(xFlags, yFlags, depth + 1); DFS(xFlags | (1 << depth), yFlags, depth + 1); DFS(xFlags, yFlags | (1 << depth), depth + 1); } } }
public static int CountSetBits(uint Value) => (int)Popcnt.PopCount(Value);
public static uint pop(short src) => Popcnt.PopCount((uint)src);
static int Main(string[] args) { ulong sl = 0; long resl; int testResult = Pass; if (!Popcnt.IsSupported || !Environment.Is64BitProcess) { try { resl = Popcnt.PopCount(sl); Console.WriteLine("Intrinsic Popcnt.PopCount is called on non-supported hardware"); Console.WriteLine("Popcnt.IsSupported " + Popcnt.IsSupported); Console.WriteLine("Environment.Is64BitProcess " + Environment.Is64BitProcess); testResult = Fail; } catch (PlatformNotSupportedException) { } try { resl = Convert.ToInt64(typeof(Popcnt).GetMethod(nameof(Popcnt.PopCount), new Type[] { sl.GetType() }).Invoke(null, new object[] { sl })); Console.WriteLine("Intrinsic Popcnt.PopCount is called via reflection on non-supported hardware"); Console.WriteLine("Popcnt.IsSupported " + Popcnt.IsSupported); Console.WriteLine("Environment.Is64BitProcess " + Environment.Is64BitProcess); testResult = Fail; } catch (TargetInvocationException e) when(e.InnerException is PlatformNotSupportedException) { } } if (Popcnt.IsSupported) { if (Environment.Is64BitProcess) { for (int i = 0; i < longPopcntTable.Length; i++) { sl = longPopcntTable[i].s; resl = Popcnt.PopCount(sl); if (resl != longPopcntTable[i].res) { Console.WriteLine("{0}: Inputs: 0x{1,16:x} Expected: 0x{3,16:x} actual: 0x{4,16:x}", i, sl, longPopcntTable[i].res, resl); testResult = Fail; } resl = Convert.ToInt64(typeof(Popcnt).GetMethod(nameof(Popcnt.PopCount), new Type[] { sl.GetType() }).Invoke(null, new object[] { sl })); if (resl != longPopcntTable[i].res) { Console.WriteLine("{0}: Inputs: 0x{1,16:x} Expected: 0x{3,16:x} actual: 0x{4,16:x} - Reflection", i, sl, longPopcntTable[i].res, resl); testResult = Fail; } } } uint si; int resi; for (int i = 0; i < intPopcntTable.Length; i++) { si = intPopcntTable[i].s; resi = Popcnt.PopCount(si); if (resi != intPopcntTable[i].res) { Console.WriteLine("{0}: Inputs: 0x{1,16:x} Expected: 0x{3,16:x} actual: 0x{4,16:x}", i, si, intPopcntTable[i].res, resi); testResult = Fail; } resi = Convert.ToInt32(typeof(Popcnt).GetMethod(nameof(Popcnt.PopCount), new Type[] { si.GetType() }).Invoke(null, new object[] { si })); if (resi != intPopcntTable[i].res) { Console.WriteLine("{0}: Inputs: 0x{1,16:x} Expected: 0x{3,16:x} actual: 0x{4,16:x} - Reflection", i, si, intPopcntTable[i].res, resi); testResult = Fail; } } } return(testResult); }
public static uint pop(uint src) => Popcnt.PopCount(src);
private static int CompareScalarCmpAltPopCount(void *p1, void *p2, int size) { byte *bpx = (byte *)p1; // PERF: This allows us to do pointer arithmetics and use relative addressing using the // hardware instructions without needed an extra register. long offset = (byte *)p2 - bpx; if ((size & 7) == 0) { goto ProcessAligned; } // We process first the "unaligned" size. ulong xor; if ((size & 4) != 0) { xor = *((uint *)bpx) ^ *((uint *)(bpx + offset)); if (xor != 0) { goto Tail; } bpx += 4; } if ((size & 2) != 0) { xor = (ulong)(*((ushort *)bpx) ^ *((ushort *)(bpx + offset))); if (xor != 0) { goto Tail; } bpx += 2; } if ((size & 1) != 0) { int value = *bpx - *(bpx + offset); if (value != 0) { return(value); } bpx += 1; } ProcessAligned: byte *end = (byte *)p1 + size; byte *loopEnd = end - 16; while (bpx <= loopEnd) { // PERF: JIT will emit: ```{op} {reg}, qword ptr [rdx+rax]``` if (*((ulong *)bpx) != *(ulong *)(bpx + offset)) { goto XorTail; } if (*((ulong *)(bpx + 8)) != *(ulong *)(bpx + 8 + offset)) { bpx += 8; goto XorTail; } bpx += 16; } if (bpx < end) { goto XorTail; } return(0); XorTail : xor = *((ulong *)bpx) ^ *(ulong *)(bpx + offset); Tail: // Fast-path for equals if (xor == 0) { return(0); } // PERF: This is a bit twiddling hack. Given that bitwise xoring 2 values flag the bits difference, // we can use that we know we are running on little endian hardware and the very first bit set // will correspond to the first byte which is different. bpx += Popcnt.PopCount((ulong)((long)xor & -(long)xor) - 1) >> 3; return(*bpx - *(bpx + offset)); }
public uint PopCount() => Popcnt.PopCount(MaxValue);