private static int RefreshCurrentProcessorId() { int currentProcessorId = Thread.GetCurrentProcessorNumber(); // On Unix, GetCurrentProcessorNumber() is implemented in terms of sched_getcpu, which // doesn't exist on all platforms. On those it doesn't exist on, GetCurrentProcessorNumber() // returns -1. As a fallback in that case and to spread the threads across the buckets // by default, we use the current managed thread ID as a proxy. if (currentProcessorId < 0) { currentProcessorId = Environment.CurrentManagedThreadId; } Debug.Assert(s_processorIdRefreshRate <= ProcessorIdCacheCountDownMask); // Mask with int.MaxValue to ensure the execution Id is not negative t_currentProcessorIdCache = ((currentProcessorId << ProcessorIdCacheShift) & int.MaxValue) | s_processorIdRefreshRate; return(currentProcessorId); }
// If GetCurrentProcessorNumber takes any nontrivial time (compared to TLS access), return false. // Check more than once - to make sure it was not because TLS was delayed by GC or a context switch. internal static bool ProcessorNumberSpeedCheck() { // NOTE: We do not check the frequency of the Stopwatch. // The frequency often does not match the actual timer refresh rate anyways. // If the resolution, precision or access time to the timer are inadequate for our measures here, // the test will fail anyways. double minID = double.MaxValue; double minTLS = double.MaxValue; // warm up the code paths. UninlinedThreadStatic(); // also check if API is actually functional (-1 means not supported) if (Thread.GetCurrentProcessorNumber() < 0) { s_processorIdRefreshRate = ProcessorIdCacheCountDownMask; return(false); } long oneMicrosecond = Stopwatch.Frequency / 1000000 + 1; for (int i = 0; i < 10; i++) { // we will measure at least 16 iterations and at least 1 microsecond long t; int iters = 8; do { iters *= 2; t = Stopwatch.GetTimestamp(); for (int j = 0; j < iters; j++) { Thread.GetCurrentProcessorNumber(); } t = Stopwatch.GetTimestamp() - t; } while (t < oneMicrosecond); minID = Math.Min(minID, (double)t / iters); // we will measure at least 1 microsecond, // and use at least 1/2 of ProcID iterations // we assume that TLS can't be more than 2x slower than ProcID iters /= 4; do { iters *= 2; t = Stopwatch.GetTimestamp(); for (int j = 0; j < iters; j++) { UninlinedThreadStatic(); } t = Stopwatch.GetTimestamp() - t; } while (t < oneMicrosecond); minTLS = Math.Min(minTLS, (double)t / iters); } // A few words about choosing cache refresh rate: // // There are too reasons why data structures use core affinity: // 1) To improve locality - avoid running on one core and using data in other core's cache. // 2) To reduce sharing - avoid multiple threads using the same piece of data. // // Scenarios with large footprint, like striped caches, are sensitive to both parts. It is desirable to access // large data from the "right" core. // In scenarios where the state is small, like a striped counter, it is mostly about sharing. // Otherwise the state is small and occasionally moving counter to a different core via cache miss is not a big deal. // // In scenarios that care more about sharing precise results of GetCurrentProcessorNumber may not justify // the cost unless the underlying implementation is very cheap. // In such cases it is desirable to amortize the cost over multiple accesses by caching in a ThreadStatic. // // In addition to the data structure, the benefits also depend on use pattern and on concurrency level. // I.E. if an array pool user only rents array "just in case" but does not actually use it, and concurrency level is low, // a longer refresh would be beneficial since that could lower the API cost. // If array is actually used, then there is benefit from higher precision of the API and shorter refresh is more attractive. // // Overall we do not know the ideal refresh rate and using some kind of dynamic feedback is unlikely to be feasible. // Experiments have shown, however, that 5x amortization rate is a good enough balance between precision and cost of the API. s_processorIdRefreshRate = Math.Min((int)(minID * 5 / minTLS), MaxIdRefreshRate); // In a case if GetCurrentProcessorNumber is particularly fast, like it happens on platforms supporting RDPID instruction, // caching is not an improvement, thus it is desirable to bypass the cache entirely. // Such systems consistently derive the refresh rate at or below 2-3, while the next tier, RDTSCP based implementations result in ~10, // so we use "5" as a criteria to separate "fast" machines from the rest. return(s_processorIdRefreshRate <= 5); }