public static async Task Sampling(string[] path, IVarTuple extras, IFdbDatabase db, TextWriter log, CancellationToken ct) { double ratio = 0.1d; bool auto = true; if (extras.Count > 0) { double x = extras.Get <double>(0); if (x > 0 && x <= 1) { ratio = x; } auto = false; } var folder = await TryOpenCurrentDirectoryAsync(path, db, ct); KeyRange span; if (folder is FdbDirectorySubspace) { span = KeyRange.StartsWith((folder as FdbDirectorySubspace).Copy().GetPrefix()); log.WriteLine($"Reading list of shards for /{String.Join("/", path)} under {FdbKey.Dump(span.Begin)} ..."); } else { log.WriteLine("Reading list of shards for the whole cluster ..."); span = KeyRange.All; } // dump keyServers var ranges = await Fdb.System.GetChunksAsync(db, span, ct); log.WriteLine($"> Found {ranges.Count:N0} shard(s)"); // take a sample var samples = new List <KeyRange>(); if (ranges.Count <= 32) { // small enough to scan it all samples.AddRange(ranges); log.WriteLine($"Sampling all {samples.Count:N0} shards ..."); } else { // need to take a random subset var rnd = new Random(); int sz = Math.Max((int)Math.Ceiling(ratio * ranges.Count), 1); if (auto) { if (sz > 100) { sz = 100; //SAFETY } if (sz < 32) { sz = Math.Max(sz, Math.Min(32, ranges.Count)); } } var population = new List <KeyRange>(ranges); for (int i = 0; i < sz; i++) { int p = rnd.Next(population.Count); samples.Add(population[p]); population.RemoveAt(p); } log.WriteLine($"Sampling {samples.Count:N0} out of {ranges.Count:N0} shards ({(100.0 * samples.Count / ranges.Count):N1}%) ..."); } log.WriteLine(); const string FORMAT_STRING = "{0,9} ║{1,10}{6,6} {2,-29} ║{3,10}{7,7} {4,-37} ║{5,10}"; const string SCALE_KEY = "....--------========########M"; const string SCALE_VAL = "....--------========########@@@@@@@@M"; log.WriteLine(FORMAT_STRING, "Count", "Keys", SCALE_KEY, "Values", SCALE_VAL, "Total", "med.", "med."); var rangeOptions = new FdbRangeOptions { Mode = FdbStreamingMode.WantAll }; samples = samples.OrderBy(x => x.Begin).ToList(); long globalSize = 0; long globalCount = 0; int workers = 8; // Math.Max(4, Environment.ProcessorCount); var sw = Stopwatch.StartNew(); var tasks = new List <Task>(); int n = samples.Count; while (samples.Count > 0) { while (tasks.Count < workers && samples.Count > 0) { var range = samples[0]; samples.RemoveAt(0); tasks.Add(Task.Run(async() => { var kk = new RobustHistogram(RobustHistogram.TimeScale.Ticks); var vv = new RobustHistogram(RobustHistogram.TimeScale.Ticks); #region Method 1: get_range everything... using (var tr = db.BeginTransaction(ct)) { long keySize = 0; long valueSize = 0; long count = 0; int iter = 0; var beginSelector = KeySelector.FirstGreaterOrEqual(range.Begin); var endSelector = KeySelector.FirstGreaterOrEqual(range.End); while (true) { FdbRangeChunk data = default(FdbRangeChunk); FdbException error = null; try { data = await tr.Snapshot.GetRangeAsync( beginSelector, endSelector, rangeOptions, iter ).ConfigureAwait(false); } catch (FdbException e) { error = e; } if (error != null) { await tr.OnErrorAsync(error.Code).ConfigureAwait(false); continue; } if (data.Count == 0) { break; } count += data.Count; foreach (var kvp in data) { keySize += kvp.Key.Count; valueSize += kvp.Value.Count; kk.Add(TimeSpan.FromTicks(kvp.Key.Count)); vv.Add(TimeSpan.FromTicks(kvp.Value.Count)); } if (!data.HasMore) { break; } beginSelector = KeySelector.FirstGreaterThan(data.Last); ++iter; } long totalSize = keySize + valueSize; Interlocked.Add(ref globalSize, totalSize); Interlocked.Add(ref globalCount, count); lock (log) { log.WriteLine(FORMAT_STRING, count.ToString("N0"), FormatSize(keySize), kk.GetDistribution(begin: 1, end: 12000, fold: 2), FormatSize(valueSize), vv.GetDistribution(begin: 1, end: 120000, fold: 2), FormatSize(totalSize), FormatSize((int)Math.Ceiling(kk.Median)), FormatSize((int)Math.Ceiling(vv.Median))); } } #endregion #region Method 2: estimate the count using key selectors... //long counter = await Fdb.System.EstimateCountAsync(db, range, ct); //Console.WriteLine("COUNT = " + counter.ToString("N0")); #endregion }, ct)); } var done = await Task.WhenAny(tasks); tasks.Remove(done); } await Task.WhenAll(tasks); sw.Stop(); log.WriteLine(); if (n != ranges.Count) { log.WriteLine($"Sampled {FormatSize(globalSize)} ({globalSize:N0} bytes) and {globalCount:N0} keys in {sw.Elapsed.TotalSeconds:N1} sec"); log.WriteLine($"> Estimated total size is {FormatSize(globalSize * ranges.Count / n)}"); } else { log.WriteLine($"Found {FormatSize(globalSize)} ({globalSize:N0} bytes) and {globalCount:N0} keys in {sw.Elapsed.TotalSeconds:N1} sec"); // compare to the whole cluster ranges = await Fdb.System.GetChunksAsync(db, FdbKey.MinValue, FdbKey.MaxValue, ct); log.WriteLine($"> This directory contains ~{(100.0 * n / ranges.Count):N2}% of all data"); } log.WriteLine(); }
public async Task Run(IFdbDatabase db, TextWriter log, CancellationToken ct) { // estimate the number of machines... Console.WriteLine("# Detecting cluster topology..."); var servers = await db.QueryAsync(tr => tr .WithReadAccessToSystemKeys() .GetRange(FdbKeyRange.StartsWith(Fdb.System.ServerList)) .Select(kvp => new { Node = kvp.Value.Substring(8, 16).ToHexaString(), Machine = kvp.Value.Substring(24, 16).ToHexaString(), DataCenter = kvp.Value.Substring(40, 16).ToHexaString() }), ct ); var numNodes = servers.Select(s => s.Node).Distinct().Count(); var numMachines = servers.Select(s => s.Machine).Distinct().Count(); var numDCs = servers.Select(s => s.DataCenter).Distinct().Count(); Console.WriteLine("# > Found " + numNodes + " process(es) on " + numMachines + " machine(s) in " + numDCs + " datacenter(s)"); Console.WriteLine("# Reading list of shards..."); // dump keyServers var ranges = await Fdb.System.GetChunksAsync(db, FdbKey.MinValue, FdbKey.MaxValue, ct); Console.WriteLine("# > Found " + ranges.Count + " shards:"); // take a sample var rnd = new Random(1234); int sz = Math.Max((int)Math.Ceiling(this.Ratio * ranges.Count), 1); if (sz > 500) { sz = 500; //SAFETY } if (sz < 50) { sz = Math.Max(sz, Math.Min(50, ranges.Count)); } var samples = new List <FdbKeyRange>(); for (int i = 0; i < sz; i++) { int p = rnd.Next(ranges.Count); samples.Add(ranges[p]); ranges.RemoveAt(p); } Console.WriteLine("# Sampling " + sz + " out of " + ranges.Count + " shards (" + (100.0 * sz / ranges.Count).ToString("N1") + "%) ..."); Console.WriteLine("{0,9}{1,10}{2,10}{3,10} : K+V size distribution", "Count", "Keys", "Values", "Total"); var rangeOptions = new FdbRangeOptions { Mode = FdbStreamingMode.WantAll }; samples = samples.OrderBy(x => x.Begin).ToList(); long total = 0; int workers = Math.Min(numMachines, 8); var sw = Stopwatch.StartNew(); var tasks = new List <Task>(); while (samples.Count > 0) { while (tasks.Count < workers && samples.Count > 0) { var range = samples[0]; samples.RemoveAt(0); tasks.Add(Task.Run(async() => { var hh = new RobustHistogram(RobustHistogram.TimeScale.Ticks); #region Method 1: get_range everything... using (var tr = db.BeginTransaction(ct)) { long keySize = 0; long valueSize = 0; long count = 0; int iter = 0; var beginSelector = FdbKeySelector.FirstGreaterOrEqual(range.Begin); var endSelector = FdbKeySelector.FirstGreaterOrEqual(range.End); while (true) { FdbRangeChunk data = default(FdbRangeChunk); FdbException error = null; try { data = await tr.Snapshot.GetRangeAsync( beginSelector, endSelector, rangeOptions, iter ).ConfigureAwait(false); } catch (FdbException e) { error = e; } if (error != null) { await tr.OnErrorAsync(error.Code).ConfigureAwait(false); continue; } if (data.Count == 0) { break; } count += data.Count; foreach (var kvp in data.Chunk) { keySize += kvp.Key.Count; valueSize += kvp.Value.Count; hh.Add(TimeSpan.FromTicks(kvp.Key.Count + kvp.Value.Count)); } if (!data.HasMore) { break; } beginSelector = FdbKeySelector.FirstGreaterThan(data.Last.Key); ++iter; } long totalSize = keySize + valueSize; Interlocked.Add(ref total, totalSize); Console.WriteLine("{0,9}{1,10}{2,10}{3,10} : {4}", count.ToString("N0"), FormatSize(keySize), FormatSize(valueSize), FormatSize(totalSize), hh.GetDistribution(begin: 1, end: 10000, fold: 2)); } #endregion #region Method 2: estimate the count using key selectors... //long counter = await Fdb.System.EstimateCountAsync(db, range, ct); //Console.WriteLine("COUNT = " + counter.ToString("N0")); #endregion }, ct)); } var done = await Task.WhenAny(tasks); tasks.Remove(done); } await Task.WhenAll(tasks); sw.Stop(); Console.WriteLine("> Sampled " + FormatSize(total) + " (" + total.ToString("N0") + " bytes) in " + sw.Elapsed.TotalSeconds.ToString("N1") + " sec"); Console.WriteLine("> Estimated total size is " + FormatSize(total * ranges.Count / sz)); }