static FileReport TestWordsFile(string fileName) { GC.Collect(); GC.WaitForFullGCComplete(); GC.WaitForPendingFinalizers(); var report = new FileReport(); report.FileName = Path.GetFileName(fileName); var sw = new Stopwatch(); Console.ForegroundColor = ConsoleColor.Yellow; Console.WriteLine(fileName); var words = File.ReadAllLines(fileName) .Where(w => !string.IsNullOrWhiteSpace(w)) .Select(w => w.ToLowerInvariant()) .Distinct() .OrderBy(x => x, StringComparer.Ordinal) .ToArray(); report.WordCount = words.Length; Console.WriteLine($"Words: {words.Length:n0}"); Console.ResetColor(); var hash = new HashSet <string>(words); PrintPhase("Creating DAWG..."); sw.Start(); var dawg = Dawg.Create(words); sw.Stop(); PrintOK(sw); report.BuildDawgTime = sw.Elapsed; report.NodeCount = dawg.NodeCount; Console.WriteLine($"Nodes {dawg.NodeCount:n0}"); report.ArraySize = EstimateObjectSize(words); report.HashSize = EstimateObjectSize(hash); Console.WriteLine($"HashSize:{report.HashSize/1024.0:n2} kB ArraySize:{report.ArraySize/1024.0:n2} kB."); sw.Restart(); PrintPhase("Verifying... "); Dawg.Verify(words, dawg); sw.Stop(); PrintOK(sw); PrintPhase("Serializing/Deserializing the DAWG... \n"); var dawgFile = Path.GetTempFileName(); var gzipFile = Path.GetTempFileName(); Console.Write(" * Write DAWG to disk... "); sw.Restart(); Dawg.Write(dawg, dawgFile); sw.Stop(); PrintOK(sw); report.DawgWriteTime = sw.Elapsed; Console.Write(" * Read DAWG from disk... "); sw.Restart(); dawg = Dawg.Read(dawgFile); sw.Stop(); PrintOK(sw); Console.Write(" * Write Gzip to disk... "); sw.Restart(); report.GzipOriginalFileLength = CompressedFileSize(fileName, gzipFile); sw.Stop(); report.GzipCompressTime = sw.Elapsed; PrintOK(sw); Console.Write(" * Read Gzip from disk... "); sw.Restart(); DecompressFileSize(gzipFile); sw.Stop(); report.GzipDecompressTime = sw.Elapsed; PrintOK(sw); report.OriginalFileLength = new FileInfo(fileName).Length; Console.Write($"Original File is {report.OriginalFileLength / 1024:n0} kB. "); report.DawgFileLength = new FileInfo(dawgFile).Length; Console.Write($"DAWG File is {report.DawgFileLength / 1024:n0} kB. "); Console.Write($"Gzip File is {report.GzipOriginalFileLength / 1024:n0} kB. \n"); File.Delete(dawgFile); File.Delete(gzipFile); PrintPhase("DAWGSharp package comparison ..."); sw.Restart(); var dawgSharpBuilder = new DawgSharp.DawgBuilder <bool> (); // <bool> is the value type. foreach (string key in words) { dawgSharpBuilder.Insert(key, true); } var dawgSharp = dawgSharpBuilder.BuildDawg(); sw.Stop(); using var ms = new MemoryStream(); dawgSharp.SaveTo(ms); PrintOK(sw); Console.WriteLine($" * DAWGSharp NodeCount is {dawgSharp.GetNodeCount():n0}."); Console.WriteLine($" * DAWGSharp File is {ms.Length/1024:n2} kB."); sw.Restart(); PrintPhase("Checking own words... "); TestOwnWordsExits(words, dawg); sw.Stop(); PrintOK(sw); PrintPhase("Find 20 random word 100 times... \n"); var random = new Random(); var toFind = new HashSet <string>(); while (toFind.Count != 20) { toFind.Add(words[random.Next(0, words.Length)]); } Console.Write(" * Find in word set..."); sw.Restart(); FindWords(toFind, hash, 100); sw.Stop(); report.HashContains = sw.Elapsed; PrintOK(sw); Console.Write(" * Find in word list..."); sw.Restart(); FindWords(toFind, words, 100); sw.Stop(); report.ArrayContains = sw.Elapsed; PrintOK(sw); Console.Write(" * Find in word list binary search..."); sw.Restart(); FindWordsBinarySearch(toFind, words, 100); sw.Stop(); report.BinarySearchContains = sw.Elapsed; PrintOK(sw); Console.Write(" * Find in DAWG..."); sw.Restart(); FindWords(toFind, dawg, 100); report.DawgSearch = sw.Elapsed; sw.Stop(); PrintOK(sw); var prefixes = words .Where(w => w.Length > 4) .Select(w => w.Substring(0, 4)) .GroupBy(s => s).Select(g => (g.Key, g.Count())) .OrderByDescending(x => x.Item2) .Select(x => x.Key) .Take(20) .ToArray(); PrintPhase($"Finding 50 words that start with 20 prefixes 100 times...\n"); Console.Write($" * DAWG... "); sw.Restart(); var found = DawgPrefixSearch(dawg, prefixes, 100, 50); sw.Stop(); report.DawgPrefixSearch = sw.Elapsed; Console.Write($"found {string.Join(", ", found.Select(x => x.Count).OrderByDescending(x => x))}"); PrintOK(sw); Console.Write($" * Array... "); sw.Restart(); // https://stackoverflow.com/questions/52395504/inconsistent-string-startswith-on-different-platforms var hfound = LinearPrefixSearch(words, prefixes, 100, 50); sw.Stop(); report.ArrayPrefixSearch = sw.Elapsed; Console.Write($"found {string.Join(", ", hfound.Select(x => x.Count).OrderByDescending(x => x))}"); PrintOK(sw); if (!SameSets(found, hfound)) { throw new Exception("Different prefixed words count."); } PrintPhase($"Finding first 500 words that start with 20 prefixes 100 times...\n"); Console.Write($" * DAWG... "); sw.Restart(); found = DawgPrefixSearch(dawg, prefixes, 100, 500); sw.Stop(); report.DawgLimitedPrefixSearch = sw.Elapsed; Console.Write($"found {string.Join(", ", found.Select(x => x.Count).OrderByDescending(x => x))}"); PrintOK(sw); Console.Write($" * Array... "); sw.Restart(); hfound = LinearPrefixSearch(words, prefixes, 100, 500); sw.Stop(); report.ArrayLimitedPrefixSearch = sw.Elapsed; Console.Write($"found {string.Join(", ", hfound.Select(x => x.Count).OrderByDescending(x => x))}"); PrintOK(sw); if (!SameSets(found, hfound)) { throw new Exception("Different prefixed words count."); } sw.Restart(); PrintPhase("Checking random generated words in parallel... "); TestRandomGeneratedWords(words, hash, dawg); sw.Stop(); PrintOK(sw); Console.WriteLine(); return(report); }