示例#1
0
文件: Program.cs 项目: vtortola/dawg
        static FileReport TestWordsFile(string fileName)
        {
            GC.Collect();
            GC.WaitForFullGCComplete();
            GC.WaitForPendingFinalizers();

            var report = new FileReport();

            report.FileName = Path.GetFileName(fileName);
            var sw = new Stopwatch();

            Console.ForegroundColor = ConsoleColor.Yellow;
            Console.WriteLine(fileName);
            var words = File.ReadAllLines(fileName)
                        .Where(w => !string.IsNullOrWhiteSpace(w))
                        .Select(w => w.ToLowerInvariant())
                        .Distinct()
                        .OrderBy(x => x, StringComparer.Ordinal)
                        .ToArray();

            report.WordCount = words.Length;
            Console.WriteLine($"Words: {words.Length:n0}");
            Console.ResetColor();

            var hash = new HashSet <string>(words);

            PrintPhase("Creating DAWG...");
            sw.Start();
            var dawg = Dawg.Create(words);

            sw.Stop();
            PrintOK(sw);
            report.BuildDawgTime = sw.Elapsed;
            report.NodeCount     = dawg.NodeCount;
            Console.WriteLine($"Nodes {dawg.NodeCount:n0}");

            report.ArraySize = EstimateObjectSize(words);
            report.HashSize  = EstimateObjectSize(hash);

            Console.WriteLine($"HashSize:{report.HashSize/1024.0:n2} kB  ArraySize:{report.ArraySize/1024.0:n2} kB.");

            sw.Restart();
            PrintPhase("Verifying... ");
            Dawg.Verify(words, dawg);
            sw.Stop();
            PrintOK(sw);

            PrintPhase("Serializing/Deserializing the DAWG... \n");
            var dawgFile = Path.GetTempFileName();
            var gzipFile = Path.GetTempFileName();

            Console.Write(" * Write DAWG to disk... ");
            sw.Restart();
            Dawg.Write(dawg, dawgFile);
            sw.Stop();
            PrintOK(sw);
            report.DawgWriteTime = sw.Elapsed;

            Console.Write(" * Read DAWG from disk... ");
            sw.Restart();
            dawg = Dawg.Read(dawgFile);
            sw.Stop();
            PrintOK(sw);

            Console.Write(" * Write Gzip to disk... ");
            sw.Restart();
            report.GzipOriginalFileLength = CompressedFileSize(fileName, gzipFile);
            sw.Stop();
            report.GzipCompressTime = sw.Elapsed;
            PrintOK(sw);

            Console.Write(" * Read Gzip from disk... ");
            sw.Restart();
            DecompressFileSize(gzipFile);
            sw.Stop();
            report.GzipDecompressTime = sw.Elapsed;
            PrintOK(sw);

            report.OriginalFileLength = new FileInfo(fileName).Length;
            Console.Write($"Original File is {report.OriginalFileLength / 1024:n0} kB. ");
            report.DawgFileLength = new FileInfo(dawgFile).Length;
            Console.Write($"DAWG File is {report.DawgFileLength / 1024:n0} kB. ");
            Console.Write($"Gzip File is {report.GzipOriginalFileLength / 1024:n0} kB. \n");

            File.Delete(dawgFile);
            File.Delete(gzipFile);

            PrintPhase("DAWGSharp package comparison ...");
            sw.Restart();
            var dawgSharpBuilder = new DawgSharp.DawgBuilder <bool> (); // <bool> is the value type.

            foreach (string key in words)
            {
                dawgSharpBuilder.Insert(key, true);
            }

            var dawgSharp = dawgSharpBuilder.BuildDawg();

            sw.Stop();
            using var ms = new MemoryStream();
            dawgSharp.SaveTo(ms);
            PrintOK(sw);
            Console.WriteLine($" * DAWGSharp NodeCount is {dawgSharp.GetNodeCount():n0}.");
            Console.WriteLine($" * DAWGSharp File is {ms.Length/1024:n2} kB.");

            sw.Restart();
            PrintPhase("Checking own words... ");
            TestOwnWordsExits(words, dawg);
            sw.Stop();
            PrintOK(sw);

            PrintPhase("Find 20 random word 100 times... \n");
            var random = new Random();
            var toFind = new HashSet <string>();

            while (toFind.Count != 20)
            {
                toFind.Add(words[random.Next(0, words.Length)]);
            }
            Console.Write(" * Find in word set...");
            sw.Restart();
            FindWords(toFind, hash, 100);
            sw.Stop();
            report.HashContains = sw.Elapsed;
            PrintOK(sw);
            Console.Write(" * Find in word list...");
            sw.Restart();
            FindWords(toFind, words, 100);
            sw.Stop();
            report.ArrayContains = sw.Elapsed;
            PrintOK(sw);
            Console.Write(" * Find in word list binary search...");
            sw.Restart();
            FindWordsBinarySearch(toFind, words, 100);
            sw.Stop();
            report.BinarySearchContains = sw.Elapsed;
            PrintOK(sw);
            Console.Write(" * Find in DAWG...");
            sw.Restart();
            FindWords(toFind, dawg, 100);
            report.DawgSearch = sw.Elapsed;
            sw.Stop();
            PrintOK(sw);

            var prefixes = words
                           .Where(w => w.Length > 4)
                           .Select(w => w.Substring(0, 4))
                           .GroupBy(s => s).Select(g => (g.Key, g.Count()))
                           .OrderByDescending(x => x.Item2)
                           .Select(x => x.Key)
                           .Take(20)
                           .ToArray();

            PrintPhase($"Finding 50 words that start with 20 prefixes 100 times...\n");
            Console.Write($" * DAWG...                ");
            sw.Restart();
            var found = DawgPrefixSearch(dawg, prefixes, 100, 50);

            sw.Stop();
            report.DawgPrefixSearch = sw.Elapsed;
            Console.Write($"found {string.Join(", ", found.Select(x => x.Count).OrderByDescending(x => x))}");
            PrintOK(sw);

            Console.Write($" * Array...               ");
            sw.Restart();
            // https://stackoverflow.com/questions/52395504/inconsistent-string-startswith-on-different-platforms
            var hfound = LinearPrefixSearch(words, prefixes, 100, 50);

            sw.Stop();
            report.ArrayPrefixSearch = sw.Elapsed;
            Console.Write($"found {string.Join(", ", hfound.Select(x => x.Count).OrderByDescending(x => x))}");
            PrintOK(sw);
            if (!SameSets(found, hfound))
            {
                throw new Exception("Different prefixed words count.");
            }

            PrintPhase($"Finding first 500 words that start with 20 prefixes 100 times...\n");
            Console.Write($" * DAWG...                ");
            sw.Restart();
            found = DawgPrefixSearch(dawg, prefixes, 100, 500);
            sw.Stop();
            report.DawgLimitedPrefixSearch = sw.Elapsed;
            Console.Write($"found {string.Join(", ", found.Select(x => x.Count).OrderByDescending(x => x))}");
            PrintOK(sw);

            Console.Write($" * Array...               ");
            sw.Restart();
            hfound = LinearPrefixSearch(words, prefixes, 100, 500);
            sw.Stop();
            report.ArrayLimitedPrefixSearch = sw.Elapsed;
            Console.Write($"found {string.Join(", ", hfound.Select(x => x.Count).OrderByDescending(x => x))}");
            PrintOK(sw);
            if (!SameSets(found, hfound))
            {
                throw new Exception("Different prefixed words count.");
            }

            sw.Restart();
            PrintPhase("Checking random generated words in parallel... ");
            TestRandomGeneratedWords(words, hash, dawg);
            sw.Stop();
            PrintOK(sw);
            Console.WriteLine();
            return(report);
        }