コード例 #1
0
ファイル: Program.cs プロジェクト: SWATOPLUS/InfoSearch
        private static SequenceStats ProcessMinHash(
            DuplicateRates groundTruth,
            Dictionary <string, string> documents,
            int hashCount)
        {
            Console.WriteLine("Min: " + hashCount + " " + DateTime.Now);

            var analyzer = new HashDuplicateAnalyzer <ulong>(new MinDuplicateHasher(hashCount, 5));

            return(Process(groundTruth, documents, analyzer));
        }
コード例 #2
0
ファイル: Program.cs プロジェクト: SWATOPLUS/InfoSearch
        private static SequenceStats ProcessSimHash(
            DuplicateRates groundTruth,
            Dictionary <string, string> documents,
            int bytesCount)
        {
            Console.WriteLine("Sim: " + bytesCount * 8 + " " + DateTime.Now);

            var analyzer = new HashDuplicateAnalyzer <byte>(new SimDuplicateHasher(bytesCount, 5));

            return(Process(groundTruth, documents, analyzer));
        }
コード例 #3
0
        private static void Main()
        {
            var groundTruth = DuplicateRates.FromLines(File.ReadLines(GroundTruthFile));
            var documents   = DirectoryTools.ReadStringDictionary(InputFile)
                              //.MapValues(SourceCodeNormalizer.NormalizeContent)
                              //.MapValues(SourceCodeNormalizer.NormalizeSpaces)
                              //.MapValues(SourceCodeNormalizer.NormalizeWords)
                              .MapValues(SourceCodeNormalizer.NormalizeSpaces);

            var hashes     = HashTools.HashValues(documents);
            var duplicates = FindDuplicates(hashes, groundTruth);

            var result = documents.RemoveKeys(duplicates);

            DirectoryTools.SaveAsJson(result, OutputFile);
        }
コード例 #4
0
        private static IReadOnlyCollection <string> FindDuplicates(
            Dictionary <ulong, string[]> hashes,
            DuplicateRates duplicateRates)
        {
            var totalErrors = 0;
            var groupCount  = 0;
            var duplicates  = new List <string>();

            foreach (var(_, names) in hashes)
            {
                if (names.Length < 2)
                {
                    continue;
                }

                var errors = 0;
                var parent = names.First();

                foreach (var name in names.Skip(1))
                {
                    if (Math.Abs(duplicateRates.GetSimilarity(parent, name) - 1) < 0.001)
                    {
                        duplicates.Add(name);
                    }
                    else
                    {
                        errors++;
                        totalErrors++;
                    }
                }

                groupCount++;

                Console.WriteLine($"Group {groupCount}, size: {names.Length}, mistakes: {errors}");
            }

            Console.WriteLine($"Total errors: {totalErrors}");
            Console.WriteLine($"Total groups: {groupCount}");
            Console.WriteLine($"Total duplicates: {duplicates.Count}");

            return(duplicates);
        }
コード例 #5
0
ファイル: Program.cs プロジェクト: SWATOPLUS/InfoSearch
        private static SequenceStats Process(
            DuplicateRates groundTruth,
            Dictionary <string, string> documents,
            IDuplicateAnalyzer analyzer)
        {
            var sw = new Stopwatch();

            sw.Start();

            foreach (var(name, text) in documents)
            {
                analyzer.AddText(name, text);
            }

            var duplicates = analyzer.GetAllDuplicates();

            var stats = StatisticsExtensions.GetNGDcg10Stats(groundTruth.Data, duplicates.Data);

            Console.WriteLine("Done in" + sw.Elapsed);

            return(stats);
        }
コード例 #6
0
ファイル: Program.cs プロジェクト: SWATOPLUS/InfoSearch
        private static void Main()
        {
            var documents   = DirectoryTools.ReadStringDictionary(InputFile);
            var groundTruth = DuplicateRates.FromLines(File.ReadLines(GroundTruthFile), documents.Keys);

            var result = new
            {
                Min1   = ProcessMinHash(groundTruth, documents, 1),
                Min5   = ProcessMinHash(groundTruth, documents, 5),
                Min10  = ProcessMinHash(groundTruth, documents, 10),
                Min30  = ProcessMinHash(groundTruth, documents, 30),
                Min50  = ProcessMinHash(groundTruth, documents, 50),
                Min100 = ProcessMinHash(groundTruth, documents, 100),
                Sim8   = ProcessSimHash(groundTruth, documents, 8 / 8),
                Sim16  = ProcessSimHash(groundTruth, documents, 16 / 8),
                Sim64  = ProcessSimHash(groundTruth, documents, 64 / 8),
                Sim128 = ProcessSimHash(groundTruth, documents, 128 / 8),
                Sim256 = ProcessSimHash(groundTruth, documents, 256 / 8),
            };

            DirectoryTools.SaveAsJson(result, OutputFile);
        }