private static SequenceStats ProcessMinHash( DuplicateRates groundTruth, Dictionary <string, string> documents, int hashCount) { Console.WriteLine("Min: " + hashCount + " " + DateTime.Now); var analyzer = new HashDuplicateAnalyzer <ulong>(new MinDuplicateHasher(hashCount, 5)); return(Process(groundTruth, documents, analyzer)); }
private static SequenceStats ProcessSimHash( DuplicateRates groundTruth, Dictionary <string, string> documents, int bytesCount) { Console.WriteLine("Sim: " + bytesCount * 8 + " " + DateTime.Now); var analyzer = new HashDuplicateAnalyzer <byte>(new SimDuplicateHasher(bytesCount, 5)); return(Process(groundTruth, documents, analyzer)); }
private static void Main() { var groundTruth = DuplicateRates.FromLines(File.ReadLines(GroundTruthFile)); var documents = DirectoryTools.ReadStringDictionary(InputFile) //.MapValues(SourceCodeNormalizer.NormalizeContent) //.MapValues(SourceCodeNormalizer.NormalizeSpaces) //.MapValues(SourceCodeNormalizer.NormalizeWords) .MapValues(SourceCodeNormalizer.NormalizeSpaces); var hashes = HashTools.HashValues(documents); var duplicates = FindDuplicates(hashes, groundTruth); var result = documents.RemoveKeys(duplicates); DirectoryTools.SaveAsJson(result, OutputFile); }
private static IReadOnlyCollection <string> FindDuplicates( Dictionary <ulong, string[]> hashes, DuplicateRates duplicateRates) { var totalErrors = 0; var groupCount = 0; var duplicates = new List <string>(); foreach (var(_, names) in hashes) { if (names.Length < 2) { continue; } var errors = 0; var parent = names.First(); foreach (var name in names.Skip(1)) { if (Math.Abs(duplicateRates.GetSimilarity(parent, name) - 1) < 0.001) { duplicates.Add(name); } else { errors++; totalErrors++; } } groupCount++; Console.WriteLine($"Group {groupCount}, size: {names.Length}, mistakes: {errors}"); } Console.WriteLine($"Total errors: {totalErrors}"); Console.WriteLine($"Total groups: {groupCount}"); Console.WriteLine($"Total duplicates: {duplicates.Count}"); return(duplicates); }
private static SequenceStats Process( DuplicateRates groundTruth, Dictionary <string, string> documents, IDuplicateAnalyzer analyzer) { var sw = new Stopwatch(); sw.Start(); foreach (var(name, text) in documents) { analyzer.AddText(name, text); } var duplicates = analyzer.GetAllDuplicates(); var stats = StatisticsExtensions.GetNGDcg10Stats(groundTruth.Data, duplicates.Data); Console.WriteLine("Done in" + sw.Elapsed); return(stats); }
private static void Main() { var documents = DirectoryTools.ReadStringDictionary(InputFile); var groundTruth = DuplicateRates.FromLines(File.ReadLines(GroundTruthFile), documents.Keys); var result = new { Min1 = ProcessMinHash(groundTruth, documents, 1), Min5 = ProcessMinHash(groundTruth, documents, 5), Min10 = ProcessMinHash(groundTruth, documents, 10), Min30 = ProcessMinHash(groundTruth, documents, 30), Min50 = ProcessMinHash(groundTruth, documents, 50), Min100 = ProcessMinHash(groundTruth, documents, 100), Sim8 = ProcessSimHash(groundTruth, documents, 8 / 8), Sim16 = ProcessSimHash(groundTruth, documents, 16 / 8), Sim64 = ProcessSimHash(groundTruth, documents, 64 / 8), Sim128 = ProcessSimHash(groundTruth, documents, 128 / 8), Sim256 = ProcessSimHash(groundTruth, documents, 256 / 8), }; DirectoryTools.SaveAsJson(result, OutputFile); }