private static void Main() { var documents = DirectoryTools.ReadStringDictionary(InputFile); var allText = string.Join(' ', documents.Values); var chars = allText .Distinct() .Except(TextTools.SourceCodeChars) .ToArray(); var words = allText .Split() .GroupBy(x => x) .ToDictionary(x => x.Key, x => x.Count()) .OrderByDescending(x => x.Value) .ToDictionary(x => x.Key, x => x.Value); var result = new { chars, words, }; DirectoryTools.SaveAsJson(result, OutputFile); }
private static void Main() { var groundTruth = DuplicateRates.FromLines(File.ReadLines(GroundTruthFile)); var documents = DirectoryTools.ReadStringDictionary(InputFile) //.MapValues(SourceCodeNormalizer.NormalizeContent) //.MapValues(SourceCodeNormalizer.NormalizeSpaces) //.MapValues(SourceCodeNormalizer.NormalizeWords) .MapValues(SourceCodeNormalizer.NormalizeSpaces); var hashes = HashTools.HashValues(documents); var duplicates = FindDuplicates(hashes, groundTruth); var result = documents.RemoveKeys(duplicates); DirectoryTools.SaveAsJson(result, OutputFile); }
private static void Main() { var documents = DirectoryTools.ReadStringDictionary(InputFile); var groundTruth = DuplicateRates.FromLines(File.ReadLines(GroundTruthFile), documents.Keys); var result = new { Min1 = ProcessMinHash(groundTruth, documents, 1), Min5 = ProcessMinHash(groundTruth, documents, 5), Min10 = ProcessMinHash(groundTruth, documents, 10), Min30 = ProcessMinHash(groundTruth, documents, 30), Min50 = ProcessMinHash(groundTruth, documents, 50), Min100 = ProcessMinHash(groundTruth, documents, 100), Sim8 = ProcessSimHash(groundTruth, documents, 8 / 8), Sim16 = ProcessSimHash(groundTruth, documents, 16 / 8), Sim64 = ProcessSimHash(groundTruth, documents, 64 / 8), Sim128 = ProcessSimHash(groundTruth, documents, 128 / 8), Sim256 = ProcessSimHash(groundTruth, documents, 256 / 8), }; DirectoryTools.SaveAsJson(result, OutputFile); }
private static void Main() { var documents = DirectoryTools.LoadDirectory(InputDirectory); DirectoryTools.SaveAsJson(documents, OutputFile); }