Esempio n. 1
0
        private static void Main()
        {
            var documents = DirectoryTools.ReadStringDictionary(InputFile);
            var allText   = string.Join(' ', documents.Values);

            var chars = allText
                        .Distinct()
                        .Except(TextTools.SourceCodeChars)
                        .ToArray();

            var words = allText
                        .Split()
                        .GroupBy(x => x)
                        .ToDictionary(x => x.Key, x => x.Count())
                        .OrderByDescending(x => x.Value)
                        .ToDictionary(x => x.Key, x => x.Value);

            var result = new
            {
                chars,
                words,
            };

            DirectoryTools.SaveAsJson(result, OutputFile);
        }
Esempio n. 2
0
        private static void Main()
        {
            var groundTruth = DuplicateRates.FromLines(File.ReadLines(GroundTruthFile));
            var documents   = DirectoryTools.ReadStringDictionary(InputFile)
                              //.MapValues(SourceCodeNormalizer.NormalizeContent)
                              //.MapValues(SourceCodeNormalizer.NormalizeSpaces)
                              //.MapValues(SourceCodeNormalizer.NormalizeWords)
                              .MapValues(SourceCodeNormalizer.NormalizeSpaces);

            var hashes     = HashTools.HashValues(documents);
            var duplicates = FindDuplicates(hashes, groundTruth);

            var result = documents.RemoveKeys(duplicates);

            DirectoryTools.SaveAsJson(result, OutputFile);
        }
Esempio n. 3
0
        private static void Main()
        {
            var documents   = DirectoryTools.ReadStringDictionary(InputFile);
            var groundTruth = DuplicateRates.FromLines(File.ReadLines(GroundTruthFile), documents.Keys);

            var result = new
            {
                Min1   = ProcessMinHash(groundTruth, documents, 1),
                Min5   = ProcessMinHash(groundTruth, documents, 5),
                Min10  = ProcessMinHash(groundTruth, documents, 10),
                Min30  = ProcessMinHash(groundTruth, documents, 30),
                Min50  = ProcessMinHash(groundTruth, documents, 50),
                Min100 = ProcessMinHash(groundTruth, documents, 100),
                Sim8   = ProcessSimHash(groundTruth, documents, 8 / 8),
                Sim16  = ProcessSimHash(groundTruth, documents, 16 / 8),
                Sim64  = ProcessSimHash(groundTruth, documents, 64 / 8),
                Sim128 = ProcessSimHash(groundTruth, documents, 128 / 8),
                Sim256 = ProcessSimHash(groundTruth, documents, 256 / 8),
            };

            DirectoryTools.SaveAsJson(result, OutputFile);
        }