Example #1
0
        static void Main(string[] args)
        {
            if (args.Length != 3)
            {
                Console.WriteLine("Usage: <baseURL> <numChunksToRead> <ouputFilePrefix>");
                return;
            }
            var baseUrl         = args[0];
            var numChunksToRead = int.Parse(args[1]);

            var outputFilePrefix = args[2];

            Directory.CreateDirectory(Path.GetDirectoryName(outputFilePrefix));

            using (var writer = new ChunkedJsonGzWriter(
                       outputFilenameTemplate: outputFilePrefix,
                       useJsonlFormat: true))
            {
                var extractor = new RepoDataExtractor();
                void ExtractRecord(CodeRecord rec)
                {
                    var parts = rec.RepositoryPath.Split();

                    extractor.ExtractDataFrom(parts[0], parts[1], rec.Content, writer);
                };

                Parallel.ForEach(
                    source: EnumerateCsvRecords(baseUrl, numChunksToRead),
                    body: ExtractRecord
                    );
            }
        }
Example #2
0
        private void ExtractDataFrom(SyntaxTree syntaxTree, ChunkedJsonGzWriter writer, string repoPath)
        {
            var compilation   = CSharpCompilation.Create("tmpCompilation", syntaxTrees: new[] { syntaxTree });
            var compiledTree  = compilation.SyntaxTrees.First();
            var semanticModel = compilation.GetSemanticModel(compiledTree);

            var allDeclaredMethods = MethodUtils.GetAllMethodDeclarations(compiledTree.GetRoot());

            foreach (var methodDeclarationNode in allDeclaredMethods.Where(m => m.Body != null))
            {
                try
                {
                    if (!(semanticModel.GetDeclaredSymbol(methodDeclarationNode) is IMethodSymbol methodSymbol))
                    {
                        continue;
                    }

                    if (SplitCamelCaseRegex.Split(methodDeclarationNode.Identifier.Text).Any(s => BlackListedFunctionNames.Contains(s.ToLower())))
                    {
                        continue;
                    }

                    var(summary, returns, parameters) = MethodUtils.GetDocumentationComment(methodSymbol);

                    // Replace <seealso cref="!:Fully.Qualified.Name#method()" /> tags with their cref content
                    // and other similar replacements
                    var summary_cleaned = Regex.Replace(summary, "</?[^\\n>]+/?>", new MatchEvaluator(ReplaceXml));

                    // If the summary has an empty line, remove everything beneath it.
                    var parts = Regex.Split(summary_cleaned, @"\n\s*\n").Select(p => p.Trim()).Where(p => p.Length > 0).ToArray();
                    if (parts.Length > 1)
                    {
                        summary_cleaned = parts[0];
                    }

                    if (string.IsNullOrWhiteSpace(summary_cleaned) || summary_cleaned.Length < MIN_SUMMARY_CHAR_LENGTH)
                    {
                        // Empty or too short summary
                        continue;
                    }

                    var lineSpan = compiledTree.GetMappedLineSpan(methodDeclarationNode.Body.Span);
                    if (lineSpan.EndLinePosition.Line - lineSpan.StartLinePosition.Line + 1 <= MIN_NUM_LINES)
                    {
                        continue; // Method seems to be too short.
                    }

                    writer.WriteElement(jw => WriteMethodData(methodDeclarationNode, summary, summary_cleaned, jw, repoPath));
                }
                catch (Exception e)
                {
                    Console.WriteLine($"Failed to extract data: {e.Message}");
                }
            }
        }
Example #3
0
        public void ExtractDataFrom(string repoPath, string relativePath, string content, ChunkedJsonGzWriter writer)
        {
            var syntaxTree = CSharpSyntaxTree.ParseText(
                text: content, path: relativePath,
                options: CSharpParseOptions.Default.WithKind(SourceCodeKind.Script));

            ExtractDataFrom(syntaxTree, writer, repoPath);
        }