static void Main(string[] args) { if (args.Length != 3) { Console.WriteLine("Usage: <baseURL> <numChunksToRead> <ouputFilePrefix>"); return; } var baseUrl = args[0]; var numChunksToRead = int.Parse(args[1]); var outputFilePrefix = args[2]; Directory.CreateDirectory(Path.GetDirectoryName(outputFilePrefix)); using (var writer = new ChunkedJsonGzWriter( outputFilenameTemplate: outputFilePrefix, useJsonlFormat: true)) { var extractor = new RepoDataExtractor(); void ExtractRecord(CodeRecord rec) { var parts = rec.RepositoryPath.Split(); extractor.ExtractDataFrom(parts[0], parts[1], rec.Content, writer); }; Parallel.ForEach( source: EnumerateCsvRecords(baseUrl, numChunksToRead), body: ExtractRecord ); } }
private void ExtractDataFrom(SyntaxTree syntaxTree, ChunkedJsonGzWriter writer, string repoPath) { var compilation = CSharpCompilation.Create("tmpCompilation", syntaxTrees: new[] { syntaxTree }); var compiledTree = compilation.SyntaxTrees.First(); var semanticModel = compilation.GetSemanticModel(compiledTree); var allDeclaredMethods = MethodUtils.GetAllMethodDeclarations(compiledTree.GetRoot()); foreach (var methodDeclarationNode in allDeclaredMethods.Where(m => m.Body != null)) { try { if (!(semanticModel.GetDeclaredSymbol(methodDeclarationNode) is IMethodSymbol methodSymbol)) { continue; } if (SplitCamelCaseRegex.Split(methodDeclarationNode.Identifier.Text).Any(s => BlackListedFunctionNames.Contains(s.ToLower()))) { continue; } var(summary, returns, parameters) = MethodUtils.GetDocumentationComment(methodSymbol); // Replace <seealso cref="!:Fully.Qualified.Name#method()" /> tags with their cref content // and other similar replacements var summary_cleaned = Regex.Replace(summary, "</?[^\\n>]+/?>", new MatchEvaluator(ReplaceXml)); // If the summary has an empty line, remove everything beneath it. var parts = Regex.Split(summary_cleaned, @"\n\s*\n").Select(p => p.Trim()).Where(p => p.Length > 0).ToArray(); if (parts.Length > 1) { summary_cleaned = parts[0]; } if (string.IsNullOrWhiteSpace(summary_cleaned) || summary_cleaned.Length < MIN_SUMMARY_CHAR_LENGTH) { // Empty or too short summary continue; } var lineSpan = compiledTree.GetMappedLineSpan(methodDeclarationNode.Body.Span); if (lineSpan.EndLinePosition.Line - lineSpan.StartLinePosition.Line + 1 <= MIN_NUM_LINES) { continue; // Method seems to be too short. } writer.WriteElement(jw => WriteMethodData(methodDeclarationNode, summary, summary_cleaned, jw, repoPath)); } catch (Exception e) { Console.WriteLine($"Failed to extract data: {e.Message}"); } } }
public void ExtractDataFrom(string repoPath, string relativePath, string content, ChunkedJsonGzWriter writer) { var syntaxTree = CSharpSyntaxTree.ParseText( text: content, path: relativePath, options: CSharpParseOptions.Default.WithKind(SourceCodeKind.Script)); ExtractDataFrom(syntaxTree, writer, repoPath); }