public static void DumpRevisionDataForNeuralTraining(string revisionDataFilePath, string outputFilePath, string grammarPath)
        {
            int entryProcessed = 0;
            var syntaxHelper   = new JsonSyntaxTreeHelper(grammarPath);

            using (var fs = File.Open(outputFilePath, FileMode.Create))
                using (var sw = new StreamWriter(fs, Encoding.UTF8))
                {
                    Stopwatch stopwatch = new Stopwatch();
                    stopwatch.Start();
                    foreach (var changeStrs in ReadRevisionData(revisionDataFilePath).AsParallel().Select(x =>
                                                                                                          ProcessSingleRevision(x, syntaxHelper).Select(t => changeEntryDatumToJsonString(t)).ToArray()))
                    {
                        entryProcessed++;
                        foreach (var changeStr in changeStrs)
                        {
                            try
                            {
                                sw.WriteLine(changeStr);
                            }
                            catch (Exception e) { }
                        }

                        if (entryProcessed % 10 == 0)
                        {
                            Console.Write($"\rEntry processed: {entryProcessed}");
                        }
                    }

                    stopwatch.Stop();
                    Console.WriteLine();
                    Console.WriteLine("Time elapsed: {0}", stopwatch.Elapsed);
                }
        }
        public static IEnumerable <object> ProcessSingleRevision(string jsonLine, JsonSyntaxTreeHelper jsonSyntaxTreeHelper)
        {
            var entry = JObject.Parse(jsonLine);

            var previousFile = entry["prev_file"].ToString();
            var updatedFile  = entry["updated_file"].ToString();

            // Console.WriteLine($"Processing {entry["id"]}");

            // File.WriteAllText("a.original.cs", previousFile);
            // File.WriteAllText("b.original.cs", updatedFile);

            var previousFileAst = CSharpSyntaxTree.ParseText(previousFile);
            var updatedFileAst  = CSharpSyntaxTree.ParseText(updatedFile);

            (SyntaxNode canonicalPrevFileAst, Dictionary <string, string> prevFileVariableNameMap)       = Canonicalization.CanonicalizeSyntaxNode(previousFileAst.GetRoot(), extractAllVariablesFirst: true);
            (SyntaxNode canonicalUpdatedFileAst, Dictionary <string, string> updatedFileVariableNameMap) = Canonicalization.CanonicalizeSyntaxNode(updatedFileAst.GetRoot(), prevFileVariableNameMap);

            var prevCodeFile    = canonicalPrevFileAst.GetText();
            var updatedCodeFile = canonicalUpdatedFileAst.GetText();

            var prevFileTokens    = canonicalPrevFileAst.DescendantTokens().ToList();
            var updatedFileTokens = canonicalUpdatedFileAst.DescendantTokens().ToList();

            var changesInRevision = GetChangesBetweenAsts(canonicalPrevFileAst.SyntaxTree, canonicalUpdatedFileAst.SyntaxTree);

            // File.WriteAllText("a.canonical.cs", canonicalPrevFileAst.GetText().ToString());
            // File.WriteAllText("b.canonical.cs", canonicalUpdatedFileAst.GetText().ToString());

            var prevTokenIndex    = new TokenIndex(prevFileTokens);
            var updatedTokenIndex = new TokenIndex(updatedFileTokens);

            var changeId = 0;

            foreach (var change in changesInRevision)
            {
                var prevCodeChunkLineSpan    = canonicalPrevFileAst.SyntaxTree.GetLineSpan(change.BeforeSpan.ChangeSpan);
                var updatedCodeChunkLineSpan = canonicalUpdatedFileAst.SyntaxTree.GetLineSpan(change.AfterSpan.ChangeSpan);

                var prevCodeChunkLineSpanStart = prevCodeFile.Lines[prevCodeChunkLineSpan.StartLinePosition.Line].Span.Start;
                var prevCodeChunkSpanEnd       = prevCodeFile.Lines[prevCodeChunkLineSpan.EndLinePosition.Line].Span.End;

                var updatedCodeChunkLineSpanStart = updatedCodeFile.Lines[updatedCodeChunkLineSpan.StartLinePosition.Line].Span.Start;
                var updatedCodeChunkSpanEnd       = updatedCodeFile.Lines[updatedCodeChunkLineSpan.EndLinePosition.Line].Span.End;

                // only consider changes of equal number of lines
                if (prevCodeChunkLineSpan.EndLinePosition.Line - prevCodeChunkLineSpan.StartLinePosition.Line
                    != updatedCodeChunkLineSpan.EndLinePosition.Line - updatedCodeChunkLineSpan.StartLinePosition.Line)
                {
                    continue;
                }

                // TODO: remove trivial change

                // only consider SyntaxKind in allowedSytaxKinds
                var prevCodeChunkNodes = GetNodesByLineSpan(canonicalPrevFileAst, prevCodeFile,
                                                            prevCodeChunkLineSpan.StartLinePosition.Line, prevCodeChunkLineSpan.EndLinePosition.Line);
                if (prevCodeChunkNodes.Any(node => !allowedSytaxKinds.Contains(node.Kind())))
                {
                    continue;
                }

                var updatedCodeChunkNodes = GetNodesByLineSpan(canonicalUpdatedFileAst, updatedCodeFile,
                                                               updatedCodeChunkLineSpan.StartLinePosition.Line, updatedCodeChunkLineSpan.EndLinePosition.Line);
                if (updatedCodeChunkNodes.Any(node => !allowedSytaxKinds.Contains(node.Kind())))
                {
                    continue;
                }

                var previousCodeChunkTokens = prevTokenIndex
                                              .GetTokensInSpan(prevCodeChunkLineSpanStart, prevCodeChunkSpanEnd)
                                              .Select(token => token.ValueText)
                                              .Where(token => !string.IsNullOrWhiteSpace(token) && !string.IsNullOrEmpty(token))
                                              .ToArray();

                var updatedsCodeChunkTokens = updatedTokenIndex
                                              .GetTokensInSpan(updatedCodeChunkLineSpanStart, updatedCodeChunkSpanEnd)
                                              .Select(token => token.ValueText)
                                              .Where(token => !string.IsNullOrWhiteSpace(token) && !string.IsNullOrEmpty(token))
                                              .ToArray();

                if (previousCodeChunkTokens.Length > 0 && updatedsCodeChunkTokens.Length > 0 &&
                    IsValidCodeChunkTokens(previousCodeChunkTokens) && IsValidCodeChunkTokens(updatedsCodeChunkTokens) &&
                    !previousCodeChunkTokens.SequenceEqual(updatedsCodeChunkTokens))
                {
                    var changeSha = entry["id"] + "_" + changeId;

                    var prevCodeChunkBlockStmt    = SyntaxFactory.Block(prevCodeChunkNodes.Select(node => (StatementSyntax)node));
                    var updatedCodeChunkBlockStmt = SyntaxFactory.Block(updatedCodeChunkNodes.Select(node => (StatementSyntax)node));

                    IDictionary <string, string> zeroIndexedVariableNameMap;
                    (prevCodeChunkBlockStmt, updatedCodeChunkBlockStmt, zeroIndexedVariableNameMap) =
                        zeroIndexVariableNames(prevCodeChunkBlockStmt, updatedCodeChunkBlockStmt);

                    var prevCodeChunkBlockStmtTokens      = prevCodeChunkBlockStmt.DescendantTokens().Skip(1).SkipLast(1).ToArray();
                    var prevCodeChunkBlackStmtTokensIndex = new TokenIndex(prevCodeChunkBlockStmtTokens).InitInvertedIndex();

                    var updatedCodeChunkBlockStmtTokens      = updatedCodeChunkBlockStmt.DescendantTokens().Skip(1).SkipLast(1).ToArray();
                    var updatedCodeChunkBlockStmtTokensIndex = new TokenIndex(updatedCodeChunkBlockStmtTokens).InitInvertedIndex();

                    var prevCodeBlockJObject    = jsonSyntaxTreeHelper.GetJObjectForSyntaxNode(prevCodeChunkBlockStmt, prevCodeChunkBlackStmtTokensIndex);
                    var updatedCodeBlockJObject = jsonSyntaxTreeHelper.GetJObjectForSyntaxNode(updatedCodeChunkBlockStmt, updatedCodeChunkBlockStmtTokensIndex);

                    var precedingContextTokens  = prevTokenIndex.GetTokensInSpan(change.BeforeSpan.SpanOfPrecedingContext);
                    var succeedingContextTokens = updatedTokenIndex.GetTokensInSpan(change.BeforeSpan.SpanOfSucceedingContext);

                    precedingContextTokens  = zeroIndexVariableNames(precedingContextTokens, zeroIndexedVariableNameMap);
                    succeedingContextTokens = zeroIndexVariableNames(succeedingContextTokens, zeroIndexedVariableNameMap);

                    var prevCodeChunkBlockStmtTextTokens =
                        prevCodeChunkBlockStmtTokens.Select(token => token.ValueText).ToArray();
                    var updatedCodeChunkBlockStmtTextTokens =
                        updatedCodeChunkBlockStmtTokens.Select(token => token.ValueText).ToArray();

                    var prevCodeTextChunk = Utils.ExtractCodeTextFromBraces(prevCodeChunkBlockStmt.GetText().ToString());
                    prevCodeTextChunk = Utils.RemoveLeadingWhiteSpace(prevCodeTextChunk, naive: true);

                    var updatedCodeTextChunk = Utils.ExtractCodeTextFromBraces(updatedCodeChunkBlockStmt.GetText().ToString());
                    updatedCodeTextChunk = Utils.RemoveLeadingWhiteSpace(updatedCodeTextChunk, naive: true);

                    var precedingContextTextTokens  = precedingContextTokens.Select(token => token.ValueText).ToArray();
                    var succeedingContextTextTokens = succeedingContextTokens.Select(token => token.ValueText).ToArray();

                    var result = new
                    {
                        Id                     = changeSha,
                        PrevCodeChunk          = prevCodeTextChunk,
                        UpdatedCodeChunk       = updatedCodeTextChunk,
                        PrevCodeChunkTokens    = prevCodeChunkBlockStmtTextTokens,
                        UpdatedCodeChunkTokens = updatedCodeChunkBlockStmtTextTokens,
                        PrevCodeAST            = prevCodeBlockJObject,
                        UpdatedCodeAST         = updatedCodeBlockJObject,
                        PrecedingContext       = precedingContextTextTokens,
                        SucceedingContext      = succeedingContextTextTokens,
                        CommitMessage          = entry["message"]
                    };

                    changeId += 1;

                    yield return(result);
                }
            }
        }