public static void wipeCharPositionInfoAndWhitespaceTokens(CodeBuffTokenStream tokens) { tokens.Fill(); CommonToken dummy = new CommonToken(TokenConstants.InvalidType, ""); dummy.Channel = TokenConstants.HiddenChannel; Token firstRealToken = tokens.getNextRealToken(-1); for (int i = 0; i < tokens.Size; i++) { if (i == firstRealToken.TokenIndex) { continue; // don't wack first token } CommonToken t = (CommonToken)tokens.Get(i); Regex rex = new Regex("^\\s+$"); if (rex.IsMatch(t.Text)) { tokens.GetTokens()[i] = dummy; // wack whitespace token so we can't use it during prediction } else { t.Line = 0; t.Column = -1; } } }
public static void Main(string[] args) { ANTLRv4Lexer lexer = new ANTLRv4Lexer(new ANTLRFileStream("grammars/org/antlr/codebuff/ANTLRv4Lexer.g4")); CommonTokenStream tokens = new CodeBuffTokenStream(lexer); ANTLRv4Parser parser = new ANTLRv4Parser(tokens); ANTLRv4Parser.GrammarSpecContext tree = parser.grammarSpec(); Console.WriteLine(tree.ToStringTree(parser)); }
/// <summary> /// Format the document. Does not affect/alter doc. </summary> public virtual string format(InputDocument doc, bool collectAnalysis) { if (testDoc != null) { throw new System.ArgumentException("can't call format > once"); } // for debugging we need a map from original token with actual line:col to tree node. used by token analysis originalDoc = doc; originalTokenToNodeMap = Trainer.indexTree(doc.tree); originalTokens = doc.tokens; this.testDoc = InputDocument.dup(doc); // make copy of doc, getting new tokens, tree output = new StringBuilder(); this.realTokens = Trainer.getRealTokens(testDoc.tokens); // squeeze out ws and kill any line/col info so we can't use ground truth by mistake wipeCharPositionInfoAndWhitespaceTokens(testDoc.tokens); // all except for first token wsClassifier = new kNNClassifier(corpus, wsFeatures, corpus.injectWhitespace); hposClassifier = new kNNClassifier(corpus, hposFeatures, corpus.hpos); analysis = new ArrayList <TokenPositionAnalysis>(testDoc.tokens.Size); for (int i = 0; i < testDoc.tokens.Size; ++i) { analysis.Add(null); } // make an index on the duplicated doc tree with tokens missing line:col info if (tokenToNodeMap == null) { tokenToNodeMap = Trainer.indexTree(testDoc.tree); } IToken firstToken = testDoc.tokens.getNextRealToken(-1); string prefix = originalTokens.GetText(Interval.Of(0, firstToken.TokenIndex)); // gets any comments in front + first real token charPosInLine = firstToken.Column + firstToken.Text.Length + 1; // start where first token left off line = Tool.count(prefix, '\n') + 1; output.Append(prefix); // first identify oversize lists with separators IdentifyOversizeLists splitter = new IdentifyOversizeLists(corpus, testDoc.tokens, tokenToNodeMap); ParseTreeWalker.Default.Walk(splitter, testDoc.tree); tokenToListInfo = splitter.tokenToListInfo; realTokens = Trainer.getRealTokens(testDoc.tokens); for (int i = Trainer.ANALYSIS_START_TOKEN_INDEX; i < realTokens.Count; i++) { // can't process first token int tokenIndexInStream = realTokens[i].TokenIndex; processToken(i, tokenIndexInStream, collectAnalysis); } releaseMemory(); return(output.ToString()); }
public static CodeBuffTokenStream tokenize(string doc, Type lexerClass) { ANTLRInputStream input = new ANTLRInputStream(doc); Lexer lexer = getLexer(lexerClass, input); CodeBuffTokenStream tokens = new CodeBuffTokenStream(lexer); tokens.Fill(); return(tokens); }
/// <summary> /// Free anything we can to reduce memory footprint after a format(). /// keep analysis, testDoc as they are used for results. /// </summary> public virtual void releaseMemory() { corpus = null; realTokens = null; originalTokens = null; tokenToNodeMap = null; originalTokenToNodeMap = null; tokenToListInfo = null; wsClassifier = null; hposClassifier = null; }
/// <summary> /// Compute a document difference metric 0-1.0 between two documents that /// are identical other than (likely) the whitespace and comments. /// /// 1.0 means the docs are maximally different and 0 means docs are identical. /// /// The Levenshtein distance between the docs counts only /// whitespace diffs as the non-WS content is identical. /// Levenshtein distance is bounded by 0..max(len(doc1),len(doc2)) so /// we normalize the distance by dividing by max WS count. /// /// TODO: can we simplify this to a simple walk with two /// cursors through the original vs formatted counting /// mismatched whitespace? real text are like anchors. /// </summary> //JAVA TO C# CONVERTER WARNING: Method 'throws' clauses are not available in .NET: //ORIGINAL LINE: public static double docDiff(String original, String formatted, Class lexerClass) throws Exception public static double docDiff(string original, string formatted, Type lexerClass) { // Grammar must strip all but real tokens and whitespace (and put that on hidden channel) CodeBuffTokenStream original_tokens = Tool.tokenize(original, lexerClass); // String s = original_tokens.getText(); CodeBuffTokenStream formatted_tokens = Tool.tokenize(formatted, lexerClass); // String t = formatted_tokens.getText(); // walk token streams and examine whitespace in between tokens int i = -1; int ws_distance = 0; int original_ws = 0; int formatted_ws = 0; while (true) { Token ot = original_tokens.LT(i); // TODO: FIX THIS! can't use LT() if (ot == null || ot.Type == TokenConstants.EOF) { break; } IList <Token> ows = original_tokens.GetHiddenTokensToLeft(ot.TokenIndex); original_ws += tokenText(ows).Length; Token ft = formatted_tokens.LT(i); // TODO: FIX THIS! can't use LT() if (ft == null || ft.Type == TokenConstants.EOF) { break; } IList <Token> fws = formatted_tokens.GetHiddenTokensToLeft(ft.TokenIndex); formatted_ws += tokenText(fws).Length; ws_distance += whitespaceEditDistance(tokenText(ows), tokenText(fws)); i++; } // it's probably ok to ignore ws diffs after last real token int max_ws = Math.Max(original_ws, formatted_ws); double normalized_ws_distance = ((float)ws_distance) / max_ws; return(normalized_ws_distance); }
// reuse object so the maps above fill from multiple files during training public virtual void setTokens(CodeBuffTokenStream tokens, ParserRuleContext root, IDictionary <Token, TerminalNode> tokenToNodeMap) { this.tokens = tokens; this.tokenToNodeMap = tokenToNodeMap; }
/// <summary> /// Return map for the various tokens related to this list re list membership </summary> public static IDictionary <Token, org.antlr.codebuff.misc.Pair <bool, int> > getInfoAboutListTokens <T1>(ParserRuleContext ctx, CodeBuffTokenStream tokens, IDictionary <Token, TerminalNode> tokenToNodeMap, IList <T1> siblings, bool isOversizeList) where T1 : Antlr4.Runtime.ParserRuleContext { IDictionary <Token, org.antlr.codebuff.misc.Pair <bool, int> > tokenToListInfo = new Dictionary <Token, org.antlr.codebuff.misc.Pair <bool, int> >(); ParserRuleContext first = siblings[0] as ParserRuleContext; ParserRuleContext last = siblings[siblings.Count - 1] as ParserRuleContext; Token prefixToken = tokens.getPreviousRealToken(first.Start.TokenIndex); // e.g., '(' in an arg list or ':' in grammar def Token suffixToken = tokens.getNextRealToken(last.Stop.TokenIndex); // e.g., LT(1) is last token of list; LT(2) is ')' in an arg list of ';' in grammar def TerminalNode prefixNode = tokenToNodeMap[prefixToken]; TerminalNode suffixNode = tokenToNodeMap[suffixToken]; bool hasSurroundingTokens = prefixNode != null && prefixNode.Parent == suffixNode.Parent; if (hasSurroundingTokens) { tokenToListInfo[prefixToken] = new org.antlr.codebuff.misc.Pair <bool, int>(isOversizeList, Trainer.LIST_PREFIX); tokenToListInfo[suffixToken] = new org.antlr.codebuff.misc.Pair <bool, int>(isOversizeList, Trainer.LIST_SUFFIX); } IList <Tree> separators = getSeparators(ctx, siblings); Tree firstSep = separators[0]; tokenToListInfo[(Token)firstSep.Payload] = new org.antlr.codebuff.misc.Pair <bool, int>(isOversizeList, Trainer.LIST_FIRST_SEPARATOR); foreach (Tree s in separators.Where((e, i) => i > 0 && i < separators.Count)) { tokenToListInfo[(Token)s.Payload] = new org.antlr.codebuff.misc.Pair <bool, int>(isOversizeList, Trainer.LIST_SEPARATOR); } // handle sibling members tokenToListInfo[first.Start] = new org.antlr.codebuff.misc.Pair <bool, int>(isOversizeList, Trainer.LIST_FIRST_ELEMENT); foreach (T1 ss in siblings.Where((e, i) => i > 0 && i < siblings.Count)) { var s = ss as ParserRuleContext; tokenToListInfo[s.Start] = new org.antlr.codebuff.misc.Pair <bool, int>(isOversizeList, Trainer.LIST_MEMBER); } return(tokenToListInfo); }
public IdentifyOversizeLists(Corpus corpus, CodeBuffTokenStream tokens, IDictionary <Token, TerminalNode> tokenToNodeMap) { this.corpus = corpus; this.tokens = tokens; this.tokenToNodeMap = tokenToNodeMap; }