/// <summary> /// Look into the originalTokens stream to get the comments to the left of current /// token. Emit all whitespace and comments except for whitespace at the /// end as we'll inject that per newline prediction. /// /// We able to see original input stream for comment purposes only. With all /// whitespace removed, we can't emit this stuff properly. This /// is the only place that examines the original token stream during formatting. /// </summary> public virtual int emitCommentsToTheLeft(int tokenIndexInStream, int injectNL_WS) { IList <Token> hiddenTokensToLeft = originalTokens.GetHiddenTokensToLeft(tokenIndexInStream); if (hiddenTokensToLeft != null) { // if at least one is not whitespace, assume it's a comment and print all hidden stuff including whitespace bool hasComment = Trainer.hasCommentToken(hiddenTokensToLeft); if (hasComment) { // avoid whitespace at end of sequence as we'll inject that int last = -1; for (int i = hiddenTokensToLeft.Count - 1; i >= 0; i--) { Token hidden = hiddenTokensToLeft[i]; string hiddenText = hidden.Text; Regex rex = new Regex("^\\s+$"); if (!rex.IsMatch(hiddenText)) { last = i; break; } } Token commentToken = hiddenTokensToLeft[last]; IList <Token> truncated = hiddenTokensToLeft.Take(last + 1).ToList(); foreach (Token hidden in truncated) { string hiddenText = hidden.Text; output.Append(hiddenText); Regex rex = new Regex("^\\n+$"); // KED SUSPECT THIS MAY BE WRONG FOR WINDOWS (\n\r). if (rex.IsMatch(hiddenText)) { line += Tool.count(hiddenText, '\n'); charPosInLine = 0; } else { // if a comment or plain ' ', must count char position charPosInLine += hiddenText.Length; } } // failsafe. make sure single-line comments have \n on the end. // If not predicted, must override and inject one if (commentToken.Type == corpus.language.singleLineCommentType && (injectNL_WS & 0xFF) != Trainer.CAT_INJECT_NL) { return(Trainer.nlcat(1)); // force formatter to predict newline then trigger alignment } } } return(injectNL_WS); // send same thing back out unless we trigger failsafe }
/// <summary> /// Compute a document difference metric 0-1.0 between two documents that /// are identical other than (likely) the whitespace and comments. /// /// 1.0 means the docs are maximally different and 0 means docs are identical. /// /// The Levenshtein distance between the docs counts only /// whitespace diffs as the non-WS content is identical. /// Levenshtein distance is bounded by 0..max(len(doc1),len(doc2)) so /// we normalize the distance by dividing by max WS count. /// /// TODO: can we simplify this to a simple walk with two /// cursors through the original vs formatted counting /// mismatched whitespace? real text are like anchors. /// </summary> //JAVA TO C# CONVERTER WARNING: Method 'throws' clauses are not available in .NET: //ORIGINAL LINE: public static double docDiff(String original, String formatted, Class lexerClass) throws Exception public static double docDiff(string original, string formatted, Type lexerClass) { // Grammar must strip all but real tokens and whitespace (and put that on hidden channel) CodeBuffTokenStream original_tokens = Tool.tokenize(original, lexerClass); // String s = original_tokens.getText(); CodeBuffTokenStream formatted_tokens = Tool.tokenize(formatted, lexerClass); // String t = formatted_tokens.getText(); // walk token streams and examine whitespace in between tokens int i = -1; int ws_distance = 0; int original_ws = 0; int formatted_ws = 0; while (true) { Token ot = original_tokens.LT(i); // TODO: FIX THIS! can't use LT() if (ot == null || ot.Type == TokenConstants.EOF) { break; } IList <Token> ows = original_tokens.GetHiddenTokensToLeft(ot.TokenIndex); original_ws += tokenText(ows).Length; Token ft = formatted_tokens.LT(i); // TODO: FIX THIS! can't use LT() if (ft == null || ft.Type == TokenConstants.EOF) { break; } IList <Token> fws = formatted_tokens.GetHiddenTokensToLeft(ft.TokenIndex); formatted_ws += tokenText(fws).Length; ws_distance += whitespaceEditDistance(tokenText(ows), tokenText(fws)); i++; } // it's probably ok to ignore ws diffs after last real token int max_ws = Math.Max(original_ws, formatted_ws); double normalized_ws_distance = ((float)ws_distance) / max_ws; return(normalized_ws_distance); }
public override void visitNonSingletonWithSeparator <T1>(ParserRuleContext ctx, IList <T1> siblings, IToken separator) { ParserRuleContext first = siblings[0] as Antlr4.Runtime.ParserRuleContext; ParserRuleContext last = siblings[siblings.Count - 1] as Antlr4.Runtime.ParserRuleContext; IList <Token> hiddenToLeft = tokens.GetHiddenTokensToLeft(first.Start.TokenIndex); IList <Token> hiddenToLeftOfSep = tokens.GetHiddenTokensToLeft(separator.TokenIndex); IList <Token> hiddenToRightOfSep = tokens.GetHiddenTokensToRight(separator.TokenIndex); IList <Token> hiddenToRight = tokens.GetHiddenTokensToRight(last.Stop.TokenIndex); Token hiddenTokenToLeft = hiddenToLeft != null ? hiddenToLeft[0] : null; Token hiddenTokenToRight = hiddenToRight != null ? hiddenToRight[0] : null; int[] ws = new int[4]; // '\n' (before list, before sep, after sep, after last element) // KED. naked new lines is not platform independent!!!!!!!!!!!!!!!!!!!! // STOP using naked new lines!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! if (hiddenTokenToLeft != null && Tool.count(hiddenTokenToLeft.Text, '\n') > 0) { ws[0] = '\n'; } if (hiddenToLeftOfSep != null && Tool.count(hiddenToLeftOfSep[0].Text, '\n') > 0) { ws[1] = '\n'; // System.out.println("BEFORE "+JavaParser.ruleNames[ctx.getRuleIndex()]+ // "->"+JavaParser.ruleNames[ctx.getRuleIndex()]+" sep "+ // JavaParser.tokenNames[separator.getType()]+ // " "+separator); } if (hiddenToRightOfSep != null && Tool.count(hiddenToRightOfSep[0].Text, '\n') > 0) { ws[2] = '\n'; // System.out.println("AFTER "+JavaParser.ruleNames[ctx.getRuleIndex()]+ // "->"+JavaParser.ruleNames[ctx.getRuleIndex()]+" sep "+ // JavaParser.tokenNames[separator.getType()]+ // " "+separator); } if (hiddenTokenToRight != null && Tool.count(hiddenTokenToRight.Text, '\n') > 0) { ws[3] = '\n'; } bool isSplitList = ws[1] == '\n' || ws[2] == '\n'; // now track length of parent:alt,child:alt list or split-list ParentSiblingListKey pair = new ParentSiblingListKey(ctx, first, separator.Type); IDictionary <ParentSiblingListKey, IList <int> > info = isSplitList ? splitListInfo : listInfo; IList <int> lens = null; info.TryGetValue(pair, out lens); if (lens == null) { lens = new List <int>(); info[pair] = lens; } lens.Add(Trainer.getSiblingsLength(siblings)); // track the form split lists take for debugging if (isSplitList) { int form = Trainer.listform(ws); IList <int> forms = null; splitListForm.TryGetValue(pair, out forms); if (forms == null) { forms = new List <int>(); splitListForm[pair] = forms; } forms.Add(form); // track where we put newlines for this list } IDictionary <Token, org.antlr.codebuff.misc.Pair <bool, int> > tokenInfo = getInfoAboutListTokens(ctx, tokens, tokenToNodeMap, siblings, isSplitList); // copy sibling list info for associated tokens into overall list // but don't overwrite existing so that most general (largest construct) // list information is use/retained (i.e., not overwritten). foreach (Token t in tokenInfo.Keys) { if (!tokenToListInfo.ContainsKey(t)) { tokenToListInfo[t] = tokenInfo[t]; } } }