/// <summary> /// Return true if we've only seen parent-sibling-separator combo as a split list. /// Return true if we've seen that combo as both list and split list AND /// len of all siblings is closer to split median than to regular nonsplit median. /// </summary> public virtual bool isOversizeList <T1>(ParserRuleContext ctx, IList <T1> siblings, Token separator) where T1 : Antlr4.Runtime.ParserRuleContext { ParserRuleContext first = siblings[0] as ParserRuleContext; ParentSiblingListKey pair = new ParentSiblingListKey(ctx, first, separator.Type); SiblingListStats stats = null; corpus.rootAndChildListStats.TryGetValue(pair, out stats); SiblingListStats splitStats = null; corpus.rootAndSplitChildListStats.TryGetValue(pair, out splitStats); bool oversize = stats == null && splitStats != null; if (stats != null && splitStats == null) { // note: if we've never seen a split version of this ctx, do nothing; // I used to have oversize failsafe } int len = Trainer.getSiblingsLength(siblings); if (stats != null && splitStats != null) { // compare distance in units of standard deviations to regular or split means // like a one-dimensional Mahalanobis distance. // actually i took out the stddev divisor. they are usually very spread out and overlapping. double distToSplit = Math.Abs(splitStats.median - len); double distToSplitSquared = Math.Pow(distToSplit, 2); double distToSplitStddevUnits = distToSplitSquared / Math.Sqrt(splitStats.variance); double distToRegular = Math.Abs(stats.median - len); double distToRegularSquared = Math.Pow(distToRegular, 2); double distToRegularStddevUnits = distToRegularSquared / Math.Sqrt(stats.variance); // consider a priori probabilities as well. float n = splitStats.numSamples + stats.numSamples; float probSplit = splitStats.numSamples / n; float probRegular = stats.numSamples / n; double adjDistToSplit = distToSplitSquared * (1 - probSplit); // make distance smaller if probSplit is high double adjDistToRegular = distToRegularSquared * (1 - probRegular); if (adjDistToSplit < adjDistToRegular) { oversize = true; } } return(oversize); }
public override void visitNonSingletonWithSeparator <T1>(ParserRuleContext ctx, IList <T1> siblings, IToken separator) { ParserRuleContext first = siblings[0] as Antlr4.Runtime.ParserRuleContext; ParserRuleContext last = siblings[siblings.Count - 1] as Antlr4.Runtime.ParserRuleContext; IList <Token> hiddenToLeft = tokens.GetHiddenTokensToLeft(first.Start.TokenIndex); IList <Token> hiddenToLeftOfSep = tokens.GetHiddenTokensToLeft(separator.TokenIndex); IList <Token> hiddenToRightOfSep = tokens.GetHiddenTokensToRight(separator.TokenIndex); IList <Token> hiddenToRight = tokens.GetHiddenTokensToRight(last.Stop.TokenIndex); Token hiddenTokenToLeft = hiddenToLeft != null ? hiddenToLeft[0] : null; Token hiddenTokenToRight = hiddenToRight != null ? hiddenToRight[0] : null; int[] ws = new int[4]; // '\n' (before list, before sep, after sep, after last element) // KED. naked new lines is not platform independent!!!!!!!!!!!!!!!!!!!! // STOP using naked new lines!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! if (hiddenTokenToLeft != null && Tool.count(hiddenTokenToLeft.Text, '\n') > 0) { ws[0] = '\n'; } if (hiddenToLeftOfSep != null && Tool.count(hiddenToLeftOfSep[0].Text, '\n') > 0) { ws[1] = '\n'; // System.out.println("BEFORE "+JavaParser.ruleNames[ctx.getRuleIndex()]+ // "->"+JavaParser.ruleNames[ctx.getRuleIndex()]+" sep "+ // JavaParser.tokenNames[separator.getType()]+ // " "+separator); } if (hiddenToRightOfSep != null && Tool.count(hiddenToRightOfSep[0].Text, '\n') > 0) { ws[2] = '\n'; // System.out.println("AFTER "+JavaParser.ruleNames[ctx.getRuleIndex()]+ // "->"+JavaParser.ruleNames[ctx.getRuleIndex()]+" sep "+ // JavaParser.tokenNames[separator.getType()]+ // " "+separator); } if (hiddenTokenToRight != null && Tool.count(hiddenTokenToRight.Text, '\n') > 0) { ws[3] = '\n'; } bool isSplitList = ws[1] == '\n' || ws[2] == '\n'; // now track length of parent:alt,child:alt list or split-list ParentSiblingListKey pair = new ParentSiblingListKey(ctx, first, separator.Type); IDictionary <ParentSiblingListKey, IList <int> > info = isSplitList ? splitListInfo : listInfo; IList <int> lens = null; info.TryGetValue(pair, out lens); if (lens == null) { lens = new List <int>(); info[pair] = lens; } lens.Add(Trainer.getSiblingsLength(siblings)); // track the form split lists take for debugging if (isSplitList) { int form = Trainer.listform(ws); IList <int> forms = null; splitListForm.TryGetValue(pair, out forms); if (forms == null) { forms = new List <int>(); splitListForm[pair] = forms; } forms.Add(form); // track where we put newlines for this list } IDictionary <Token, org.antlr.codebuff.misc.Pair <bool, int> > tokenInfo = getInfoAboutListTokens(ctx, tokens, tokenToNodeMap, siblings, isSplitList); // copy sibling list info for associated tokens into overall list // but don't overwrite existing so that most general (largest construct) // list information is use/retained (i.e., not overwritten). foreach (Token t in tokenInfo.Keys) { if (!tokenToListInfo.ContainsKey(t)) { tokenToListInfo[t] = tokenInfo[t]; } } }