/// <summary>
        /// Return true if we've only seen parent-sibling-separator combo as a split list.
        ///  Return true if we've seen that combo as both list and split list AND
        ///  len of all siblings is closer to split median than to regular nonsplit median.
        /// </summary>
        public virtual bool isOversizeList <T1>(ParserRuleContext ctx, IList <T1> siblings, Token separator) where T1 : Antlr4.Runtime.ParserRuleContext
        {
            ParserRuleContext    first = siblings[0] as ParserRuleContext;
            ParentSiblingListKey pair  = new ParentSiblingListKey(ctx, first, separator.Type);
            SiblingListStats     stats = null;

            corpus.rootAndChildListStats.TryGetValue(pair, out stats);
            SiblingListStats splitStats = null;

            corpus.rootAndSplitChildListStats.TryGetValue(pair, out splitStats);
            bool oversize = stats == null && splitStats != null;

            if (stats != null && splitStats == null)
            {
                // note: if we've never seen a split version of this ctx, do nothing;
                // I used to have oversize failsafe
            }

            int len = Trainer.getSiblingsLength(siblings);

            if (stats != null && splitStats != null)
            {
                // compare distance in units of standard deviations to regular or split means
                // like a one-dimensional Mahalanobis distance.
                // actually i took out the stddev divisor. they are usually very spread out and overlapping.
                double distToSplit            = Math.Abs(splitStats.median - len);
                double distToSplitSquared     = Math.Pow(distToSplit, 2);
                double distToSplitStddevUnits = distToSplitSquared / Math.Sqrt(splitStats.variance);

                double distToRegular            = Math.Abs(stats.median - len);
                double distToRegularSquared     = Math.Pow(distToRegular, 2);
                double distToRegularStddevUnits = distToRegularSquared / Math.Sqrt(stats.variance);

                // consider a priori probabilities as well.
                float  n                = splitStats.numSamples + stats.numSamples;
                float  probSplit        = splitStats.numSamples / n;
                float  probRegular      = stats.numSamples / n;
                double adjDistToSplit   = distToSplitSquared * (1 - probSplit);               // make distance smaller if probSplit is high
                double adjDistToRegular = distToRegularSquared * (1 - probRegular);
                if (adjDistToSplit < adjDistToRegular)
                {
                    oversize = true;
                }
            }
            return(oversize);
        }
Exemple #2
0
        public override void visitNonSingletonWithSeparator <T1>(ParserRuleContext ctx, IList <T1> siblings, IToken separator)
        {
            ParserRuleContext first              = siblings[0] as Antlr4.Runtime.ParserRuleContext;
            ParserRuleContext last               = siblings[siblings.Count - 1] as Antlr4.Runtime.ParserRuleContext;
            IList <Token>     hiddenToLeft       = tokens.GetHiddenTokensToLeft(first.Start.TokenIndex);
            IList <Token>     hiddenToLeftOfSep  = tokens.GetHiddenTokensToLeft(separator.TokenIndex);
            IList <Token>     hiddenToRightOfSep = tokens.GetHiddenTokensToRight(separator.TokenIndex);
            IList <Token>     hiddenToRight      = tokens.GetHiddenTokensToRight(last.Stop.TokenIndex);

            Token hiddenTokenToLeft  = hiddenToLeft != null ? hiddenToLeft[0] : null;
            Token hiddenTokenToRight = hiddenToRight != null ? hiddenToRight[0] : null;

            int[] ws = new int[4];             // '\n' (before list, before sep, after sep, after last element)
            // KED. naked new lines is not platform independent!!!!!!!!!!!!!!!!!!!!
            // STOP using naked new lines!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
            if (hiddenTokenToLeft != null && Tool.count(hiddenTokenToLeft.Text, '\n') > 0)
            {
                ws[0] = '\n';
            }
            if (hiddenToLeftOfSep != null && Tool.count(hiddenToLeftOfSep[0].Text, '\n') > 0)
            {
                ws[1] = '\n';
                //			System.out.println("BEFORE "+JavaParser.ruleNames[ctx.getRuleIndex()]+
                //				                   "->"+JavaParser.ruleNames[ctx.getRuleIndex()]+" sep "+
                //				                   JavaParser.tokenNames[separator.getType()]+
                //				                   " "+separator);
            }
            if (hiddenToRightOfSep != null && Tool.count(hiddenToRightOfSep[0].Text, '\n') > 0)
            {
                ws[2] = '\n';
                //			System.out.println("AFTER "+JavaParser.ruleNames[ctx.getRuleIndex()]+
                //				                   "->"+JavaParser.ruleNames[ctx.getRuleIndex()]+" sep "+
                //				                   JavaParser.tokenNames[separator.getType()]+
                //				                   " "+separator);
            }
            if (hiddenTokenToRight != null && Tool.count(hiddenTokenToRight.Text, '\n') > 0)
            {
                ws[3] = '\n';
            }
            bool isSplitList = ws[1] == '\n' || ws[2] == '\n';

            // now track length of parent:alt,child:alt list or split-list
            ParentSiblingListKey pair = new ParentSiblingListKey(ctx, first, separator.Type);
            IDictionary <ParentSiblingListKey, IList <int> > info = isSplitList ? splitListInfo : listInfo;
            IList <int> lens = null;

            info.TryGetValue(pair, out lens);
            if (lens == null)
            {
                lens       = new List <int>();
                info[pair] = lens;
            }
            lens.Add(Trainer.getSiblingsLength(siblings));

            // track the form split lists take for debugging
            if (isSplitList)
            {
                int         form  = Trainer.listform(ws);
                IList <int> forms = null;
                splitListForm.TryGetValue(pair, out forms);
                if (forms == null)
                {
                    forms = new List <int>();
                    splitListForm[pair] = forms;
                }
                forms.Add(form);                 // track where we put newlines for this list
            }

            IDictionary <Token, org.antlr.codebuff.misc.Pair <bool, int> > tokenInfo = getInfoAboutListTokens(ctx, tokens, tokenToNodeMap, siblings, isSplitList);

            // copy sibling list info for associated tokens into overall list
            // but don't overwrite existing so that most general (largest construct)
            // list information is use/retained (i.e., not overwritten).
            foreach (Token t in tokenInfo.Keys)
            {
                if (!tokenToListInfo.ContainsKey(t))
                {
                    tokenToListInfo[t] = tokenInfo[t];
                }
            }
        }