/// <summary>
        /// Return true if we've only seen parent-sibling-separator combo as a split list.
        ///  Return true if we've seen that combo as both list and split list AND
        ///  len of all siblings is closer to split median than to regular nonsplit median.
        /// </summary>
        public virtual bool isOversizeList <T1>(ParserRuleContext ctx, IList <T1> siblings, Token separator) where T1 : Antlr4.Runtime.ParserRuleContext
        {
            ParserRuleContext    first = siblings[0] as ParserRuleContext;
            ParentSiblingListKey pair  = new ParentSiblingListKey(ctx, first, separator.Type);
            SiblingListStats     stats = null;

            corpus.rootAndChildListStats.TryGetValue(pair, out stats);
            SiblingListStats splitStats = null;

            corpus.rootAndSplitChildListStats.TryGetValue(pair, out splitStats);
            bool oversize = stats == null && splitStats != null;

            if (stats != null && splitStats == null)
            {
                // note: if we've never seen a split version of this ctx, do nothing;
                // I used to have oversize failsafe
            }

            int len = Trainer.getSiblingsLength(siblings);

            if (stats != null && splitStats != null)
            {
                // compare distance in units of standard deviations to regular or split means
                // like a one-dimensional Mahalanobis distance.
                // actually i took out the stddev divisor. they are usually very spread out and overlapping.
                double distToSplit            = Math.Abs(splitStats.median - len);
                double distToSplitSquared     = Math.Pow(distToSplit, 2);
                double distToSplitStddevUnits = distToSplitSquared / Math.Sqrt(splitStats.variance);

                double distToRegular            = Math.Abs(stats.median - len);
                double distToRegularSquared     = Math.Pow(distToRegular, 2);
                double distToRegularStddevUnits = distToRegularSquared / Math.Sqrt(stats.variance);

                // consider a priori probabilities as well.
                float  n                = splitStats.numSamples + stats.numSamples;
                float  probSplit        = splitStats.numSamples / n;
                float  probRegular      = stats.numSamples / n;
                double adjDistToSplit   = distToSplitSquared * (1 - probSplit);               // make distance smaller if probSplit is high
                double adjDistToRegular = distToRegularSquared * (1 - probRegular);
                if (adjDistToSplit < adjDistToRegular)
                {
                    oversize = true;
                }
            }
            return(oversize);
        }
Example #2
0
        public virtual IDictionary <ParentSiblingListKey, SiblingListStats> getListStats(IDictionary <ParentSiblingListKey, IList <int> > map)
        {
            IDictionary <ParentSiblingListKey, SiblingListStats> listSizes = new Dictionary <ParentSiblingListKey, SiblingListStats>();

            foreach (ParentSiblingListKey pair in map.Keys)
            {
                IList <int> lens     = map[pair];
                var         new_lens = lens.OrderBy(i => i).ToList();
                lens = new_lens;
                int    n      = lens.Count;
                int?   min    = lens[0];
                int?   median = lens[n / 2];
                int?   max    = lens[n - 1];
                double @var   = BuffUtils.variance(lens);
                listSizes[pair] = new SiblingListStats(n, min.Value, median.Value, @var, max.Value);
            }
            return(listSizes);
        }