/// <summary> /// Return true if we've only seen parent-sibling-separator combo as a split list. /// Return true if we've seen that combo as both list and split list AND /// len of all siblings is closer to split median than to regular nonsplit median. /// </summary> public virtual bool isOversizeList <T1>(ParserRuleContext ctx, IList <T1> siblings, Token separator) where T1 : Antlr4.Runtime.ParserRuleContext { ParserRuleContext first = siblings[0] as ParserRuleContext; ParentSiblingListKey pair = new ParentSiblingListKey(ctx, first, separator.Type); SiblingListStats stats = null; corpus.rootAndChildListStats.TryGetValue(pair, out stats); SiblingListStats splitStats = null; corpus.rootAndSplitChildListStats.TryGetValue(pair, out splitStats); bool oversize = stats == null && splitStats != null; if (stats != null && splitStats == null) { // note: if we've never seen a split version of this ctx, do nothing; // I used to have oversize failsafe } int len = Trainer.getSiblingsLength(siblings); if (stats != null && splitStats != null) { // compare distance in units of standard deviations to regular or split means // like a one-dimensional Mahalanobis distance. // actually i took out the stddev divisor. they are usually very spread out and overlapping. double distToSplit = Math.Abs(splitStats.median - len); double distToSplitSquared = Math.Pow(distToSplit, 2); double distToSplitStddevUnits = distToSplitSquared / Math.Sqrt(splitStats.variance); double distToRegular = Math.Abs(stats.median - len); double distToRegularSquared = Math.Pow(distToRegular, 2); double distToRegularStddevUnits = distToRegularSquared / Math.Sqrt(stats.variance); // consider a priori probabilities as well. float n = splitStats.numSamples + stats.numSamples; float probSplit = splitStats.numSamples / n; float probRegular = stats.numSamples / n; double adjDistToSplit = distToSplitSquared * (1 - probSplit); // make distance smaller if probSplit is high double adjDistToRegular = distToRegularSquared * (1 - probRegular); if (adjDistToSplit < adjDistToRegular) { oversize = true; } } return(oversize); }
public virtual IDictionary <ParentSiblingListKey, SiblingListStats> getListStats(IDictionary <ParentSiblingListKey, IList <int> > map) { IDictionary <ParentSiblingListKey, SiblingListStats> listSizes = new Dictionary <ParentSiblingListKey, SiblingListStats>(); foreach (ParentSiblingListKey pair in map.Keys) { IList <int> lens = map[pair]; var new_lens = lens.OrderBy(i => i).ToList(); lens = new_lens; int n = lens.Count; int? min = lens[0]; int? median = lens[n / 2]; int? max = lens[n - 1]; double @var = BuffUtils.variance(lens); listSizes[pair] = new SiblingListStats(n, min.Value, median.Value, @var, max.Value); } return(listSizes); }