// Given min, max, construct histgram for discrete or continous domains // static void constructDomain <T>(bool isDiscrete, Historgram hist, dynamic min, dynamic max, ulong nrows) { int nbuckets = NBuckets_; dynamic width; if (isDiscrete) { // given [0, 2], we have 3 integers between thus 3 buckets are enough var nvals = max - min + 1; if (nvals < NBuckets_) { nbuckets = nvals; } width = ((decimal)nvals) / nbuckets; Debug.Assert((decimal)width >= 1); } else { // use default number of buckets for continous fields var diff = max - min; width = diff / (nbuckets - 1); } // in case nrows is small, say (min, max, nrows) = (1000, 20000, 3), depth_ will be // small and this shall not affect histgram's correctness. // Debug.Assert(nbuckets > 0 && nbuckets <= NBuckets_); for (int i = 0; i < nbuckets; i++) { hist.buckets_[i] = (T)(min + (width * i)); } hist.nbuckets_ = nbuckets; hist.depth_ = ((double)nrows) / hist.nbuckets_; }
public void ComputeStats(int index, List <Row> samples) { int nNulls = 0; List <Value> values = new List <Value>(); foreach (var r in samples) { Value val = r[index]; if (val is null) { nNulls++; } values.Add(val); } n_distinct_ = values.Distinct().Count(); if (n_distinct_ <= MCVList.NValues_) { mcv_ = new MCVList(); var groups = from value in values group value by value into newGroup orderby newGroup.Key select newGroup; int i = 0; foreach (var g in groups) { mcv_.values_[i] = g.Key; mcv_.freqs_[i] = (1.0 * g.Count()) / values.Count(); i++; } mcv_.nvalues_ = i; mcv_.validateThis(); } else { // now sort the values and create equal-depth historgram values.Sort(); int nbuckets = Math.Min(Historgram.NBuckets_, values.Count); int depth = values.Count / nbuckets; Debug.Assert(depth >= 1); hist_ = new Historgram(); for (int i = 0; i < nbuckets; i++) { hist_.buckets_[i] = values[(i + 1) * depth - 1]; hist_.distincts_[i] = values.GetRange(i * depth, depth).Distinct().Count(); Debug.Assert(hist_.distincts_[i] > 0); } hist_.depth_ = depth; hist_.nbuckets_ = nbuckets; } // finalize the stats n_rows_ = samples.Count; Debug.Assert(nNulls <= samples.Count); if (samples.Count != 0) { nullfrac_ = nNulls / samples.Count; } }
public static Historgram ConstructFromMinMax(dynamic min, dynamic max, ulong nrows) { Historgram hist = new Historgram(); switch (min) { case int intmin: Debug.Assert(max is int); constructDomain <int>(true, hist, min, max, nrows); break; case long longmin: Debug.Assert(max is long); constructDomain <long>(true, hist, min, max, nrows); break; case DateTime datemin: Debug.Assert(max is DateTime); // Notes: Planck disagrees DateTime not discrete constructDomain <DateTime>(false, hist, min, max, nrows); break; case float floatmin: Debug.Assert(max is float); constructDomain <float>(false, hist, min, max, nrows); break; case double doublemin: Debug.Assert(max is double); constructDomain <double>(false, hist, min, max, nrows); break; case decimal decmin: Debug.Assert(max is decimal); constructDomain <decimal>(false, hist, min, max, nrows); break; default: // this data type is not supported return(null); } // sanity checks Debug.Assert(hist.nbuckets_ >= 1); if (hist.nbuckets_ < NBuckets_) { Debug.Assert(hist.buckets_[hist.nbuckets_] is null); } return(hist); }
public void ComputeStats(int index, List <Row> samples) { int nNulls = 0; List <Value> values = new List <Value>(); foreach (var r in samples) { Value val = r[index]; if (val is null) { nNulls++; continue; } values.Add(val); } n_distinct_ = (ulong)values.Distinct().Count(); // initialize mcv whenever the attr is not unique key if (n_distinct_ < (ulong)values.Count()) { mcv_ = new MCVList(); var groups = from value in values group value by value into newGroup select newGroup; Dictionary <Value, int> sortgroup = new Dictionary <Value, int>(); foreach (var g in groups) { sortgroup.Add(g.Key, g.Count()); } // use top 100 values to calculate frequency, ensure that pairs are sorted in a fixed order. var sorted = from pair in sortgroup orderby pair.Value descending, pair.Key descending select pair; mcv_.nvalues_ = (int)Math.Min(n_distinct_, MCVList.NValues_); int i = 0; double freq = 0.0; foreach (var g in sorted) { mcv_.values_[i] = g.Key; mcv_.freqs_[i] = (1.0 * g.Value) / values.Count(); freq += mcv_.freqs_[i]; i++; if (i >= mcv_.nvalues_) { break; } } if (n_distinct_ > (ulong)mcv_.nvalues_) { Debug.Assert(freq > 0 && freq < 1 + StatConst.epsilon_); mcv_.otherfreq_ = (1.0 - freq) / (n_distinct_ - (ulong)mcv_.nvalues_); mcv_.totalfreq_ = freq; } else { mcv_.otherfreq_ = 0; mcv_.totalfreq_ = StatConst.one_; } mcv_.validateThis(); // remove all values present in mcv values.RemoveAll(x => mcv_.values_.Contains(x)); } // initialize histogram unless all values in mcv if (values.Count > 0) { // now sort the values and create equal-depth historgram values.Sort(); int nbuckets = Math.Min(Historgram.NBuckets_, values.Count); double depth = ((double)values.Count) / nbuckets; Debug.Assert(depth >= 1); hist_ = new Historgram(); for (int i = 0; i < nbuckets; i++) { hist_.buckets_[i] = values[Math.Min((int)(i * depth), values.Count - 1)]; } hist_.depth_ = depth; hist_.nbuckets_ = nbuckets; } // finalize the stats n_rows_ = (ulong)samples.Count; Debug.Assert(nNulls <= samples.Count); if (samples.Count != 0) { nullfrac_ = nNulls / samples.Count; } }