Пример #1
0
        // Given min, max, construct histgram for discrete or continous domains
        //
        static void constructDomain <T>(bool isDiscrete, Historgram hist, dynamic min, dynamic max, ulong nrows)
        {
            int     nbuckets = NBuckets_;
            dynamic width;

            if (isDiscrete)
            {
                // given [0, 2], we have 3 integers between thus 3 buckets are enough
                var nvals = max - min + 1;
                if (nvals < NBuckets_)
                {
                    nbuckets = nvals;
                }
                width = ((decimal)nvals) / nbuckets;
                Debug.Assert((decimal)width >= 1);
            }
            else
            {
                // use default number of buckets for continous fields
                var diff = max - min;
                width = diff / (nbuckets - 1);
            }

            // in case nrows is small, say (min, max, nrows) = (1000, 20000, 3), depth_ will be
            // small and this shall not affect histgram's correctness.
            //
            Debug.Assert(nbuckets > 0 && nbuckets <= NBuckets_);
            for (int i = 0; i < nbuckets; i++)
            {
                hist.buckets_[i] = (T)(min + (width * i));
            }
            hist.nbuckets_ = nbuckets;
            hist.depth_    = ((double)nrows) / hist.nbuckets_;
        }
Пример #2
0
        public void ComputeStats(int index, List <Row> samples)
        {
            int          nNulls = 0;
            List <Value> values = new List <Value>();

            foreach (var r in samples)
            {
                Value val = r[index];
                if (val is null)
                {
                    nNulls++;
                }

                values.Add(val);
            }

            n_distinct_ = values.Distinct().Count();
            if (n_distinct_ <= MCVList.NValues_)
            {
                mcv_ = new MCVList();
                var groups = from value in values group value by value into newGroup orderby newGroup.Key select newGroup;
                int i      = 0;
                foreach (var g in groups)
                {
                    mcv_.values_[i] = g.Key;
                    mcv_.freqs_[i]  = (1.0 * g.Count()) / values.Count();
                    i++;
                }
                mcv_.nvalues_ = i;
                mcv_.validateThis();
            }
            else
            {
                // now sort the values and create equal-depth historgram
                values.Sort();
                int nbuckets = Math.Min(Historgram.NBuckets_, values.Count);
                int depth    = values.Count / nbuckets;
                Debug.Assert(depth >= 1);

                hist_ = new Historgram();
                for (int i = 0; i < nbuckets; i++)
                {
                    hist_.buckets_[i]   = values[(i + 1) * depth - 1];
                    hist_.distincts_[i] = values.GetRange(i * depth, depth).Distinct().Count();
                    Debug.Assert(hist_.distincts_[i] > 0);
                }
                hist_.depth_    = depth;
                hist_.nbuckets_ = nbuckets;
            }

            // finalize the stats
            n_rows_ = samples.Count;
            Debug.Assert(nNulls <= samples.Count);
            if (samples.Count != 0)
            {
                nullfrac_ = nNulls / samples.Count;
            }
        }
Пример #3
0
        public static Historgram ConstructFromMinMax(dynamic min, dynamic max, ulong nrows)
        {
            Historgram hist = new Historgram();

            switch (min)
            {
            case int intmin:
                Debug.Assert(max is int);
                constructDomain <int>(true, hist, min, max, nrows);
                break;

            case long longmin:
                Debug.Assert(max is long);
                constructDomain <long>(true, hist, min, max, nrows);
                break;

            case DateTime datemin:
                Debug.Assert(max is DateTime);
                // Notes: Planck disagrees DateTime not discrete
                constructDomain <DateTime>(false, hist, min, max, nrows);
                break;

            case float floatmin:
                Debug.Assert(max is float);
                constructDomain <float>(false, hist, min, max, nrows);
                break;

            case double doublemin:
                Debug.Assert(max is double);
                constructDomain <double>(false, hist, min, max, nrows);
                break;

            case decimal decmin:
                Debug.Assert(max is decimal);
                constructDomain <decimal>(false, hist, min, max, nrows);
                break;

            default:
                // this data type is not supported
                return(null);
            }

            // sanity checks
            Debug.Assert(hist.nbuckets_ >= 1);
            if (hist.nbuckets_ < NBuckets_)
            {
                Debug.Assert(hist.buckets_[hist.nbuckets_] is null);
            }
            return(hist);
        }
Пример #4
0
        public void ComputeStats(int index, List <Row> samples)
        {
            int          nNulls = 0;
            List <Value> values = new List <Value>();

            foreach (var r in samples)
            {
                Value val = r[index];
                if (val is null)
                {
                    nNulls++;
                    continue;
                }

                values.Add(val);
            }

            n_distinct_ = (ulong)values.Distinct().Count();
            // initialize mcv whenever the attr is not unique key
            if (n_distinct_ < (ulong)values.Count())
            {
                mcv_ = new MCVList();
                var groups = from value in values group value by value into newGroup select newGroup;

                Dictionary <Value, int> sortgroup = new Dictionary <Value, int>();
                foreach (var g in groups)
                {
                    sortgroup.Add(g.Key, g.Count());
                }

                // use top 100 values to calculate frequency, ensure that pairs are sorted in a fixed order.
                var sorted = from pair in sortgroup orderby pair.Value descending, pair.Key descending select pair;
                mcv_.nvalues_ = (int)Math.Min(n_distinct_, MCVList.NValues_);

                int    i    = 0;
                double freq = 0.0;
                foreach (var g in sorted)
                {
                    mcv_.values_[i] = g.Key;
                    mcv_.freqs_[i]  = (1.0 * g.Value) / values.Count();
                    freq           += mcv_.freqs_[i];
                    i++;
                    if (i >= mcv_.nvalues_)
                    {
                        break;
                    }
                }
                if (n_distinct_ > (ulong)mcv_.nvalues_)
                {
                    Debug.Assert(freq > 0 && freq < 1 + StatConst.epsilon_);
                    mcv_.otherfreq_ = (1.0 - freq) / (n_distinct_ - (ulong)mcv_.nvalues_);
                    mcv_.totalfreq_ = freq;
                }
                else
                {
                    mcv_.otherfreq_ = 0;
                    mcv_.totalfreq_ = StatConst.one_;
                }

                mcv_.validateThis();

                // remove all values present in mcv
                values.RemoveAll(x => mcv_.values_.Contains(x));
            }
            // initialize histogram unless all values in mcv
            if (values.Count > 0)
            {
                // now sort the values and create equal-depth historgram
                values.Sort();
                int    nbuckets = Math.Min(Historgram.NBuckets_, values.Count);
                double depth    = ((double)values.Count) / nbuckets;
                Debug.Assert(depth >= 1);

                hist_ = new Historgram();
                for (int i = 0; i < nbuckets; i++)
                {
                    hist_.buckets_[i] = values[Math.Min((int)(i * depth), values.Count - 1)];
                }
                hist_.depth_    = depth;
                hist_.nbuckets_ = nbuckets;
            }

            // finalize the stats
            n_rows_ = (ulong)samples.Count;
            Debug.Assert(nNulls <= samples.Count);
            if (samples.Count != 0)
            {
                nullfrac_ = nNulls / samples.Count;
            }
        }