Ejemplo n.º 1
0
        public void ComputeStats(int index, List <Row> samples)
        {
            int          nNulls = 0;
            List <Value> values = new List <Value>();

            foreach (var r in samples)
            {
                Value val = r[index];
                if (val is null)
                {
                    nNulls++;
                }

                values.Add(val);
            }

            n_distinct_ = values.Distinct().Count();
            if (n_distinct_ <= MCVList.NValues_)
            {
                mcv_ = new MCVList();
                var groups = from value in values group value by value into newGroup orderby newGroup.Key select newGroup;
                int i      = 0;
                foreach (var g in groups)
                {
                    mcv_.values_[i] = g.Key;
                    mcv_.freqs_[i]  = (1.0 * g.Count()) / values.Count();
                    i++;
                }
                mcv_.nvalues_ = i;
                mcv_.validateThis();
            }
            else
            {
                // now sort the values and create equal-depth historgram
                values.Sort();
                int nbuckets = Math.Min(Historgram.NBuckets_, values.Count);
                int depth    = values.Count / nbuckets;
                Debug.Assert(depth >= 1);

                hist_ = new Historgram();
                for (int i = 0; i < nbuckets; i++)
                {
                    hist_.buckets_[i]   = values[(i + 1) * depth - 1];
                    hist_.distincts_[i] = values.GetRange(i * depth, depth).Distinct().Count();
                    Debug.Assert(hist_.distincts_[i] > 0);
                }
                hist_.depth_    = depth;
                hist_.nbuckets_ = nbuckets;
            }

            // finalize the stats
            n_rows_ = samples.Count;
            Debug.Assert(nNulls <= samples.Count);
            if (samples.Count != 0)
            {
                nullfrac_ = nNulls / samples.Count;
            }
        }
Ejemplo n.º 2
0
        public void ComputeStats(int index, List <Row> samples)
        {
            int          nNulls = 0;
            List <Value> values = new List <Value>();

            foreach (var r in samples)
            {
                Value val = r[index];
                if (val is null)
                {
                    nNulls++;
                    continue;
                }

                values.Add(val);
            }

            n_distinct_ = (ulong)values.Distinct().Count();
            // initialize mcv whenever the attr is not unique key
            if (n_distinct_ < (ulong)values.Count())
            {
                mcv_ = new MCVList();
                var groups = from value in values group value by value into newGroup select newGroup;

                Dictionary <Value, int> sortgroup = new Dictionary <Value, int>();
                foreach (var g in groups)
                {
                    sortgroup.Add(g.Key, g.Count());
                }

                // use top 100 values to calculate frequency, ensure that pairs are sorted in a fixed order.
                var sorted = from pair in sortgroup orderby pair.Value descending, pair.Key descending select pair;
                mcv_.nvalues_ = (int)Math.Min(n_distinct_, MCVList.NValues_);

                int    i    = 0;
                double freq = 0.0;
                foreach (var g in sorted)
                {
                    mcv_.values_[i] = g.Key;
                    mcv_.freqs_[i]  = (1.0 * g.Value) / values.Count();
                    freq           += mcv_.freqs_[i];
                    i++;
                    if (i >= mcv_.nvalues_)
                    {
                        break;
                    }
                }
                if (n_distinct_ > (ulong)mcv_.nvalues_)
                {
                    Debug.Assert(freq > 0 && freq < 1 + StatConst.epsilon_);
                    mcv_.otherfreq_ = (1.0 - freq) / (n_distinct_ - (ulong)mcv_.nvalues_);
                    mcv_.totalfreq_ = freq;
                }
                else
                {
                    mcv_.otherfreq_ = 0;
                    mcv_.totalfreq_ = StatConst.one_;
                }

                mcv_.validateThis();

                // remove all values present in mcv
                values.RemoveAll(x => mcv_.values_.Contains(x));
            }
            // initialize histogram unless all values in mcv
            if (values.Count > 0)
            {
                // now sort the values and create equal-depth historgram
                values.Sort();
                int    nbuckets = Math.Min(Historgram.NBuckets_, values.Count);
                double depth    = ((double)values.Count) / nbuckets;
                Debug.Assert(depth >= 1);

                hist_ = new Historgram();
                for (int i = 0; i < nbuckets; i++)
                {
                    hist_.buckets_[i] = values[Math.Min((int)(i * depth), values.Count - 1)];
                }
                hist_.depth_    = depth;
                hist_.nbuckets_ = nbuckets;
            }

            // finalize the stats
            n_rows_ = (ulong)samples.Count;
            Debug.Assert(nNulls <= samples.Count);
            if (samples.Count != 0)
            {
                nullfrac_ = nNulls / samples.Count;
            }
        }