Esempio n. 1
0
        /// <summary>Gets best split.</summary>
        /// <param name="x">The Matrix to process.</param>
        /// <param name="y">The Vector to process.</param>
        /// <param name="used">The used.</param>
        /// <returns>The best split.</returns>
        private Tuple <int, double, Impurity> GetBestSplit(Matrix x, Vector y, List <int> used)
        {
            double bestGain    = -1;
            var    bestFeature = -1;

            Impurity bestMeasure = null;

            for (var i = 0; i < x.Cols; i++)
            {
                // already used?
                if (used.Contains(i))
                {
                    continue;
                }

                double gain = 0;

                // Impurity measure = (Impurity)Activator.CreateInstance(ImpurityType);
                var measure = (Impurity)Ject.Create(this.ImpurityType);

                // get appropriate column vector
                var feature = x.Col(i);

                // get appropriate feature at index i
                // (important on because of multivalued
                // cols)
                var property = this.Descriptor.At(i);

                // if discrete, calculate full relative gain
                if (property.Discrete)
                {
                    gain = measure.RelativeGain(y, feature);
                }

                // otherwise segment based on width
                else
                {
                    gain = measure.SegmentedRelativeGain(y, feature, this.Width);
                }

                // best one?
                if (gain > bestGain)
                {
                    bestGain    = gain;
                    bestFeature = i;
                    bestMeasure = measure;
                }
            }

            return(new Tuple <int, double, Impurity>(bestFeature, bestGain, bestMeasure));
        }
Esempio n. 2
0
        /// <summary>Gets best split.</summary>
        /// <param name="x">The Matrix to process.</param>
        /// <param name="y">The Vector to process.</param>
        /// <param name="used">The used.</param>
        /// <returns>The best split.</returns>
        private Tuple <int, double, Impurity> GetBestSplit(Matrix x, Vector y, List <int> used)
        {
            double bestGain    = -1;
            var    bestFeature = -1;

            Impurity bestMeasure = null;

            for (var i = 0; i < x.Cols; i++)
            {
                // already used?
                if (used.Contains(i))
                {
                    continue;
                }

                var measure = (Impurity)Ject.Create(ImpurityType);

                // get appropriate column vector
                var feature = x.Col(i);
                // get appropriate feature at index i
                // (important on because of multivalued
                // cols)
                var property = Descriptor.At(i);
                // if discrete, calculate full relative gain
                var gain = property.Discrete ? measure.RelativeGain(y, feature) : measure.SegmentedRelativeGain(y, feature, Width);

                // best one?
                if (!(gain > bestGain))
                {
                    continue;
                }

                bestGain    = gain;
                bestFeature = i;
                bestMeasure = measure;
            }

            return(new Tuple <int, double, Impurity>(bestFeature, bestGain, bestMeasure));
        }
Esempio n. 3
0
        private Node BuildTree(Matrix x, Vector y, int depth, List <int> used)
        {
            // reached depth limit or all labels are the same
            if (depth < 0 || y.Distinct().Count() == 1)
            {
                return new Node {
                           IsLeaf = true, Label = y.Mode()
                }
            }
            ;

            double bestGain    = -1;
            int    bestFeature = -1;

            double[] splitValues = new double[] { };
            Impurity measure     = null;

            for (int i = 0; i < x.Cols; i++)
            {
                var feature = x[i, VectorType.Column];
                var fd      = Description.Features[i];

                // is feature discrete? ie enum or bool?
                var discrete = fd.Type.IsEnum || fd.Type == typeof(bool);

                switch (Type)
                {
                case ImpurityType.Error:
                    if (!discrete)
                    {
                        measure = Error.Of(y)
                                  .Given(feature)
                                  .WithWidth(Width);
                    }
                    else
                    {
                        measure = Error.Of(y)
                                  .Given(feature);
                    }
                    break;

                case ImpurityType.Entropy:
                    if (!discrete)
                    {
                        measure = Entropy.Of(y)
                                  .Given(feature)
                                  .WithWidth(Width);
                    }
                    else
                    {
                        measure = Entropy.Of(y)
                                  .Given(feature);
                    }
                    break;

                case ImpurityType.Gini:
                    if (!discrete)
                    {
                        measure = Gini.Of(y)
                                  .Given(feature)
                                  .WithWidth(Width);
                    }
                    else
                    {
                        measure = Gini.Of(y)
                                  .Given(feature);
                    }
                    break;
                }

                double gain = measure.RelativeGain();

                if (gain > bestGain && !used.Contains(i))
                {
                    bestGain    = gain;
                    bestFeature = i;
                    splitValues = measure.SplitValues;
                }
            }

            // uh oh, need to return something?
            // a weird node of some sort...
            // but just in case...
            if (bestFeature == -1)
            {
                return new Node {
                           IsLeaf = true, Label = y.Mode()
                }
            }
            ;

            used.Add(bestFeature);
            Node n = new Node();

            n.Gain = bestGain;
            // measure has a width property set
            // meaning its a continuous var
            // (second conditional indicates
            //  a width that has range values)

            var bestFD = Description.Features[bestFeature];

            // multiway split - constant fan-out width (non-continuous)
            if (bestFD.Type.IsEnum || bestFD.Type == typeof(bool))
            {
                n.Children = new Node[splitValues.Length];

                for (int i = 0; i < n.Children.Length; i++)
                {
                    var slice = x.Indices(v => v[bestFeature] == splitValues[i], VectorType.Row);
                    n.Children[i] = BuildTree(x.Slice(slice, VectorType.Row), y.Slice(slice), depth - 1, used);
                }
                n.Segmented = false;
            }
            // continuous split with built in ranges
            else
            {
                // since this is in ranges, need each slot
                // represents two boundaries
                n.Children = new Node[measure.Width];
                for (int i = 0; i < n.Children.Length; i++)
                {
                    var slice = x.Indices(
                        v => v[bestFeature] >= splitValues[i] && v[bestFeature] < splitValues[i + 1],
                        VectorType.Row);

                    n.Children[i] = BuildTree(x.Slice(slice, VectorType.Row), y.Slice(slice), depth - 1, used);
                }
                n.Segmented = true;
            }

            n.IsLeaf  = false;
            n.Feature = bestFeature;
            n.Values  = splitValues;
            return(n);
        }