/// <summary>Gets best split.</summary> /// <param name="x">The Matrix to process.</param> /// <param name="y">The Vector to process.</param> /// <param name="used">The used.</param> /// <returns>The best split.</returns> private Tuple <int, double, Impurity> GetBestSplit(Matrix x, Vector y, List <int> used) { double bestGain = -1; var bestFeature = -1; Impurity bestMeasure = null; for (var i = 0; i < x.Cols; i++) { // already used? if (used.Contains(i)) { continue; } double gain = 0; // Impurity measure = (Impurity)Activator.CreateInstance(ImpurityType); var measure = (Impurity)Ject.Create(this.ImpurityType); // get appropriate column vector var feature = x.Col(i); // get appropriate feature at index i // (important on because of multivalued // cols) var property = this.Descriptor.At(i); // if discrete, calculate full relative gain if (property.Discrete) { gain = measure.RelativeGain(y, feature); } // otherwise segment based on width else { gain = measure.SegmentedRelativeGain(y, feature, this.Width); } // best one? if (gain > bestGain) { bestGain = gain; bestFeature = i; bestMeasure = measure; } } return(new Tuple <int, double, Impurity>(bestFeature, bestGain, bestMeasure)); }
/// <summary>Gets best split.</summary> /// <param name="x">The Matrix to process.</param> /// <param name="y">The Vector to process.</param> /// <param name="used">The used.</param> /// <returns>The best split.</returns> private Tuple <int, double, Impurity> GetBestSplit(Matrix x, Vector y, List <int> used) { double bestGain = -1; var bestFeature = -1; Impurity bestMeasure = null; for (var i = 0; i < x.Cols; i++) { // already used? if (used.Contains(i)) { continue; } var measure = (Impurity)Ject.Create(ImpurityType); // get appropriate column vector var feature = x.Col(i); // get appropriate feature at index i // (important on because of multivalued // cols) var property = Descriptor.At(i); // if discrete, calculate full relative gain var gain = property.Discrete ? measure.RelativeGain(y, feature) : measure.SegmentedRelativeGain(y, feature, Width); // best one? if (!(gain > bestGain)) { continue; } bestGain = gain; bestFeature = i; bestMeasure = measure; } return(new Tuple <int, double, Impurity>(bestFeature, bestGain, bestMeasure)); }
private Node BuildTree(Matrix x, Vector y, int depth, List <int> used) { // reached depth limit or all labels are the same if (depth < 0 || y.Distinct().Count() == 1) { return new Node { IsLeaf = true, Label = y.Mode() } } ; double bestGain = -1; int bestFeature = -1; double[] splitValues = new double[] { }; Impurity measure = null; for (int i = 0; i < x.Cols; i++) { var feature = x[i, VectorType.Column]; var fd = Description.Features[i]; // is feature discrete? ie enum or bool? var discrete = fd.Type.IsEnum || fd.Type == typeof(bool); switch (Type) { case ImpurityType.Error: if (!discrete) { measure = Error.Of(y) .Given(feature) .WithWidth(Width); } else { measure = Error.Of(y) .Given(feature); } break; case ImpurityType.Entropy: if (!discrete) { measure = Entropy.Of(y) .Given(feature) .WithWidth(Width); } else { measure = Entropy.Of(y) .Given(feature); } break; case ImpurityType.Gini: if (!discrete) { measure = Gini.Of(y) .Given(feature) .WithWidth(Width); } else { measure = Gini.Of(y) .Given(feature); } break; } double gain = measure.RelativeGain(); if (gain > bestGain && !used.Contains(i)) { bestGain = gain; bestFeature = i; splitValues = measure.SplitValues; } } // uh oh, need to return something? // a weird node of some sort... // but just in case... if (bestFeature == -1) { return new Node { IsLeaf = true, Label = y.Mode() } } ; used.Add(bestFeature); Node n = new Node(); n.Gain = bestGain; // measure has a width property set // meaning its a continuous var // (second conditional indicates // a width that has range values) var bestFD = Description.Features[bestFeature]; // multiway split - constant fan-out width (non-continuous) if (bestFD.Type.IsEnum || bestFD.Type == typeof(bool)) { n.Children = new Node[splitValues.Length]; for (int i = 0; i < n.Children.Length; i++) { var slice = x.Indices(v => v[bestFeature] == splitValues[i], VectorType.Row); n.Children[i] = BuildTree(x.Slice(slice, VectorType.Row), y.Slice(slice), depth - 1, used); } n.Segmented = false; } // continuous split with built in ranges else { // since this is in ranges, need each slot // represents two boundaries n.Children = new Node[measure.Width]; for (int i = 0; i < n.Children.Length; i++) { var slice = x.Indices( v => v[bestFeature] >= splitValues[i] && v[bestFeature] < splitValues[i + 1], VectorType.Row); n.Children[i] = BuildTree(x.Slice(slice, VectorType.Row), y.Slice(slice), depth - 1, used); } n.Segmented = true; } n.IsLeaf = false; n.Feature = bestFeature; n.Values = splitValues; return(n); }