/// <summary>Gets base conditionals.</summary> /// <param name="x">The Matrix to process.</param> /// <returns>An array of measure.</returns> private Measure[] GetBaseConditionals(Matrix x) { Measure[] features = new Measure[x.Cols]; for (int i = 0; i < features.Length; i++) { Property p = Descriptor.At(i); var f = new Measure { Discrete = p.Discrete, Label = Descriptor.ColumnAt(i), }; IEnumerable <Statistic> fstats; if (f.Discrete) { fstats = x[i, VectorType.Col].Distinct().OrderBy(d => d) .Select(d => Statistic.Make(p.Convert(d).ToString(), d, 1)); } else { fstats = x[i, VectorType.Col].Segment(Width) .Select(d => Statistic.Make(f.Label, d, 1)); } f.Probabilities = fstats.ToArray(); features[i] = f; } return(features); }
/// <summary>Gets best split.</summary> /// <param name="x">The Matrix to process.</param> /// <param name="y">The Vector to process.</param> /// <param name="used">The used.</param> /// <returns>The best split.</returns> private Tuple <int, double, Impurity> GetBestSplit(Matrix x, Vector y, List <int> used) { double bestGain = -1; int bestFeature = -1; Impurity bestMeasure = null; for (int i = 0; i < x.Cols; i++) { // already used? if (used.Contains(i)) { continue; } double gain = 0; //Impurity measure = (Impurity)Activator.CreateInstance(ImpurityType); Impurity measure = (Impurity)Ject.Create(ImpurityType); // get appropriate column vector var feature = x.Col(i); // get appropriate feature at index i // (important on because of multivalued // cols) var property = Descriptor.At(i); // if discrete, calculate full relative gain if (property.Discrete) { gain = measure.RelativeGain(y, feature); } // otherwise segment based on width else { gain = measure.SegmentedRelativeGain(y, feature, Width); } // best one? if (gain > bestGain) { bestGain = gain; bestFeature = i; bestMeasure = measure; } } return(new Tuple <int, double, Impurity>(bestFeature, bestGain, bestMeasure)); }
/// <summary>Walk node.</summary> /// <exception cref="InvalidOperationException">Thrown when the requested operation is invalid.</exception> /// <param name="v">The Vector to process.</param> /// <param name="node">The node.</param> /// <returns>A double.</returns> private double WalkNode(Vector v, Node node) { if (node.IsLeaf) { return(node.Value); } // Get the index of the feature for this node. var col = node.Column; if (col == -1) { throw new InvalidOperationException("Invalid Feature encountered during node walk!"); } var edges = Tree.GetOutEdges(node).ToArray(); for (var i = 0; i < edges.Length; i++) { var edge = (Edge)edges[i]; if (edge.Discrete && v[col] == edge.Min) { return(WalkNode(v, (Node)Tree.GetVertex(edge.ChildId))); } if (!edge.Discrete && v[col] >= edge.Min && v[col] < edge.Max) { return(WalkNode(v, (Node)Tree.GetVertex(edge.ChildId))); } } if (Hint != double.Epsilon) { return(Hint); } throw new InvalidOperationException( string.Format( "Unable to match split value {0} for feature {1}[2]\nConsider setting a Hint in order to avoid this error.", v[col], Descriptor.At(col), col)); }
/// <summary>Gets best split.</summary> /// <param name="x">The Matrix to process.</param> /// <param name="y">The Vector to process.</param> /// <param name="used">The used.</param> /// <returns>The best split.</returns> private Tuple <int, double, Impurity> GetBestSplit(Matrix x, Vector y, List <int> used) { double bestGain = -1; var bestFeature = -1; Impurity bestMeasure = null; for (var i = 0; i < x.Cols; i++) { // already used? if (used.Contains(i)) { continue; } var measure = (Impurity)Ject.Create(ImpurityType); // get appropriate column vector var feature = x.Col(i); // get appropriate feature at index i // (important on because of multivalued // cols) var property = Descriptor.At(i); // if discrete, calculate full relative gain var gain = property.Discrete ? measure.RelativeGain(y, feature) : measure.SegmentedRelativeGain(y, feature, Width); // best one? if (!(gain > bestGain)) { continue; } bestGain = gain; bestFeature = i; bestMeasure = measure; } return(new Tuple <int, double, Impurity>(bestFeature, bestGain, bestMeasure)); }
/// <summary>Builds a tree.</summary> /// <param name="x">The Matrix to process.</param> /// <param name="y">The Vector to process.</param> /// <param name="depth">The depth.</param> /// <param name="used">The used.</param> /// <returns>A Node.</returns> private Node BuildTree(Matrix x, Vector y, int depth, List <int> used, Tree tree) { if (depth < 0) { return(BuildLeafNode(y.Mode())); } var tuple = GetBestSplit(x, y, used); var col = tuple.Item1; var gain = tuple.Item2; var measure = tuple.Item3; // uh oh, need to return something? // a weird node of some sort... // but just in case... if (col == -1) { return(BuildLeafNode(y.Mode())); } used.Add(col); Node node = new Node { Column = col, Gain = gain, IsLeaf = false, Name = Descriptor.ColumnAt(col) }; // populate edges List <Edge> edges = new List <Edge>(measure.Segments.Length); for (int i = 0; i < measure.Segments.Length; i++) { // working set var segment = measure.Segments[i]; var edge = new Edge() { ParentId = node.Id, Discrete = measure.Discrete, Min = segment.Min, Max = segment.Max }; IEnumerable <int> slice; if (edge.Discrete) { // get discrete label edge.Label = Descriptor.At(col).Convert(segment.Min).ToString(); // do value check for matrix slicing slice = x.Indices(v => v[col] == segment.Min); } else { // get range label edge.Label = string.Format("{0} <= x < {1}", segment.Min, segment.Max); // do range check for matrix slicing slice = x.Indices(v => v[col] >= segment.Min && v[col] < segment.Max); } // something to look at? // if this number is 0 then this edge // leads to a dead end - the edge will // not be built if (slice.Count() > 0) { Vector ySlice = y.Slice(slice); // only one answer, set leaf if (ySlice.Distinct().Count() == 1) { var child = BuildLeafNode(ySlice[0]); tree.AddVertex(child); edge.ChildId = child.Id; } // otherwise continue to build tree else { var child = BuildTree(x.Slice(slice), ySlice, depth - 1, used, tree); tree.AddVertex(child); edge.ChildId = child.Id; } edges.Add(edge); } } // problem, need to convert // parent to terminal node // with mode if (edges.Count <= 1) { var val = y.Mode(); node.IsLeaf = true; node.Value = val; } tree.AddVertex(node); if (edges.Count > 1) { foreach (var e in edges) { tree.AddEdge(e); } } return(node); }