/// <summary> /// Builds the tree by choosing the best dividing criteria for the current set /// </summary> public static DecisionNode BuildTree(Set rows, MeasuringMetric mode = MeasuringMetric.Entropy) { Fun scoreCalc = Entropy; switch (mode) { case MeasuringMetric.Entropy: scoreCalc = Entropy; break; case MeasuringMetric.GiniImpurity: scoreCalc = GiniImpurity; break; case MeasuringMetric.Variance: scoreCalc = Variance; break; } if (rows.Count == 0) { return(new DecisionNode()); } double currentScore = scoreCalc(rows); // Set up some variables to track the best criteria double bestGain = 0; Pair bestCriteria = new Pair(); DividedSet bestSets = new DividedSet(); int columnCount = rows[0].Length - 1; for (int col = 0; col < columnCount; ++col) { // Generate the list of different values in this column List <string> columnValues = new List <string>(); foreach (var row in rows) { if (!columnValues.Contains(row[col])) { columnValues.Add(row[col]); } } // Now try dividing the rows up for each value in this column foreach (var value in columnValues) { DividedSet divSet = DivideSet(rows, col, value); // Information gain double p = (double)divSet.Key.Count / rows.Count; double gain = currentScore - p * scoreCalc(divSet.Key) - (1 - p) * scoreCalc(divSet.Value); if (gain > bestGain && divSet.Key.Count > 0 && divSet.Value.Count > 0) { bestGain = gain; bestCriteria = new Pair(col, value); bestSets = divSet; } } } // Create the subbranches if (bestGain > 0) { DecisionNode trueBranch = BuildTree(bestSets.Key); DecisionNode falseBranch = BuildTree(bestSets.Value); return(new DecisionNode(column: bestCriteria.Key, value: bestCriteria.Value, nextTrueNode: trueBranch, nextFalseNode: falseBranch)); } return(new DecisionNode(results: UniqueCounts(rows))); }
/// <summary> /// Builds the tree by choosing the best dividing criteria for the current set /// </summary> public static DecisionNode BuildTree(Set rows, MeasuringMetric mode = MeasuringMetric.Entropy) { Fun scoreCalc = Entropy; switch (mode) { case MeasuringMetric.Entropy: scoreCalc = Entropy; break; case MeasuringMetric.GiniImpurity: scoreCalc = GiniImpurity; break; case MeasuringMetric.Variance: scoreCalc = Variance; break; } if (rows.Count == 0) { return new DecisionNode(); } double currentScore = scoreCalc(rows); // Set up some variables to track the best criteria double bestGain = 0; Pair bestCriteria = new Pair(); DividedSet bestSets = new DividedSet(); int columnCount = rows[0].Length - 1; for (int col = 0; col < columnCount; ++col) { // Generate the list of different values in this column List<string> columnValues = new List<string>(); foreach (var row in rows) { if (!columnValues.Contains(row[col])) { columnValues.Add(row[col]); } } // Now try dividing the rows up for each value in this column foreach (var value in columnValues) { DividedSet divSet = DivideSet(rows, col, value); // Information gain double p = (double)divSet.Key.Count / rows.Count; double gain = currentScore - p * scoreCalc(divSet.Key) - (1 - p) * scoreCalc(divSet.Value); if (gain > bestGain && divSet.Key.Count > 0 && divSet.Value.Count > 0) { bestGain = gain; bestCriteria = new Pair(col, value); bestSets = divSet; } } } // Create the subbranches if (bestGain > 0) { DecisionNode trueBranch = BuildTree(bestSets.Key); DecisionNode falseBranch = BuildTree(bestSets.Value); return new DecisionNode(column: bestCriteria.Key, value: bestCriteria.Value, nextTrueNode: trueBranch, nextFalseNode: falseBranch); } return new DecisionNode(results: UniqueCounts(rows)); }