private static void BuildTmpTree() { List<DataRow> tmpData = SpoofData(100, 3); BooleanTreeNode root = new BooleanTreeNode("Root", tmpData); if (tmpData.Count > 0) { root.BuildTree(new HashSet<string>(tmpData[0].Attributes)); } }
static void Main(string[] args) { // testing entropy function //Console.WriteLine("Entropy: {0}", MathHelpers.Entropy(new double[] { 1.0/4.0, 3.0/4.0 })); //// testing IG function //double pE = 1; //double rE = MathHelpers.Entropy(new double[] { 4.0 / 7.0, 3.0 / 7.0 }); //double LE = MathHelpers.Entropy(new double[] { 2.0 / 3.0, 1.0 / 3.0 }); //double lD = 3.0 / 10.0; //double rD = 7.0 / 10.0; //Console.WriteLine("IG: {0}", MathHelpers.InformationGained(pE, new double[] { LE, rE }, new double[] { lD, rD })); BooleanTreeNode root; string trainPath = @"data\train-win.dat"; string testPath = @"data\test-win.dat"; if (args.Length != 2) { Console.WriteLine("Invalid input... Reading packaged test data..."); } else { trainPath = args[0]; testPath = args[1]; } List<DataRow> trainData = DataRow.ReadFile(trainPath); List<DataRow> testData = DataRow.ReadFile(testPath); int trainDataCount = trainData.Count; List<DataRow> prunedData = trainData.GetRange(0, trainDataCount); root = new BooleanTreeNode("Root", prunedData); if (trainData.Count > 0) { var hash = new HashSet<string>(trainData[0].Attributes); // the reader counts the class as an attr. whoops hash.Remove("class"); root.BuildTree(hash); root.PrintTree(); PrintCorrectness(prunedData, root); PrintCorrectness(testData, root, "test"); } else { Console.WriteLine("Uh oh..."); } //BuildTmpTree(); Console.WriteLine("Press any key to exit..."); Console.ReadKey(); }
public static double InformationGained(BooleanTreeNode parent, BooleanTreeNode left, BooleanTreeNode right) { int totalRows = parent.Subset.Count; double sub = 0.0; double leftEnt = left.GetEntropy(); double rightEnt = right.GetEntropy(); sub += leftEnt * ((double)left.Subset.Count / (double)totalRows); sub += rightEnt * ((double)right.Subset.Count / (double)totalRows); double ent = parent.GetEntropy(); return ent - sub; }
public void BuildTree(HashSet<string> remainingAttributes) { // The meat of the algorithm /* LearnTree(X,Y) – Input: • Set X of R training vectors, each containing the values (x1,..,xM) of M attributes (X1,..,XM) • A vector Y of R elements, where yj = class of the jth datapoint – If all the datapoints in X have the same class value y • Return a leaf node that predicts y as output – If all the datapoints in X have the same attribute value (x1,..,xM) • Return a leaf node that predicts the majority of the class values in Y as output – Try all the possible attributes Xj and choose the one, j*, for which IG(Y|Xj) is maximum – For every possible value v of Xj*: – Xv,Yv= set of datapoints for which xj* = v and corresponding classes – Child <= LearnTree(Xv, Yv) */ int pureClass = int.MinValue; if (IsSubsetPure(Subset, out pureClass)) { // excellent, we have a pure node! // set the classification and return Classification = pureClass; return; } if (remainingAttributes.Count == 0) { // we have nothing else to splice, just return the majority or the global majority Classification = MajorClass(Subset); return; } // get the entropy of the current node and subset double parentEntropy = MathHelpers.Entropy(Subset); string maxAttr = null; double maxIG = int.MinValue; Dictionary<int, List<DataRow>> splitByAttr = null; // calculate the most information gained foreach (string attr in remainingAttributes) { // in a binary tree node, there will only be two results when splitting // by an attribute var tmp = DataRow.GetDistByAttr(Subset, attr); // == 0 BooleanTreeNode right = new BooleanTreeNode(maxAttr, tmp[0]); BooleanTreeNode left = new BooleanTreeNode(maxAttr, tmp[1]); double ig = MathHelpers.InformationGained(this, left, right); if (Program.Debug) Console.WriteLine("{0} Attribute has an information gain of {1}", attr, ig); if (ig > maxIG) { maxAttr = attr; maxIG = ig; splitByAttr = tmp; } } if(Program.Debug) Console.WriteLine("Picking attribute {0} for an information gained of {1}", maxAttr, maxIG); // this is weird, repeating, leaving in for testing purposes BooleanTreeNode child1 = new BooleanTreeNode(maxAttr, splitByAttr[0]) { MatchValue = 0 }; BooleanTreeNode child2 = new BooleanTreeNode(maxAttr, splitByAttr[1]) { MatchValue = 1 }; // == 0 AddChild(child1, (row) => { return row.RetrieveValueAsInt(maxAttr) == child1.MatchValue; }); // == 1 AddChild(child2, (row) => { return row.RetrieveValueAsInt(maxAttr) == child2.MatchValue; }); remainingAttributes.Remove(maxAttr); child1.BuildTree(new HashSet<string>(remainingAttributes)); child2.BuildTree(new HashSet<string>(remainingAttributes)); }
public void AddChild(BooleanTreeNode node, Func<DataRow, bool> resultFunc) { _children.Add(node, resultFunc); }
private static void PrintCorrectness(List<DataRow> data, BooleanTreeNode tree, string set = "training") { int correct = 0; for(int i = 0; i < data.Count; i++) { int realClassification = data[i].RetrieveClassification(); int treeClassification = tree.Classify(data[i]); if(realClassification == treeClassification) { correct++; } } double accuracy = ((double)correct / (double)data.Count) * 100; Console.WriteLine("Accuracy on {0} set ({1} instances): {2:0.0}%", set, data.Count, accuracy); }