示例#1
0
文件: Program.cs 项目: mlohstroh/ml
        private static void BuildTmpTree()
        {
            List<DataRow> tmpData = SpoofData(100, 3);

            BooleanTreeNode root = new BooleanTreeNode("Root", tmpData);
            if (tmpData.Count > 0)
            {
                root.BuildTree(new HashSet<string>(tmpData[0].Attributes));
            }

        }
示例#2
0
文件: Program.cs 项目: mlohstroh/ml
        static void Main(string[] args)
        {
            // testing entropy function
            //Console.WriteLine("Entropy: {0}", MathHelpers.Entropy(new double[] { 1.0/4.0, 3.0/4.0 }));

            //// testing IG function
            //double pE = 1;
            //double rE = MathHelpers.Entropy(new double[] { 4.0 / 7.0, 3.0 / 7.0 });
            //double LE = MathHelpers.Entropy(new double[] { 2.0 / 3.0, 1.0 / 3.0 });

            //double lD = 3.0 / 10.0;
            //double rD = 7.0 / 10.0;
            //Console.WriteLine("IG: {0}", MathHelpers.InformationGained(pE, new double[] { LE, rE }, new double[] { lD, rD }));
            BooleanTreeNode root;
            string trainPath = @"data\train-win.dat";
            string testPath = @"data\test-win.dat";
            if (args.Length != 2)
            {
                Console.WriteLine("Invalid input... Reading packaged test data...");
            }
            else
            {
                trainPath = args[0];
                testPath = args[1];
            }

            List<DataRow> trainData = DataRow.ReadFile(trainPath);
            List<DataRow> testData = DataRow.ReadFile(testPath);

            int trainDataCount = trainData.Count;

            List<DataRow> prunedData = trainData.GetRange(0, trainDataCount);

            root = new BooleanTreeNode("Root", prunedData);
            if (trainData.Count > 0)
            {
                var hash = new HashSet<string>(trainData[0].Attributes);
                // the reader counts the class as an attr. whoops
                hash.Remove("class");
                root.BuildTree(hash);
                root.PrintTree();

                PrintCorrectness(prunedData, root);
                PrintCorrectness(testData, root, "test");
            }
            else
            {
                Console.WriteLine("Uh oh...");
            }
            //BuildTmpTree();

            Console.WriteLine("Press any key to exit...");
            Console.ReadKey();
        }
示例#3
0
        public static double InformationGained(BooleanTreeNode parent, BooleanTreeNode left, BooleanTreeNode right)
        {
            int totalRows = parent.Subset.Count;

            double sub = 0.0;
            double leftEnt = left.GetEntropy();
            double rightEnt = right.GetEntropy();
            sub += leftEnt * ((double)left.Subset.Count / (double)totalRows);
            sub += rightEnt * ((double)right.Subset.Count / (double)totalRows);
            double ent = parent.GetEntropy();

            return ent - sub;
        }
示例#4
0
        public void BuildTree(HashSet<string> remainingAttributes)
        {
            // The meat of the algorithm

            /*

             LearnTree(X,Y)
                – Input:
                • Set X of R training vectors, each containing the values (x1,..,xM) of M attributes (X1,..,XM)
                • A vector Y of R elements, where yj = class of the jth datapoint
                – If all the datapoints in X have the same class value y
                    • Return a leaf node that predicts y as output
                – If all the datapoints in X have the same attribute value (x1,..,xM)
                    • Return a leaf node that predicts the majority of the class values in Y
                as output
                – Try all the possible attributes Xj and choose the one, j*, for which IG(Y|Xj) is maximum
                    – For every possible value v of Xj*:
                        – Xv,Yv= set of datapoints for which xj* = v and corresponding classes
                        – Child <= LearnTree(Xv, Yv)

            */
            int pureClass = int.MinValue;
            if (IsSubsetPure(Subset, out pureClass))
            {
                // excellent, we have a pure node!
                // set the classification and return
                Classification = pureClass;
                return;
            }

            if (remainingAttributes.Count == 0)
            {
                // we have nothing else to splice, just return the majority or the global majority
                Classification = MajorClass(Subset);
                return;
            }

            // get the entropy of the current node and subset
            double parentEntropy = MathHelpers.Entropy(Subset);

            string maxAttr = null;
            double maxIG = int.MinValue;
            Dictionary<int, List<DataRow>> splitByAttr = null;

            // calculate the most information gained
            foreach (string attr in remainingAttributes)
            {
                // in a binary tree node, there will only be two results when splitting
                // by an attribute
                var tmp = DataRow.GetDistByAttr(Subset, attr);

                // == 0
                BooleanTreeNode right = new BooleanTreeNode(maxAttr, tmp[0]);
                BooleanTreeNode left = new BooleanTreeNode(maxAttr, tmp[1]);

                double ig = MathHelpers.InformationGained(this, left, right);

                if (Program.Debug)
                    Console.WriteLine("{0} Attribute has an information gain of {1}", attr, ig);
                
                if (ig > maxIG)
                {
                    maxAttr = attr;
                    maxIG = ig;
                    splitByAttr = tmp;
                }
            }

            if(Program.Debug)
                Console.WriteLine("Picking attribute {0} for an information gained of {1}", maxAttr, maxIG);

            // this is weird, repeating, leaving in for testing purposes
            BooleanTreeNode child1 = new BooleanTreeNode(maxAttr, splitByAttr[0])
            {
                MatchValue = 0
            };
            BooleanTreeNode child2 = new BooleanTreeNode(maxAttr, splitByAttr[1])
            {
                MatchValue = 1
            };

            // == 0
            AddChild(child1, (row) =>
            {
                return row.RetrieveValueAsInt(maxAttr) == child1.MatchValue;
            });

            // == 1
            AddChild(child2, (row) =>
            {
                return row.RetrieveValueAsInt(maxAttr) == child2.MatchValue;
            });

            remainingAttributes.Remove(maxAttr);

            child1.BuildTree(new HashSet<string>(remainingAttributes));
            child2.BuildTree(new HashSet<string>(remainingAttributes));
        }
示例#5
0
 public void AddChild(BooleanTreeNode node, Func<DataRow, bool> resultFunc)
 {
     _children.Add(node, resultFunc);
 }
示例#6
0
文件: Program.cs 项目: mlohstroh/ml
        private static void PrintCorrectness(List<DataRow> data, BooleanTreeNode tree, string set = "training")
        {
            int correct = 0;

            for(int i = 0; i < data.Count; i++)
            {
                int realClassification = data[i].RetrieveClassification();
                int treeClassification = tree.Classify(data[i]);

                if(realClassification == treeClassification)
                {
                    correct++;
                }
            }

            double accuracy = ((double)correct / (double)data.Count) * 100;

            Console.WriteLine("Accuracy on {0} set ({1} instances):   {2:0.0}%", set, data.Count, accuracy);
        }