Пример #1
0
        public RandomForest(LearningSet data, Hyperparameters parameters)
        {
            List <Task <Tree> > tasks = new List <Task <Tree> >();
            Random rand = new Random(parameters.Seed);

            for (int i = 0; i < parameters.NumTrees; ++i)
            {
                Random treeRandom = new Random(rand.Next());
                tasks.Add(Task.Run(() => new Tree(data, treeRandom, parameters)));
            }
            NumFeatures = data.First().Features.Length;
            Task.WaitAll(tasks.ToArray());
            Trees    = tasks.Select(t => t.Result).ToArray();
            Accuracy = Trees.Average(t => t.Accuracy);
        }
Пример #2
0
        public Tree(LearningSet data, Random random, Hyperparameters parameters)
        {
            List <DataPoint> bag      = new List <DataPoint>();
            List <DataPoint> outOfBag = new List <DataPoint>();

            foreach (DataPoint point in data)
            {
                if (random.NextDouble() < parameters.OutOfBag)
                {
                    outOfBag.Add(point);
                }
                else
                {
                    bag.Add(point);
                }
            }
            Root     = new Node(bag, random, parameters);
            Accuracy = outOfBag.Average(d => Root.Classify(d) == d.Classification ? 1.0 : 0.0);
        }
Пример #3
0
        public static void Main(string[] args)
        {
            Hyperparameters parameters = new Hyperparameters {
                NumTrees    = 10,
                MaxFeatures = -1,
                MinFeatures = 1,
                MaxDepth    = 10,
                Seed        = (int)(DateTime.Now.Ticks % int.MaxValue),
                OutOfBag    = 0.3
            };
            string        trainingFile   = null;
            string        serializedFile = null;
            List <double> testData       = new List <double>();

            for (int i = 0; i < args.Length; ++i)
            {
                switch (args[i])
                {
                case "--num-trees":
                    if (i + 1 >= args.Length)
                    {
                        InvalidArgs();
                    }
                    if (!int.TryParse(args[++i], out parameters.NumTrees))
                    {
                        InvalidArgs();
                    }
                    break;

                case "--max-features":
                    if (i + 1 >= args.Length)
                    {
                        InvalidArgs();
                    }
                    if (!int.TryParse(args[++i], out parameters.MaxFeatures))
                    {
                        InvalidArgs();
                    }
                    break;

                case "--min-features":
                    if (i + 1 >= args.Length)
                    {
                        InvalidArgs();
                    }
                    if (!int.TryParse(args[++i], out parameters.MinFeatures))
                    {
                        InvalidArgs();
                    }
                    break;

                case "--max-depth":
                    if (i + 1 >= args.Length)
                    {
                        InvalidArgs();
                    }
                    if (!int.TryParse(args[++i], out parameters.MaxDepth))
                    {
                        InvalidArgs();
                    }
                    break;

                case "--seed":
                    if (i + 1 >= args.Length)
                    {
                        InvalidArgs();
                    }
                    if (!int.TryParse(args[++i], out parameters.Seed))
                    {
                        InvalidArgs();
                    }
                    break;

                case "--oob":
                    if (i + 1 >= args.Length)
                    {
                        InvalidArgs();
                    }
                    if (!double.TryParse(args[++i], out parameters.OutOfBag))
                    {
                        InvalidArgs();
                    }
                    break;

                default:
                    double val;
                    if (args[i].EndsWith(".csv"))
                    {
                        if (trainingFile == null)
                        {
                            trainingFile = args[i];
                        }
                        else
                        {
                            InvalidArgs();
                        }
                    }
                    else if (args[i].EndsWith(".bin") || args[i].EndsWith(".xml"))
                    {
                        if (serializedFile == null)
                        {
                            serializedFile = args[i];
                        }
                        else
                        {
                            InvalidArgs();
                        }
                    }
                    else if (double.TryParse(args[i], out val))
                    {
                        testData.Add(val);
                    }
                    else
                    {
                        InvalidArgs();
                    }
                    break;
                }
            }
            RandomForest forest = null;

            if (trainingFile == null)
            {
                if (serializedFile == null || !File.Exists(serializedFile))
                {
                    Console.WriteLine("No model source");
                    InvalidArgs();
                }
                else
                {
                    if (serializedFile.EndsWith(".xml"))
                    {
                        XmlSerializer serializer = new XmlSerializerFactory().CreateSerializer(typeof(RandomForest));
                        using (Stream stream = new FileStream(serializedFile, FileMode.Open, FileAccess.Read)) {
                            forest = (RandomForest)serializer.Deserialize(stream);
                        }
                    }
                    else
                    {
                        BinaryFormatter serializer = new BinaryFormatter();
                        using (Stream stream = new FileStream(serializedFile, FileMode.Open, FileAccess.Read)) {
                            forest = (RandomForest)serializer.Deserialize(stream);
                        }
                    }
                }
            }
            else
            {
                LearningSet learningSet = new LearningSet(trainingFile);
                if (parameters.MaxFeatures == -1)
                {
                    parameters.MaxFeatures = (int)Math.Sqrt(learningSet.First().Features.Length);
                }
                forest = new RandomForest(learningSet, parameters);
                if (serializedFile != null)
                {
                    if (serializedFile.EndsWith(".xml"))
                    {
                        XmlSerializer serializer = new XmlSerializerFactory().CreateSerializer(typeof(RandomForest));
                        using (Stream stream = new FileStream(serializedFile, FileMode.Create, FileAccess.Write)) {
                            serializer.Serialize(stream, forest);
                        }
                    }
                    else
                    {
                        BinaryFormatter serializer = new BinaryFormatter();
                        using (Stream stream = new FileStream(serializedFile, FileMode.Create, FileAccess.Write)) {
                            serializer.Serialize(stream, forest);
                        }
                    }
                }
            }
            if (testData.Count > 0)
            {
                if (testData.Count == forest.NumFeatures)
                {
                    Console.WriteLine(forest.Classify(new DataPoint {
                        Features = testData.ToArray()
                    }));
                }
                else
                {
                    Console.WriteLine("Invalid number of features");
                    InvalidArgs();
                }
            }
            else
            {
                Console.WriteLine("Accuracy: {0}%", forest.Accuracy * 100.0);
            }
        }
Пример #4
0
        public Node(IEnumerable <DataPoint> dataPoints, Random random, Hyperparameters parameters, int depth = 1)
        {
            List <int> features = new List <int>();

            while (features.Count < parameters.MaxFeatures)
            {
                int feature = random.Next(dataPoints.First().Features.Length);
                if (!features.Contains(feature))
                {
                    features.Add(feature);
                }
            }
            double[] classes  = dataPoints.GroupBy(d => d.Classification).Select(g => g.Key).ToArray();
            double   bestGini = double.MaxValue;

            DataPoint[] bestLess    = new DataPoint[0];
            DataPoint[] bestGreater = new DataPoint[0];
            foreach (int feature in features)
            {
                foreach (DataPoint dataPoint in dataPoints)
                {
                    DataPoint[] less    = dataPoints.Where(t => t.Features[feature] < dataPoint.Features[feature]).ToArray();
                    DataPoint[] greater = dataPoints.Except(less).ToArray();
                    double      gini    = 0;
                    foreach (double classification in classes)
                    {
                        foreach (DataPoint[] group in new [] { less, greater })
                        {
                            double amount = ((double)group.Where(d => d.Classification == classification).Count()) / (double)group.Length;
                            gini += amount * (1.0 - amount);
                        }
                    }
                    if (gini < bestGini)
                    {
                        Feature     = feature;
                        Threshold   = dataPoint.Features[feature];
                        bestLess    = less;
                        bestGreater = greater;
                        bestGini    = gini;
                    }
                }
            }
            if (bestLess.Length == 0 || bestGreater.Length == 0)
            {
                LessClass = GreaterClass = dataPoints.GroupBy(d => d.Classification).OrderByDescending(g => g.Count()).First().Key;
            }
            else if (depth >= parameters.MaxDepth)
            {
                LessClass    = bestLess.GroupBy(d => d.Classification).OrderByDescending(g => g.Count()).First().Key;
                GreaterClass = bestGreater.GroupBy(d => d.Classification).OrderByDescending(g => g.Count()).First().Key;
            }
            else
            {
                Task less = Task.Run(() => {
                    if (bestLess.Length > parameters.MinFeatures)
                    {
                        Less = new Node(bestLess, random, parameters, depth + 1);
                    }
                    else
                    {
                        LessClass = bestLess.GroupBy(d => d.Classification).OrderByDescending(g => g.Count()).First().Key;
                    }
                });
                Task greater = Task.Run(() => {
                    if (bestGreater.Length > parameters.MinFeatures)
                    {
                        Greater = new Node(bestGreater, random, parameters, depth + 1);
                    }
                    else
                    {
                        GreaterClass = bestGreater.GroupBy(d => d.Classification).OrderByDescending(g => g.Count()).First().Key;
                    }
                });
                Task.WaitAll(less, greater);
            }
        }