public RandomForest(LearningSet data, Hyperparameters parameters) { List <Task <Tree> > tasks = new List <Task <Tree> >(); Random rand = new Random(parameters.Seed); for (int i = 0; i < parameters.NumTrees; ++i) { Random treeRandom = new Random(rand.Next()); tasks.Add(Task.Run(() => new Tree(data, treeRandom, parameters))); } NumFeatures = data.First().Features.Length; Task.WaitAll(tasks.ToArray()); Trees = tasks.Select(t => t.Result).ToArray(); Accuracy = Trees.Average(t => t.Accuracy); }
public Tree(LearningSet data, Random random, Hyperparameters parameters) { List <DataPoint> bag = new List <DataPoint>(); List <DataPoint> outOfBag = new List <DataPoint>(); foreach (DataPoint point in data) { if (random.NextDouble() < parameters.OutOfBag) { outOfBag.Add(point); } else { bag.Add(point); } } Root = new Node(bag, random, parameters); Accuracy = outOfBag.Average(d => Root.Classify(d) == d.Classification ? 1.0 : 0.0); }
public static void Main(string[] args) { Hyperparameters parameters = new Hyperparameters { NumTrees = 10, MaxFeatures = -1, MinFeatures = 1, MaxDepth = 10, Seed = (int)(DateTime.Now.Ticks % int.MaxValue), OutOfBag = 0.3 }; string trainingFile = null; string serializedFile = null; List <double> testData = new List <double>(); for (int i = 0; i < args.Length; ++i) { switch (args[i]) { case "--num-trees": if (i + 1 >= args.Length) { InvalidArgs(); } if (!int.TryParse(args[++i], out parameters.NumTrees)) { InvalidArgs(); } break; case "--max-features": if (i + 1 >= args.Length) { InvalidArgs(); } if (!int.TryParse(args[++i], out parameters.MaxFeatures)) { InvalidArgs(); } break; case "--min-features": if (i + 1 >= args.Length) { InvalidArgs(); } if (!int.TryParse(args[++i], out parameters.MinFeatures)) { InvalidArgs(); } break; case "--max-depth": if (i + 1 >= args.Length) { InvalidArgs(); } if (!int.TryParse(args[++i], out parameters.MaxDepth)) { InvalidArgs(); } break; case "--seed": if (i + 1 >= args.Length) { InvalidArgs(); } if (!int.TryParse(args[++i], out parameters.Seed)) { InvalidArgs(); } break; case "--oob": if (i + 1 >= args.Length) { InvalidArgs(); } if (!double.TryParse(args[++i], out parameters.OutOfBag)) { InvalidArgs(); } break; default: double val; if (args[i].EndsWith(".csv")) { if (trainingFile == null) { trainingFile = args[i]; } else { InvalidArgs(); } } else if (args[i].EndsWith(".bin") || args[i].EndsWith(".xml")) { if (serializedFile == null) { serializedFile = args[i]; } else { InvalidArgs(); } } else if (double.TryParse(args[i], out val)) { testData.Add(val); } else { InvalidArgs(); } break; } } RandomForest forest = null; if (trainingFile == null) { if (serializedFile == null || !File.Exists(serializedFile)) { Console.WriteLine("No model source"); InvalidArgs(); } else { if (serializedFile.EndsWith(".xml")) { XmlSerializer serializer = new XmlSerializerFactory().CreateSerializer(typeof(RandomForest)); using (Stream stream = new FileStream(serializedFile, FileMode.Open, FileAccess.Read)) { forest = (RandomForest)serializer.Deserialize(stream); } } else { BinaryFormatter serializer = new BinaryFormatter(); using (Stream stream = new FileStream(serializedFile, FileMode.Open, FileAccess.Read)) { forest = (RandomForest)serializer.Deserialize(stream); } } } } else { LearningSet learningSet = new LearningSet(trainingFile); if (parameters.MaxFeatures == -1) { parameters.MaxFeatures = (int)Math.Sqrt(learningSet.First().Features.Length); } forest = new RandomForest(learningSet, parameters); if (serializedFile != null) { if (serializedFile.EndsWith(".xml")) { XmlSerializer serializer = new XmlSerializerFactory().CreateSerializer(typeof(RandomForest)); using (Stream stream = new FileStream(serializedFile, FileMode.Create, FileAccess.Write)) { serializer.Serialize(stream, forest); } } else { BinaryFormatter serializer = new BinaryFormatter(); using (Stream stream = new FileStream(serializedFile, FileMode.Create, FileAccess.Write)) { serializer.Serialize(stream, forest); } } } } if (testData.Count > 0) { if (testData.Count == forest.NumFeatures) { Console.WriteLine(forest.Classify(new DataPoint { Features = testData.ToArray() })); } else { Console.WriteLine("Invalid number of features"); InvalidArgs(); } } else { Console.WriteLine("Accuracy: {0}%", forest.Accuracy * 100.0); } }
public Node(IEnumerable <DataPoint> dataPoints, Random random, Hyperparameters parameters, int depth = 1) { List <int> features = new List <int>(); while (features.Count < parameters.MaxFeatures) { int feature = random.Next(dataPoints.First().Features.Length); if (!features.Contains(feature)) { features.Add(feature); } } double[] classes = dataPoints.GroupBy(d => d.Classification).Select(g => g.Key).ToArray(); double bestGini = double.MaxValue; DataPoint[] bestLess = new DataPoint[0]; DataPoint[] bestGreater = new DataPoint[0]; foreach (int feature in features) { foreach (DataPoint dataPoint in dataPoints) { DataPoint[] less = dataPoints.Where(t => t.Features[feature] < dataPoint.Features[feature]).ToArray(); DataPoint[] greater = dataPoints.Except(less).ToArray(); double gini = 0; foreach (double classification in classes) { foreach (DataPoint[] group in new [] { less, greater }) { double amount = ((double)group.Where(d => d.Classification == classification).Count()) / (double)group.Length; gini += amount * (1.0 - amount); } } if (gini < bestGini) { Feature = feature; Threshold = dataPoint.Features[feature]; bestLess = less; bestGreater = greater; bestGini = gini; } } } if (bestLess.Length == 0 || bestGreater.Length == 0) { LessClass = GreaterClass = dataPoints.GroupBy(d => d.Classification).OrderByDescending(g => g.Count()).First().Key; } else if (depth >= parameters.MaxDepth) { LessClass = bestLess.GroupBy(d => d.Classification).OrderByDescending(g => g.Count()).First().Key; GreaterClass = bestGreater.GroupBy(d => d.Classification).OrderByDescending(g => g.Count()).First().Key; } else { Task less = Task.Run(() => { if (bestLess.Length > parameters.MinFeatures) { Less = new Node(bestLess, random, parameters, depth + 1); } else { LessClass = bestLess.GroupBy(d => d.Classification).OrderByDescending(g => g.Count()).First().Key; } }); Task greater = Task.Run(() => { if (bestGreater.Length > parameters.MinFeatures) { Greater = new Node(bestGreater, random, parameters, depth + 1); } else { GreaterClass = bestGreater.GroupBy(d => d.Classification).OrderByDescending(g => g.Count()).First().Key; } }); Task.WaitAll(less, greater); } }