public static string SelectBestAxis(DecisionTreeSet set) { var baseEntropy = Entropy(set); //Console.WriteLine("Base entropy is {0}", baseEntropy); var bestInfoGain = 0.0; var uniqueFeaturesByAxis = set.UniqueFeatures().GroupBy(i => i.Axis).ToList(); string bestAxisSplit = uniqueFeaturesByAxis.First().Key; foreach (var axis in uniqueFeaturesByAxis) { // calculate the total entropy based on splitting by this axis. The total entropy // is the sum of the entropy of each branch that would be created by this split var newEntropy = EntropyForSplitBranches(set, axis.ToList()); var infoGain = baseEntropy - newEntropy; if (infoGain > bestInfoGain) { bestInfoGain = infoGain; bestAxisSplit = axis.Key; } } return bestAxisSplit; }
private static double EntropyForSplitBranches(DecisionTreeSet set, IEnumerable<Feature> allPossibleAxisValues) { return (from possibleValue in allPossibleAxisValues select set.Split(possibleValue) into subset let prob = (float) subset.NumberOfInstances/set.NumberOfInstances select prob*Entropy(subset)).Sum(); }
public static double Entropy(DecisionTreeSet set) { var total = set.Instances.Count(); var outputs = set.Instances.Select(i => i.Output).GroupBy(f => f.Value).ToList(); var entropy = 0.0; foreach (var target in outputs) { var probability = (float)target.Count()/total; entropy -= probability*Math.Log(probability, 2); } return entropy; }
public DecisionTreeSet Parse(string file) { var set = new DecisionTreeSet(); set.Instances = new List<Instance>(); using (var stream = new StreamReader(file)) { while (!stream.EndOfStream) { var line = stream.ReadLine(); set.Instances.Add(ParseLine(line)); } } return set; }