/// <summary> /// Method responsible for select the best attribute, based on information gain /// </summary> /// <param name="attributeList">Array of attributes</param> /// <param name="categoricalAttribute">The categorical attribute</param> /// <param name="data">Array of training data</param> /// <returns>Best attribute</returns> private static shared.Attribute GetAttributeLargestGain(shared.Attribute[] attributeList, shared.Attribute categoricalAttribute, string[][] data) { // creates a list that stores the attribute and its information gain List<Tuple<shared.Attribute, double>> listInformationGain = new List<Tuple<shared.Attribute, double>>(); // for each attribute foreach (shared.Attribute attribute in attributeList) { List<double> entropyList = new List<double>(); // if the attribute has a domain if (attribute.Type == typeof(string)) { // for each value to the attribute foreach(string value in attribute.Values) { // gets entropy and it is proportion double proportion = 0; double entropy = GetEntropy(attribute, categoricalAttribute, value, data, out proportion); // add the entropy to entropyList entropyList.Add(entropy * proportion); } // the information gain is 1.0 minus the sum of all entropies listInformationGain.Add(new Tuple<shared.Attribute, double>(attribute, 1D - entropyList.Sum())); } else { // clear all possible values attribute.Values = new List<string>(); // gets a distinct ordered list from data double[] orderedValues = data.ToList().OrderBy(m => double.Parse(m[attribute.Index])) .ToList().ConvertAll<double>(m => double.Parse(m[attribute.Index])).Distinct().ToArray(); List<double> continousEntropyList = new List<double>(); List<Tuple<string, double>> continousGainList = new List<Tuple<string, double>>(); // for each distinct value in data foreach (double item in orderedValues) { // for each value to that specific partition (first partition) foreach (double line in orderedValues.Where(m => m < item).ToArray()) { // gets entropy and it is proportion double proportion = 0; double entropy = GetEntropy(attribute, categoricalAttribute, line.ToString(), data, out proportion); // add the entropy to entropyList continousEntropyList.Add(entropy * proportion); } // the information gain is 1.0 minus the sum of all entropies if (continousEntropyList.Sum() > 0) continousGainList.Add( new Tuple<string, double>("<" + item, 1D - continousEntropyList.Sum())); continousEntropyList.Clear(); // for each value to that specific partition (last partition) foreach (double line in orderedValues.Where(m => m >= item).ToArray()) { // gets entropy and it is proportion double proportion = 0; double entropy = GetEntropy(attribute, categoricalAttribute, line.ToString(), data, out proportion); // add the entropy to entropyList continousEntropyList.Add(entropy * proportion); } // the information gain is 1.0 minus the sum of all entropies if (continousEntropyList.Sum() > 0) continousGainList.Add( new Tuple<string, double>("=>" + item, 1D - continousEntropyList.Sum())); } // if there is any item in gain list if (continousGainList.Count() > 0) { // gets the maximum gain var maxGainContinuous = continousGainList.OrderByDescending(m => m.Item2).First(); // add value to the current attribute attribute.Values.Add(maxGainContinuous.Item1); // add the another partition to values as well if (maxGainContinuous.Item1.IndexOf("=>") > -1) attribute.Values.Add(maxGainContinuous.Item1.Replace("=>", "<")); else attribute.Values.Add(maxGainContinuous.Item1.Replace("<", "=>")); // the information gain is 1.0 minus the sum of all entropies listInformationGain.Add( new Tuple<shared.Attribute, double>(attribute, maxGainContinuous.Item2)); } } } // gets the max information gain element if (listInformationGain.Count() > 0) return listInformationGain.OrderByDescending(m => m.Item2).First().Item1; else return attributeList.First(); }