Пример #1
0
        /// <summary>
        /// Method responsible for select the best attribute, based on information gain
        /// </summary>
        /// <param name="attributeList">Array of attributes</param>
        /// <param name="categoricalAttribute">The categorical attribute</param>
        /// <param name="data">Array of training data</param>
        /// <returns>Best attribute</returns>
        private static shared.Attribute GetAttributeLargestGain(shared.Attribute[] attributeList, 
                                                                  shared.Attribute categoricalAttribute,
                                                                  string[][] data)
        {
            // creates a list that stores the attribute and its information gain
            List<Tuple<shared.Attribute, double>> listInformationGain = new List<Tuple<shared.Attribute, double>>();

            // for each attribute
            foreach (shared.Attribute attribute in attributeList)
            {
                List<double> entropyList = new List<double>();

                // if the attribute has a domain
                if (attribute.Type == typeof(string))
                {
                    // for each value to the attribute
                    foreach(string value in attribute.Values)
                    {
                        // gets entropy and it is proportion
                        double proportion = 0;
                        double entropy = GetEntropy(attribute, categoricalAttribute, value, data, out proportion);

                        // add the entropy to entropyList
                        entropyList.Add(entropy * proportion);
                    }

                    // the information gain is 1.0 minus the sum of all entropies
                    listInformationGain.Add(new Tuple<shared.Attribute, double>(attribute, 1D - entropyList.Sum()));
                }
                else
                {
                    // clear all possible values
                    attribute.Values = new List<string>();

                    // gets a distinct ordered list from data
                    double[] orderedValues = data.ToList().OrderBy(m => double.Parse(m[attribute.Index]))
                        .ToList().ConvertAll<double>(m => double.Parse(m[attribute.Index])).Distinct().ToArray();

                    List<double> continousEntropyList = new List<double>();
                    List<Tuple<string, double>> continousGainList = new List<Tuple<string, double>>();

                    // for each distinct value in data
                    foreach (double item in orderedValues)
                    {
                        // for each value to that specific partition (first partition)
                        foreach (double line in orderedValues.Where(m => m < item).ToArray())
                        {
                            // gets entropy and it is proportion
                            double proportion = 0;
                            double entropy = GetEntropy(attribute, categoricalAttribute, line.ToString(), data,
                                                                                                        out proportion);

                            // add the entropy to entropyList
                            continousEntropyList.Add(entropy * proportion);
                        }

                        // the information gain is 1.0 minus the sum of all entropies
                        if (continousEntropyList.Sum() > 0)
                            continousGainList.Add(
                                new Tuple<string, double>("<" + item, 1D - continousEntropyList.Sum()));

                        continousEntropyList.Clear();

                        // for each value to that specific partition (last partition)
                        foreach (double line in orderedValues.Where(m => m >= item).ToArray())
                        {
                            // gets entropy and it is proportion
                            double proportion = 0;
                            double entropy = GetEntropy(attribute, categoricalAttribute, line.ToString(), data,
                                                                                                        out proportion);

                            // add the entropy to entropyList
                            continousEntropyList.Add(entropy * proportion);
                        }

                        // the information gain is 1.0 minus the sum of all entropies
                        if (continousEntropyList.Sum() > 0)
                            continousGainList.Add(
                                new Tuple<string, double>("=>" + item, 1D - continousEntropyList.Sum()));
                    }

                    // if there is any item in gain list
                    if (continousGainList.Count() > 0)
                    {
                        // gets the maximum gain
                        var maxGainContinuous = continousGainList.OrderByDescending(m => m.Item2).First();

                        // add value to the current attribute
                        attribute.Values.Add(maxGainContinuous.Item1);

                        // add the another partition to values as well
                        if (maxGainContinuous.Item1.IndexOf("=>") > -1)
                            attribute.Values.Add(maxGainContinuous.Item1.Replace("=>", "<"));
                        else
                            attribute.Values.Add(maxGainContinuous.Item1.Replace("<", "=>"));

                        // the information gain is 1.0 minus the sum of all entropies
                        listInformationGain.Add(
                            new Tuple<shared.Attribute, double>(attribute, maxGainContinuous.Item2));
                    }
                }
            }

            // gets the max information gain element
            if (listInformationGain.Count() > 0)
                return listInformationGain.OrderByDescending(m => m.Item2).First().Item1;
            else
                return attributeList.First();
        }