Example #1
0
        /// <summary>
        /// Determines the class label for a data example.
        /// </summary>
        /// <param name="data">data example to classify</param>
        /// <returns>class label</returns>
        public string Classify(Instance data)
        {
            StringBuilder trace = new StringBuilder();

            Node current = this.model;
            while (null != current)
            {
                if (null == current.Children)
                {
                    // we have reaced a leaf node - label contains the
                    // class name
                    trace.Append("\t");
                    trace.Append(current.Label);
                    Logger.Log(LogLevel.Trace, trace.ToString());

                    return current.Label;
                }

                if (!data.Data.ContainsKey(current.Label))
                {
                    throw new ArgumentException("Data sample is incompatible with the model!");
                }

                string attributeValue = data.Data[current.Label];
                if (!current.Children.ContainsKey(attributeValue))
                {
                    throw new ArgumentException("Data sample is incompativle with the model!");
                }

                // record trace
                trace.Append("(");
                trace.Append(current.Label);
                trace.Append("|");
                trace.Append(attributeValue);
                trace.Append(")");

                // pick the branch
                current = current.Children[attributeValue];
            }

            throw new Exception("Invalid model");
        }
Example #2
0
        /// <summary>
        /// Adds a data instance to the container
        /// </summary>
        /// <param name="values">list of feature values for the instance</param>
        public void AddInstance(List<string> values)
        {
            if (null == values
                || this.attributeList.Count != values.Count)
            {
                throw new ArgumentNullException(
                    string.Format("values = [{0}]", values));
            }

            // create the instance object
            Instance item = new Instance()
            {
                Data = new Dictionary<string,string>()
            };

            for (int idx = 0; idx < values.Count; idx++)
            {
                if (!UnknownValue.Equals(values[idx])
                    && !this.attributeList[idx].Values.Contains(values[idx]))
                {
                    throw new FormatException(
                        string.Format("Invalid data file. attribut = [{0}], invalid value=[{1}]",
                            this.attributeList[idx].Name, values[idx]));
                }

                item.Data.Add(this.attributeList[idx].Name, values[idx]);
            }

            // add the data instance to the list
            this.instanceList.Add(item);
        }
Example #3
0
        private Node Learn(Instance[] instanceList, Arff.Attribute[] attributeList, Arff.Attribute classAttribute)
        {
            // compute the class distribution of all the examples
            var classDistribution = GetClassDistribution(instanceList, classAttribute);

            string[] classesWithExamples = classDistribution.Where(a => a.Value > 0).Select(a => a.Key).ToArray();
            if (1 == classesWithExamples.Length)
            {
                // all examples belong to the same class so we have reached a leaf node
                Logger.Log(LogLevel.Progress, ".");
                return new Node()
                {
                    Label = classesWithExamples[0],
                    Children = null
                };
            }

            string mostCommonClass = classDistribution.OrderBy(a => a.Value).Last().Key;
            if (null == attributeList
                || 0 == attributeList.Length)
            {
                // no more attributes to split on
                Logger.Log(LogLevel.Progress, ".");
                return new Node()
                {
                    Label = mostCommonClass,
                    Children = null
                };
            }

            var decisionAttribute = GetDecisionAttribute(instanceList, attributeList, classDistribution, classAttribute);
            if (null == decisionAttribute)
            {
                // can't find a attribute to split on that passes the split-termination condition
                Logger.Log(LogLevel.Progress, ".");
                return new Node()
                {
                    Label = mostCommonClass,
                    Children = null
                };
            }

            // recursively build the tree
            var root = new Node()
            {
                Label = decisionAttribute.Name,
                Children = new Dictionary<string, Node>()
            };

            // pre-process the instances
            var instanceGroups = instanceList
                .GroupBy(a => a.Data[decisionAttribute.Name])
                .ToDictionary(g => g.Key, v => v.ToArray());

            // build the sub-trees
            foreach (string value in decisionAttribute.Values)
            {
                Node childNode = null;

                if (!instanceGroups.ContainsKey(value)
                    || 0 == instanceGroups[value].Length)
                {
                    // if there are not example for the node value assign the
                    // label of most instances to the value branch
                    childNode = new Node()
                    {
                        Label = mostCommonClass,
                        Children = null
                    };
                    root.Children.Add(value, childNode);

                    continue;
                }

                // build the subtree recursively
                childNode = this.Learn(
                    instanceGroups[value],
                    attributeList.Where(a => !a.Name.Equals(decisionAttribute.Name)).ToArray(),
                    classAttribute);
                root.Children.Add(value, childNode);
            }

            return root;
        }
Example #4
0
        private Arff.Attribute GetDecisionAttribute(Instance[] instanceList, Arff.Attribute[] attributeList, Dictionary<string, int> classDistribution, Arff.Attribute classAttribute)
        {
            Arff.Attribute decisionAttribute = null;
            double decisionAttributeGain = double.MinValue;

            // compute entropy
            double entropy = ComputeEntropy(instanceList.Length, classDistribution);

            // compute entropy of each attribute
            foreach (Arff.Attribute attribute in attributeList)
            {
                // group the instances by their values for the attribute being evaluated
                var groupedInstanceList = instanceList.GroupBy(i => i.Data[attribute.Name]);
                if (!this.handleUnknownAsValue)
                {
                    groupedInstanceList = groupedInstanceList.Where(g => !g.Key.Equals(Instances.UnknownValue));
                }

                // compute the chi-squared statistic for the data
                double dataChiSquared = ComputeAttributeChiSquared(groupedInstanceList, instanceList.Length, classDistribution, classAttribute);
                double criticalChiSquared = ChiSquare.CriticalChiSquareValue(1.0f - this.splitStoppingConfidenceLevel, attribute.Values.Length - 1);
                Logger.Log(LogLevel.Info, "Chi-Square test for [{0}]: data=[{1}] critical=[{2}]", attribute.Name, dataChiSquared, criticalChiSquared);

                if (dataChiSquared < criticalChiSquared)
                {
                    // attribute did not pass chi-square split test
                    continue;
                }

                // compute the attribute entropy
                double attributeEntropy = ComputeAttributeEntropy(groupedInstanceList, instanceList.Length, classAttribute);

                // compute the gain
                double attributeGain = entropy - attributeEntropy;

                if (this.useGainRatio)
                {
                    // compute the gain ratio
                    double splitInfo = this.ComputeAttributeSplitInfo(groupedInstanceList, instanceList.Length);

                    attributeGain = attributeGain / splitInfo;
                }

                // check the attribute
                if (decisionAttributeGain < attributeGain)
                {
                    // found a better attribute
                    decisionAttribute = attribute;
                    decisionAttributeGain = attributeGain;
                }
            }

            if (null != decisionAttribute)
            {
                Logger.Log(LogLevel.Info, "Selected Attribute - name=[{0}] gain=[{1}].", decisionAttribute.Name, decisionAttributeGain);
            }
            else
            {
                Logger.Log(LogLevel.Info, "No relevant attribute found.");
            }

            return decisionAttribute;
        }
Example #5
0
        private static Dictionary<string, int> GetClassDistribution(Instance[] instanceList, Arff.Attribute classAttribute)
        {
            // lets check some termination conditions
            var classDistribution = new Dictionary<string, int>();
            foreach (string value in classAttribute.Values)
            {
                classDistribution.Add(value, 0);
            }

            foreach (Instance example in instanceList)
            {
                classDistribution[example.Data[classAttribute.Name]]++;
            }

            return classDistribution;
        }