/// <summary> /// Determines the class label for a data example. /// </summary> /// <param name="data">data example to classify</param> /// <returns>class label</returns> public string Classify(Instance data) { StringBuilder trace = new StringBuilder(); Node current = this.model; while (null != current) { if (null == current.Children) { // we have reaced a leaf node - label contains the // class name trace.Append("\t"); trace.Append(current.Label); Logger.Log(LogLevel.Trace, trace.ToString()); return current.Label; } if (!data.Data.ContainsKey(current.Label)) { throw new ArgumentException("Data sample is incompatible with the model!"); } string attributeValue = data.Data[current.Label]; if (!current.Children.ContainsKey(attributeValue)) { throw new ArgumentException("Data sample is incompativle with the model!"); } // record trace trace.Append("("); trace.Append(current.Label); trace.Append("|"); trace.Append(attributeValue); trace.Append(")"); // pick the branch current = current.Children[attributeValue]; } throw new Exception("Invalid model"); }
/// <summary> /// Adds a data instance to the container /// </summary> /// <param name="values">list of feature values for the instance</param> public void AddInstance(List<string> values) { if (null == values || this.attributeList.Count != values.Count) { throw new ArgumentNullException( string.Format("values = [{0}]", values)); } // create the instance object Instance item = new Instance() { Data = new Dictionary<string,string>() }; for (int idx = 0; idx < values.Count; idx++) { if (!UnknownValue.Equals(values[idx]) && !this.attributeList[idx].Values.Contains(values[idx])) { throw new FormatException( string.Format("Invalid data file. attribut = [{0}], invalid value=[{1}]", this.attributeList[idx].Name, values[idx])); } item.Data.Add(this.attributeList[idx].Name, values[idx]); } // add the data instance to the list this.instanceList.Add(item); }
private Node Learn(Instance[] instanceList, Arff.Attribute[] attributeList, Arff.Attribute classAttribute) { // compute the class distribution of all the examples var classDistribution = GetClassDistribution(instanceList, classAttribute); string[] classesWithExamples = classDistribution.Where(a => a.Value > 0).Select(a => a.Key).ToArray(); if (1 == classesWithExamples.Length) { // all examples belong to the same class so we have reached a leaf node Logger.Log(LogLevel.Progress, "."); return new Node() { Label = classesWithExamples[0], Children = null }; } string mostCommonClass = classDistribution.OrderBy(a => a.Value).Last().Key; if (null == attributeList || 0 == attributeList.Length) { // no more attributes to split on Logger.Log(LogLevel.Progress, "."); return new Node() { Label = mostCommonClass, Children = null }; } var decisionAttribute = GetDecisionAttribute(instanceList, attributeList, classDistribution, classAttribute); if (null == decisionAttribute) { // can't find a attribute to split on that passes the split-termination condition Logger.Log(LogLevel.Progress, "."); return new Node() { Label = mostCommonClass, Children = null }; } // recursively build the tree var root = new Node() { Label = decisionAttribute.Name, Children = new Dictionary<string, Node>() }; // pre-process the instances var instanceGroups = instanceList .GroupBy(a => a.Data[decisionAttribute.Name]) .ToDictionary(g => g.Key, v => v.ToArray()); // build the sub-trees foreach (string value in decisionAttribute.Values) { Node childNode = null; if (!instanceGroups.ContainsKey(value) || 0 == instanceGroups[value].Length) { // if there are not example for the node value assign the // label of most instances to the value branch childNode = new Node() { Label = mostCommonClass, Children = null }; root.Children.Add(value, childNode); continue; } // build the subtree recursively childNode = this.Learn( instanceGroups[value], attributeList.Where(a => !a.Name.Equals(decisionAttribute.Name)).ToArray(), classAttribute); root.Children.Add(value, childNode); } return root; }
private Arff.Attribute GetDecisionAttribute(Instance[] instanceList, Arff.Attribute[] attributeList, Dictionary<string, int> classDistribution, Arff.Attribute classAttribute) { Arff.Attribute decisionAttribute = null; double decisionAttributeGain = double.MinValue; // compute entropy double entropy = ComputeEntropy(instanceList.Length, classDistribution); // compute entropy of each attribute foreach (Arff.Attribute attribute in attributeList) { // group the instances by their values for the attribute being evaluated var groupedInstanceList = instanceList.GroupBy(i => i.Data[attribute.Name]); if (!this.handleUnknownAsValue) { groupedInstanceList = groupedInstanceList.Where(g => !g.Key.Equals(Instances.UnknownValue)); } // compute the chi-squared statistic for the data double dataChiSquared = ComputeAttributeChiSquared(groupedInstanceList, instanceList.Length, classDistribution, classAttribute); double criticalChiSquared = ChiSquare.CriticalChiSquareValue(1.0f - this.splitStoppingConfidenceLevel, attribute.Values.Length - 1); Logger.Log(LogLevel.Info, "Chi-Square test for [{0}]: data=[{1}] critical=[{2}]", attribute.Name, dataChiSquared, criticalChiSquared); if (dataChiSquared < criticalChiSquared) { // attribute did not pass chi-square split test continue; } // compute the attribute entropy double attributeEntropy = ComputeAttributeEntropy(groupedInstanceList, instanceList.Length, classAttribute); // compute the gain double attributeGain = entropy - attributeEntropy; if (this.useGainRatio) { // compute the gain ratio double splitInfo = this.ComputeAttributeSplitInfo(groupedInstanceList, instanceList.Length); attributeGain = attributeGain / splitInfo; } // check the attribute if (decisionAttributeGain < attributeGain) { // found a better attribute decisionAttribute = attribute; decisionAttributeGain = attributeGain; } } if (null != decisionAttribute) { Logger.Log(LogLevel.Info, "Selected Attribute - name=[{0}] gain=[{1}].", decisionAttribute.Name, decisionAttributeGain); } else { Logger.Log(LogLevel.Info, "No relevant attribute found."); } return decisionAttribute; }
private static Dictionary<string, int> GetClassDistribution(Instance[] instanceList, Arff.Attribute classAttribute) { // lets check some termination conditions var classDistribution = new Dictionary<string, int>(); foreach (string value in classAttribute.Values) { classDistribution.Add(value, 0); } foreach (Instance example in instanceList) { classDistribution[example.Data[classAttribute.Name]]++; } return classDistribution; }