static void Main(string[] args) { if (args.Length < 2) // NOTE: Check argument length { PrintHelp(); return; } Parser parser = new Parser(); DecisionTree dt = new DecisionTree(); string trainingFileName = args[0]; string testingFileName = args[1]; ID3Data id3Data = parser.ParseID3InformationFile(trainingFileName); Node learnedTree = dt.ID3(id3Data.TestData, id3Data.GetKeyAttributes(), id3Data); // Part One Console.WriteLine("Learned Tree:\n-----------"); Console.WriteLine(learnedTree.ToString()); // Part Two ID3Data testingData = parser.ParseID3InformationFile(testingFileName); ArrayList treeClassification = learnedTree.GetTreeClassifications(testingData); Console.WriteLine("Classifications:\n-------------"); Console.WriteLine(testingData.CompareClassifications(treeClassification)); Console.WriteLine("\n\nPress any key to exit..."); Console.ReadKey(); }
/// <summary> /// Translate the file data to ID3 data. /// </summary> /// <param name="filename">Filename to read from</param> /// <returns>ID3Data object</returns> public ID3Data ParseID3InformationFile(string filename) { /** * !File Format! * !Do not read lines starting with '//'! * \SOF * class labels (could be any number; assume 2 for testing purposes) * # of features (nF) (how many to read in) * feature1 (list starts) * ... -> nF (list ends) * # of examles (nE) (how many to read in) * example1 (list starts) * ... -> nE (list ends) * \EOF */ string rawFileContents = GetFileAsString(filename); ArrayList cleanFileContents = RemoveCommentsAndEmptyLines(rawFileContents); int i = 0; ID3Data id3Data = new ID3Data(); id3Data.Categories = new ArrayList() { cleanFileContents[i++], cleanFileContents[i++] // NOTE: This gets line 0 and 1 (the class labels) }; int attributeCount = Int32.Parse(cleanFileContents[i++].ToString()); for (int f = 0; f < attributeCount; f++, i++) // NOTE: This line is the number of features { string[] line = cleanFileContents[i].ToString().Split(' '); id3Data.Attributes.Add(line[0], new ArrayList() { line[1], line[2] }); } int dataItemCount = Int32.Parse(cleanFileContents[i++].ToString()); for (int e = 0; e < dataItemCount; e++, i++) { string[] line = System.Text.RegularExpressions.Regex.Split(cleanFileContents[i].ToString(), @"\s+"); id3Data.TestData.Add( new Data( line[0], line[1], new ArrayList() { line[2], line[3], line[4], line[5] } ) ); } return(id3Data); }
/// <summary> /// Gets all the tree classification for the given test data object /// </summary> /// <param name="testingData">Data object to test with</param> /// <returns>ArrayList of Tuples[string, string] (name, classification)</returns> public ArrayList GetTreeClassifications(ID3Data testingData) { ArrayList output = new ArrayList(); foreach (Data example in testingData.TestData) { string exmapleClassification = Traverse(example); output.Add(Tuple.Create <string, string>(example.Name, exmapleClassification)); } return(output); }
/// <summary> /// Entropy /// </summary> /// <param name="examples"></param> /// <param name="targetAttribute"></param> /// <param name="data"></param> /// <returns></returns> double Entropy(ArrayList examples, string targetAttribute, ID3Data data) { double result = 0; Dictionary <string, int> dictionary = SummarizeExamplesAttribute(examples, targetAttribute, data); foreach (KeyValuePair <string, int> kvp in dictionary) { double proportion = (float)dictionary[kvp.Key] / (float)examples.Count; result -= proportion * Math.Log(proportion, 2); } return(result); }
/// <summary> /// Gets the best attribute for the examples. /// </summary> /// <param name="examples">Examples list (as data objects)</param> /// <param name="attributes">Attributes in the current example list</param> /// <param name="data">Data object</param> /// <returns>Best attribute for the current examples</returns> string GetBestAttribute(ArrayList examples, ArrayList attributes, ID3Data data) { string output = ""; double best = double.MinValue; foreach (string value in attributes) { double temp = InformationGain(examples, value, Entropy(examples, value, data), data); if (temp > best) // REVIEW: Depends on how you do this { output = value; best = temp; } } return(output); }
/// <summary> /// ID3 Algorithm /// *Notes inline comments* /// </summary> /// <param name="examples">Example set to be used</param> /// <param name="attributes">All attributes in the training data</param> /// <param name="data">ID3 Data object (from the parser)</param> /// <returns>Roote node of the tree</returns> public Node ID3(ArrayList examples, ArrayList attributes, ID3Data data) { /** * A <- best attribute * Assign A as decision attribute for node * foreach value of A * create a descendent of node * sort training examples to leaves * if examples perfectly classified STOP * else iterate over leaves */ // if all example same category (pure) // return leaf with that category // if attributes.empty // return a leaf with most common category in examples if (attributes.Count == 0) { // string mostCommon = GetMostCommonValue(examples); string mostCommon = GetMostCommonCategory(examples); return(new Node(Label: mostCommon, Decision: null)); } string bestAttribute = GetBestAttribute(examples, attributes, data); // Gets the best attribute for the current examples Node tree = new Node(Label: bestAttribute, Decision: null); // This nodes decision category foreach (string value in data.Attributes[bestAttribute]) { ArrayList subset = SubSet(examples, value); // Generates a subset of examples Dictionary <string, int> dictionary = SummarizeExamplesValue(examples, value, data); // Checks if the examples are empty foreach (KeyValuePair <string, int> kvp in dictionary) { if (kvp.Value == examples.Count) { return(new Node(Label: kvp.Key, Decision: null)); } } ArrayList newAttributes = attributes; newAttributes.Remove(bestAttribute); // Removes current best attribute from list Node subtree = ID3(subset, newAttributes, data); // Gets the subtree from a new tree being created by the algorithm subtree.Decision = value; tree.AddBranch(subtree); // Adds a child to the tree (or a branch) } return(tree); }
/// <summary> /// Summarizes how many of each attribute are in the current examples. /// </summary> /// <param name="examples">Examples to check</param> /// <param name="targetAttribute">Target attribute to iterate over it's values</param> /// <param name="data">Data object</param> /// <returns>Dictionary of summarized examples</returns> Dictionary <string, int> SummarizeExamplesAttribute(ArrayList examples, string targetAttribute, ID3Data data) { Dictionary <string, int> dictionary = new Dictionary <string, int>(); foreach (string value in data.Attributes[targetAttribute]) { foreach (Data example in examples) { if (example.Attributes.Contains(value)) { if (dictionary.ContainsKey(value)) { dictionary[value] += 1; } else { dictionary.Add(value, 1); } } } } return(dictionary); }
/// <summary> /// Information gain /// </summary> /// <param name="examples"></param> /// <param name="attribute"></param> /// <param name="entropyOfSet"></param> /// <param name="data"></param> /// <returns></returns> double InformationGain(ArrayList examples, string attribute, double entropyOfSet, ID3Data data) { double gain = entropyOfSet; // The current gain foreach (string value in data.Attributes[attribute]) // For each value the attribute can be { ArrayList subset = SubSet(examples, value); gain -= (float)subset.Count / (float)examples.Count * (float)Entropy(subset, attribute, data); } return(gain); }