/// <summary> /// Splits data into Tuples and builds a Decision Tree /// </summary> /// <param name="inputLines"></param> /// <param name="attributeArray"></param> /// <param name="attributeNames"></param> /// <param name="continuousAttributeNames"></param> /// <param name="continuousAttributeValues"></param> /// <param name="continousIndexes"></param> /// <param name="userLabels"></param> /// <returns></returns> public static Tree BuildTree(string[] inputLines, List <Attribute> attributeArray, string[] attributeNames, List <string> continuousAttributeNames, Dictionary <string, List <double> > continuousAttributeValues, List <int> continousIndexes, Attribute userLabels) { List <Tuple> tuples = new List <Tuple>(); //build the continuous attributes List <double>[] continuousVals = new List <double> [continuousAttributeNames.Count]; for (int i = 0; i < continuousVals.Length; ++i) { continuousVals[i] = new List <double>(); } for (int i = 0; i < inputLines.Length - attributeArray.Count - 2; ++i) { string[] tupleInfo = inputLines[i + 2 + attributeArray.Count].Split(' ', StringSplitOptions.RemoveEmptyEntries); string[] tupleAttributes = new string[tupleInfo.Length - 1]; Array.Copy(tupleInfo, 0, tupleAttributes, 0, tupleAttributes.Length); tuples.Add(new Tuple(tupleInfo[tupleInfo.Length - 1], attributeNames, tupleAttributes)); int continuousCount = 0; for (int j = 0; j < tupleAttributes.Length; ++j) { if (continousIndexes.Contains(j)) { continuousVals[continuousCount].Add(Convert.ToDouble(tupleAttributes[j])); ++continuousCount; } } } for (int i = 0; i < continuousAttributeNames.Count; i++) { continuousAttributeValues.Add(continuousAttributeNames[i], continuousVals[i]); continuousAttributeValues[continuousAttributeNames[i]].Sort(); } foreach (string contName in continuousAttributeNames) { double split; split = (continuousAttributeValues[contName][0] + continuousAttributeValues[contName][1]) / 2; double minExpectInfo = Double.MaxValue; for (int i = 0; i < tuples.Count - 1; ++i) { double splitPoint = (continuousAttributeValues[contName][0] + continuousAttributeValues[contName][i + 1]) / 2; List <Tuple>[] subList = new List <Tuple> [2]; for (int j = 0; j < subList.Length; ++j) { subList[j] = new List <Tuple>(); } foreach (Tuple tuple in tuples) { if (Convert.ToDouble(tuple.AttributeValues[contName]) <= splitPoint) { subList[0].Add(tuple); } else { subList[1].Add(tuple); } } double expectedInfo = Equations.ExpectedInfoWithPartition(tuples, subList, userLabels); if (expectedInfo < minExpectInfo) { minExpectInfo = expectedInfo; split = splitPoint; } } for (int i = 0; i < tuples.Count; ++i) { if (split != 0) { if (Convert.ToDouble(tuples[i].AttributeValues[contName]) <= split) { tuples[i].AttributeValues[contName] = " <= " + split.ToString("F1"); } else { tuples[i].AttributeValues[contName] = " > " + split.ToString("F1"); } } else { if (Convert.ToDouble(tuples[i].AttributeValues[contName]) <= split) { Convert.ToInt32(split); tuples[i].AttributeValues[contName] = " <= " + split.ToString(); } else { Convert.ToInt32(split); tuples[i].AttributeValues[contName] = " > " + split.ToString(); } } } int contAttributeIndex = -1; for (int i = 0; i < attributeArray.Count; ++i) { if (attributeArray[i].AttributeName == contName) { contAttributeIndex = i; break; } } attributeArray[contAttributeIndex].Values = new string[2]; if (split != 0) { attributeArray[contAttributeIndex].Values[0] = " <= " + split.ToString("F1"); attributeArray[contAttributeIndex].Values[1] = " > " + split.ToString("F1"); } else //split = 0 so no need for the extra decimal place -> 0.0 { attributeArray[contAttributeIndex].Values[0] = " <= " + split.ToString(); attributeArray[contAttributeIndex].Values[1] = " > " + split.ToString(); } } //build the tree Tree decisionTree = new Tree(string.Empty, string.Empty); decisionTree.TreeRecursion(tuples, decisionTree.Root, attributeArray, userLabels, continuousAttributeNames); return(decisionTree); }
//Member Functions //recursively build the tree by splitting on a particular attribute, based on information gain public void TreeRecursion(List <Tuple> tupleArray, TreeNode parentNode, List <Attribute> attributeArray, Attribute userLabels, List <string> continuousAttributeNames, TreeNode prevNode = null, int childIndex = -1) { if (tupleArray.Count > 0 && attributeArray.Count > 0 && !Equations.SubListClassesAreSame(tupleArray)) { string bestAttribute = string.Empty; double minExpectInfo = double.MaxValue; foreach (Attribute attribute in attributeArray) { Dictionary <string, List <Tuple> > attrSubLists = new Dictionary <string, List <Tuple> >(); foreach (string value in attribute.Values) { attrSubLists.Add(value, new List <Tuple>()); } foreach (Tuple t in tupleArray) { attrSubLists[t.AttributeValues[attribute.AttributeName]].Add(t); } List <Tuple>[] attrSubListArray = new List <Tuple> [attrSubLists.Count]; for (int i = 0; i < attrSubLists.Count; ++i) { attrSubListArray[i] = attrSubLists[attribute.Values[i]]; } double expectedInfo = Equations.ExpectedInfoWithPartition(tupleArray, attrSubListArray, userLabels); if (expectedInfo < minExpectInfo) { minExpectInfo = expectedInfo; bestAttribute = attribute.AttributeName; } } parentNode.NextAttribute = bestAttribute; Dictionary <string, List <Tuple> > subLists = new Dictionary <string, List <Tuple> >(); int bestAttributeIndex = -1; for (int i = 0; i < attributeArray.Count; ++i) { if (attributeArray[i].AttributeName == bestAttribute) { bestAttributeIndex = i; break; } } foreach (string value in attributeArray[bestAttributeIndex].Values) { subLists.Add(value, new List <Tuple>()); } foreach (Tuple t in tupleArray) { subLists[t.AttributeValues[attributeArray[bestAttributeIndex].AttributeName]].Add(t); } List <Tuple>[] subListArray = new List <Tuple> [subLists.Count]; for (int i = 0; i < subLists.Count; ++i) { subListArray[i] = subLists[attributeArray[bestAttributeIndex].Values[i]]; } List <Attribute> prunedAttributeArray = new List <Attribute>(attributeArray); prunedAttributeArray.RemoveAt(bestAttributeIndex); //for (int i = 0; i < subListArray.Length; ++i) int index = 0; int iteration = 0; while (iteration < subListArray.Length) { if (continuousAttributeNames.Contains(bestAttribute)) { parentNode.AddChild(attributeArray[bestAttributeIndex].Values[iteration] + ":", string.Empty); } else { parentNode.AddChild("=" + attributeArray[bestAttributeIndex].Values[iteration] + ":", string.Empty); } TreeRecursion(subListArray[iteration], parentNode.Children[index], prunedAttributeArray, userLabels, continuousAttributeNames, parentNode, index); if (pruned_status) { parentNode.Children.RemoveAt(index); pruned_status = false; index = 0; iteration++; continue; } iteration++; index++; } } else if (tupleArray.Count > 0 && attributeArray.Count == 0) { Dictionary <string, int> classCount = new Dictionary <string, int>(); foreach (string tupleClass in userLabels.Values) { classCount.Add(tupleClass, 0); } foreach (Tuple tuple in tupleArray) { ++classCount[tuple.Class]; } int maxFrequency = int.MinValue; string modeClass = string.Empty; foreach (string tupleClass in userLabels.Values) { if (classCount[tupleClass] > maxFrequency) { maxFrequency = classCount[tupleClass]; modeClass = tupleClass; } } parentNode.NextAttribute = modeClass; } else if (tupleArray.Count == 0) //no tuples left left to branch off of { if (childIndex != -1) //prune the child { pruned_status = true; } else { parentNode.NextAttribute = "No data left"; } } else //all the tuples are of the same class { parentNode.NextAttribute = tupleArray[0].Class; } }