Ejemplo n.º 1
0
        /// <summary>
        /// Splits data into Tuples and builds a Decision Tree
        /// </summary>
        /// <param name="inputLines"></param>
        /// <param name="attributeArray"></param>
        /// <param name="attributeNames"></param>
        /// <param name="continuousAttributeNames"></param>
        /// <param name="continuousAttributeValues"></param>
        /// <param name="continousIndexes"></param>
        /// <param name="userLabels"></param>
        /// <returns></returns>
        public static Tree BuildTree(string[] inputLines, List <Attribute> attributeArray, string[] attributeNames, List <string> continuousAttributeNames,
                                     Dictionary <string, List <double> > continuousAttributeValues, List <int> continousIndexes, Attribute userLabels)
        {
            List <Tuple> tuples = new List <Tuple>();

            //build the continuous attributes
            List <double>[] continuousVals = new List <double> [continuousAttributeNames.Count];
            for (int i = 0; i < continuousVals.Length; ++i)
            {
                continuousVals[i] = new List <double>();
            }
            for (int i = 0; i < inputLines.Length - attributeArray.Count - 2; ++i)
            {
                string[] tupleInfo       = inputLines[i + 2 + attributeArray.Count].Split(' ', StringSplitOptions.RemoveEmptyEntries);
                string[] tupleAttributes = new string[tupleInfo.Length - 1];
                Array.Copy(tupleInfo, 0, tupleAttributes, 0, tupleAttributes.Length);
                tuples.Add(new Tuple(tupleInfo[tupleInfo.Length - 1], attributeNames, tupleAttributes));

                int continuousCount = 0;

                for (int j = 0; j < tupleAttributes.Length; ++j)
                {
                    if (continousIndexes.Contains(j))
                    {
                        continuousVals[continuousCount].Add(Convert.ToDouble(tupleAttributes[j]));
                        ++continuousCount;
                    }
                }
            }

            for (int i = 0; i < continuousAttributeNames.Count; i++)
            {
                continuousAttributeValues.Add(continuousAttributeNames[i], continuousVals[i]);
                continuousAttributeValues[continuousAttributeNames[i]].Sort();
            }
            foreach (string contName in continuousAttributeNames)
            {
                double split;
                split = (continuousAttributeValues[contName][0] + continuousAttributeValues[contName][1]) / 2;
                double minExpectInfo = Double.MaxValue;

                for (int i = 0; i < tuples.Count - 1; ++i)
                {
                    double         splitPoint = (continuousAttributeValues[contName][0] + continuousAttributeValues[contName][i + 1]) / 2;
                    List <Tuple>[] subList    = new List <Tuple> [2];


                    for (int j = 0; j < subList.Length; ++j)
                    {
                        subList[j] = new List <Tuple>();
                    }
                    foreach (Tuple tuple in tuples)
                    {
                        if (Convert.ToDouble(tuple.AttributeValues[contName]) <= splitPoint)
                        {
                            subList[0].Add(tuple);
                        }
                        else
                        {
                            subList[1].Add(tuple);
                        }
                    }
                    double expectedInfo = Equations.ExpectedInfoWithPartition(tuples, subList, userLabels);

                    if (expectedInfo < minExpectInfo)
                    {
                        minExpectInfo = expectedInfo;
                        split         = splitPoint;
                    }
                }
                for (int i = 0; i < tuples.Count; ++i)
                {
                    if (split != 0)
                    {
                        if (Convert.ToDouble(tuples[i].AttributeValues[contName]) <= split)
                        {
                            tuples[i].AttributeValues[contName] = " <= " + split.ToString("F1");
                        }
                        else
                        {
                            tuples[i].AttributeValues[contName] = " > " + split.ToString("F1");
                        }
                    }
                    else
                    {
                        if (Convert.ToDouble(tuples[i].AttributeValues[contName]) <= split)
                        {
                            Convert.ToInt32(split);
                            tuples[i].AttributeValues[contName] = " <= " + split.ToString();
                        }
                        else
                        {
                            Convert.ToInt32(split);
                            tuples[i].AttributeValues[contName] = " > " + split.ToString();
                        }
                    }
                }

                int contAttributeIndex = -1;

                for (int i = 0; i < attributeArray.Count; ++i)
                {
                    if (attributeArray[i].AttributeName == contName)
                    {
                        contAttributeIndex = i;
                        break;
                    }
                }

                attributeArray[contAttributeIndex].Values = new string[2];
                if (split != 0)
                {
                    attributeArray[contAttributeIndex].Values[0] = " <= " + split.ToString("F1");
                    attributeArray[contAttributeIndex].Values[1] = " > " + split.ToString("F1");
                }
                else //split = 0 so no need for the extra decimal place -> 0.0
                {
                    attributeArray[contAttributeIndex].Values[0] = " <= " + split.ToString();
                    attributeArray[contAttributeIndex].Values[1] = " > " + split.ToString();
                }
            }

            //build the tree
            Tree decisionTree = new Tree(string.Empty, string.Empty);

            decisionTree.TreeRecursion(tuples, decisionTree.Root, attributeArray, userLabels, continuousAttributeNames);

            return(decisionTree);
        }
Ejemplo n.º 2
0
        //Member Functions

        //recursively build the tree by splitting on a particular attribute, based on information gain
        public void TreeRecursion(List <Tuple> tupleArray, TreeNode parentNode, List <Attribute> attributeArray, Attribute userLabels, List <string> continuousAttributeNames, TreeNode prevNode = null, int childIndex = -1)
        {
            if (tupleArray.Count > 0 && attributeArray.Count > 0 && !Equations.SubListClassesAreSame(tupleArray))
            {
                string bestAttribute = string.Empty;
                double minExpectInfo = double.MaxValue;

                foreach (Attribute attribute in attributeArray)
                {
                    Dictionary <string, List <Tuple> > attrSubLists = new Dictionary <string, List <Tuple> >();

                    foreach (string value in attribute.Values)
                    {
                        attrSubLists.Add(value, new List <Tuple>());
                    }
                    foreach (Tuple t in tupleArray)
                    {
                        attrSubLists[t.AttributeValues[attribute.AttributeName]].Add(t);
                    }

                    List <Tuple>[] attrSubListArray = new List <Tuple> [attrSubLists.Count];

                    for (int i = 0; i < attrSubLists.Count; ++i)
                    {
                        attrSubListArray[i] = attrSubLists[attribute.Values[i]];
                    }

                    double expectedInfo = Equations.ExpectedInfoWithPartition(tupleArray, attrSubListArray, userLabels);

                    if (expectedInfo < minExpectInfo)
                    {
                        minExpectInfo = expectedInfo;
                        bestAttribute = attribute.AttributeName;
                    }
                }

                parentNode.NextAttribute = bestAttribute;

                Dictionary <string, List <Tuple> > subLists = new Dictionary <string, List <Tuple> >();

                int bestAttributeIndex = -1;

                for (int i = 0; i < attributeArray.Count; ++i)
                {
                    if (attributeArray[i].AttributeName == bestAttribute)
                    {
                        bestAttributeIndex = i;
                        break;
                    }
                }

                foreach (string value in attributeArray[bestAttributeIndex].Values)
                {
                    subLists.Add(value, new List <Tuple>());
                }
                foreach (Tuple t in tupleArray)
                {
                    subLists[t.AttributeValues[attributeArray[bestAttributeIndex].AttributeName]].Add(t);
                }

                List <Tuple>[] subListArray = new List <Tuple> [subLists.Count];

                for (int i = 0; i < subLists.Count; ++i)
                {
                    subListArray[i] = subLists[attributeArray[bestAttributeIndex].Values[i]];
                }

                List <Attribute> prunedAttributeArray = new List <Attribute>(attributeArray);
                prunedAttributeArray.RemoveAt(bestAttributeIndex);

                //for (int i = 0; i < subListArray.Length; ++i)
                int index     = 0;
                int iteration = 0;
                while (iteration < subListArray.Length)
                {
                    if (continuousAttributeNames.Contains(bestAttribute))
                    {
                        parentNode.AddChild(attributeArray[bestAttributeIndex].Values[iteration] + ":", string.Empty);
                    }
                    else
                    {
                        parentNode.AddChild("=" + attributeArray[bestAttributeIndex].Values[iteration] + ":", string.Empty);
                    }
                    TreeRecursion(subListArray[iteration], parentNode.Children[index], prunedAttributeArray, userLabels, continuousAttributeNames, parentNode, index);
                    if (pruned_status)
                    {
                        parentNode.Children.RemoveAt(index);
                        pruned_status = false;
                        index         = 0;
                        iteration++;
                        continue;
                    }
                    iteration++;
                    index++;
                }
            }
            else if (tupleArray.Count > 0 && attributeArray.Count == 0)
            {
                Dictionary <string, int> classCount = new Dictionary <string, int>();

                foreach (string tupleClass in userLabels.Values)
                {
                    classCount.Add(tupleClass, 0);
                }

                foreach (Tuple tuple in tupleArray)
                {
                    ++classCount[tuple.Class];
                }

                int    maxFrequency = int.MinValue;
                string modeClass    = string.Empty;

                foreach (string tupleClass in userLabels.Values)
                {
                    if (classCount[tupleClass] > maxFrequency)
                    {
                        maxFrequency = classCount[tupleClass];
                        modeClass    = tupleClass;
                    }
                }

                parentNode.NextAttribute = modeClass;
            }
            else if (tupleArray.Count == 0) //no tuples left left to branch off of
            {
                if (childIndex != -1)       //prune the child
                {
                    pruned_status = true;
                }
                else
                {
                    parentNode.NextAttribute = "No data left";
                }
            }
            else //all the tuples are of the same class
            {
                parentNode.NextAttribute = tupleArray[0].Class;
            }
        }