Exemple #1
0
        private static double GetGainForAllAttributes(TrainingData data, int colIndex, double entropyOfDataset)
        {
            var totalRows = data.Rows.Length;
            var amountForDifferentValue = GetAmountOfEdgesAndTotalPositivResults(data, colIndex);
            var stepsForCalculation     = new List <double>();

            foreach (var item in amountForDifferentValue)
            {
                // helper for calculation
                var firstDivision  = item[0, 1] / (double)item[0, 0];
                var secondDivision = (item[0, 0] - item[0, 1]) / (double)item[0, 0];

                // prevent dividedByZeroException
                if (firstDivision == 0 || secondDivision == 0)
                {
                    stepsForCalculation.Add(0.0);
                }
                else
                {
                    stepsForCalculation.Add(-firstDivision * Math.Log(firstDivision, 2) - secondDivision * Math.Log(secondDivision, 2));
                }
            }

            var gain = stepsForCalculation.Select((t, i) => amountForDifferentValue[i][0, 0] / (double)totalRows * t).Sum();

            gain = entropyOfDataset - gain;

            return(gain);
        }
Exemple #2
0
        private static List <int[, ]> GetAmountOfEdgesAndTotalPositivResults(TrainingData data, int indexOfColumnToCheck)
        {
            var foundValues = new List <int[, ]>();
            var knownValues = data.Rows.Select(x => x[indexOfColumnToCheck]).Distinct().ToList();

            foreach (var item in knownValues)
            {
                var amount         = 0;
                var positiveAmount = 0;

                for (var i = 0; i < data.Rows.Length; i++)
                {
                    if (data.Rows[i][indexOfColumnToCheck].ToString().Equals(item))
                    {
                        amount++;

                        // Counts the positive cases and adds the sum later to the array for the calculation
                        if (data.Rows[i][data.Columns.Length - 1].ToString().Equals(data.Rows[0][data.Columns.Length - 1]))
                        {
                            positiveAmount++;
                        }
                    }
                }

                int[,] array = { { amount, positiveAmount } };
                foundValues.Add(array);
            }

            return(foundValues);
        }
Exemple #3
0
        private static TreeNode GetRootNode(TrainingData data, string edge)
        {
            var attributes = new List <NodeAttribute>();
            var highestInformationGainIndex = -1;
            var highestInformationGain      = double.MinValue;

            // Get all names, amount of attributes and attributes for every column
            for (var i = 0; i < data.Columns.Length - 1; i++)
            {
                var differentAttributenames = data.Rows.Select(x => x[i]).Distinct().ToList();
                attributes.Add(new NodeAttribute(data.Columns[i].ToString(), differentAttributenames));
            }

            // Calculate Entropy (S)
            var tableEntropy = CalculateEntropy(data);

            for (var i = 0; i < attributes.Count; i++)
            {
                attributes[i].InformationGain = GetGainForAllAttributes(data, i, tableEntropy);

                if (attributes[i].InformationGain > highestInformationGain)
                {
                    highestInformationGain      = attributes[i].InformationGain;
                    highestInformationGainIndex = i;
                }
            }

            return(new TreeNode(attributes[highestInformationGainIndex].Name, highestInformationGainIndex, attributes[highestInformationGainIndex], edge));
        }
Exemple #4
0
        private static bool CheckIfIsLeaf(TreeNode root, TrainingData data, string attributeToCheck)
        {
            var isLeaf       = true;
            var allEndValues = new List <string>();

            // get all leaf values for the attribute in question
            for (var i = 0; i < data.Rows.Length; i++)
            {
                if (data.Rows[i][root.TableIndex].ToString().Equals(attributeToCheck))
                {
                    allEndValues.Add(data.Rows[i][data.Columns.Length - 1].ToString());
                }
            }

            // check whether all elements of the list have the same value
            if (allEndValues.Count > 0 && allEndValues.Any(x => x != allEndValues[0]))
            {
                isLeaf = false;
            }

            // create leaf with value to display and edge to the leaf
            if (isLeaf)
            {
                root.ChildNodes.Add(new TreeNode(true, allEndValues[0], attributeToCheck));
            }

            return(isLeaf);
        }
Exemple #5
0
        public static TreeNode Learn(TrainingData data, string edgeName)
        {
            var root = GetRootNode(data, edgeName);

            foreach (var item in root.NodeAttribute.DifferentAttributeNames)
            {
                // if a leaf, leaf will be added in this method
                var isLeaf = CheckIfIsLeaf(root, data, item);

                // make a recursive call as long as the node is not a leaf
                if (!isLeaf)
                {
                    var reducedTable = new TrainingData
                    {
                        Columns = data.Columns.Skip(root.TableIndex + 1).Select(x => x).ToArray(),
                        Rows    = data.Rows.Where(x => x[root.TableIndex].Equals(item))
                                  .Select(x => x.Skip(root.TableIndex + 1).ToArray())
                                  .ToArray()
                    };

                    root.ChildNodes.Add(Learn(reducedTable, item));
                }
            }

            return(root);
        }
Exemple #6
0
        private static double CalculateEntropy(TrainingData data)
        {
            var totalRows = data.Rows.Length;
            var amountForDifferentValue = GetAmountOfEdgesAndTotalPositivResults(data, data.Columns.Length - 1);

            var stepsForCalculation = amountForDifferentValue
                                      .Select(item => item[0, 0] / (double)totalRows)
                                      .Select(division => - division * Math.Log(division, 2))
                                      .ToList();

            return(stepsForCalculation.Sum());
        }
        public TrainingData Read(string filePath)
        {
            var data = new TrainingData {
            };

            string[] lines = File.ReadAllLines(filePath);
            data.Rows = new string[lines.Length - 1][];

            for (int rowIndex = 0; rowIndex < lines.Length; rowIndex++)
            {
                string line = lines[rowIndex];
                // Columns
                if (rowIndex == 0)
                {
                    data.Columns = line.Split(';').Where(x => !String.IsNullOrEmpty(x)).ToArray();
                }
                else
                {
                    data.Rows[rowIndex - 1] = line.Split(';').Where(x => !String.IsNullOrEmpty(x)).ToArray();
                }
            }

            return(data);
        }