Exemplo n.º 1
0
        splitDataOnUnivariateCriterion(
            double[][] data)
        {
            int[] features = GetFeaturesForSplitCritertion();

            //Store values for each col to help in parallelization
            double[] minGini        = new double[features.Length - 1];
            double[] splitCriterion = new double[features.Length - 1];
            int[]    attrIdx        = new int[features.Length - 1];

            //for (int col = 0; col < data.Count()-1;col++)
            //Assume last column is for value, d not compute gini split for target value
            Parallel.For(0, features.Count() - 1, new ParallelOptions {
                MaxDegreeOfParallelism = _maxParallelThreads
            }, col =>
            {
                double gini;
                minGini[col] = double.PositiveInfinity;
                for (int row = 0; row < data[0].Count(); row++)
                {
                    //gini = getGiniImpurity(col, data[col][row]);
                    gini = GetGiniImpurity(data, row, col);

                    //Compute Gini
                    if (gini < minGini[col])
                    {
                        attrIdx[col]        = col;
                        splitCriterion[col] = data[col][row];
                        minGini[col]        = gini;
                    }

                    if (minGini[col] == 0) //Perfect split point found, no need to continue further
                    {
                        break;             //return ed;
                    }
                }  //If condition
            });   //Main loop

            //Find value with lowest minGini
            int minIdx = 0;

            for (int col = 1; col < minGini.Length; col++)
            {
                if (minGini[col] < minGini[minIdx])
                {
                    minIdx = col;
                }
            }

            SplittedAttributeData ed = new SplittedAttributeData()
            {
                AttributeIndex         = attrIdx[minIdx],
                SplittingCriteriaValue = splitCriterion[minIdx]
            };

            return(ed);
        }
Exemplo n.º 2
0
        splitDataOnUnivariateCriterion(
            double[][] data)
        {
            double entropyS;
            double entropySv;
            double entropySum;
            double infoGain = 0;

            double[] filteredTargetData;
            Dictionary <double, long> freqs;
            SplittedAttributeData     ed = new SplittedAttributeData();

            ed.SplittingCriteriaValue = double.NegativeInfinity;
            entropyS = getEntropyOfTargetAttribute(data);

            for (int idxCol = 0; idxCol < data.Count();
                 idxCol++)
            {
                entropySum = 0;
                if (data[idxCol] != null &&
                    idxCol != _indexTargetAttribute)         //Do not compute when data not present
                {
                    freqs = InformationGain.Frequency(
                        data[idxCol]);
                    //key has value
                    foreach (double key in freqs.Keys)
                    {
                        if (key != _missingValue)
                        {
                            filteredTargetData =
                                getFilteredTargetValues(data, idxCol, key);

                            entropySv   = InformationGain.EntropyShannon(filteredTargetData);
                            entropySum += ((double)filteredTargetData.Length /
                                           (double)data[_indexTargetAttribute].Length)
                                          * entropySv;
                        }
                    }

                    infoGain = entropyS - entropySum;
                    //Compute InfoGain
                    if (infoGain > ed.SplittingCriteriaValue)
                    {
                        ed.Freqs                  = freqs;
                        ed.AttributeIndex         = idxCol;
                        ed.SplittingCriteriaValue = infoGain;
                    }
                } //if condition
            }     //Main loop

            return(ed);
        }
Exemplo n.º 3
0
        //Make private member so that stack is not full
        protected virtual DecisionTreeNode buildChildNodes(ConcurrentBag <long> trainingDataRowIndices,
                                                           double value,
                                                           DecisionTreeNode dtnParent)
        {
            DecisionTreeNode dtn = new DecisionTreeNode(dtnParent);

            //Get all rows in Training Data
            FilteredData fd = getFilteredDataForNode(dtn, value,
                                                     trainingDataRowIndices);

            //Check if all target examples are same or not
            //Their Entropy will be 0

            if (fd.NumberOfRows <= _minimumNumberPerNode || isTargetDataSame(fd.FilteredDataValues)) //Attributes is empty
            {
                setAsTargetAttributeNode(fd.FilteredDataValues, dtn);
                return(dtn);//No more children if attributeIndex is target Attributes
            }

            //Set data for current node
            SplittedAttributeData ed =
                splitDataOnUnivariateCriterion(fd.FilteredDataValues);

            //Check for positive and negative examples
            dtn.setAttributeValues(ed.AttributeIndex,
                                   _attributeHeaders[ed.AttributeIndex]);

            ConcurrentBag <long> newTrainingDataRowIndices =
                fd.TrainingDataRowIndices;

            fd = null;  //Free Memory -> Clean up data, no longer needed
            //Key has values
            foreach (double key in ed.Freqs.Keys)
            {
                if (key != 999) //Dont add for missing values
                {
                    dtn.addChild(key, buildChildNodes(newTrainingDataRowIndices,
                                                      key,
                                                      dtn)); //Key has value
                }
            }
            return(dtn);
        }
Exemplo n.º 4
0
        //Make private member so that stack is not full
        protected DecisionTreeNode BuildChildNodes(ConcurrentBag <long> trainingDataRowIndices,
                                                   double value,
                                                   DecisionTreeNode dtnParent,
                                                   bool isLessThan)
        {
            DecisionTreeNode dtn = new DecisionTreeNode(dtnParent);

            //Get all rows in Training Data
            FilteredData fd = GetFilteredDataForNode(dtn, value,
                                                     trainingDataRowIndices,
                                                     isLessThan);

            //Stopping Criterion
            //Check if minimum number of nodes are there
            //OR all target values are same
            if (fd.NumberOfRows <= _minimumNumberPerNode ||
                isTargetDataSame(fd.FilteredDataValues) ||     //Attributes is empty
                (dtnParent != null && fd.TrainingDataRowIndices.Count == trainingDataRowIndices.Count) ||     //implies no split happened)
                GetAdditionalStoppingCondition(dtn))
            {
                if (fd.NumberOfRows == 0)     //Special case, use original data as node
                {
                    fd = convertRowIndicesToFilteredData(trainingDataRowIndices);
                }
                setAsTargetAttributeNode(fd.FilteredDataValues, dtn);
                return(dtn);   //No more children if min attributes reached
            }

            //Set data for current node
            SplittedAttributeData ed =
                splitDataOnUnivariateCriterion(fd.FilteredDataValues);

            //Check for positive and negative examples
            dtn.setAttributeValues(ed.AttributeIndex,
                                   _attributeHeaders[ed.AttributeIndex]);

            //Store value in column ed.AttributeIndex based on which split  was done
            dtn.Value = ed.SplittingCriteriaValue;

            ConcurrentBag <long> newTrainingDataRowIndices =
                fd.TrainingDataRowIndices;

            fd = null;  //Free Memory -> Clean up data, no longer needed
                        //Key has values

            //Add left node
            if (ed.SplittingCriteriaValue != _missingValue) //Dont add for missing values
            {
                //0 if for left, 1 is for right.
                //There wont be any conflict since each node will have only 2 children
                //DecisionTreeNode dtnChild =
                dtn.addChild(0, BuildChildNodes(newTrainingDataRowIndices,
                                                ed.SplittingCriteriaValue,
                                                dtn, true)); //Key has value

                //dtnChild =
                dtn.addChild(1, BuildChildNodes(newTrainingDataRowIndices,
                                                ed.SplittingCriteriaValue,
                                                dtn, false)); //Key has value
            }

            return(dtn);
        }