Пример #1
0
        //
        //Used original data

        /// <summary>
        /// Return filtered data for node
        ///
        /// Returns rows which match parent attribute having parent value
        /// Also sets the attribute column of all parents to null
        ///
        /// </summary>
        /// <param name="dtn"></param>
        /// <param name="nodeValue"></param>
        /// <param name="trainingDataRowIndices"></param>
        /// <returns></returns>
        protected virtual FilteredData getFilteredDataForNode(
            DecisionTreeNode dtn,
            double nodeValue,
            ConcurrentBag <long> trainingDataRowIndices)
        {
            //filterOnValues has all values we need to filter on
            if (dtn.Parent == null) //Root nodes, gets entire training data
            {
                return(new FilteredData(_trainingData,
                                        trainingDataRowIndices,
                                        trainingDataRowIndices.Count));
            }

            Dictionary <int, double> allParentValues =
                new Dictionary <int, double>();
            DecisionTreeNode parent = dtn.Parent;


            while (parent != null) //Find out values for each parent
            {
                allParentValues.Add(parent.AttributeIndex,
                                    nodeValue);     //Value is in current node
                parent = parent.Parent;
            } //Only add 1 index, value pair since trainingDataRowIndices is already sorted out

            ConcurrentBag <long> bagTrainingDataRowIndices =
                new ConcurrentBag <long>();


            // Find out which rows matach the parent
            Parallel.ForEach(trainingDataRowIndices, new ParallelOptions {
                MaxDegreeOfParallelism = _maxParallelThreads
            }, rowIdx =>
            {
                if (_trainingData[dtn.Parent.AttributeIndex][rowIdx] ==
                    nodeValue)
                {
                    bagTrainingDataRowIndices.Add(rowIdx);
                }
            });  //rowIdx

            //Now rows are known allocate for filtered Value
            //Allocate for filteredValues
            //Need dymanic numer of rows hence using fltered values
            long[]     arrayTrainingDataRowIndices = bagTrainingDataRowIndices.ToArray <long>();
            double[][] filteredData = new double[_trainingData.Length][];
            //Parallel.For(0,_trainingData.Length, new ParallelOptions { MaxDegreeOfParallelism = _maxParallelThreads }, colIdx =>
            for (int colIdx = 0; colIdx < _trainingData.Length; colIdx++)
            {
                filteredData[colIdx] = new double[arrayTrainingDataRowIndices.Length];
                long rowIdx;
                //Will run in parallel Order has to be presevered
                for (int idx = 0; idx < arrayTrainingDataRowIndices.Length; idx++)
                {
                    rowIdx = arrayTrainingDataRowIndices[idx];
                    filteredData[colIdx][idx] =
                        _trainingData[colIdx][rowIdx];
                }
            }//);

            return(new FilteredData(filteredData,
                                    bagTrainingDataRowIndices,
                                    filteredData[0].Length));
        }
Пример #2
0
        //Make private member so that stack is not full
        protected DecisionTreeNode BuildChildNodes(ConcurrentBag <long> trainingDataRowIndices,
                                                   double value,
                                                   DecisionTreeNode dtnParent,
                                                   bool isLessThan)
        {
            DecisionTreeNode dtn = new DecisionTreeNode(dtnParent);

            //Get all rows in Training Data
            FilteredData fd = GetFilteredDataForNode(dtn, value,
                                                     trainingDataRowIndices,
                                                     isLessThan);

            //Stopping Criterion
            //Check if minimum number of nodes are there
            //OR all target values are same
            if (fd.NumberOfRows <= _minimumNumberPerNode ||
                isTargetDataSame(fd.FilteredDataValues) ||     //Attributes is empty
                (dtnParent != null && fd.TrainingDataRowIndices.Count == trainingDataRowIndices.Count) ||     //implies no split happened)
                GetAdditionalStoppingCondition(dtn))
            {
                if (fd.NumberOfRows == 0)     //Special case, use original data as node
                {
                    fd = convertRowIndicesToFilteredData(trainingDataRowIndices);
                }
                setAsTargetAttributeNode(fd.FilteredDataValues, dtn);
                return(dtn);   //No more children if min attributes reached
            }

            //Set data for current node
            SplittedAttributeData ed =
                splitDataOnUnivariateCriterion(fd.FilteredDataValues);

            //Check for positive and negative examples
            dtn.setAttributeValues(ed.AttributeIndex,
                                   _attributeHeaders[ed.AttributeIndex]);

            //Store value in column ed.AttributeIndex based on which split  was done
            dtn.Value = ed.SplittingCriteriaValue;

            ConcurrentBag <long> newTrainingDataRowIndices =
                fd.TrainingDataRowIndices;

            fd = null;  //Free Memory -> Clean up data, no longer needed
                        //Key has values

            //Add left node
            if (ed.SplittingCriteriaValue != _missingValue) //Dont add for missing values
            {
                //0 if for left, 1 is for right.
                //There wont be any conflict since each node will have only 2 children
                //DecisionTreeNode dtnChild =
                dtn.addChild(0, BuildChildNodes(newTrainingDataRowIndices,
                                                ed.SplittingCriteriaValue,
                                                dtn, true)); //Key has value

                //dtnChild =
                dtn.addChild(1, BuildChildNodes(newTrainingDataRowIndices,
                                                ed.SplittingCriteriaValue,
                                                dtn, false)); //Key has value
            }

            return(dtn);
        }
Пример #3
0
 //Any extra stopping condition, primarily added for boosting
 //which needs decision stumps
 protected virtual bool GetAdditionalStoppingCondition(DecisionTreeNode dtn)
 {
     return(false);//Make sure value is false else stopping condition is reach automatically
 }
Пример #4
0
        /// <summary>
        /// Return filtered data for node
        ///
        /// Returns rows which match parent attribute having parent value
        /// Also sets the attribute column of all parents to null
        ///
        /// </summary>
        /// <param name="dtn"></param>
        /// <param name="nodeValue"></param>
        /// <param name="trainingDataRowIndices"></param>
        /// <returns></returns>
        protected FilteredData GetFilteredDataForNode(
            DecisionTreeNode dtn,
            double nodeValue,
            ConcurrentBag <long> trainingDataRowIndices,
            bool isLessThan)
        {
            if (dtn.Parent == null) //Root nodes, gets entire training data
            {
                //If trainingDataRowIndices not same as original rows
                //Then filter, required for Bagged Trees
                if (_trainingRowsPassed)
                {
                    return(convertRowIndicesToFilteredData(trainingDataRowIndices));
                }
                else
                {
                    return(new FilteredData(_trainingData,
                                            trainingDataRowIndices,
                                            trainingDataRowIndices.Count));
                }
            }

            Dictionary <int, double> allParentValues =
                new Dictionary <int, double>();

            //Allocate for filteredValues
            List <double>[] filteredValues = new List <double> [_trainingData.Length];
            //filterOnValues has all values we need to filter on
            //Need dynamic numer of rows hence using filtered values as List
            Parallel.For(0, _trainingData.Length, new ParallelOptions {
                MaxDegreeOfParallelism = _maxParallelThreads
            }, colIdx => {
                filteredValues[colIdx] = new List <double>();
            });

            DecisionTreeNode     parent = dtn.Parent;
            ConcurrentBag <long> newTrainingDataRowIndices =
                new ConcurrentBag <long>();

            //Parallelize this later
            //foreach (long rowIdx in trainingDataRowIndices) //Can optimize here to view rows passed down
            Parallel.ForEach(trainingDataRowIndices, new ParallelOptions {
                MaxDegreeOfParallelism = _maxParallelThreads
            }, rowIdx =>
            {
                if ((isLessThan && _trainingData[dtn.Parent.AttributeIndex][rowIdx] <
                     nodeValue) || (!isLessThan && _trainingData[dtn.Parent.AttributeIndex][rowIdx] >=
                                    nodeValue))
                {
                    newTrainingDataRowIndices.Add(rowIdx);
                    //Copy array and break from foreach
                    //List.Add is not thread safe
                    CopyFilteredData(filteredValues, rowIdx);
                }
            }); //rowIdx

            //Convert List to Array
            //Even copy previous columns, as there Entropy will be 0
            double[][] filteredData = new double[_trainingData.Length][];
            long       numberOfRows = 0;

            //for(int colIdx=0; colIdx < _trainingData.Length; colIdx++)
            Parallel.For(0, _trainingData.Length, new ParallelOptions {
                MaxDegreeOfParallelism = _maxParallelThreads
            }, colIdx =>
            {
                //No need to set parent to NULL in CART
                filteredData[colIdx] =
                    filteredValues[colIdx].ToArray();
                //Do note move this out or for since  filteredData[colIdx] can be null
                //numberOfRows is same for all selected columns
                numberOfRows = filteredData[colIdx].Length;
            });
            return(new FilteredData(filteredData,
                                    newTrainingDataRowIndices,
                                    numberOfRows));
        }
Пример #5
0
        //private long _numberOfRows; //Used by CART to determine if node had ended

        public DecisionTreeNode(DecisionTreeNode parent)
        {
            Initialize();
            _value  = Double.PositiveInfinity;
            _parent = parent;
        }