// //Used original data /// <summary> /// Return filtered data for node /// /// Returns rows which match parent attribute having parent value /// Also sets the attribute column of all parents to null /// /// </summary> /// <param name="dtn"></param> /// <param name="nodeValue"></param> /// <param name="trainingDataRowIndices"></param> /// <returns></returns> protected virtual FilteredData getFilteredDataForNode( DecisionTreeNode dtn, double nodeValue, ConcurrentBag <long> trainingDataRowIndices) { //filterOnValues has all values we need to filter on if (dtn.Parent == null) //Root nodes, gets entire training data { return(new FilteredData(_trainingData, trainingDataRowIndices, trainingDataRowIndices.Count)); } Dictionary <int, double> allParentValues = new Dictionary <int, double>(); DecisionTreeNode parent = dtn.Parent; while (parent != null) //Find out values for each parent { allParentValues.Add(parent.AttributeIndex, nodeValue); //Value is in current node parent = parent.Parent; } //Only add 1 index, value pair since trainingDataRowIndices is already sorted out ConcurrentBag <long> bagTrainingDataRowIndices = new ConcurrentBag <long>(); // Find out which rows matach the parent Parallel.ForEach(trainingDataRowIndices, new ParallelOptions { MaxDegreeOfParallelism = _maxParallelThreads }, rowIdx => { if (_trainingData[dtn.Parent.AttributeIndex][rowIdx] == nodeValue) { bagTrainingDataRowIndices.Add(rowIdx); } }); //rowIdx //Now rows are known allocate for filtered Value //Allocate for filteredValues //Need dymanic numer of rows hence using fltered values long[] arrayTrainingDataRowIndices = bagTrainingDataRowIndices.ToArray <long>(); double[][] filteredData = new double[_trainingData.Length][]; //Parallel.For(0,_trainingData.Length, new ParallelOptions { MaxDegreeOfParallelism = _maxParallelThreads }, colIdx => for (int colIdx = 0; colIdx < _trainingData.Length; colIdx++) { filteredData[colIdx] = new double[arrayTrainingDataRowIndices.Length]; long rowIdx; //Will run in parallel Order has to be presevered for (int idx = 0; idx < arrayTrainingDataRowIndices.Length; idx++) { rowIdx = arrayTrainingDataRowIndices[idx]; filteredData[colIdx][idx] = _trainingData[colIdx][rowIdx]; } }//); return(new FilteredData(filteredData, bagTrainingDataRowIndices, filteredData[0].Length)); }
//Make private member so that stack is not full protected DecisionTreeNode BuildChildNodes(ConcurrentBag <long> trainingDataRowIndices, double value, DecisionTreeNode dtnParent, bool isLessThan) { DecisionTreeNode dtn = new DecisionTreeNode(dtnParent); //Get all rows in Training Data FilteredData fd = GetFilteredDataForNode(dtn, value, trainingDataRowIndices, isLessThan); //Stopping Criterion //Check if minimum number of nodes are there //OR all target values are same if (fd.NumberOfRows <= _minimumNumberPerNode || isTargetDataSame(fd.FilteredDataValues) || //Attributes is empty (dtnParent != null && fd.TrainingDataRowIndices.Count == trainingDataRowIndices.Count) || //implies no split happened) GetAdditionalStoppingCondition(dtn)) { if (fd.NumberOfRows == 0) //Special case, use original data as node { fd = convertRowIndicesToFilteredData(trainingDataRowIndices); } setAsTargetAttributeNode(fd.FilteredDataValues, dtn); return(dtn); //No more children if min attributes reached } //Set data for current node SplittedAttributeData ed = splitDataOnUnivariateCriterion(fd.FilteredDataValues); //Check for positive and negative examples dtn.setAttributeValues(ed.AttributeIndex, _attributeHeaders[ed.AttributeIndex]); //Store value in column ed.AttributeIndex based on which split was done dtn.Value = ed.SplittingCriteriaValue; ConcurrentBag <long> newTrainingDataRowIndices = fd.TrainingDataRowIndices; fd = null; //Free Memory -> Clean up data, no longer needed //Key has values //Add left node if (ed.SplittingCriteriaValue != _missingValue) //Dont add for missing values { //0 if for left, 1 is for right. //There wont be any conflict since each node will have only 2 children //DecisionTreeNode dtnChild = dtn.addChild(0, BuildChildNodes(newTrainingDataRowIndices, ed.SplittingCriteriaValue, dtn, true)); //Key has value //dtnChild = dtn.addChild(1, BuildChildNodes(newTrainingDataRowIndices, ed.SplittingCriteriaValue, dtn, false)); //Key has value } return(dtn); }
//Any extra stopping condition, primarily added for boosting //which needs decision stumps protected virtual bool GetAdditionalStoppingCondition(DecisionTreeNode dtn) { return(false);//Make sure value is false else stopping condition is reach automatically }
/// <summary> /// Return filtered data for node /// /// Returns rows which match parent attribute having parent value /// Also sets the attribute column of all parents to null /// /// </summary> /// <param name="dtn"></param> /// <param name="nodeValue"></param> /// <param name="trainingDataRowIndices"></param> /// <returns></returns> protected FilteredData GetFilteredDataForNode( DecisionTreeNode dtn, double nodeValue, ConcurrentBag <long> trainingDataRowIndices, bool isLessThan) { if (dtn.Parent == null) //Root nodes, gets entire training data { //If trainingDataRowIndices not same as original rows //Then filter, required for Bagged Trees if (_trainingRowsPassed) { return(convertRowIndicesToFilteredData(trainingDataRowIndices)); } else { return(new FilteredData(_trainingData, trainingDataRowIndices, trainingDataRowIndices.Count)); } } Dictionary <int, double> allParentValues = new Dictionary <int, double>(); //Allocate for filteredValues List <double>[] filteredValues = new List <double> [_trainingData.Length]; //filterOnValues has all values we need to filter on //Need dynamic numer of rows hence using filtered values as List Parallel.For(0, _trainingData.Length, new ParallelOptions { MaxDegreeOfParallelism = _maxParallelThreads }, colIdx => { filteredValues[colIdx] = new List <double>(); }); DecisionTreeNode parent = dtn.Parent; ConcurrentBag <long> newTrainingDataRowIndices = new ConcurrentBag <long>(); //Parallelize this later //foreach (long rowIdx in trainingDataRowIndices) //Can optimize here to view rows passed down Parallel.ForEach(trainingDataRowIndices, new ParallelOptions { MaxDegreeOfParallelism = _maxParallelThreads }, rowIdx => { if ((isLessThan && _trainingData[dtn.Parent.AttributeIndex][rowIdx] < nodeValue) || (!isLessThan && _trainingData[dtn.Parent.AttributeIndex][rowIdx] >= nodeValue)) { newTrainingDataRowIndices.Add(rowIdx); //Copy array and break from foreach //List.Add is not thread safe CopyFilteredData(filteredValues, rowIdx); } }); //rowIdx //Convert List to Array //Even copy previous columns, as there Entropy will be 0 double[][] filteredData = new double[_trainingData.Length][]; long numberOfRows = 0; //for(int colIdx=0; colIdx < _trainingData.Length; colIdx++) Parallel.For(0, _trainingData.Length, new ParallelOptions { MaxDegreeOfParallelism = _maxParallelThreads }, colIdx => { //No need to set parent to NULL in CART filteredData[colIdx] = filteredValues[colIdx].ToArray(); //Do note move this out or for since filteredData[colIdx] can be null //numberOfRows is same for all selected columns numberOfRows = filteredData[colIdx].Length; }); return(new FilteredData(filteredData, newTrainingDataRowIndices, numberOfRows)); }
//private long _numberOfRows; //Used by CART to determine if node had ended public DecisionTreeNode(DecisionTreeNode parent) { Initialize(); _value = Double.PositiveInfinity; _parent = parent; }