splitDataOnUnivariateCriterion( double[][] data) { int[] features = GetFeaturesForSplitCritertion(); //Store values for each col to help in parallelization double[] minGini = new double[features.Length - 1]; double[] splitCriterion = new double[features.Length - 1]; int[] attrIdx = new int[features.Length - 1]; //for (int col = 0; col < data.Count()-1;col++) //Assume last column is for value, d not compute gini split for target value Parallel.For(0, features.Count() - 1, new ParallelOptions { MaxDegreeOfParallelism = _maxParallelThreads }, col => { double gini; minGini[col] = double.PositiveInfinity; for (int row = 0; row < data[0].Count(); row++) { //gini = getGiniImpurity(col, data[col][row]); gini = GetGiniImpurity(data, row, col); //Compute Gini if (gini < minGini[col]) { attrIdx[col] = col; splitCriterion[col] = data[col][row]; minGini[col] = gini; } if (minGini[col] == 0) //Perfect split point found, no need to continue further { break; //return ed; } } //If condition }); //Main loop //Find value with lowest minGini int minIdx = 0; for (int col = 1; col < minGini.Length; col++) { if (minGini[col] < minGini[minIdx]) { minIdx = col; } } SplittedAttributeData ed = new SplittedAttributeData() { AttributeIndex = attrIdx[minIdx], SplittingCriteriaValue = splitCriterion[minIdx] }; return(ed); }
splitDataOnUnivariateCriterion( double[][] data) { double entropyS; double entropySv; double entropySum; double infoGain = 0; double[] filteredTargetData; Dictionary <double, long> freqs; SplittedAttributeData ed = new SplittedAttributeData(); ed.SplittingCriteriaValue = double.NegativeInfinity; entropyS = getEntropyOfTargetAttribute(data); for (int idxCol = 0; idxCol < data.Count(); idxCol++) { entropySum = 0; if (data[idxCol] != null && idxCol != _indexTargetAttribute) //Do not compute when data not present { freqs = InformationGain.Frequency( data[idxCol]); //key has value foreach (double key in freqs.Keys) { if (key != _missingValue) { filteredTargetData = getFilteredTargetValues(data, idxCol, key); entropySv = InformationGain.EntropyShannon(filteredTargetData); entropySum += ((double)filteredTargetData.Length / (double)data[_indexTargetAttribute].Length) * entropySv; } } infoGain = entropyS - entropySum; //Compute InfoGain if (infoGain > ed.SplittingCriteriaValue) { ed.Freqs = freqs; ed.AttributeIndex = idxCol; ed.SplittingCriteriaValue = infoGain; } } //if condition } //Main loop return(ed); }
//Make private member so that stack is not full protected virtual DecisionTreeNode buildChildNodes(ConcurrentBag <long> trainingDataRowIndices, double value, DecisionTreeNode dtnParent) { DecisionTreeNode dtn = new DecisionTreeNode(dtnParent); //Get all rows in Training Data FilteredData fd = getFilteredDataForNode(dtn, value, trainingDataRowIndices); //Check if all target examples are same or not //Their Entropy will be 0 if (fd.NumberOfRows <= _minimumNumberPerNode || isTargetDataSame(fd.FilteredDataValues)) //Attributes is empty { setAsTargetAttributeNode(fd.FilteredDataValues, dtn); return(dtn);//No more children if attributeIndex is target Attributes } //Set data for current node SplittedAttributeData ed = splitDataOnUnivariateCriterion(fd.FilteredDataValues); //Check for positive and negative examples dtn.setAttributeValues(ed.AttributeIndex, _attributeHeaders[ed.AttributeIndex]); ConcurrentBag <long> newTrainingDataRowIndices = fd.TrainingDataRowIndices; fd = null; //Free Memory -> Clean up data, no longer needed //Key has values foreach (double key in ed.Freqs.Keys) { if (key != 999) //Dont add for missing values { dtn.addChild(key, buildChildNodes(newTrainingDataRowIndices, key, dtn)); //Key has value } } return(dtn); }
//Make private member so that stack is not full protected DecisionTreeNode BuildChildNodes(ConcurrentBag <long> trainingDataRowIndices, double value, DecisionTreeNode dtnParent, bool isLessThan) { DecisionTreeNode dtn = new DecisionTreeNode(dtnParent); //Get all rows in Training Data FilteredData fd = GetFilteredDataForNode(dtn, value, trainingDataRowIndices, isLessThan); //Stopping Criterion //Check if minimum number of nodes are there //OR all target values are same if (fd.NumberOfRows <= _minimumNumberPerNode || isTargetDataSame(fd.FilteredDataValues) || //Attributes is empty (dtnParent != null && fd.TrainingDataRowIndices.Count == trainingDataRowIndices.Count) || //implies no split happened) GetAdditionalStoppingCondition(dtn)) { if (fd.NumberOfRows == 0) //Special case, use original data as node { fd = convertRowIndicesToFilteredData(trainingDataRowIndices); } setAsTargetAttributeNode(fd.FilteredDataValues, dtn); return(dtn); //No more children if min attributes reached } //Set data for current node SplittedAttributeData ed = splitDataOnUnivariateCriterion(fd.FilteredDataValues); //Check for positive and negative examples dtn.setAttributeValues(ed.AttributeIndex, _attributeHeaders[ed.AttributeIndex]); //Store value in column ed.AttributeIndex based on which split was done dtn.Value = ed.SplittingCriteriaValue; ConcurrentBag <long> newTrainingDataRowIndices = fd.TrainingDataRowIndices; fd = null; //Free Memory -> Clean up data, no longer needed //Key has values //Add left node if (ed.SplittingCriteriaValue != _missingValue) //Dont add for missing values { //0 if for left, 1 is for right. //There wont be any conflict since each node will have only 2 children //DecisionTreeNode dtnChild = dtn.addChild(0, BuildChildNodes(newTrainingDataRowIndices, ed.SplittingCriteriaValue, dtn, true)); //Key has value //dtnChild = dtn.addChild(1, BuildChildNodes(newTrainingDataRowIndices, ed.SplittingCriteriaValue, dtn, false)); //Key has value } return(dtn); }