private bool ShouldSplitOnAttributeAccordingToChiSquared(DataSetAttributeWithCounts attributeToSplitOn) { int positiveValuesGlobal = attributeToSplitOn.PossibleValueCounts.Values.Select(s => s.AppearWhenTrueCount).Sum(); int negativeValuesGlobal = attributeToSplitOn.PossibleValueCounts.Values.Select(s => s.AppearWhenFalseCount).Sum(); double chiTestValue = 0; foreach (var possibleValueCounts in attributeToSplitOn.PossibleValueCounts) { string valueKey = possibleValueCounts.Key; int positiveValuesLocal = possibleValueCounts.Value.AppearWhenTrueCount; int negativeValuesLocal = possibleValueCounts.Value.AppearWhenFalseCount; double weightFactor = 1.0 * (positiveValuesLocal + negativeValuesLocal) / (positiveValuesGlobal + negativeValuesGlobal); double pExpected = positiveValuesGlobal * weightFactor; double nExpected = negativeValuesGlobal * weightFactor; double pActual = positiveValuesLocal; double nActual = negativeValuesLocal; double diffP = (pExpected - pActual); double diffN = (nExpected - nActual); double localChiTestValue = diffP * diffP / pExpected + diffN * diffN / nExpected; chiTestValue += localChiTestValue; } double chiQuareCumulative = ChiSquaredUtils.CalculateChiSquareCDT(attributeToSplitOn.PossibleValueCounts.Count - 1, chiTestValue); return(chiQuareCumulative >= ChiTestLimit); }
public void D3(List <DataSetAttribute> attributes, List <DataSetValue> values) { // Check whether we even need to split or not int totalTrueValues = values.Count(v => v.Output); int totalFalseValues = values.Count(v => !v.Output); if (totalFalseValues == 0 && totalTrueValues > 0) { _localValue = true; return; } if (totalTrueValues == 0 && totalFalseValues > 0) { _localValue = false; return; } // Can we split on attributes? if (attributes.Count == 0) { // Can't split anymore. We'll decide on the most prevalent value _localValue = totalTrueValues > totalFalseValues; return; } // First, find the attribute with the highest "E" List <DataSetAttributeWithCounts> e = CalculateEForAllAttributes(attributes, values); DataSetAttributeWithCounts attributeWithMinEntropy = FindAttributeWithMinEntropy(e); _attributeToSplitOn = attributeWithMinEntropy; // Is it worth it to split on attributes if (!ShouldSplitOnAttributeAccordingToChiSquared(attributeWithMinEntropy)) { // Not worth it to split. We'll decide on the most prevalent value _localValue = totalTrueValues > totalFalseValues; return; } // Remove this attribute from the list of new attributes to create new subtrees List <DataSetAttribute> newAttributes = attributes.Where(a => a.Name != attributeWithMinEntropy.Name).ToList(); // Split the values in many sets _dictionaryOfSubTrees = new Dictionary <string, DecisionTreeLevel>(attributeWithMinEntropy.PossibleValues.Count); var dictionaryOfValues = new Dictionary <string, List <DataSetValue> >(); foreach (var dataSetValue in values) { string value = dataSetValue.Values[attributeWithMinEntropy.ValueIndex]; DecisionTreeLevel localTreeLevel; List <DataSetValue> localValues; if (!_dictionaryOfSubTrees.TryGetValue(value, out localTreeLevel)) { localTreeLevel = new DecisionTreeLevel(ChiTestLimit); _dictionaryOfSubTrees[value] = localTreeLevel; localValues = new List <DataSetValue>(); dictionaryOfValues[value] = localValues; } else { localValues = dictionaryOfValues[value]; } localValues.Add(dataSetValue); } // Recursively run D3 on them foreach (var decisionTree in _dictionaryOfSubTrees) { var localValues = dictionaryOfValues[decisionTree.Key]; decisionTree.Value.D3(newAttributes, localValues); } }