private bool ShouldSplitOnAttributeAccordingToChiSquared(DataSetAttributeWithCounts attributeToSplitOn) { int positiveValuesGlobal = attributeToSplitOn.PossibleValueCounts.Values.Select(s => s.AppearWhenTrueCount).Sum(); int negativeValuesGlobal = attributeToSplitOn.PossibleValueCounts.Values.Select(s => s.AppearWhenFalseCount).Sum(); double chiTestValue = 0; foreach (var possibleValueCounts in attributeToSplitOn.PossibleValueCounts) { string valueKey = possibleValueCounts.Key; int positiveValuesLocal = possibleValueCounts.Value.AppearWhenTrueCount; int negativeValuesLocal = possibleValueCounts.Value.AppearWhenFalseCount; double weightFactor = 1.0 * (positiveValuesLocal + negativeValuesLocal) / (positiveValuesGlobal + negativeValuesGlobal); double pExpected = positiveValuesGlobal * weightFactor; double nExpected = negativeValuesGlobal * weightFactor; double pActual = positiveValuesLocal; double nActual = negativeValuesLocal; double diffP = (pExpected - pActual); double diffN = (nExpected - nActual); double localChiTestValue = diffP * diffP / pExpected + diffN * diffN / nExpected; chiTestValue += localChiTestValue; } double chiQuareCumulative = ChiSquaredUtils.CalculateChiSquareCDT(attributeToSplitOn.PossibleValueCounts.Count - 1, chiTestValue); return(chiQuareCumulative >= ChiTestLimit); }
public void D3() { // Check whether we even need to split or not int totalTrueValues = _values.Count(v => v.Output); int totalFalseValues = _values.Count(v => !v.Output); if (totalFalseValues == 0 && totalTrueValues > 0) { _localValue = true; return; } if (totalTrueValues == 0 && totalFalseValues > 0) { _localValue = false; return; } // Can we split on attributes? if (_attributes.Count == 0) { // Can't split anymore. We'll decide on the most prevalent value _localValue = totalTrueValues > totalFalseValues; return; } // First, find the attribute with the highest "E" List <DataSetAttributeWithCounts> e = CalculateEForAllAttributes(_attributes, _values); DataSetAttributeWithCounts attributeWithMinEntropy = FindAttributeWithMinEntropy(e); _attributeToSplitOn = attributeWithMinEntropy; // Is it worth it to split on attributes if (!ShouldSplitOnAttributeAccordingToChiSquared(attributeWithMinEntropy)) { // Not worth it to split. We'll decide on the most prevalent value _localValue = totalTrueValues > totalFalseValues; return; } // Remove this attribute from the list of new attributes to create new subtrees List <DataSetAttribute> newAttributes = _attributes.Where(a => a.Name != attributeWithMinEntropy.Name).ToList(); // Split the values in many sets _dictionaryOfSubTrees = new Dictionary <string, DecisionTreeLevel>(attributeWithMinEntropy.PossibleValues.Count); var dictionaryOfValues = new Dictionary <string, List <DataSetValue> >(); foreach (var dataSetValue in _values) { string value = dataSetValue.Values[attributeWithMinEntropy.ValueIndex]; DecisionTreeLevel localTreeLevel; List <DataSetValue> localValues; if (!_dictionaryOfSubTrees.TryGetValue(value, out localTreeLevel)) { localValues = new List <DataSetValue>(); dictionaryOfValues[value] = localValues; localTreeLevel = new DecisionTreeLevel(ChiTestLimit, newAttributes, localValues); _dictionaryOfSubTrees[value] = localTreeLevel; } else { localValues = dictionaryOfValues[value]; } localValues.Add(dataSetValue); } // Recursively run D3 on them foreach (var decisionTree in _dictionaryOfSubTrees) { List <DataSetValue> localValues = dictionaryOfValues[decisionTree.Key]; decisionTree.Value.D3(); } }