private bool ShouldSplitOnAttributeAccordingToChiSquared(DataSetAttributeWithCounts attributeToSplitOn)
        {
            int positiveValuesGlobal = attributeToSplitOn.PossibleValueCounts.Values.Select(s => s.AppearWhenTrueCount).Sum();
            int negativeValuesGlobal = attributeToSplitOn.PossibleValueCounts.Values.Select(s => s.AppearWhenFalseCount).Sum();

            double chiTestValue = 0;

            foreach (var possibleValueCounts in attributeToSplitOn.PossibleValueCounts)
            {
                string valueKey            = possibleValueCounts.Key;
                int    positiveValuesLocal = possibleValueCounts.Value.AppearWhenTrueCount;
                int    negativeValuesLocal = possibleValueCounts.Value.AppearWhenFalseCount;

                double weightFactor = 1.0 * (positiveValuesLocal + negativeValuesLocal) / (positiveValuesGlobal + negativeValuesGlobal);
                double pExpected    = positiveValuesGlobal * weightFactor;
                double nExpected    = negativeValuesGlobal * weightFactor;

                double pActual = positiveValuesLocal;
                double nActual = negativeValuesLocal;

                double diffP = (pExpected - pActual);
                double diffN = (nExpected - nActual);

                double localChiTestValue = diffP * diffP / pExpected + diffN * diffN / nExpected;

                chiTestValue += localChiTestValue;
            }

            double chiQuareCumulative = ChiSquaredUtils.CalculateChiSquareCDT(attributeToSplitOn.PossibleValueCounts.Count - 1, chiTestValue);

            return(chiQuareCumulative >= ChiTestLimit);
        }
        public void D3(List <DataSetAttribute> attributes, List <DataSetValue> values)
        {
            // Check whether we even need to split or not
            int totalTrueValues  = values.Count(v => v.Output);
            int totalFalseValues = values.Count(v => !v.Output);

            if (totalFalseValues == 0 && totalTrueValues > 0)
            {
                _localValue = true;
                return;
            }

            if (totalTrueValues == 0 && totalFalseValues > 0)
            {
                _localValue = false;
                return;
            }

            // Can we split on attributes?
            if (attributes.Count == 0)
            {
                // Can't split anymore. We'll decide on the most prevalent value
                _localValue = totalTrueValues > totalFalseValues;
                return;
            }

            // First, find the attribute with the highest "E"
            List <DataSetAttributeWithCounts> e = CalculateEForAllAttributes(attributes, values);
            DataSetAttributeWithCounts        attributeWithMinEntropy = FindAttributeWithMinEntropy(e);

            _attributeToSplitOn = attributeWithMinEntropy;

            // Is it worth it to split on attributes
            if (!ShouldSplitOnAttributeAccordingToChiSquared(attributeWithMinEntropy))
            {
                // Not worth it to split. We'll decide on the most prevalent value
                _localValue = totalTrueValues > totalFalseValues;
                return;
            }

            // Remove this attribute from the list of new attributes to create new subtrees
            List <DataSetAttribute> newAttributes = attributes.Where(a => a.Name != attributeWithMinEntropy.Name).ToList();

            // Split the values in many sets
            _dictionaryOfSubTrees = new Dictionary <string, DecisionTreeLevel>(attributeWithMinEntropy.PossibleValues.Count);
            var dictionaryOfValues = new Dictionary <string, List <DataSetValue> >();

            foreach (var dataSetValue in values)
            {
                string              value = dataSetValue.Values[attributeWithMinEntropy.ValueIndex];
                DecisionTreeLevel   localTreeLevel;
                List <DataSetValue> localValues;
                if (!_dictionaryOfSubTrees.TryGetValue(value, out localTreeLevel))
                {
                    localTreeLevel = new DecisionTreeLevel(ChiTestLimit);
                    _dictionaryOfSubTrees[value] = localTreeLevel;
                    localValues = new List <DataSetValue>();
                    dictionaryOfValues[value] = localValues;
                }
                else
                {
                    localValues = dictionaryOfValues[value];
                }

                localValues.Add(dataSetValue);
            }

            // Recursively run D3 on them
            foreach (var decisionTree in _dictionaryOfSubTrees)
            {
                var localValues = dictionaryOfValues[decisionTree.Key];
                decisionTree.Value.D3(newAttributes, localValues);
            }
        }