public SubsetMask(BitArrayX mask, int size, int[] output, int outputFactorsCount) { Mask = mask; Size = size; Entropy = 0d; EntropyZeroFactor = -1; for (var factor = 0; factor < outputFactorsCount; factor++) { var count = 0; for (var i = 0; i < output.Length; i++) { if (mask[i] && output[i] == factor) { count++; } } if (count == Size) { Entropy = 0d; EntropyZeroFactor = factor; return; } if (count > 0) { var px = (double)count / Size; Entropy -= px * Math.Log(px, 2); } } }
public void Learn() { var rootMask = new BitArrayX(input.Length, true); Root = new DecisionNode { SubsetMask = new SubsetMask(rootMask, input.Length, output, outputFactorsCount), UsedFactors = new BitArrayX(inputFactorsCount) }; var stack = new Stack <DecisionNode>(); stack.Push(Root); while (stack.Count > 0) { var node = stack.Pop(); SplitNode(node); if (node.IsLeaf) { continue; } stack.Push(node.Present); stack.Push(node.Absent); } }
public void CanCreateAllTrues() { for (var length = 0; length < 200; length++) { var expected = new BitArrayX(length); for (var i = 0; i < length; i++) { expected[i] = true; } var actual = new BitArrayX(length, true); var isCorrect = BitArrayX.Equals(expected, actual); Assert.IsTrue(isCorrect); } }
private void SplitNode(DecisionNode node) { var mask = node.SubsetMask.Mask; var size = node.SubsetMask.Size; if (node.SubsetMask.EntropyZeroFactor >= 0) { node.IsLeaf = true; node.Factor = node.SubsetMask.EntropyZeroFactor; return; } var bestInformationGain = double.MinValue; var bestFactor = -1; var bestPresentSplitMask = default(SubsetMask); var bestAbsentSplitMask = default(SubsetMask); for (var inputFactor = 0; inputFactor < inputFactorsCount; inputFactor++) { if (node.UsedFactors[inputFactor]) { continue; } var factorPresentMask = new BitArrayX(mask).And(factorPresentMasks[inputFactor]); var factorAbsentMask = new BitArrayX(mask).AndNot(factorPresentMask); var factorPresentCount = factorPresentMask.CountBitSet(); var factorAbsentCount = size - factorPresentCount; var factorPresentSubset = new SubsetMask(factorPresentMask, factorPresentCount, output, outputFactorsCount); var factorAbsentSubset = new SubsetMask(factorAbsentMask, factorAbsentCount, output, outputFactorsCount); var informationGain = node.SubsetMask.Entropy - (factorPresentSubset.Entropy * factorPresentCount / size) - (factorAbsentSubset.Entropy * factorAbsentCount / size); if (informationGain > bestInformationGain) { bestInformationGain = informationGain; bestFactor = inputFactor; bestPresentSplitMask = factorPresentSubset; bestAbsentSplitMask = factorAbsentSubset; } } if (bestFactor == -1) { var counts = new int[outputFactorsCount]; for (var i = 0; i < output.Length; i++) { if (mask[i]) { counts[output[i]]++; } } node.IsLeaf = true; node.Factor = counts.IndexOfMax(); return; } var childUsedFactors = new BitArrayX(node.UsedFactors) { [bestFactor] = true }; node.Factor = bestFactor; node.Present = new DecisionNode { SubsetMask = bestPresentSplitMask, Factor = bestFactor, Parent = node, UsedFactors = childUsedFactors, }; node.Absent = new DecisionNode { SubsetMask = bestAbsentSplitMask, Factor = bestFactor, Parent = node, UsedFactors = childUsedFactors, }; }