/// <summary> /// Reconstruct a decision tree from /// </summary> /// <param name="attrSet"></param> /// <param name="goalAttr"></param> /// <param name="nodes"></param> public TestDecisionTree(AttributeSet attrSet, SymbolicAttribute goalAttr, Node[] nodes) : base(attrSet, goalAttr) { int cur = 0; Node curNode; Node[] nodeArray = nodes.ToArray(); if (!(nodeArray[0] is AnchorNode)) { throw new ArgumentException(""); } this._anchor = new AnchorNode(this); curNode = nodeArray[++cur]; this._anchor.Replace(curNode); curNode.Father = this._anchor; Queue <Node> queue = new Queue <Node>(); queue.Enqueue(curNode); do { curNode = queue.Dequeue(); int nbSons = curNode.NumOfSons(); List <Node> sons = new List <Node>(); for (int i = 0; i < nbSons; i++) { Node son = nodeArray[++cur]; sons.Add(son); //update the son queue.Enqueue(son); } if (curNode is TestNode) { TestNode tn = new TestNode(curNode.Weight, (curNode as TestNode).Test, sons); curNode.Replace(tn); curNode = tn; } else if (curNode is LeafNode) { curNode.Replace((LeafNode)curNode); } foreach (Node n in sons) { n.Father = curNode; } }while (queue.Count > 0); }
public SymbolicTest(SymbolicAttribute attr, IEnumerable <KnownSymbolicValue> values) : base(attr) { if (values == null) { throw new ArgumentNullException(); } _values = values.ToArray(); }
/// <summary> /// Create an empty decision tree object. /// </summary> /// <param name="attrSet"></param> /// <param name="goalAttr"></param> public DecisionTree(AttributeSet attrSet, SymbolicAttribute goalAttr) { if (attrSet == null || goalAttr == null) { throw new ArgumentNullException(); } _anchor = new AnchorNode(this); _attributeSet = attrSet; _goalAttribute = goalAttr; }
public TestScore BestSplitTest(Attribute testAttribute, SymbolicAttribute goalAttribute) { if (testAttribute is SymbolicAttribute) { return(BestSplitTest((SymbolicAttribute)testAttribute, goalAttribute)); } else if (testAttribute is NumericalAttribute) { return(BestSplitTest((NumericalAttribute)testAttribute, goalAttribute)); } else { throw new ArgumentException("Unknow attribute type."); } }
public SimpleDecisionTreeBuilder(ItemSet learningItemSet, AttributeSet testAttributeSet, SymbolicAttribute goalAttribute) { System.Console.WriteLine("Inside the tree builder!!!!!!!!!!"); if (learningItemSet == null || learningItemSet.NumOfItems() == 0) { throw new ArgumentNullException(); } this._learningSet = learningItemSet; this._testAttributeSet = testAttributeSet; this._goalAttribute = goalAttribute; LearningDecisionTree tree = new LearningDecisionTree(learningItemSet.AttrSet, goalAttribute, learningItemSet); this._tree = tree; }
/// <summary> /// Returns the distribution of goal values. This distributionis represented by an array, /// its i-th element is proportional to the weight of the i-th goal value. /// The Sum of the elements of this array is equal to 1. /// </summary> /// <returns>An array describing the goal value distribution associated to this node.</returns> public override double[] GetGoalValueDistribution() { WeightedItemSet itemSet; DecisionTree dt = base.Tree(); if (dt == null || _learningSet == null) { return(null); } if (!(_learningSet is WeightedItemSet)) { itemSet = new WeightedItemSet(_learningSet); } else { itemSet = (WeightedItemSet)_learningSet; } SymbolicAttribute goalAttr = dt.GoalAttribute; if (goalAttr == null) { return(null); } //Find the most frequent goal value in the items of the learning set double[] frequencies = new double[goalAttr.NumOfValues]; for (int i = 0; i < itemSet.NumOfItems(); i++) { int id = ((KnownSymbolicValue)(itemSet.Items[i].ValueOf(itemSet.AttrSet.IndexOf(goalAttr)))).IntValue; frequencies[id] += itemSet.GetWeight(i); } for (int i = 0; i < frequencies.Length; i++) { frequencies[i] /= itemSet.Size(); } return(frequencies); }
/// <summary> /// Computes the entropy of the set regarding a given symbolic attribute. /// </summary> /// <param name="attr">the attribute against which to compute the entropy.</param> /// <returns></returns> public virtual double CalEntropy(SymbolicAttribute attr) { if (!_attributeSet.Contains(attr)) { throw new ArgumentException("Unknown attribute"); } if (this._entropy < 0.0d || !_entropyAttribute.Equals(attr)) { double[] frequencies = new double[attr.NumOfValues]; for (int i = 0; i < _items.Count; i++) { KnownSymbolicValue sv = (KnownSymbolicValue)(_items[i].ValueOf(_attributeSet.IndexOf(attr))); frequencies[sv.IntValue]++; } this._entropy = Entropy.CalEntropy(frequencies); _entropyAttribute = attr; } return(_entropy); }
/// <summary> /// /// </summary> /// <param name="testAttr"></param> /// <param name="goalAttr"></param> /// <returns></returns> protected TestScore BestSplitTest(Attribute testAttr, SymbolicAttribute goalAttr) { ItemSet knownItems = new ItemSet(_attributeSet); int nbKnown = 0; foreach (Item it in _items) { if (!it.ValueOf(_attributeSet, testAttr).IsUnknown()) { knownItems.Add(it); nbKnown++; } } if (nbKnown == 0) { //No Information can be gained from this test Test test; if (testAttr is SymbolicAttribute) { //Symblic test test = new SymbolicTest((SymbolicAttribute)testAttr, new KnownSymbolicValue[] { new KnownSymbolicValue(0) }); } else { //Numerical test test = new NumericalTest((NumericalAttribute)testAttr, 0.0d); } return(new TestScore(test, 0.0d)); } else { TestScore knownTestScore = knownItems.BestSplitTest(testAttr, goalAttr); return(new TestScore(knownTestScore.Test, knownTestScore.Score * (double)nbKnown / Items.Count)); } }
/// <summary> /// This method computes the entropy of the set regarding a given symbolic attribute. /// The frequency of each value of this attribute is counted according to the weights. /// The value of this attribute must be known for allt e itmes of this set. /// </summary> /// <param name="attr">the attribute against which to compute the entropy.</param> /// <returns>entropy</returns> public override double CalEntropy(SymbolicAttribute attr) { if (!_attributeSet.Contains(attr)) { throw new ArgumentException("Unknown attribute"); } if (_entropy < 0.0d || !_entropyAttribute.Equals(attr)) { double[] freqs = new double[attr.NumOfValues]; for (int i = 0; i < Items.Count; i++) { KnownSymbolicValue sv = Items[i].ValueOf(_attributeSet, attr) as KnownSymbolicValue; freqs[sv.IntValue] += _weights[i]; } _entropy = Biotracker.Signature.DT.Entropy.CalEntropy(freqs); _entropyAttribute = attr; } return(this._entropy); }
/// <summary> /// Finds the best splitting test involving a numerical attribute /// </summary> /// <param name="testAttr"></param> /// <param name="goalAttr"></param> /// <returns></returns> private TestScore BestSplitTest(NumericalAttribute testAttr, SymbolicAttribute goalAttr) { int testIndex = _attributeSet.IndexOf(testAttr); int goalNbVal = goalAttr.NumOfValues; int goalIndex = _attributeSet.IndexOf(goalAttr); //frequencyLower (frequencyHigher) counts the number of items lower //(higher) than the threshold for each goal value. In the beginning, //frequencyLower is zeroed because the threshold is chosen small. double[] freqLower = new double[goalNbVal]; double[] freqHigher = new double[goalNbVal]; for (int gvi = 0; gvi < goalNbVal; gvi++) { SymbolicTest valTest = new SymbolicTest(goalAttr, new KnownSymbolicValue[] { new KnownSymbolicValue(gvi) }); freqHigher[gvi] = Split(valTest).ElementAt(1).Size(); } //Those two variables hold sum of the elements of the corresponding array. double freqLowerSum = 0.0d; double freqHigherSum = (double)_items.Count; List <TestGoalValue> tgv = new List <TestGoalValue>(); for (int i = 0; i < _items.Count; i++) { double testVal = ((KnownNumericalValue)(this._items[i].ValueOf(testIndex))).Value; int goalVal = ((KnownSymbolicValue)(this._items[i].ValueOf(goalIndex))).IntValue; tgv.Add(new TestGoalValue(testVal, goalVal)); } tgv.Sort(); int goalValue, goalValueNew = tgv[0].GoalValue; double testValue, testValueNew = tgv[0].TestValue; double bestScore = 0.0d; double bestThreshold = testValueNew; for (int i = 1; i < _items.Count; i++) { testValue = testValueNew; goalValue = goalValueNew; testValueNew = tgv[i].TestValue; goalValueNew = tgv[i].GoalValue; freqLower[goalValue]++; freqLowerSum++; freqHigher[goalValue]--; freqHigherSum--; if (testValue != testValueNew) { double score = CalEntropy(goalAttr) - (freqLowerSum / _items.Count) * Entropy.CalEntropy(freqLower) - (freqHigherSum / _items.Count) * Entropy.CalEntropy(freqHigher); if (score > bestScore) { bestScore = score; bestThreshold = (testValue + testValueNew) / 2.0d; } } } return(new TestScore(new NumericalTest(testAttr, bestThreshold), bestScore)); }
/// <summary> /// Finds the best splitting test involving a Symbolic attribute. /// </summary> /// <param name="testAttr">Symbolic attribute for test</param> /// <param name="goalAttr"></param> /// <returns></returns> protected TestScore BestSplitTest(SymbolicAttribute testAttr, SymbolicAttribute goalAttr) { int testNbVal = testAttr.NumOfValues; int testIndex = _attributeSet.IndexOf(testAttr); int goalNbVal = goalAttr.NumOfValues; int goalIndex = _attributeSet.IndexOf(goalAttr); //freqMatch[tvi][gvi] is the number of items that has a value equal to tvi for their 'test' //attribute and value equal to 'gvi' for their 'goal' attribute. //freqMatchSum[tvi] is the sum of the frequencyMatch[tvi][gvi] elements (for all gvi). double[][] freqMatch = new double[testNbVal][]; for (int i = 0; i < testNbVal; i++) { freqMatch[i] = new double[goalNbVal]; } double[] freqMatchSum = new double[testNbVal]; //Identically for the items that do not have tvi as a test attribute value. double[][] freqNoMatch = new double[testNbVal][]; for (int i = 0; i < testNbVal; i++) { freqNoMatch[i] = new double[goalNbVal]; } double[] freqNoMatchSum = new double[testNbVal]; for (int i = 0; i < _items.Count; i++) { int testVal = ((KnownSymbolicValue)(_items[i].ValueOf(testIndex))).IntValue; int goalVal = ((KnownSymbolicValue)(_items[i].ValueOf(goalIndex))).IntValue; for (int tvi = 0; tvi < testNbVal; tvi++) { if (testVal == tvi) { freqMatch[tvi][goalVal]++; freqMatchSum[tvi]++; } else { freqNoMatch[tvi][goalVal]++; freqNoMatchSum[tvi]++; } } } double bestScore = -1.0d; int bestValue = -1; for (int tvi = 0; tvi < testNbVal; tvi++) { double score = CalEntropy(goalAttr) - ((freqMatchSum[tvi] / _items.Count) * Entropy.CalEntropy(freqMatch[tvi])) - ((freqNoMatchSum[tvi] / _items.Count) * Entropy.CalEntropy(freqNoMatch[tvi])); if (score > bestScore) { bestScore = score; bestValue = tvi; } } //Group the attribute values one by one List <int> remainTestValueIndexes = new List <int>(); for (int i = 0; i < testNbVal; i++) { remainTestValueIndexes.Add(i); } double[] remainingFreqMatch = new double[goalNbVal]; double[] remainingFreqNoMatch = new double[goalNbVal]; for (int gvi = 0; gvi < goalNbVal; gvi++) { remainingFreqNoMatch[gvi] = freqMatch[0][gvi] + freqNoMatch[0][gvi]; } double remainingFreqMatchSum = 0.0d; double remainingFreqNoMatchSum = (double)(_items.Count); List <int> orderedValueIndex = new List <int>(); List <double> orderedScores = new List <double>(); orderedValueIndex.Add(bestValue); orderedScores.Add(bestScore); //Remove values until only one is left while (remainTestValueIndexes.Count >= 2) { //Update remaining Frequency.. arrays according to the last test attribute value removed. remainTestValueIndexes.Remove(bestValue); for (int gvi = 0; gvi < goalNbVal; gvi++) { remainingFreqMatch[gvi] += freqMatch[bestValue][gvi]; remainingFreqNoMatch[gvi] -= freqMatch[bestValue][gvi]; } remainingFreqMatchSum += freqMatchSum[bestValue]; remainingFreqNoMatchSum -= freqMatchSum[bestValue]; bestScore = -1.0d; //Find the next best test attribute value for (int i = 0; i < remainTestValueIndexes.Count; i++) { int tvi = remainTestValueIndexes[i]; double[] thisFreqMatch = new double[goalNbVal]; double[] thisFreqNoMatch = new double[goalNbVal]; double thisFreqMatchSum = 0.0d; double thisFreqNoMatchSum = 0.0d; for (int gvi = 0; gvi < goalNbVal; gvi++) { thisFreqMatch[gvi] = freqMatch[tvi][gvi] + remainingFreqMatch[gvi]; thisFreqNoMatch[gvi] = remainingFreqNoMatch[gvi] - freqMatch[tvi][gvi]; } thisFreqMatchSum = freqMatchSum[tvi] + remainingFreqMatchSum; thisFreqNoMatchSum = remainingFreqNoMatchSum - freqMatchSum[tvi]; double score = CalEntropy(goalAttr) - ((thisFreqMatchSum / _items.Count) * Entropy.CalEntropy(thisFreqMatch)) - ((thisFreqNoMatchSum / _items.Count) * Entropy.CalEntropy(thisFreqNoMatch)); if (score > bestScore) { bestScore = score; bestValue = tvi; } } } orderedScores.Add(bestScore); orderedValueIndex.Add(bestValue); bestScore = -1.0d; int bestIndex = 0; for (int i = 0; i < orderedScores.Count; i++) { double score = orderedScores[i]; if (score > bestScore) { bestScore = score; bestIndex = i; } } KnownSymbolicValue[] testValueIndexes = new KnownSymbolicValue[bestIndex + 1]; for (int i = 0; i <= bestIndex; i++) { int val = orderedValueIndex[i]; testValueIndexes[i] = new KnownSymbolicValue(val); } return(new TestScore(new SymbolicTest(testAttr, testValueIndexes), bestScore)); }
/// <summary> /// Finds the test on each attribute performing the best split for finding the value of a 'goal' /// attribute. /// </summary> /// <param name="candidateAttributes"></param> /// <param name="goalAttribute"></param> /// <returns></returns> public IEnumerable <TestScore> BestSplitTests(AttributeSet candidateAttributes, SymbolicAttribute goalAttribute) { if (candidateAttributes == null || goalAttribute == null || candidateAttributes.Size() == 0) { throw new ArgumentNullException(); } List <TestScore> bestScores = new List <TestScore>(); List <Attribute> attributes = candidateAttributes.GetAttributes().ToList(); foreach (Attribute attr in attributes) { bestScores.Add(BestSplitTest(attr, goalAttribute)); } return(bestScores); }
/// <summary> /// Finds the test on one attribute performing the best split (bringing the most information) /// for finding the value of a 'goal' attribute /// </summary> /// <param name="candidateAttributes">The set of attributes defining which attributes can be tested</param> /// <param name="goalAttribute">the attribute guess using the test</param> /// <returns></returns> public TestScore BestSplitTest(AttributeSet candidateAttributes, SymbolicAttribute goalAttribute) { return(BestSplitTests(candidateAttributes, goalAttribute).Max <TestScore>()); }
/// <summary> /// Create an empty learning decision tree. /// </summary> /// <param name="attrSet"></param> /// <param name="goalAttr"></param> /// <param name="learnignSet"></param> public LearningDecisionTree(AttributeSet attrSet, SymbolicAttribute goalAttr, ItemSet learnignSet) : base(attrSet, goalAttr) { Root().Replace(new LearningOpenNode(0, learnignSet)); }