////////////////////////////////////////////////////// // Mutator: // CDTNode::Train // // Purpose: // Calculates the information gain for the remaining // attributes, the entropy of the remaining examples // and creates splits/sub-nodes ////////////////////////////////////////////////////// public void Train() { // Note: // I need a better approach to situations where noe more attributes // are left to split on, but multiple classes exist in the examples. // If we don't have any examples to train on, then // this node HAS to be a leaf. We will keep the classification // given to us which was the most common class from the parent node if (m_rwExampleIDs.Count() == 0) { m_bIsLeaf = true; return; } m_bIsLeaf = false; m_fEntropy = GetEntropy(); //assert( m_fEntropy >= 0.0f ); //assert( m_fEntropy <= ( (float)m_pDecisionTree->GetNumClasses() - 1.0f ) ); // If we don't have any entropy, then go ahead and treat this like a leaf // since we don't have any reason to calculate any more if (m_fEntropy == 0.0f) { m_bIsLeaf = true; m_iClass = m_pDecisionTree.GetExample(m_rwExampleIDs[0]).GetClassIdentifier(); return; } // Calculate the gains for the attributes int iAttributeID = 0; int i = 0; int iBestClassCount = 0; // The number of times the most common class appears int iBestAttributeID = m_rwRemainingAttributeIDs[0]; // Treat the first attribute initially as the best float fBestGain = GetInformationGain(iBestAttributeID); // Save the information gain from the first attribute m_rwInformationGain = new List <float>(); for (i = 0; i < m_pDecisionTree.GetNumAttributes(); ++i) { m_rwInformationGain.Add(0.0f); } // Put the gain we just calculated into the proper place m_rwInformationGain[iBestAttributeID] = fBestGain; // Find the information gain for each attribute and store it // while keeping track of the best gain and the attribute // that it goes with. int iRemainingAttributeCount = m_rwRemainingAttributeIDs.Count(); for (i = 1; i < iRemainingAttributeCount; ++i) { iAttributeID = m_rwRemainingAttributeIDs[i]; m_rwInformationGain[iAttributeID] = GetInformationGain(iAttributeID); // If we find a better gain, store it // and remember which attribute it came from if (m_rwInformationGain[iAttributeID] > fBestGain) { fBestGain = m_rwInformationGain[iAttributeID]; iBestAttributeID = iAttributeID; } } // Set the class counts to 0 m_rwClassCount = new List <int>(); int iTreeClassCount = m_pDecisionTree.GetNumClasses(); for (i = 0; i < iTreeClassCount; ++i) { m_rwClassCount.Add(0); } // Store the attribute this splits on m_iAttributeID = iBestAttributeID; // Modify the list of attributes to give to the new children List <int> rwNewAttributeList = m_rwRemainingAttributeIDs; // If we don't have any attributes left to split on // then this node has to be a leaf, so we need to // find the most common class for the examples // and make that the node's classification m_bIsLeaf = !m_rwRemainingAttributeIDs.Any(); // If we are not a leaf, then generate the children nodes // and give them a list of remaining attributes they can split on if (!m_bIsLeaf) { rwNewAttributeList = (from attributeId in rwNewAttributeList where attributeId != m_iAttributeID select attributeId).ToList(); // Create the sub-nodes CAttribute attributeToSplitOn = m_pDecisionTree.GetAttribute(m_iAttributeID); int iAttributeValueCount = attributeToSplitOn.GetNumValues(); for (i = 0; i < iAttributeValueCount; ++i) { CDTNode pNewNode = new CDTNode(m_pDecisionTree, rwNewAttributeList); m_rwChildren.Add(pNewNode); } } // Now we know the best attribute to split/branch on // send the examples to their appropriate child nodes // while finding the most common classification // for this node's examples int iExampleCount = m_rwExampleIDs.Count(); for (i = 0; i < iExampleCount; ++i) { int iExampleID = m_rwExampleIDs[i]; CExample curExample = m_pDecisionTree.GetExample(iExampleID); int iExampleClassID = curExample.GetClassIdentifier(); // Save a tally of the occurrence of each classification // and which class is the most common ++m_rwClassCount[iExampleClassID]; if (iBestClassCount < m_rwClassCount[iExampleClassID]) { iBestClassCount = m_rwClassCount[iExampleClassID]; m_iClass = iExampleClassID; } // Match the examples with the sub tree that matches the attribute's value if (!m_bIsLeaf) { int iValueID = curExample.GetValueIdentifier(m_iAttributeID); // Add the example into the correct sub-tree and remove it from this level m_rwChildren[iValueID].GetExampleIdentifierList().Add(iExampleID); } } // If we are a leaf, then we don't have any children // nodes to calculate so just return back if (m_bIsLeaf) { return; } // No more examples should be associated with this node. m_rwExampleIDs.Clear(); // Calculate all the subtrees for this node int iNumValues = m_pDecisionTree.GetAttribute(m_iAttributeID).GetNumValues(); for (i = 0; i < iNumValues; ++i) { m_rwChildren[i].m_iClass = m_iClass; m_rwChildren[i].Train(); } }
////////////////////////////////////////////////////// // Procedure: // Stream // // Purpose: // Outputs a human readable text stream to the given // stream offset, indented to refelect the given // depth of the node. ////////////////////////////////////////////////////// public void Stream( StreamWriter inOutputStream, int iInDepth = 0) { int j = (0); int i = (0); bool bIsBuiltFromExamples = m_pDecisionTree.GetNumExamples() > 0; PrintTabs(inOutputStream, iInDepth); if (bIsBuiltFromExamples) { inOutputStream.WriteLine($"Data set had an entropy of {m_fEntropy}"); } if (bIsBuiltFromExamples && m_rwRemainingAttributeIDs.Any() && m_rwInformationGain.Any()) { PrintTabs(inOutputStream, iInDepth); inOutputStream.Write($"Node had {m_rwRemainingAttributeIDs.Count()} attributes to choose from with gains of"); for (i = 0; i < m_rwRemainingAttributeIDs.Count(); i++) { int attributeID = (m_rwRemainingAttributeIDs[i]); inOutputStream.Write($" {m_pDecisionTree.GetAttribute(attributeID).GetName()}:{m_rwInformationGain[attributeID]}"); } inOutputStream.WriteLine(); } PrintTabs(inOutputStream, iInDepth); if (!m_bIsLeaf) { inOutputStream.WriteLine(); PrintTabs(inOutputStream, iInDepth); inOutputStream.WriteLine($"Split on attribute {m_pDecisionTree.GetAttribute(m_iAttributeID).GetName()}"); for (i = 0; i < m_pDecisionTree.GetAttribute(m_iAttributeID).GetNumValues(); ++i) { PrintTabs(inOutputStream, iInDepth); inOutputStream.WriteLine($"Value {m_pDecisionTree.GetAttribute(m_iAttributeID).GetValue(i)}"); m_rwChildren[i].Stream(inOutputStream, (iInDepth + 1)); } } else { inOutputStream.WriteLine($"Class = {m_pDecisionTree.GetClass(m_iClass)}"); } if (m_rwExampleIDs.Any()) { PrintTabs(inOutputStream, iInDepth); inOutputStream.WriteLine("Examples:"); } for (j = 0; j < m_rwExampleIDs.Count(); j++) { CExample curExample = m_pDecisionTree.GetExample(m_rwExampleIDs[j]); PrintTabs(inOutputStream, iInDepth); inOutputStream.Write($"Class: {m_pDecisionTree.GetClass(curExample.GetClassIdentifier())}"); for (i = 0; i < m_pDecisionTree.GetNumAttributes(); i++) { inOutputStream.Write($" {m_pDecisionTree.GetAttribute(i).GetName()}:{m_pDecisionTree.GetAttribute(i).GetValue(curExample.GetValueIdentifier(i))}"); } inOutputStream.WriteLine(); } }
private static bool TestComplex(string szInFilename) { bool bOutIsTestSuccessful = false; int iSuccessCount = 0; int iFailedCount = 0; Console.WriteLine("Starting self test."); var testTree = new CDecisionTree(); bOutIsTestSuccessful = testTree.LoadTrainingData(szInFilename); // cout << "Examples:" << endl; // // for( unsigned int iIndex = 0; iIndex < testTree.GetNumExamples(); ++iIndex ) // { // cout << testTree.GetExample( iIndex ); // } // // cout << endl; if (bOutIsTestSuccessful) { Console.WriteLine($"Starting self test using example file `{szInFilename}`"); Console.WriteLine("\nTraining"); testTree.Train(); Console.WriteLine("Resulting tree:"); Console.WriteLine(testTree); int iExampleCount = testTree.GetNumExamples(); for (int iExampleIndex = 0; iExampleIndex < iExampleCount; ++iExampleIndex) { CExample example = testTree.GetExample(iExampleIndex); string szCorrectAnswer = testTree.GetClass(example.GetClassIdentifier()); string szCalculatedAnswer = testTree.Classify(example); bool bIsExampleAMatchWithClassification = szCorrectAnswer == szCalculatedAnswer; iSuccessCount += bIsExampleAMatchWithClassification ? 1 : 0; iFailedCount += bIsExampleAMatchWithClassification ? 0 : 1; Console.WriteLine("----"); Console.WriteLine($"Example has outcome of '{szCorrectAnswer}'"); Console.WriteLine($"Classified example '{iExampleIndex}' as class `{szCalculatedAnswer}`"); Console.WriteLine($"SELF TEST {(bIsExampleAMatchWithClassification ? "SUCCEEDED" : "FAILED")}!"); } Console.WriteLine($"\nDone. {iSuccessCount} examples matched their classification, {iFailedCount} did not."); Console.WriteLine($"Saving to {szInFilename}.dts"); string szOutputFileName = $"{szInFilename}.dts"; testTree.SavePrebuiltTree(szOutputFileName); Console.WriteLine("Finished saving. Now attempting load!"); bool bIsLoadSuccess = testTree.LoadPrebuiltTree(szOutputFileName); Console.WriteLine($"Save = {bIsLoadSuccess}"); Console.WriteLine("Loaded tree"); Console.WriteLine(testTree); } else { Console.Error.WriteLine($"ERROR - Unable to open test file '{szInFilename}'"); } return(bOutIsTestSuccessful); }