/// <summary> Prunes a tree using C4.5's pruning procedure.
		/// 
		/// </summary>
		/// <exception cref="Exception">if something goes wrong
		/// </exception>
		public virtual void  prune()
		{
			
			double errorsLargestBranch;
			double errorsLeaf;
			double errorsTree;
			int indexOfLargestBranch;
			C45PruneableClassifierTree largestBranch;
			int i;
			
			if (!m_isLeaf)
			{
				
				// Prune all subtrees.
				for (i = 0; i < m_sons.Length; i++)
					son(i).prune();
				
				// Compute error for largest branch
				indexOfLargestBranch = localModel().distribution().maxBag();
				if (m_subtreeRaising)
				{
					errorsLargestBranch = son(indexOfLargestBranch).getEstimatedErrorsForBranch((Instances) m_train);
				}
				else
				{
					//UPGRADE_TODO: The equivalent in .NET for field 'java.lang.Double.MAX_VALUE' may return a different value. "ms-help://MS.VSCC.v80/dv_commoner/local/redirect.htm?index='!DefaultContextWindowIndex'&keyword='jlca1043'"
					errorsLargestBranch = System.Double.MaxValue;
				}
				
				// Compute error if this Tree would be leaf
				errorsLeaf = getEstimatedErrorsForDistribution(localModel().distribution());
				
				// Compute error for the whole subtree
				errorsTree = EstimatedErrors;
				
				// Decide if leaf is best choice.
				if (Utils.smOrEq(errorsLeaf, errorsTree + 0.1) && Utils.smOrEq(errorsLeaf, errorsLargestBranch + 0.1))
				{
					
					// Free son Trees
					m_sons = null;
					m_isLeaf = true;
					
					// Get NoSplit Model for node.
					m_localModel = new NoSplit(localModel().distribution());
					return ;
				}
				
				// Decide if largest branch is better choice
				// than whole subtree.
				if (Utils.smOrEq(errorsLargestBranch, errorsTree + 0.1))
				{
					largestBranch = son(indexOfLargestBranch);
					m_sons = largestBranch.m_sons;
					m_localModel = largestBranch.localModel();
					m_isLeaf = largestBranch.m_isLeaf;
					newDistribution(m_train);
					prune();
				}
			}
		}
Exemplo n.º 2
0
		/// <summary> Selects C4.5-type split for the given dataset.</summary>
		public override ClassifierSplitModel selectModel(Instances data)
		{
			
			double minResult;
			//double currentResult;
			BinC45Split[] currentModel;
			BinC45Split bestModel = null;
			NoSplit noSplitModel = null;
			double averageInfoGain = 0;
			int validModels = 0;
			bool multiVal = true;
			Distribution checkDistribution;
			double sumOfWeights;
			int i;
			
			try
			{
				
				// Check if all Instances belong to one class or if not
				// enough Instances to split.
				checkDistribution = new Distribution(data);
				noSplitModel = new NoSplit(checkDistribution);
				if (Utils.sm(checkDistribution.total(), 2 * m_minNoObj) || Utils.eq(checkDistribution.total(), checkDistribution.perClass(checkDistribution.maxClass())))
					return noSplitModel;
				
				// Check if all attributes are nominal and have a 
				// lot of values.
				System.Collections.IEnumerator enu = data.enumerateAttributes();
				//UPGRADE_TODO: Method 'java.util.Enumeration.hasMoreElements' was converted to 'System.Collections.IEnumerator.MoveNext' which has a different behavior. "ms-help://MS.VSCC.v80/dv_commoner/local/redirect.htm?index='!DefaultContextWindowIndex'&keyword='jlca1073_javautilEnumerationhasMoreElements'"
				while (enu.MoveNext())
				{
					//UPGRADE_TODO: Method 'java.util.Enumeration.nextElement' was converted to 'System.Collections.IEnumerator.Current' which has a different behavior. "ms-help://MS.VSCC.v80/dv_commoner/local/redirect.htm?index='!DefaultContextWindowIndex'&keyword='jlca1073_javautilEnumerationnextElement'"
                    weka.core.Attribute attribute = (weka.core.Attribute)enu.Current;
					if ((attribute.Numeric) || (Utils.sm((double) attribute.numValues(), (0.3 * (double) m_allData.numInstances()))))
					{
						multiVal = false;
						break;
					}
				}
				currentModel = new BinC45Split[data.numAttributes()];
				sumOfWeights = data.sumOfWeights();
				
				// For each attribute.
				for (i = 0; i < data.numAttributes(); i++)
				{
					
					// Apart from class attribute.
					if (i != (data).classIndex())
					{
						
						// Get models for current attribute.
						currentModel[i] = new BinC45Split(i, m_minNoObj, sumOfWeights);
						currentModel[i].buildClassifier(data);
						
						// Check if useful split for current attribute
						// exists and check for enumerated attributes with 
						// a lot of values.
						if (currentModel[i].checkModel())
							if ((data.attribute(i).Numeric) || (multiVal || Utils.sm((double) data.attribute(i).numValues(), (0.3 * (double) m_allData.numInstances()))))
							{
								averageInfoGain = averageInfoGain + currentModel[i].infoGain();
								validModels++;
							}
					}
					else
						currentModel[i] = null;
				}
				
				// Check if any useful split was found.
				if (validModels == 0)
					return noSplitModel;
				averageInfoGain = averageInfoGain / (double) validModels;
				
				// Find "best" attribute to split on.
				minResult = 0;
				for (i = 0; i < data.numAttributes(); i++)
				{
					if ((i != (data).classIndex()) && (currentModel[i].checkModel()))
					// Use 1E-3 here to get a closer approximation to the original
					// implementation.
						if ((currentModel[i].infoGain() >= (averageInfoGain - 1e-3)) && Utils.gr(currentModel[i].gainRatio(), minResult))
						{
							bestModel = currentModel[i];
							minResult = currentModel[i].gainRatio();
						}
				}
				
				// Check if useful split was found.
				if (Utils.eq(minResult, 0))
					return noSplitModel;
				
				// Add all Instances with unknown values for the corresponding
				// attribute to the distribution for the model, so that
				// the complete distribution is stored with the model. 
				bestModel.distribution().addInstWithUnknown(data, bestModel.attIndex());
				
				// Set the split point analogue to C45 if attribute numeric.
				bestModel.SplitPoint = m_allData;
				return bestModel;
			}
			catch (System.Exception e)
			{
                System.Console.WriteLine(e.StackTrace + " " + e.Message);
			}
			return null;
		}
		/// <summary> Collapses a tree to a node if training error doesn't increase.</summary>
		public void  collapse()
		{
			
			double errorsOfSubtree;
			double errorsOfTree;
			int i;
			
			if (!m_isLeaf)
			{
				errorsOfSubtree = TrainingErrors;
				errorsOfTree = localModel().distribution().numIncorrect();
				if (errorsOfSubtree >= errorsOfTree - 1e-3)
				{
					
					// Free adjacent trees
					m_sons = null;
					m_isLeaf = true;
					
					// Get NoSplit Model for tree.
					m_localModel = new NoSplit(localModel().distribution());
				}
				else
					for (i = 0; i < m_sons.Length; i++)
						son(i).collapse();
			}
		}