/** * <summary> Training algorithm for the quadratic discriminant analysis classifier (Introduction to Machine Learning, Alpaydin, 2015).</summary> * * <param name="trainSet"> Training data given to the algorithm.</param> * <param name="parameters">-</param> */ public override void Train(InstanceList.InstanceList trainSet, Parameter.Parameter parameters) { var w0 = new Dictionary <string, double>(); var w = new Dictionary <string, Vector>(); var W = new Dictionary <string, Matrix>(); var classLists = trainSet.DivideIntoClasses(); var priorDistribution = trainSet.ClassDistribution(); for (var i = 0; i < classLists.Size(); i++) { var ci = ((InstanceListOfSameClass)classLists.Get(i)).GetClassLabel(); var averageVector = new Vector(classLists.Get(i).ContinuousAttributeAverage()); var classCovariance = classLists.Get(i).Covariance(averageVector); var determinant = classCovariance.Determinant(); classCovariance.Inverse(); var Wi = (Matrix)classCovariance.Clone(); Wi.MultiplyWithConstant(-0.5); W[ci] = Wi; var wi = classCovariance.MultiplyWithVectorFromLeft(averageVector); w[ci] = wi; var w0i = -0.5 * (wi.DotProduct(averageVector) + System.Math.Log(determinant)) + System.Math.Log(priorDistribution.GetProbability(ci)); w0[ci] = w0i; } model = new QdaModel(priorDistribution, W, w, w0); }
/** * <summary> Training algorithm for the linear discriminant analysis classifier (Introduction to Machine Learning, Alpaydin, 2015).</summary> * * <param name="trainSet"> Training data given to the algorithm.</param> * <param name="parameters">-</param> */ public override void Train(InstanceList.InstanceList trainSet, Parameter.Parameter parameters) { Vector averageVector; var w0 = new Dictionary <string, double>(); var w = new Dictionary <string, Vector>(); var priorDistribution = trainSet.ClassDistribution(); var classLists = trainSet.DivideIntoClasses(); var covariance = new Matrix(trainSet.Get(0).ContinuousAttributeSize(), trainSet.Get(0).ContinuousAttributeSize()); for (var i = 0; i < classLists.Size(); i++) { averageVector = new Vector(classLists.Get(i).ContinuousAttributeAverage()); var classCovariance = classLists.Get(i).Covariance(averageVector); classCovariance.MultiplyWithConstant(classLists.Get(i).Size() - 1); covariance.Add(classCovariance); } covariance.DivideByConstant(trainSet.Size() - classLists.Size()); covariance.Inverse(); for (var i = 0; i < classLists.Size(); i++) { var ci = ((InstanceListOfSameClass)classLists.Get(i)).GetClassLabel(); averageVector = new Vector(classLists.Get(i).ContinuousAttributeAverage()); var wi = covariance.MultiplyWithVectorFromRight(averageVector); w[ci] = wi; var w0i = -0.5 * wi.DotProduct(averageVector) + System.Math.Log(priorDistribution.GetProbability(ci)); w0[ci] = w0i; } model = new LdaModel(priorDistribution, w, w0); }
/** * <summary> The predict method takes an {@link Instance} as input and performs prediction on the DecisionNodes and returns the prediction * for that instance.</summary> * * <param name="instance">Instance to make prediction.</param> * <returns>The prediction for given instance.</returns> */ public string Predict(Instance.Instance instance) { if (instance is CompositeInstance compositeInstance) { var possibleClassLabels = compositeInstance.GetPossibleClassLabels(); var distribution = _data.ClassDistribution(); var predictedClass = distribution.GetMaxItem(possibleClassLabels); if (_leaf) { return(predictedClass); } foreach (var node in _children) { if (node._condition.Satisfy(compositeInstance)) { var childPrediction = node.Predict(compositeInstance); if (childPrediction != null) { return(childPrediction); } return(predictedClass); } } return(predictedClass); } if (_leaf) { return(_classLabel); } foreach (var node in _children) { if (node._condition.Satisfy(instance)) { return(node.Predict(instance)); } } return(_classLabel); }
/** * <summary> Training algorithm for K-Means classifier. K-Means finds the mean of each class for training.</summary> * * <param name="trainSet"> Training data given to the algorithm.</param> * <param name="parameters">distanceMetric: distance metric used to calculate the distance between two instances.</param> */ public override void Train(InstanceList.InstanceList trainSet, Parameter.Parameter parameters) { var priorDistribution = trainSet.ClassDistribution(); var classMeans = new InstanceList.InstanceList(); var classLists = trainSet.DivideIntoClasses(); for (var i = 0; i < classLists.Size(); i++) { classMeans.Add(classLists.Get(i).Average()); } model = new KMeansModel(priorDistribution, classMeans, ((KMeansParameter)parameters).GetDistanceMetric()); }
/** * <summary> Training algorithm for Naive Bayes algorithm. It basically calls trainContinuousVersion for continuous data sets, * trainDiscreteVersion for discrete data sets.</summary> * <param name="trainSet">Training data given to the algorithm</param> * <param name="parameters">-</param> */ public override void Train(InstanceList.InstanceList trainSet, Parameter.Parameter parameters) { var priorDistribution = trainSet.ClassDistribution(); var classLists = trainSet.DivideIntoClasses(); if (classLists.Get(0).Get(0).GetAttribute(0) is DiscreteAttribute) { TrainDiscreteVersion(priorDistribution, classLists); } else { TrainContinuousVersion(priorDistribution, classLists); } }
/** * <summary> The DecisionNode method takes {@link InstanceList} data as input and then it sets the class label parameter by finding * the most occurred class label of given data, it then gets distinct class labels as class labels List. Later, it adds ordered * indices to the indexList and shuffles them randomly. Then, it gets the class distribution of given data and finds the best entropy value * of these class distribution. * <p/> * If an attribute of given data is {@link DiscreteIndexedAttribute}, it creates a Distribution according to discrete indexed attribute class distribution * and finds the entropy. If it is better than the last best entropy it reassigns the best entropy, best attribute and best split value according to * the newly founded best entropy's index. At the end, it also add new distribution to the class distribution . * <p/> * If an attribute of given data is {@link DiscreteAttribute}, it directly finds the entropy. If it is better than the last best entropy it * reassigns the best entropy, best attribute and best split value according to the newly founded best entropy's index. * <p/> * If an attribute of given data is {@link ContinuousAttribute}, it creates two distributions; left and right according to class distribution * and discrete distribution respectively, and finds the entropy. If it is better than the last best entropy it reassigns the best entropy, * best attribute and best split value according to the newly founded best entropy's index. At the end, it also add new distribution to * the right distribution and removes from left distribution.</summary> * * <param name="data"> {@link InstanceList} input.</param> * <param name="condition">{@link DecisionCondition} to check.</param> * <param name="parameter">RandomForestParameter like seed, ensembleSize, attributeSubsetSize.</param> * <param name="isStump"> Refers to decision trees with only 1 splitting rule.</param> */ public DecisionNode(InstanceList.InstanceList data, DecisionCondition condition, RandomForestParameter parameter, bool isStump) { int bestAttribute = -1, size; double bestSplitValue = 0; this._condition = condition; this._data = data; _classLabel = Classifier.Classifier.GetMaximum(data.GetClassLabels()); _leaf = true; var classLabels = data.GetDistinctClassLabels(); if (classLabels.Count == 1) { return; } if (isStump && condition != null) { return; } var indexList = new List <int>(); for (var i = 0; i < data.Get(0).AttributeSize(); i++) { indexList.Add(i); } if (parameter != null && parameter.GetAttributeSubsetSize() < data.Get(0).AttributeSize()) { size = parameter.GetAttributeSubsetSize(); } else { size = data.Get(0).AttributeSize(); } var classDistribution = data.ClassDistribution(); var bestEntropy = data.ClassDistribution().Entropy(); for (var j = 0; j < size; j++) { var index = indexList[j]; double entropy; if (data.Get(0).GetAttribute(index) is DiscreteIndexedAttribute) { for (var k = 0; k < ((DiscreteIndexedAttribute)data.Get(0).GetAttribute(index)).GetMaxIndex(); k++) { var distribution = data.DiscreteIndexedAttributeClassDistribution(index, k); if (distribution.GetSum() > 0) { classDistribution.RemoveDistribution(distribution); entropy = (classDistribution.Entropy() * classDistribution.GetSum() + distribution.Entropy() * distribution.GetSum()) / data.Size(); if (entropy < bestEntropy) { bestEntropy = entropy; bestAttribute = index; bestSplitValue = k; } classDistribution.AddDistribution(distribution); } } } else { if (data.Get(0).GetAttribute(index) is DiscreteAttribute) { entropy = EntropyForDiscreteAttribute(index); if (entropy < bestEntropy) { bestEntropy = entropy; bestAttribute = index; } } else { if (data.Get(0).GetAttribute(index) is ContinuousAttribute) { data.Sort(index); var previousValue = double.MinValue; var leftDistribution = data.ClassDistribution(); var rightDistribution = new DiscreteDistribution(); for (var k = 0; k < data.Size(); k++) { var instance = data.Get(k); if (k == 0) { previousValue = ((ContinuousAttribute)instance.GetAttribute(index)).GetValue(); } else { if (((ContinuousAttribute)instance.GetAttribute(index)).GetValue() != previousValue) { var splitValue = (previousValue + ((ContinuousAttribute)instance.GetAttribute(index)) .GetValue()) / 2; previousValue = ((ContinuousAttribute)instance.GetAttribute(index)).GetValue(); entropy = (leftDistribution.GetSum() / data.Size()) * leftDistribution.Entropy() + (rightDistribution.GetSum() / data.Size()) * rightDistribution.Entropy(); if (entropy < bestEntropy) { bestEntropy = entropy; bestSplitValue = splitValue; bestAttribute = index; } } } leftDistribution.RemoveItem(instance.GetClassLabel()); rightDistribution.AddItem(instance.GetClassLabel()); } } } } } if (bestAttribute != -1) { _leaf = false; if (data.Get(0).GetAttribute(bestAttribute) is DiscreteIndexedAttribute) { CreateChildrenForDiscreteIndexed(bestAttribute, (int)bestSplitValue, parameter, isStump); } else { if (data.Get(0).GetAttribute(bestAttribute) is DiscreteAttribute) { CreateChildrenForDiscrete(bestAttribute, parameter, isStump); } else { if (data.Get(0).GetAttribute(bestAttribute) is ContinuousAttribute) { CreateChildrenForContinuous(bestAttribute, bestSplitValue, parameter, isStump); } } } } }
/** * <summary> Returns the size of the class label distribution of {@link InstanceList}.</summary> * * <returns>Size of the class label distribution of {@link InstanceList}.</returns> */ public int ClassCount() { return(_instances.ClassDistribution().Count); }
/** * <summary> Constructor which sets the distribution using the given {@link InstanceList}.</summary> * * <param name="trainSet">{@link InstanceList} which is used to get the class distribution.</param> */ public DummyModel(InstanceList.InstanceList trainSet) { this._distribution = trainSet.ClassDistribution(); }
/** * <summary> Training algorithm for random classifier.</summary> * * <param name="trainSet"> Training data given to the algorithm.</param> * <param name="parameters">-</param> */ public override void Train(InstanceList.InstanceList trainSet, Parameter.Parameter parameters) { model = new RandomModel(new List <String>(trainSet.ClassDistribution().Keys), parameters.GetSeed()); }