public void Train(ILabeledExampleCollection <LblT, SparseVector <double> > dataset) { Utils.ThrowException(dataset == null ? new ArgumentNullException("dataset") : null); Utils.ThrowException(dataset.Count == 0 ? new ArgumentValueException("dataset") : null); Dictionary <LblT, CentroidData> centroids = new Dictionary <LblT, CentroidData>(mLblCmp); foreach (LabeledExample <LblT, SparseVector <double> > labeledExample in dataset) { if (!centroids.ContainsKey(labeledExample.Label)) { CentroidData centroidData = new CentroidData(); centroidData.AddToSum(labeledExample.Example); centroids.Add(labeledExample.Label, centroidData); } else { CentroidData centroidData = centroids[labeledExample.Label]; centroidData.AddToSum(labeledExample.Example); } } foreach (CentroidData cenData in centroids.Values) { cenData.UpdateCentroidLen(); } double learnRate = 1; double[][] dotProd = null; SparseMatrix <double> dsMtx = null; if (mIterations > 0) { dotProd = new double[centroids.Count][]; dsMtx = ModelUtils.GetTransposedMatrix(ModelUtils.ConvertToUnlabeledDataset(dataset)); } for (int iter = 1; iter <= mIterations; iter++) { mLogger.Info("Train", "Iteration {0} / {1} ...", iter, mIterations); // compute dot products mLogger.Info("Train", "Computing dot products ..."); int j = 0; foreach (KeyValuePair <LblT, CentroidData> labeledCentroid in centroids) { mLogger.ProgressNormal(Logger.Level.Info, /*sender=*/ this, "Train", "Centroid {0} / {1} ...", j + 1, centroids.Count); SparseVector <double> cenVec = labeledCentroid.Value.GetSparseVector(); dotProd[j] = ModelUtils.GetDotProductSimilarity(dsMtx, dataset.Count, cenVec); j++; } // classify training examples mLogger.Info("Train", "Classifying training examples ..."); int errCount = 0; for (int instIdx = 0; instIdx < dataset.Count; instIdx++) { mLogger.ProgressFast(Logger.Level.Info, /*sender=*/ this, "Train", "Example {0} / {1} ...", instIdx + 1, dataset.Count); double maxSim = double.MinValue; CentroidData assignedCentroid = null; CentroidData actualCentroid = null; LabeledExample <LblT, SparseVector <double> > labeledExample = dataset[instIdx]; SparseVector <double> vec = labeledExample.Example; int cenIdx = 0; foreach (KeyValuePair <LblT, CentroidData> labeledCentroid in centroids) { double sim = dotProd[cenIdx][instIdx]; if (sim > maxSim) { maxSim = sim; assignedCentroid = labeledCentroid.Value; } if (labeledCentroid.Key.Equals(labeledExample.Label)) { actualCentroid = labeledCentroid.Value; } cenIdx++; } if (assignedCentroid != actualCentroid) { assignedCentroid.AddToDiff(-learnRate, vec); actualCentroid.AddToDiff(learnRate, vec); errCount++; } } mLogger.Info("Train", "Training set error rate: {0:0.00}%", (double)errCount / (double)dataset.Count * 100.0); // update centroids int k = 0; foreach (CentroidData centroidData in centroids.Values) { mLogger.ProgressNormal(Logger.Level.Info, /*sender=*/ this, "Train", "Centroid {0} / {1} ...", ++k, centroids.Count); centroidData.Update(mPositiveValuesOnly); centroidData.UpdateCentroidLen(); } learnRate *= mDamping; } mCentroidMtxTr = new SparseMatrix <double>(); mLabels = new ArrayList <LblT>(); int rowIdx = 0; foreach (KeyValuePair <LblT, CentroidData> labeledCentroid in centroids) { mCentroidMtxTr[rowIdx++] = labeledCentroid.Value.GetSparseVector(); mLabels.Add(labeledCentroid.Key); } mCentroidMtxTr = mCentroidMtxTr.GetTransposedCopy(); }
public void Train(IExampleCollection <LblT, SparseVector <double> .ReadOnly> dataset) { Utils.ThrowException(dataset == null ? new ArgumentNullException("dataset") : null); Utils.ThrowException(dataset.Count == 0 ? new ArgumentValueException("dataset") : null); m_centroids = new Dictionary <LblT, CentroidData>(); foreach (LabeledExample <LblT, SparseVector <double> .ReadOnly> labeled_example in dataset) { if (!m_centroids.ContainsKey(labeled_example.Label)) { CentroidData centroid_data = new CentroidData(); centroid_data.AddToSum(labeled_example.Example); m_centroids.Add(labeled_example.Label, centroid_data); } else { CentroidData centroid_data = m_centroids[labeled_example.Label]; centroid_data.AddToSum(labeled_example.Example); } } foreach (CentroidData vec_data in m_centroids.Values) { vec_data.UpdateCentroidLen(); } double learn_rate = 1; for (int iter = 1; iter <= m_iterations; iter++) { Utils.VerboseLine("Iteration {0} / {1} ...", iter, m_iterations); // classify training documents int i = 0; int num_miscfy = 0; foreach (LabeledExample <LblT, SparseVector <double> .ReadOnly> labeled_example in dataset) { Utils.Verbose("\rExample {0} / {1} ...", ++i, dataset.Count); double max_sim = double.MinValue; CentroidData assigned_centroid = null; CentroidData actual_centroid = null; SparseVector <double> .ReadOnly vec = labeled_example.Example; foreach (KeyValuePair <LblT, CentroidData> labeled_centroid in m_centroids) { double sim = labeled_centroid.Value.GetSimilarity(vec); if (sim > max_sim) { max_sim = sim; assigned_centroid = labeled_centroid.Value; } if (labeled_centroid.Key.Equals(labeled_example.Label)) { actual_centroid = labeled_centroid.Value; } } if (assigned_centroid != actual_centroid) { assigned_centroid.AddToDiff(-learn_rate, vec); actual_centroid.AddToDiff(learn_rate, vec); num_miscfy++; } } Utils.VerboseLine(""); Utils.VerboseLine("Training set error rate: {0:0.00}%", (double)num_miscfy / (double)dataset.Count * 100.0); // update centroids i = 0; foreach (CentroidData centroid_data in m_centroids.Values) { Utils.Verbose("\rCentroid {0} / {1} ...", ++i, m_centroids.Count); centroid_data.UpdateCentroid(m_positive_values_only); centroid_data.UpdateCentroidLen(); } Utils.VerboseLine(""); learn_rate *= m_damping; } }