private ClusteringResult CreateSingleCluster(IUnlabeledExampleCollection <SparseVector <double> > dataset) { ClusteringResult clustering = new ClusteringResult(); Cluster root = new Cluster(); for (int i = 0; i < dataset.Count; i++) { root.Items.Add(i); } clustering.AddRoot(root); CentroidData centroid = new CentroidData(); centroid.Items.AddRange(root.Items); centroid.Update(dataset); centroid.UpdateCentroidLen(); mCentroids = new ArrayList <CentroidData>(); mCentroids.Add(centroid); return(clustering); }
private void GetMostSimilarClusters(out int idx1, out int idx2) { double maxSim = 0; idx1 = 0; idx2 = 1; for (int i1 = 0; i1 < mCentroids.Count; i1++) { for (int i2 = i1 + 1; i2 < mCentroids.Count; i2++) { CentroidData c1 = mCentroids[i1]; CentroidData c2 = mCentroids[i2]; double sim = c1.GetDotProduct(c2.GetSparseVector()); if (sim > maxSim) { maxSim = sim; idx1 = i1; idx2 = i2; } } } }
public ClusteringResult Cluster(int numOutdated, IUnlabeledExampleCollection <SparseVector <double> > batch) { Utils.ThrowException(batch == null ? new ArgumentNullException("batch") : null); Utils.ThrowException(numOutdated < 0 ? new ArgumentOutOfRangeException("numOutdated") : null); if (mDataset == null) { // initialize mLogger.Trace("Cluster", "Initializing ..."); Utils.ThrowException(numOutdated > 0 ? new ArgumentOutOfRangeException("numOutdated") : null); //Utils.ThrowException(batch.Count == 0 ? new ArgumentValueException("batch") : null); if (batch.Count == 0) { return(new ClusteringResult()); } kMeans(batch, Math.Min(mK, batch.Count)); mDataset = new UnlabeledDataset <SparseVector <double> >(batch); foreach (CentroidData centroid in mCentroids) { centroid.Tag = mTopicId++; } //OutputState(); } else { // update clusters Utils.ThrowException(numOutdated > mDataset.Count ? new ArgumentOutOfRangeException("numOutdated") : null); if (numOutdated == 0 && batch.Count == 0) { return(GetClusteringResult()); } mLogger.Trace("Cluster", "Updating clusters ..."); // assign new instances double dummy; Assign(mCentroids, ModelUtils.GetTransposedMatrix(batch), batch.Count, /*offs=*/ mDataset.Count, out dummy); mDataset.AddRange(batch); // remove outdated instances foreach (CentroidData centroid in mCentroids) { foreach (int item in centroid.CurrentItems) { if (item >= numOutdated) { centroid.Items.Add(item); } } centroid.Update(mDataset); centroid.UpdateCentroidLen(); } mDataset.RemoveRange(0, numOutdated); ArrayList <CentroidData> centroidsNew = new ArrayList <CentroidData>(mCentroids.Count); foreach (CentroidData centroid in mCentroids) { if (centroid.CurrentItems.Count > 0) { centroidsNew.Add(centroid); Set <int> tmp = new Set <int>(); foreach (int idx in centroid.CurrentItems) { tmp.Add(idx - numOutdated); } centroid.CurrentItems.Inner.SetItems(tmp); } } if (centroidsNew.Count == 0) // reset { mCentroids = null; mDataset = null; return(new ClusteringResult()); } mCentroids = centroidsNew; // execute main loop kMeansMainLoop(mDataset, mCentroids); //OutputState(); } // adjust k double minQual; // *** not used at the moment int minQualIdx; double qual = GetClustQual(out minQual, out minQualIdx); if (qual < mQualThresh) { while (qual < mQualThresh) // split cluster at minQualIdx { mLogger.Trace("Cluster", "Increasing k to {0} ...", mCentroids.Count + 1); mCentroids.Add(mCentroids[minQualIdx].Clone()); mCentroids.Last.Tag = mTopicId++; kMeansMainLoop(mDataset, mCentroids); if (mCentroids.Last.CurrentItems.Count > mCentroids[minQualIdx].CurrentItems.Count) { // swap topic identifiers object tmp = mCentroids.Last.Tag; mCentroids.Last.Tag = mCentroids[minQualIdx].Tag; mCentroids[minQualIdx].Tag = tmp; } qual = GetClustQual(out minQual, out minQualIdx); //OutputState(); } } else if (numOutdated > 0) { while (qual > mQualThresh && mCentroids.Count > 1) // join clusters { mLogger.Trace("Cluster", "Decreasing k to {0} ...", mCentroids.Count - 1); ArrayList <CentroidData> centroidsCopy = mCentroids.DeepClone(); if (mCentroids.Count == 2) // create single cluster { object topicId = mCentroids[0].CurrentItems.Count > mCentroids[1].CurrentItems.Count ? mCentroids[0].Tag : mCentroids[1].Tag; mCentroids = new ArrayList <CentroidData>(); mCentroids.Add(new CentroidData()); for (int i = 0; i < mDataset.Count; i++) { mCentroids.Last.Items.Add(i); } mCentroids.Last.Tag = topicId; mCentroids.Last.Update(mDataset); mCentroids.Last.UpdateCentroidLen(); } else { int idx1, idx2; GetMostSimilarClusters(out idx1, out idx2); CentroidData c1 = mCentroids[idx1]; CentroidData c2 = mCentroids[idx2]; object topicId = c1.CurrentItems.Count > c2.CurrentItems.Count ? c1.Tag : c2.Tag; mCentroids.RemoveAt(idx2); c1.Items.AddRange(c1.CurrentItems); c1.Items.AddRange(c2.CurrentItems); c1.Tag = topicId; c1.Update(mDataset); c1.UpdateCentroidLen(); kMeansMainLoop(mDataset, mCentroids); } qual = GetClustQual(); if (qual >= mQualThresh) { mLogger.Trace("Cluster", "Accepted solution at k = {0}.", mCentroids.Count); } else { mCentroids = centroidsCopy; } //OutputState(); } } OutputState(); return(GetClusteringResult()); }
public void Train(ILabeledExampleCollection <LblT, SparseVector <double> > dataset) { Utils.ThrowException(dataset == null ? new ArgumentNullException("dataset") : null); Utils.ThrowException(dataset.Count == 0 ? new ArgumentValueException("dataset") : null); Dictionary <LblT, CentroidData> centroids = new Dictionary <LblT, CentroidData>(mLblCmp); foreach (LabeledExample <LblT, SparseVector <double> > labeledExample in dataset) { if (!centroids.ContainsKey(labeledExample.Label)) { CentroidData centroidData = new CentroidData(); centroidData.AddToSum(labeledExample.Example); centroids.Add(labeledExample.Label, centroidData); } else { CentroidData centroidData = centroids[labeledExample.Label]; centroidData.AddToSum(labeledExample.Example); } } foreach (CentroidData cenData in centroids.Values) { cenData.UpdateCentroidLen(); } double learnRate = 1; double[][] dotProd = null; SparseMatrix <double> dsMtx = null; if (mIterations > 0) { dotProd = new double[centroids.Count][]; dsMtx = ModelUtils.GetTransposedMatrix(ModelUtils.ConvertToUnlabeledDataset(dataset)); } for (int iter = 1; iter <= mIterations; iter++) { mLogger.Info("Train", "Iteration {0} / {1} ...", iter, mIterations); // compute dot products mLogger.Info("Train", "Computing dot products ..."); int j = 0; foreach (KeyValuePair <LblT, CentroidData> labeledCentroid in centroids) { mLogger.ProgressNormal(Logger.Level.Info, /*sender=*/ this, "Train", "Centroid {0} / {1} ...", j + 1, centroids.Count); SparseVector <double> cenVec = labeledCentroid.Value.GetSparseVector(); dotProd[j] = ModelUtils.GetDotProductSimilarity(dsMtx, dataset.Count, cenVec); j++; } // classify training examples mLogger.Info("Train", "Classifying training examples ..."); int errCount = 0; for (int instIdx = 0; instIdx < dataset.Count; instIdx++) { mLogger.ProgressFast(Logger.Level.Info, /*sender=*/ this, "Train", "Example {0} / {1} ...", instIdx + 1, dataset.Count); double maxSim = double.MinValue; CentroidData assignedCentroid = null; CentroidData actualCentroid = null; LabeledExample <LblT, SparseVector <double> > labeledExample = dataset[instIdx]; SparseVector <double> vec = labeledExample.Example; int cenIdx = 0; foreach (KeyValuePair <LblT, CentroidData> labeledCentroid in centroids) { double sim = dotProd[cenIdx][instIdx]; if (sim > maxSim) { maxSim = sim; assignedCentroid = labeledCentroid.Value; } if (labeledCentroid.Key.Equals(labeledExample.Label)) { actualCentroid = labeledCentroid.Value; } cenIdx++; } if (assignedCentroid != actualCentroid) { assignedCentroid.AddToDiff(-learnRate, vec); actualCentroid.AddToDiff(learnRate, vec); errCount++; } } mLogger.Info("Train", "Training set error rate: {0:0.00}%", (double)errCount / (double)dataset.Count * 100.0); // update centroids int k = 0; foreach (CentroidData centroidData in centroids.Values) { mLogger.ProgressNormal(Logger.Level.Info, /*sender=*/ this, "Train", "Centroid {0} / {1} ...", ++k, centroids.Count); centroidData.Update(mPositiveValuesOnly); centroidData.UpdateCentroidLen(); } learnRate *= mDamping; } mCentroidMtxTr = new SparseMatrix <double>(); mLabels = new ArrayList <LblT>(); int rowIdx = 0; foreach (KeyValuePair <LblT, CentroidData> labeledCentroid in centroids) { mCentroidMtxTr[rowIdx++] = labeledCentroid.Value.GetSparseVector(); mLabels.Add(labeledCentroid.Key); } mCentroidMtxTr = mCentroidMtxTr.GetTransposedCopy(); }
public void Train(IExampleCollection <LblT, SparseVector <double> .ReadOnly> dataset) { Utils.ThrowException(dataset == null ? new ArgumentNullException("dataset") : null); Utils.ThrowException(dataset.Count == 0 ? new ArgumentValueException("dataset") : null); m_centroids = new Dictionary <LblT, CentroidData>(); foreach (LabeledExample <LblT, SparseVector <double> .ReadOnly> labeled_example in dataset) { if (!m_centroids.ContainsKey(labeled_example.Label)) { CentroidData centroid_data = new CentroidData(); centroid_data.AddToSum(labeled_example.Example); m_centroids.Add(labeled_example.Label, centroid_data); } else { CentroidData centroid_data = m_centroids[labeled_example.Label]; centroid_data.AddToSum(labeled_example.Example); } } foreach (CentroidData vec_data in m_centroids.Values) { vec_data.UpdateCentroidLen(); } double learn_rate = 1; for (int iter = 1; iter <= m_iterations; iter++) { Utils.VerboseLine("Iteration {0} / {1} ...", iter, m_iterations); // classify training documents int i = 0; int num_miscfy = 0; foreach (LabeledExample <LblT, SparseVector <double> .ReadOnly> labeled_example in dataset) { Utils.Verbose("\rExample {0} / {1} ...", ++i, dataset.Count); double max_sim = double.MinValue; CentroidData assigned_centroid = null; CentroidData actual_centroid = null; SparseVector <double> .ReadOnly vec = labeled_example.Example; foreach (KeyValuePair <LblT, CentroidData> labeled_centroid in m_centroids) { double sim = labeled_centroid.Value.GetSimilarity(vec); if (sim > max_sim) { max_sim = sim; assigned_centroid = labeled_centroid.Value; } if (labeled_centroid.Key.Equals(labeled_example.Label)) { actual_centroid = labeled_centroid.Value; } } if (assigned_centroid != actual_centroid) { assigned_centroid.AddToDiff(-learn_rate, vec); actual_centroid.AddToDiff(learn_rate, vec); num_miscfy++; } } Utils.VerboseLine(""); Utils.VerboseLine("Training set error rate: {0:0.00}%", (double)num_miscfy / (double)dataset.Count * 100.0); // update centroids i = 0; foreach (CentroidData centroid_data in m_centroids.Values) { Utils.Verbose("\rCentroid {0} / {1} ...", ++i, m_centroids.Count); centroid_data.UpdateCentroid(m_positive_values_only); centroid_data.UpdateCentroidLen(); } Utils.VerboseLine(""); learn_rate *= m_damping; } }