public ClusteringResult Cluster(IUnlabeledExampleCollection <SparseVector <double> > dataset) { Utils.ThrowException(dataset == null ? new ArgumentNullException("dataset") : null); Utils.ThrowException(dataset.Count < NumLeaves ? new ArgumentValueException("dataset") : null); ClusteringResult clusters = mKMeansClustering.Cluster(dataset); UnlabeledDataset <SparseVector <double> > centroids = new UnlabeledDataset <SparseVector <double> >(); foreach (Cluster cluster in clusters.Roots) { SparseVector <double> centroid = ModelUtils.ComputeCentroid(cluster.Items, dataset, CentroidType.NrmL2); centroids.Add(centroid); centroid = Trim(centroid, 1000, 0.8); cluster.ClusterInfo = 1; // cluster level } SparseMatrix <double> simMtx = ModelUtils.GetDotProductSimilarity(centroids, /*thresh=*/ 0, /*fullMatrix=*/ false); SparseMatrix <double> clustMtxTr = ModelUtils.GetTransposedMatrix(centroids); int iter = 1; while (clusters.Roots.Count > 1) { Console.WriteLine("Iteration {0} ...", iter++); int idx1, idx2; FindMaxSim(simMtx, out idx1, out idx2); Update(simMtx, clustMtxTr, clusters.Roots.Count, idx1, idx2, clusters.Roots.Inner, dataset, /*damping=*/ 0.9); Console.WriteLine(simMtx.ToString("E0.00")); Console.WriteLine(); } return(clusters); }
private UnlabeledDataset <SparseVector <double> > GetDatasetSubset(IEnumerable <int> items, IUnlabeledExampleCollection <SparseVector <double> > dataset) { UnlabeledDataset <SparseVector <double> > datasetSubset = new UnlabeledDataset <SparseVector <double> >(); foreach (int item in items) { datasetSubset.Add(dataset[item]); } return(datasetSubset); }
public static IUnlabeledExampleCollection <ExT> ConvertToUnlabeledDataset <LblT, ExT>(ILabeledExampleCollection <LblT, ExT> dataset) { UnlabeledDataset <ExT> unlabeledDataset = new UnlabeledDataset <ExT>(); foreach (LabeledExample <LblT, ExT> labeledExample in dataset) { unlabeledDataset.Add(labeledExample.Example); } return(unlabeledDataset); }
public static UnlabeledDataset <ExT> ConvertToUnlabeledDataset <LblT, ExT>(ILabeledExampleCollection <LblT, ExT> dataset) { Utils.ThrowException(dataset == null ? new ArgumentNullException("dataset") : null); UnlabeledDataset <ExT> unlabeledDataset = new UnlabeledDataset <ExT>(); foreach (LabeledExample <LblT, ExT> labeledExample in dataset) { unlabeledDataset.Add(labeledExample.Example); } return(unlabeledDataset); }
public void Load(BinarySerializer reader) { Utils.ThrowException(reader == null ? new ArgumentNullException("reader") : null); // the following statements throw serialization-related exceptions mRnd = (Random)reader.ReadDotNetObject(); mEps = reader.ReadDouble(); mTrials = reader.ReadInt(); mK = reader.ReadInt(); mCentroids = reader.ReadObject <ArrayList <CentroidData> >(); mDataset = reader.ReadObject <UnlabeledDataset <SparseVector <double> > >(); mQualThresh = reader.ReadDouble(); mTopicId = reader.ReadLong(); }
public IUnlabeledDataset ConvertDataset(Type newExType, bool move) { Utils.ThrowException(newExType == null ? new ArgumentNullException("newExType") : null); Utils.ThrowException(move && typeof(ExT).IsValueType ? new ArgumentValueException("newExType") : null); IUnlabeledDataset newDataset = null; ArrayList <object> tmp = new ArrayList <object>(mItems.Count); for (int i = 0; i < mItems.Count; i++) { tmp.Add(ModelUtils.ConvertExample(mItems[i], newExType)); // throws ArgumentValueException if (move) { mItems[i] = default(ExT); } // *** this is guaranteed to be null by the second assertion } if (move) { mItems.Clear(); } if (newExType == typeof(SparseVector <double>)) { newDataset = new UnlabeledDataset <SparseVector <double> >(tmp); } else if (newExType == typeof(SparseVector <double> .ReadOnly)) { newDataset = new UnlabeledDataset <SparseVector <double> .ReadOnly>(tmp); } else if (newExType == typeof(BinaryVector)) { newDataset = new UnlabeledDataset <BinaryVector>(tmp); } else if (newExType == typeof(BinaryVector.ReadOnly)) { newDataset = new UnlabeledDataset <BinaryVector.ReadOnly>(tmp); } else { throw new ArgumentNotSupportedException("newExType"); } return(newDataset); }
public ClusteringResult Cluster(IUnlabeledExampleCollection <SparseVector <double> > dataset) { Utils.ThrowException(dataset == null ? new ArgumentNullException("dataset") : null); Utils.ThrowException(dataset.Count < 2 ? new ArgumentValueException("dataset") : null); ClusteringResult clusteringResult = new ClusteringResult(); Queue <Cluster> queue = new Queue <Cluster>(); // create root Cluster root = new Cluster(); for (int i = 0; i < dataset.Count; i++) { Utils.ThrowException(dataset[i].Count == 0 ? new ArgumentValueException("dataset") : null); root.Items.Add(i); } clusteringResult.AddRoot(root); // add root to queue queue.Enqueue(root); while (queue.Count > 0) { // get next cluster Cluster cluster = queue.Dequeue(); // compute cluster quality UnlabeledDataset <SparseVector <double> > localDataset = GetDatasetSubset(cluster.Items, dataset); SparseVector <double> centroid; double quality = GetClusterQuality(localDataset, out centroid); cluster.ClusterInfo = new Pair <SparseVector <double>, double>(centroid, quality); if (quality < mMinQuality) { // split cluster, add children to queue ClusteringResult localResult = mKMeansClustering.Cluster(localDataset); for (int i = 0; i < 2; i++) { cluster.AddChild(localResult.Roots[i]); localResult.Roots[i].Parent = cluster; queue.Enqueue(localResult.Roots[i]); } } } return(clusteringResult); }
public ClusteringResult Cluster(int numOutdated, IUnlabeledExampleCollection <SparseVector <double> > batch) { Utils.ThrowException(batch == null ? new ArgumentNullException("batch") : null); Utils.ThrowException(numOutdated < 0 ? new ArgumentOutOfRangeException("numOutdated") : null); if (mDataset == null) { // initialize mLogger.Trace("Cluster", "Initializing ..."); Utils.ThrowException(numOutdated > 0 ? new ArgumentOutOfRangeException("numOutdated") : null); //Utils.ThrowException(batch.Count == 0 ? new ArgumentValueException("batch") : null); if (batch.Count == 0) { return(new ClusteringResult()); } kMeans(batch, Math.Min(mK, batch.Count)); mDataset = new UnlabeledDataset <SparseVector <double> >(batch); foreach (CentroidData centroid in mCentroids) { centroid.Tag = mTopicId++; } //OutputState(); } else { // update clusters Utils.ThrowException(numOutdated > mDataset.Count ? new ArgumentOutOfRangeException("numOutdated") : null); if (numOutdated == 0 && batch.Count == 0) { return(GetClusteringResult()); } mLogger.Trace("Cluster", "Updating clusters ..."); // assign new instances double dummy; Assign(mCentroids, ModelUtils.GetTransposedMatrix(batch), batch.Count, /*offs=*/ mDataset.Count, out dummy); mDataset.AddRange(batch); // remove outdated instances foreach (CentroidData centroid in mCentroids) { foreach (int item in centroid.CurrentItems) { if (item >= numOutdated) { centroid.Items.Add(item); } } centroid.Update(mDataset); centroid.UpdateCentroidLen(); } mDataset.RemoveRange(0, numOutdated); ArrayList <CentroidData> centroidsNew = new ArrayList <CentroidData>(mCentroids.Count); foreach (CentroidData centroid in mCentroids) { if (centroid.CurrentItems.Count > 0) { centroidsNew.Add(centroid); Set <int> tmp = new Set <int>(); foreach (int idx in centroid.CurrentItems) { tmp.Add(idx - numOutdated); } centroid.CurrentItems.Inner.SetItems(tmp); } } if (centroidsNew.Count == 0) // reset { mCentroids = null; mDataset = null; return(new ClusteringResult()); } mCentroids = centroidsNew; // execute main loop kMeansMainLoop(mDataset, mCentroids); //OutputState(); } // adjust k double minQual; // *** not used at the moment int minQualIdx; double qual = GetClustQual(out minQual, out minQualIdx); if (qual < mQualThresh) { while (qual < mQualThresh) // split cluster at minQualIdx { mLogger.Trace("Cluster", "Increasing k to {0} ...", mCentroids.Count + 1); mCentroids.Add(mCentroids[minQualIdx].Clone()); mCentroids.Last.Tag = mTopicId++; kMeansMainLoop(mDataset, mCentroids); if (mCentroids.Last.CurrentItems.Count > mCentroids[minQualIdx].CurrentItems.Count) { // swap topic identifiers object tmp = mCentroids.Last.Tag; mCentroids.Last.Tag = mCentroids[minQualIdx].Tag; mCentroids[minQualIdx].Tag = tmp; } qual = GetClustQual(out minQual, out minQualIdx); //OutputState(); } } else if (numOutdated > 0) { while (qual > mQualThresh && mCentroids.Count > 1) // join clusters { mLogger.Trace("Cluster", "Decreasing k to {0} ...", mCentroids.Count - 1); ArrayList <CentroidData> centroidsCopy = mCentroids.DeepClone(); if (mCentroids.Count == 2) // create single cluster { object topicId = mCentroids[0].CurrentItems.Count > mCentroids[1].CurrentItems.Count ? mCentroids[0].Tag : mCentroids[1].Tag; mCentroids = new ArrayList <CentroidData>(); mCentroids.Add(new CentroidData()); for (int i = 0; i < mDataset.Count; i++) { mCentroids.Last.Items.Add(i); } mCentroids.Last.Tag = topicId; mCentroids.Last.Update(mDataset); mCentroids.Last.UpdateCentroidLen(); } else { int idx1, idx2; GetMostSimilarClusters(out idx1, out idx2); CentroidData c1 = mCentroids[idx1]; CentroidData c2 = mCentroids[idx2]; object topicId = c1.CurrentItems.Count > c2.CurrentItems.Count ? c1.Tag : c2.Tag; mCentroids.RemoveAt(idx2); c1.Items.AddRange(c1.CurrentItems); c1.Items.AddRange(c2.CurrentItems); c1.Tag = topicId; c1.Update(mDataset); c1.UpdateCentroidLen(); kMeansMainLoop(mDataset, mCentroids); } qual = GetClustQual(); if (qual >= mQualThresh) { mLogger.Trace("Cluster", "Accepted solution at k = {0}.", mCentroids.Count); } else { mCentroids = centroidsCopy; } //OutputState(); } } OutputState(); return(GetClusteringResult()); }