public ClusteringResult Cluster(IUnlabeledExampleCollection <SparseVector <double> > dataset) { Utils.ThrowException(dataset == null ? new ArgumentNullException("dataset") : null); Utils.ThrowException(dataset.Count < NumLeaves ? new ArgumentValueException("dataset") : null); ClusteringResult clusters = mKMeansClustering.Cluster(dataset); UnlabeledDataset <SparseVector <double> > centroids = new UnlabeledDataset <SparseVector <double> >(); foreach (Cluster cluster in clusters.Roots) { SparseVector <double> centroid = ModelUtils.ComputeCentroid(cluster.Items, dataset, CentroidType.NrmL2); centroids.Add(centroid); centroid = Trim(centroid, 1000, 0.8); cluster.ClusterInfo = 1; // cluster level } SparseMatrix <double> simMtx = ModelUtils.GetDotProductSimilarity(centroids, /*thresh=*/ 0, /*fullMatrix=*/ false); SparseMatrix <double> clustMtxTr = ModelUtils.GetTransposedMatrix(centroids); int iter = 1; while (clusters.Roots.Count > 1) { Console.WriteLine("Iteration {0} ...", iter++); int idx1, idx2; FindMaxSim(simMtx, out idx1, out idx2); Update(simMtx, clustMtxTr, clusters.Roots.Count, idx1, idx2, clusters.Roots.Inner, dataset, /*damping=*/ 0.9); Console.WriteLine(simMtx.ToString("E0.00")); Console.WriteLine(); } return(clusters); }
protected ClusteringResult GetClusteringResult() { ClusteringResult clustering = new ClusteringResult(); foreach (CentroidData centroid in mCentroids) { clustering.AddRoot(new Cluster()); clustering.Roots.Last.Items.AddRange(centroid.CurrentItems); clustering.Roots.Last.ClusterInfo = centroid.Tag; } return(clustering); }
private ClusteringResult CreateSingleCluster(IUnlabeledExampleCollection <SparseVector <double> > dataset) { ClusteringResult clustering = new ClusteringResult(); Cluster root = new Cluster(); for (int i = 0; i < dataset.Count; i++) { root.Items.Add(i); } clustering.AddRoot(root); CentroidData centroid = new CentroidData(); centroid.Items.AddRange(root.Items); centroid.Update(dataset); centroid.UpdateCentroidLen(); mCentroids = new ArrayList <CentroidData>(); mCentroids.Add(centroid); return(clustering); }
public ClusteringResult Cluster(IUnlabeledExampleCollection <SparseVector <double> > dataset) { Utils.ThrowException(dataset == null ? new ArgumentNullException("dataset") : null); Utils.ThrowException(dataset.Count < 2 ? new ArgumentValueException("dataset") : null); ClusteringResult clusteringResult = new ClusteringResult(); Queue <Cluster> queue = new Queue <Cluster>(); // create root Cluster root = new Cluster(); for (int i = 0; i < dataset.Count; i++) { Utils.ThrowException(dataset[i].Count == 0 ? new ArgumentValueException("dataset") : null); root.Items.Add(i); } clusteringResult.AddRoot(root); // add root to queue queue.Enqueue(root); while (queue.Count > 0) { // get next cluster Cluster cluster = queue.Dequeue(); // compute cluster quality UnlabeledDataset <SparseVector <double> > localDataset = GetDatasetSubset(cluster.Items, dataset); SparseVector <double> centroid; double quality = GetClusterQuality(localDataset, out centroid); cluster.ClusterInfo = new Pair <SparseVector <double>, double>(centroid, quality); if (quality < mMinQuality) { // split cluster, add children to queue ClusteringResult localResult = mKMeansClustering.Cluster(localDataset); for (int i = 0; i < 2; i++) { cluster.AddChild(localResult.Roots[i]); localResult.Roots[i].Parent = cluster; queue.Enqueue(localResult.Roots[i]); } } } return(clusteringResult); }
public ClusteringResult Cluster(IExampleCollection <LblT, SparseVector <double> .ReadOnly> dataset) { Utils.ThrowException(dataset == null ? new ArgumentNullException("dataset") : null); Utils.ThrowException(dataset.Count < m_k ? new ArgumentValueException("dataset") : null); ClusteringResult clustering = null; ClusteringResult best_clustering = null; double global_best_clust_qual = 0; for (int trial = 1; trial <= m_trials; trial++) { Utils.VerboseLine("*** CLUSTERING TRIAL {0} OF {1} ***", trial, m_trials); ArrayList <SparseVector <double> .ReadOnly> centroids = null; clustering = new ClusteringResult(); for (int i = 0; i < m_k; i++) { clustering.Roots.Add(new Cluster()); } // select seed items double min_sim = double.MaxValue; ArrayList <int> tmp = new ArrayList <int>(dataset.Count); for (int i = 0; i < dataset.Count; i++) { tmp.Add(i); } for (int k = 0; k < 3; k++) { ArrayList <SparseVector <double> .ReadOnly> seeds = new ArrayList <SparseVector <double> .ReadOnly>(m_k); tmp.Shuffle(m_rnd); for (int i = 0; i < m_k; i++) { seeds.Add(ModelUtils.ComputeCentroid(new SparseVector <double> .ReadOnly[] { dataset[tmp[i]].Example }, m_centroid_type)); } // assess quality of seed items double sim_avg = 0; foreach (SparseVector <double> .ReadOnly seed_1 in seeds) { foreach (SparseVector <double> .ReadOnly seed_2 in seeds) { if (seed_1 != seed_2) { sim_avg += m_similarity.GetSimilarity(seed_1, seed_2); } } } sim_avg /= (double)(m_k * m_k - m_k); //Console.WriteLine(sim_avg); if (sim_avg < min_sim) { min_sim = sim_avg; centroids = seeds; } } // main loop int iter = 0; double best_clust_qual = 0; double clust_qual; while (true) { iter++; clust_qual = 0; // assign items to clusters foreach (Cluster cluster in clustering.Roots) { cluster.Items.Clear(); } for (int i = 0; i < dataset.Count; i++) { SparseVector <double> .ReadOnly example = dataset[i].Example; double max_sim = double.MinValue; ArrayList <int> candidates = new ArrayList <int>(); for (int j = 0; j < m_k; j++) { SparseVector <double> .ReadOnly centroid = centroids[j]; double sim = m_similarity.GetSimilarity(example, centroid); if (sim > max_sim) { max_sim = sim; candidates.Clear(); candidates.Add(j); } else if (sim == max_sim) { candidates.Add(j); } } if (candidates.Count > 1) { candidates.Shuffle(m_rnd); } if (candidates.Count > 0) // *** is this always true? { clustering.Roots[candidates[0]].Items.Add(new Pair <double, int>(1, i)); clust_qual += max_sim; } } clust_qual /= (double)dataset.Count; Utils.VerboseLine("*** Iteration {0} ***", iter); Utils.VerboseLine("Quality: {0:0.0000}", clust_qual); // check if done if (iter > 1 && clust_qual - best_clust_qual <= m_eps) { break; } best_clust_qual = clust_qual; // compute new centroids for (int i = 0; i < m_k; i++) { centroids[i] = clustering.Roots[i].ComputeCentroid(dataset, m_centroid_type); } } if (trial == 1 || clust_qual > global_best_clust_qual) { global_best_clust_qual = clust_qual; best_clustering = clustering; } } return(best_clustering); }
public ClusteringResult Cluster(IUnlabeledExampleCollection <SparseVector <double> > dataset) { Utils.ThrowException(dataset == null ? new ArgumentNullException("dataset") : null); Utils.ThrowException(dataset.Count < mK ? new ArgumentValueException("dataset") : null); ClusteringResult clustering = null; double globalBestClustQual = 0; for (int trial = 1; trial <= mTrials; trial++) { mLogger.Info("Cluster", "Clustering trial {0} of {1} ...", trial, mTrials); ArrayList <CentroidData> centroids = new ArrayList <CentroidData>(mK); ArrayList <int> bestSeeds = null; for (int i = 0; i < mK; i++) { centroids.Add(new CentroidData()); } // select seed items double minSim = double.MaxValue; ArrayList <int> tmp = new ArrayList <int>(dataset.Count); for (int i = 0; i < dataset.Count; i++) { tmp.Add(i); } for (int k = 0; k < 3; k++) { ArrayList <SparseVector <double> > seeds = new ArrayList <SparseVector <double> >(mK); tmp.Shuffle(mRnd); for (int i = 0; i < mK; i++) { seeds.Add(dataset[tmp[i]]); } // assess quality of seed items double simAvg = 0; foreach (SparseVector <double> seed1 in seeds) { foreach (SparseVector <double> seed2 in seeds) { if (seed1 != seed2) { simAvg += DotProductSimilarity.Instance.GetSimilarity(seed1, seed2); } } } simAvg /= (double)(mK * mK - mK); //Console.WriteLine(simAvg); if (simAvg < minSim) { minSim = simAvg; bestSeeds = new ArrayList <int>(mK); for (int i = 0; i < mK; i++) { bestSeeds.Add(tmp[i]); } } } for (int i = 0; i < mK; i++) { centroids[i].Items.Add(bestSeeds[i]); centroids[i].Update(dataset); centroids[i].UpdateCentroidLen(); } double[][] dotProd = new double[mK][]; SparseMatrix <double> dsMtx = ModelUtils.GetTransposedMatrix(dataset); // main loop int iter = 0; double bestClustQual = 0; double clustQual; while (true) { iter++; mLogger.Info("Cluster", "Iteration {0} ...", iter); clustQual = 0; // assign items to clusters int j = 0; foreach (CentroidData cen in centroids) { SparseVector <double> cenVec = cen.GetSparseVector(); dotProd[j] = ModelUtils.GetDotProductSimilarity(dsMtx, dataset.Count, cenVec); j++; } for (int instIdx = 0; instIdx < dataset.Count; instIdx++) { double maxSim = double.MinValue; ArrayList <int> candidates = new ArrayList <int>(); for (int cenIdx = 0; cenIdx < mK; cenIdx++) { double sim = dotProd[cenIdx][instIdx]; if (sim > maxSim) { maxSim = sim; candidates.Clear(); candidates.Add(cenIdx); } else if (sim == maxSim) { candidates.Add(cenIdx); } } if (candidates.Count > 1) { candidates.Shuffle(mRnd); } if (candidates.Count > 0) // *** is this always true? { centroids[candidates[0]].Items.Add(instIdx); clustQual += maxSim; } } clustQual /= (double)dataset.Count; mLogger.Info("Cluster", "Quality: {0:0.0000}", clustQual); // check if done if (iter > 1 && clustQual - bestClustQual <= mEps) { break; } bestClustQual = clustQual; // compute new centroids for (int i = 0; i < mK; i++) { centroids[i].Update(dataset); centroids[i].UpdateCentroidLen(); } } if (trial == 1 || clustQual > globalBestClustQual) { globalBestClustQual = clustQual; // save the result clustering = new ClusteringResult(); for (int i = 0; i < mK; i++) { clustering.AddRoot(new Cluster()); clustering.Roots.Last.Items.AddRange(centroids[i].Items); } } } return(clustering); }
public ClusteringResult Cluster(IUnlabeledExampleCollection <SparseVector <double> > dataset) { Utils.ThrowException(dataset == null ? new ArgumentNullException("dataset") : null); Utils.ThrowException(dataset.Count < mK ? new ArgumentValueException("dataset") : null); ClusteringResult clustering = null; ClusteringResult bestClustering = null; double globalBestClustQual = 0; for (int trial = 1; trial <= mTrials; trial++) { mLogger.Trace("Cluster", "Clustering trial {0} of {1} ...", trial, mTrials); ArrayList <SparseVector <double> > centroids = null; clustering = new ClusteringResult(); for (int i = 0; i < mK; i++) { clustering.AddRoot(new Cluster()); } // select seed items double minSim = double.MaxValue; ArrayList <int> tmp = new ArrayList <int>(dataset.Count); for (int i = 0; i < dataset.Count; i++) { tmp.Add(i); } for (int k = 0; k < 3; k++) { ArrayList <SparseVector <double> > seeds = new ArrayList <SparseVector <double> >(mK); tmp.Shuffle(mRnd); for (int i = 0; i < mK; i++) { seeds.Add(ModelUtils.ComputeCentroid(new SparseVector <double>[] { dataset[tmp[i]] }, mCentroidType)); } // assess quality of seed items double simAvg = 0; foreach (SparseVector <double> seed1 in seeds) { foreach (SparseVector <double> seed2 in seeds) { if (seed1 != seed2) { simAvg += mSimilarity.GetSimilarity(seed1, seed2); } } } simAvg /= (double)(mK * mK - mK); if (simAvg < minSim) { minSim = simAvg; centroids = seeds; } } // main loop int iter = 0; double bestClustQual = 0; double clustQual; while (true) { iter++; mLogger.Trace("Cluster", "Iteration {0} ...", iter); clustQual = 0; // assign items to clusters foreach (Cluster cluster in clustering.Roots) { cluster.Items.Clear(); } for (int i = 0; i < dataset.Count; i++) { SparseVector <double> example = dataset[i]; double maxSim = double.MinValue; ArrayList <int> candidates = new ArrayList <int>(); for (int j = 0; j < mK; j++) { SparseVector <double> centroid = centroids[j]; double sim = mSimilarity.GetSimilarity(example, centroid); if (sim > maxSim) { maxSim = sim; candidates.Clear(); candidates.Add(j); } else if (sim == maxSim) { candidates.Add(j); } } if (candidates.Count > 1) { candidates.Shuffle(mRnd); } clustering.Roots[candidates[0]].Items.Add(i); clustQual += maxSim; } clustQual /= (double)dataset.Count; mLogger.Trace("Cluster", "Quality: {0:0.0000}", clustQual); // check if done if (iter > 1 && clustQual - bestClustQual <= mEps) { break; } bestClustQual = clustQual; // compute new centroids for (int i = 0; i < mK; i++) { centroids[i] = clustering.Roots[i].ComputeCentroid(dataset, mCentroidType); } } if (trial == 1 || clustQual > globalBestClustQual) { globalBestClustQual = clustQual; bestClustering = clustering; } } return(bestClustering); }