public ClusteringResult Cluster(IUnlabeledExampleCollection <SparseVector <double> > dataset) { Utils.ThrowException(dataset == null ? new ArgumentNullException("dataset") : null); Utils.ThrowException(dataset.Count < NumLeaves ? new ArgumentValueException("dataset") : null); ClusteringResult clusters = mKMeansClustering.Cluster(dataset); UnlabeledDataset <SparseVector <double> > centroids = new UnlabeledDataset <SparseVector <double> >(); foreach (Cluster cluster in clusters.Roots) { SparseVector <double> centroid = ModelUtils.ComputeCentroid(cluster.Items, dataset, CentroidType.NrmL2); centroids.Add(centroid); centroid = Trim(centroid, 1000, 0.8); cluster.ClusterInfo = 1; // cluster level } SparseMatrix <double> simMtx = ModelUtils.GetDotProductSimilarity(centroids, /*thresh=*/ 0, /*fullMatrix=*/ false); SparseMatrix <double> clustMtxTr = ModelUtils.GetTransposedMatrix(centroids); int iter = 1; while (clusters.Roots.Count > 1) { Console.WriteLine("Iteration {0} ...", iter++); int idx1, idx2; FindMaxSim(simMtx, out idx1, out idx2); Update(simMtx, clustMtxTr, clusters.Roots.Count, idx1, idx2, clusters.Roots.Inner, dataset, /*damping=*/ 0.9); Console.WriteLine(simMtx.ToString("E0.00")); Console.WriteLine(); } return(clusters); }
private void Update(SparseMatrix <double> simMtx, SparseMatrix <double> clustMtxTr, int numClusters, int idx1, int idx2, ArrayList <Cluster> clusters, IUnlabeledExampleCollection <SparseVector <double> > dataset, double damping) { Debug.Assert(idx1 < idx2); // create new parent Cluster c1 = clusters[idx1]; Cluster c2 = clusters[idx2]; Cluster parent = new Cluster(); parent.Items.AddRange(c1.Items); parent.Items.AddRange(c2.Items); parent.ClusterInfo = Math.Max((int)c1.ClusterInfo, (int)c2.ClusterInfo) + 1; c1.Parent = parent; c2.Parent = parent; parent.AddChild(c1); parent.AddChild(c2); SparseVector <double> centroid = ModelUtils.ComputeCentroid(parent.Items, dataset, CentroidType.NrmL2); centroid = Trim(centroid, 1000, 0.8); // remove clusters clusters.RemoveAt(idx2); clusters.RemoveAt(idx1); // add new parent clusters.Add(parent); // remove rows at idx1 and idx2 simMtx.PurgeRowAt(idx2); simMtx.PurgeRowAt(idx1); // remove cols at idx1 and idx2 simMtx.PurgeColAt(idx2); simMtx.PurgeColAt(idx1); clustMtxTr.PurgeColAt(idx2); clustMtxTr.PurgeColAt(idx1); // update matrices numClusters -= 2; foreach (IdxDat <double> item in centroid) { if (clustMtxTr[item.Idx] == null) { clustMtxTr[item.Idx] = new SparseVector <double>(new IdxDat <double>[] { new IdxDat <double>(numClusters, item.Dat) }); } else { clustMtxTr[item.Idx].InnerIdx.Add(numClusters); clustMtxTr[item.Idx].InnerDat.Add(item.Dat); } } double[] simVec = ModelUtils.GetDotProductSimilarity(clustMtxTr, numClusters + 1, centroid); for (int i = 0; i < simVec.Length; i++) { simVec[i] *= Math.Pow(damping, (double)((int)parent.ClusterInfo + (int)clusters[i].ClusterInfo) / 2.0); } SparseMatrix <double> col = new SparseMatrix <double>(); col[0] = new SparseVector <double>(simVec); simMtx.AppendCols(col.GetTransposedCopy(), numClusters); }
private double GetClusterQuality(IUnlabeledExampleCollection <SparseVector <double> > dataset, out SparseVector <double> centroid) { // compute centroid centroid = ModelUtils.ComputeCentroid(dataset, CentroidType.NrmL2); // compute intra-cluster similarities double[] simData = ModelUtils.GetDotProductSimilarity(dataset, centroid); // compute cluster quality double quality = 0; for (int i = 0; i < simData.Length; i++) { quality += simData[i]; } quality /= (double)simData.Length; return(quality); }
public void Train(IExampleCollection <LblT, SparseVector <double> .ReadOnly> dataset) { Utils.ThrowException(dataset == null ? new ArgumentNullException("dataset") : null); Utils.ThrowException(dataset.Count == 0 ? new ArgumentValueException("dataset") : null); m_centroids = new ArrayList <Pair <LblT, SparseVector <double> .ReadOnly> >(); Dictionary <LblT, ArrayList <SparseVector <double> .ReadOnly> > tmp = new Dictionary <LblT, ArrayList <SparseVector <double> .ReadOnly> >(m_lbl_cmp); foreach (LabeledExample <LblT, SparseVector <double> .ReadOnly> labeled_example in dataset) { if (!tmp.ContainsKey(labeled_example.Label)) { tmp.Add(labeled_example.Label, new ArrayList <SparseVector <double> .ReadOnly>(new SparseVector <double> .ReadOnly[] { labeled_example.Example })); } else { tmp[labeled_example.Label].Add(labeled_example.Example); } } foreach (KeyValuePair <LblT, ArrayList <SparseVector <double> .ReadOnly> > centroid_data in tmp) { SparseVector <double> centroid = ModelUtils.ComputeCentroid(centroid_data.Value, m_normalize ? CentroidType.NrmL2 : CentroidType.Avg); m_centroids.Add(new Pair <LblT, SparseVector <double> .ReadOnly>(centroid_data.Key, centroid)); } }
public ClusteringResult Cluster(IExampleCollection <LblT, SparseVector <double> .ReadOnly> dataset) { Utils.ThrowException(dataset == null ? new ArgumentNullException("dataset") : null); Utils.ThrowException(dataset.Count < m_k ? new ArgumentValueException("dataset") : null); ClusteringResult clustering = null; ClusteringResult best_clustering = null; double global_best_clust_qual = 0; for (int trial = 1; trial <= m_trials; trial++) { Utils.VerboseLine("*** CLUSTERING TRIAL {0} OF {1} ***", trial, m_trials); ArrayList <SparseVector <double> .ReadOnly> centroids = null; clustering = new ClusteringResult(); for (int i = 0; i < m_k; i++) { clustering.Roots.Add(new Cluster()); } // select seed items double min_sim = double.MaxValue; ArrayList <int> tmp = new ArrayList <int>(dataset.Count); for (int i = 0; i < dataset.Count; i++) { tmp.Add(i); } for (int k = 0; k < 3; k++) { ArrayList <SparseVector <double> .ReadOnly> seeds = new ArrayList <SparseVector <double> .ReadOnly>(m_k); tmp.Shuffle(m_rnd); for (int i = 0; i < m_k; i++) { seeds.Add(ModelUtils.ComputeCentroid(new SparseVector <double> .ReadOnly[] { dataset[tmp[i]].Example }, m_centroid_type)); } // assess quality of seed items double sim_avg = 0; foreach (SparseVector <double> .ReadOnly seed_1 in seeds) { foreach (SparseVector <double> .ReadOnly seed_2 in seeds) { if (seed_1 != seed_2) { sim_avg += m_similarity.GetSimilarity(seed_1, seed_2); } } } sim_avg /= (double)(m_k * m_k - m_k); //Console.WriteLine(sim_avg); if (sim_avg < min_sim) { min_sim = sim_avg; centroids = seeds; } } // main loop int iter = 0; double best_clust_qual = 0; double clust_qual; while (true) { iter++; clust_qual = 0; // assign items to clusters foreach (Cluster cluster in clustering.Roots) { cluster.Items.Clear(); } for (int i = 0; i < dataset.Count; i++) { SparseVector <double> .ReadOnly example = dataset[i].Example; double max_sim = double.MinValue; ArrayList <int> candidates = new ArrayList <int>(); for (int j = 0; j < m_k; j++) { SparseVector <double> .ReadOnly centroid = centroids[j]; double sim = m_similarity.GetSimilarity(example, centroid); if (sim > max_sim) { max_sim = sim; candidates.Clear(); candidates.Add(j); } else if (sim == max_sim) { candidates.Add(j); } } if (candidates.Count > 1) { candidates.Shuffle(m_rnd); } if (candidates.Count > 0) // *** is this always true? { clustering.Roots[candidates[0]].Items.Add(new Pair <double, int>(1, i)); clust_qual += max_sim; } } clust_qual /= (double)dataset.Count; Utils.VerboseLine("*** Iteration {0} ***", iter); Utils.VerboseLine("Quality: {0:0.0000}", clust_qual); // check if done if (iter > 1 && clust_qual - best_clust_qual <= m_eps) { break; } best_clust_qual = clust_qual; // compute new centroids for (int i = 0; i < m_k; i++) { centroids[i] = clustering.Roots[i].ComputeCentroid(dataset, m_centroid_type); } } if (trial == 1 || clust_qual > global_best_clust_qual) { global_best_clust_qual = clust_qual; best_clustering = clustering; } } return(best_clustering); }
public SparseVector <double> ComputeCentroid(IUnlabeledExampleCollection <SparseVector <double> > dataset, CentroidType type) { return(ModelUtils.ComputeCentroid(mItems, dataset, type)); // throws ArgumentValueException }
public ClusteringResult Cluster(IUnlabeledExampleCollection <SparseVector <double> > dataset) { Utils.ThrowException(dataset == null ? new ArgumentNullException("dataset") : null); Utils.ThrowException(dataset.Count < mK ? new ArgumentValueException("dataset") : null); ClusteringResult clustering = null; ClusteringResult bestClustering = null; double globalBestClustQual = 0; for (int trial = 1; trial <= mTrials; trial++) { mLogger.Trace("Cluster", "Clustering trial {0} of {1} ...", trial, mTrials); ArrayList <SparseVector <double> > centroids = null; clustering = new ClusteringResult(); for (int i = 0; i < mK; i++) { clustering.AddRoot(new Cluster()); } // select seed items double minSim = double.MaxValue; ArrayList <int> tmp = new ArrayList <int>(dataset.Count); for (int i = 0; i < dataset.Count; i++) { tmp.Add(i); } for (int k = 0; k < 3; k++) { ArrayList <SparseVector <double> > seeds = new ArrayList <SparseVector <double> >(mK); tmp.Shuffle(mRnd); for (int i = 0; i < mK; i++) { seeds.Add(ModelUtils.ComputeCentroid(new SparseVector <double>[] { dataset[tmp[i]] }, mCentroidType)); } // assess quality of seed items double simAvg = 0; foreach (SparseVector <double> seed1 in seeds) { foreach (SparseVector <double> seed2 in seeds) { if (seed1 != seed2) { simAvg += mSimilarity.GetSimilarity(seed1, seed2); } } } simAvg /= (double)(mK * mK - mK); if (simAvg < minSim) { minSim = simAvg; centroids = seeds; } } // main loop int iter = 0; double bestClustQual = 0; double clustQual; while (true) { iter++; mLogger.Trace("Cluster", "Iteration {0} ...", iter); clustQual = 0; // assign items to clusters foreach (Cluster cluster in clustering.Roots) { cluster.Items.Clear(); } for (int i = 0; i < dataset.Count; i++) { SparseVector <double> example = dataset[i]; double maxSim = double.MinValue; ArrayList <int> candidates = new ArrayList <int>(); for (int j = 0; j < mK; j++) { SparseVector <double> centroid = centroids[j]; double sim = mSimilarity.GetSimilarity(example, centroid); if (sim > maxSim) { maxSim = sim; candidates.Clear(); candidates.Add(j); } else if (sim == maxSim) { candidates.Add(j); } } if (candidates.Count > 1) { candidates.Shuffle(mRnd); } clustering.Roots[candidates[0]].Items.Add(i); clustQual += maxSim; } clustQual /= (double)dataset.Count; mLogger.Trace("Cluster", "Quality: {0:0.0000}", clustQual); // check if done if (iter > 1 && clustQual - bestClustQual <= mEps) { break; } bestClustQual = clustQual; // compute new centroids for (int i = 0; i < mK; i++) { centroids[i] = clustering.Roots[i].ComputeCentroid(dataset, mCentroidType); } } if (trial == 1 || clustQual > globalBestClustQual) { globalBestClustQual = clustQual; bestClustering = clustering; } } return(bestClustering); }