protected ClusteringResult GetClusteringResult() { ClusteringResult clustering = new ClusteringResult(); foreach (CentroidData centroid in mCentroids) { clustering.AddRoot(new Cluster()); clustering.Roots.Last.Items.AddRange(centroid.CurrentItems); clustering.Roots.Last.ClusterInfo = centroid.Tag; } return(clustering); }
private ClusteringResult CreateSingleCluster(IUnlabeledExampleCollection <SparseVector <double> > dataset) { ClusteringResult clustering = new ClusteringResult(); Cluster root = new Cluster(); for (int i = 0; i < dataset.Count; i++) { root.Items.Add(i); } clustering.AddRoot(root); CentroidData centroid = new CentroidData(); centroid.Items.AddRange(root.Items); centroid.Update(dataset); centroid.UpdateCentroidLen(); mCentroids = new ArrayList <CentroidData>(); mCentroids.Add(centroid); return(clustering); }
public ClusteringResult Cluster(IUnlabeledExampleCollection <SparseVector <double> > dataset) { Utils.ThrowException(dataset == null ? new ArgumentNullException("dataset") : null); Utils.ThrowException(dataset.Count < 2 ? new ArgumentValueException("dataset") : null); ClusteringResult clusteringResult = new ClusteringResult(); Queue <Cluster> queue = new Queue <Cluster>(); // create root Cluster root = new Cluster(); for (int i = 0; i < dataset.Count; i++) { Utils.ThrowException(dataset[i].Count == 0 ? new ArgumentValueException("dataset") : null); root.Items.Add(i); } clusteringResult.AddRoot(root); // add root to queue queue.Enqueue(root); while (queue.Count > 0) { // get next cluster Cluster cluster = queue.Dequeue(); // compute cluster quality UnlabeledDataset <SparseVector <double> > localDataset = GetDatasetSubset(cluster.Items, dataset); SparseVector <double> centroid; double quality = GetClusterQuality(localDataset, out centroid); cluster.ClusterInfo = new Pair <SparseVector <double>, double>(centroid, quality); if (quality < mMinQuality) { // split cluster, add children to queue ClusteringResult localResult = mKMeansClustering.Cluster(localDataset); for (int i = 0; i < 2; i++) { cluster.AddChild(localResult.Roots[i]); localResult.Roots[i].Parent = cluster; queue.Enqueue(localResult.Roots[i]); } } } return(clusteringResult); }
public ClusteringResult Cluster(IUnlabeledExampleCollection <SparseVector <double> > dataset) { Utils.ThrowException(dataset == null ? new ArgumentNullException("dataset") : null); Utils.ThrowException(dataset.Count < mK ? new ArgumentValueException("dataset") : null); ClusteringResult clustering = null; double globalBestClustQual = 0; for (int trial = 1; trial <= mTrials; trial++) { mLogger.Info("Cluster", "Clustering trial {0} of {1} ...", trial, mTrials); ArrayList <CentroidData> centroids = new ArrayList <CentroidData>(mK); ArrayList <int> bestSeeds = null; for (int i = 0; i < mK; i++) { centroids.Add(new CentroidData()); } // select seed items double minSim = double.MaxValue; ArrayList <int> tmp = new ArrayList <int>(dataset.Count); for (int i = 0; i < dataset.Count; i++) { tmp.Add(i); } for (int k = 0; k < 3; k++) { ArrayList <SparseVector <double> > seeds = new ArrayList <SparseVector <double> >(mK); tmp.Shuffle(mRnd); for (int i = 0; i < mK; i++) { seeds.Add(dataset[tmp[i]]); } // assess quality of seed items double simAvg = 0; foreach (SparseVector <double> seed1 in seeds) { foreach (SparseVector <double> seed2 in seeds) { if (seed1 != seed2) { simAvg += DotProductSimilarity.Instance.GetSimilarity(seed1, seed2); } } } simAvg /= (double)(mK * mK - mK); //Console.WriteLine(simAvg); if (simAvg < minSim) { minSim = simAvg; bestSeeds = new ArrayList <int>(mK); for (int i = 0; i < mK; i++) { bestSeeds.Add(tmp[i]); } } } for (int i = 0; i < mK; i++) { centroids[i].Items.Add(bestSeeds[i]); centroids[i].Update(dataset); centroids[i].UpdateCentroidLen(); } double[][] dotProd = new double[mK][]; SparseMatrix <double> dsMtx = ModelUtils.GetTransposedMatrix(dataset); // main loop int iter = 0; double bestClustQual = 0; double clustQual; while (true) { iter++; mLogger.Info("Cluster", "Iteration {0} ...", iter); clustQual = 0; // assign items to clusters int j = 0; foreach (CentroidData cen in centroids) { SparseVector <double> cenVec = cen.GetSparseVector(); dotProd[j] = ModelUtils.GetDotProductSimilarity(dsMtx, dataset.Count, cenVec); j++; } for (int instIdx = 0; instIdx < dataset.Count; instIdx++) { double maxSim = double.MinValue; ArrayList <int> candidates = new ArrayList <int>(); for (int cenIdx = 0; cenIdx < mK; cenIdx++) { double sim = dotProd[cenIdx][instIdx]; if (sim > maxSim) { maxSim = sim; candidates.Clear(); candidates.Add(cenIdx); } else if (sim == maxSim) { candidates.Add(cenIdx); } } if (candidates.Count > 1) { candidates.Shuffle(mRnd); } if (candidates.Count > 0) // *** is this always true? { centroids[candidates[0]].Items.Add(instIdx); clustQual += maxSim; } } clustQual /= (double)dataset.Count; mLogger.Info("Cluster", "Quality: {0:0.0000}", clustQual); // check if done if (iter > 1 && clustQual - bestClustQual <= mEps) { break; } bestClustQual = clustQual; // compute new centroids for (int i = 0; i < mK; i++) { centroids[i].Update(dataset); centroids[i].UpdateCentroidLen(); } } if (trial == 1 || clustQual > globalBestClustQual) { globalBestClustQual = clustQual; // save the result clustering = new ClusteringResult(); for (int i = 0; i < mK; i++) { clustering.AddRoot(new Cluster()); clustering.Roots.Last.Items.AddRange(centroids[i].Items); } } } return(clustering); }
public ClusteringResult Cluster(IUnlabeledExampleCollection <SparseVector <double> > dataset) { Utils.ThrowException(dataset == null ? new ArgumentNullException("dataset") : null); Utils.ThrowException(dataset.Count < mK ? new ArgumentValueException("dataset") : null); ClusteringResult clustering = null; ClusteringResult bestClustering = null; double globalBestClustQual = 0; for (int trial = 1; trial <= mTrials; trial++) { mLogger.Trace("Cluster", "Clustering trial {0} of {1} ...", trial, mTrials); ArrayList <SparseVector <double> > centroids = null; clustering = new ClusteringResult(); for (int i = 0; i < mK; i++) { clustering.AddRoot(new Cluster()); } // select seed items double minSim = double.MaxValue; ArrayList <int> tmp = new ArrayList <int>(dataset.Count); for (int i = 0; i < dataset.Count; i++) { tmp.Add(i); } for (int k = 0; k < 3; k++) { ArrayList <SparseVector <double> > seeds = new ArrayList <SparseVector <double> >(mK); tmp.Shuffle(mRnd); for (int i = 0; i < mK; i++) { seeds.Add(ModelUtils.ComputeCentroid(new SparseVector <double>[] { dataset[tmp[i]] }, mCentroidType)); } // assess quality of seed items double simAvg = 0; foreach (SparseVector <double> seed1 in seeds) { foreach (SparseVector <double> seed2 in seeds) { if (seed1 != seed2) { simAvg += mSimilarity.GetSimilarity(seed1, seed2); } } } simAvg /= (double)(mK * mK - mK); if (simAvg < minSim) { minSim = simAvg; centroids = seeds; } } // main loop int iter = 0; double bestClustQual = 0; double clustQual; while (true) { iter++; mLogger.Trace("Cluster", "Iteration {0} ...", iter); clustQual = 0; // assign items to clusters foreach (Cluster cluster in clustering.Roots) { cluster.Items.Clear(); } for (int i = 0; i < dataset.Count; i++) { SparseVector <double> example = dataset[i]; double maxSim = double.MinValue; ArrayList <int> candidates = new ArrayList <int>(); for (int j = 0; j < mK; j++) { SparseVector <double> centroid = centroids[j]; double sim = mSimilarity.GetSimilarity(example, centroid); if (sim > maxSim) { maxSim = sim; candidates.Clear(); candidates.Add(j); } else if (sim == maxSim) { candidates.Add(j); } } if (candidates.Count > 1) { candidates.Shuffle(mRnd); } clustering.Roots[candidates[0]].Items.Add(i); clustQual += maxSim; } clustQual /= (double)dataset.Count; mLogger.Trace("Cluster", "Quality: {0:0.0000}", clustQual); // check if done if (iter > 1 && clustQual - bestClustQual <= mEps) { break; } bestClustQual = clustQual; // compute new centroids for (int i = 0; i < mK; i++) { centroids[i] = clustering.Roots[i].ComputeCentroid(dataset, mCentroidType); } } if (trial == 1 || clustQual > globalBestClustQual) { globalBestClustQual = clustQual; bestClustering = clustering; } } return(bestClustering); }