public static SparseMatrix <double> GetDotProductSimilarity(IUnlabeledExampleCollection <SparseVector <double> > dataset, double thresh, bool fullMatrix) { Utils.ThrowException(dataset == null ? new ArgumentNullException("dataset") : null); Utils.ThrowException(thresh < 0 ? new ArgumentOutOfRangeException("thresh") : null); SparseMatrix <double> trMtx = GetTransposedMatrix(dataset); double[] simVec = new double[dataset.Count]; SparseMatrix <double> simMtx = new SparseMatrix <double>(); int rowIdx = 0; foreach (SparseVector <double> item in dataset) { GetDotProductSimilarity(item, simVec, trMtx, /*startIdx=*/ fullMatrix ? 0 : rowIdx); // if fullMatrix is false, upper (right) triangular sparse matrix of dot products is computed for (int idx = 0; idx < simVec.Length; idx++) { double sim = simVec[idx]; if (sim > thresh) { if (!simMtx.ContainsRowAt(rowIdx)) { simMtx[rowIdx] = new SparseVector <double>(new IdxDat <double>[] { new IdxDat <double>(idx, sim) }); } else { simMtx[rowIdx].InnerIdx.Add(idx); simMtx[rowIdx].InnerDat.Add(sim); } } simVec[idx] = 0; } rowIdx++; } return(simMtx); }
// *** Dataset utilities *** public static SparseMatrix <double> GetTransposedMatrix(IUnlabeledExampleCollection <SparseVector <double> > dataset) { Utils.ThrowException(dataset == null ? new ArgumentNullException("dataset") : null); //if (dataset.Count == 0) { return new SparseMatrix<double>(); } SparseMatrix <double> trMtx = new SparseMatrix <double>(); int rowIdx = 0; foreach (SparseVector <double> item in dataset) { foreach (IdxDat <double> vecItem in item) { if (!trMtx.ContainsRowAt(vecItem.Idx)) { trMtx[vecItem.Idx] = new SparseVector <double>(new IdxDat <double>[] { new IdxDat <double>(rowIdx, vecItem.Dat) }); } else { trMtx[vecItem.Idx].InnerIdx.Add(rowIdx); trMtx[vecItem.Idx].InnerDat.Add(vecItem.Dat); } } rowIdx++; } return(trMtx); }
public ClusteringResult Cluster(IUnlabeledExampleCollection <SparseVector <double> > dataset) { Utils.ThrowException(dataset == null ? new ArgumentNullException("dataset") : null); Utils.ThrowException(dataset.Count < NumLeaves ? new ArgumentValueException("dataset") : null); ClusteringResult clusters = mKMeansClustering.Cluster(dataset); UnlabeledDataset <SparseVector <double> > centroids = new UnlabeledDataset <SparseVector <double> >(); foreach (Cluster cluster in clusters.Roots) { SparseVector <double> centroid = ModelUtils.ComputeCentroid(cluster.Items, dataset, CentroidType.NrmL2); centroids.Add(centroid); centroid = Trim(centroid, 1000, 0.8); cluster.ClusterInfo = 1; // cluster level } SparseMatrix <double> simMtx = ModelUtils.GetDotProductSimilarity(centroids, /*thresh=*/ 0, /*fullMatrix=*/ false); SparseMatrix <double> clustMtxTr = ModelUtils.GetTransposedMatrix(centroids); int iter = 1; while (clusters.Roots.Count > 1) { Console.WriteLine("Iteration {0} ...", iter++); int idx1, idx2; FindMaxSim(simMtx, out idx1, out idx2); Update(simMtx, clustMtxTr, clusters.Roots.Count, idx1, idx2, clusters.Roots.Inner, dataset, /*damping=*/ 0.9); Console.WriteLine(simMtx.ToString("E0.00")); Console.WriteLine(); } return(clusters); }
internal void Update(IUnlabeledExampleCollection <SparseVector <double> > dataset, ArrayList <CentroidData> centroids) { foreach (CentroidData centroid in centroids) { centroid.Update(dataset); centroid.UpdateCentroidLen(); } }
public LabeledDataset <Cluster, ExT> GetClassificationDataset <ExT>(IUnlabeledExampleCollection <ExT> dataset) { Utils.ThrowException(dataset == null ? new ArgumentNullException("dataset") : null); LabeledDataset <Cluster, ExT> classificationDataset = new LabeledDataset <Cluster, ExT>(); FillClassificationDataset(mRoots, dataset, classificationDataset); // throws ArgumentValueException return(classificationDataset); }
private void Update(SparseMatrix <double> simMtx, SparseMatrix <double> clustMtxTr, int numClusters, int idx1, int idx2, ArrayList <Cluster> clusters, IUnlabeledExampleCollection <SparseVector <double> > dataset, double damping) { Debug.Assert(idx1 < idx2); // create new parent Cluster c1 = clusters[idx1]; Cluster c2 = clusters[idx2]; Cluster parent = new Cluster(); parent.Items.AddRange(c1.Items); parent.Items.AddRange(c2.Items); parent.ClusterInfo = Math.Max((int)c1.ClusterInfo, (int)c2.ClusterInfo) + 1; c1.Parent = parent; c2.Parent = parent; parent.AddChild(c1); parent.AddChild(c2); SparseVector <double> centroid = ModelUtils.ComputeCentroid(parent.Items, dataset, CentroidType.NrmL2); centroid = Trim(centroid, 1000, 0.8); // remove clusters clusters.RemoveAt(idx2); clusters.RemoveAt(idx1); // add new parent clusters.Add(parent); // remove rows at idx1 and idx2 simMtx.PurgeRowAt(idx2); simMtx.PurgeRowAt(idx1); // remove cols at idx1 and idx2 simMtx.PurgeColAt(idx2); simMtx.PurgeColAt(idx1); clustMtxTr.PurgeColAt(idx2); clustMtxTr.PurgeColAt(idx1); // update matrices numClusters -= 2; foreach (IdxDat <double> item in centroid) { if (clustMtxTr[item.Idx] == null) { clustMtxTr[item.Idx] = new SparseVector <double>(new IdxDat <double>[] { new IdxDat <double>(numClusters, item.Dat) }); } else { clustMtxTr[item.Idx].InnerIdx.Add(numClusters); clustMtxTr[item.Idx].InnerDat.Add(item.Dat); } } double[] simVec = ModelUtils.GetDotProductSimilarity(clustMtxTr, numClusters + 1, centroid); for (int i = 0; i < simVec.Length; i++) { simVec[i] *= Math.Pow(damping, (double)((int)parent.ClusterInfo + (int)clusters[i].ClusterInfo) / 2.0); } SparseMatrix <double> col = new SparseMatrix <double>(); col[0] = new SparseVector <double>(simVec); simMtx.AppendCols(col.GetTransposedCopy(), numClusters); }
public static double[] GetDotProductSimilarity(IUnlabeledExampleCollection <SparseVector <double> > dataset, SparseVector <double> .ReadOnly vec) { Utils.ThrowException(dataset == null ? new ArgumentNullException("dataset") : null); Utils.ThrowException(vec == null ? new ArgumentNullException("vec") : null); SparseMatrix <double> trMtx = GetTransposedMatrix(dataset); double[] simVec = new double[dataset.Count]; GetDotProductSimilarity(vec, simVec, trMtx, /*startIdx=*/ 0); return(simVec); }
private void FillClassificationDataset <ExT>(IEnumerable <Cluster> clusters, IUnlabeledExampleCollection <ExT> dataset, LabeledDataset <Cluster, ExT> classificationDataset) { foreach (Cluster cluster in clusters) { foreach (int item in cluster.Items) { Utils.ThrowException(item < 0 || item >= dataset.Count ? new ArgumentValueException("clusters") : null); classificationDataset.Add(cluster, dataset[item]); } FillClassificationDataset(cluster.Children, dataset, classificationDataset); } }
public void Train(IUnlabeledExampleCollection <SparseVector <double> > dataset, ClusteringResult hierarchy) { Utils.ThrowException(dataset == null ? new ArgumentNullException("dataset") : null); Utils.ThrowException(dataset.Count == 0 ? new ArgumentValueException("dataset") : null); Utils.ThrowException(hierarchy == null ? new ArgumentNullException("hierarchy") : null); Utils.ThrowException(hierarchy.Roots.Count == 0 ? new ArgumentValueException("hierarchy") : null); mModel = new Dictionary <Cluster, ClusterInfo>(); mDataset = dataset; foreach (Cluster root in hierarchy.Roots) { ComputeCentroid(root); } mDataset = null; }
private double GetClusterQuality(IUnlabeledExampleCollection <SparseVector <double> > dataset, out SparseVector <double> centroid) { // compute centroid centroid = ModelUtils.ComputeCentroid(dataset, CentroidType.NrmL2); // compute intra-cluster similarities double[] simData = ModelUtils.GetDotProductSimilarity(dataset, centroid); // compute cluster quality double quality = 0; for (int i = 0; i < simData.Length; i++) { quality += simData[i]; } quality /= (double)simData.Length; return(quality); }
public static SparseVector <double> GetDotProductSimilarity(IUnlabeledExampleCollection <SparseVector <double> > dataset, SparseVector <double> .ReadOnly vec, double thresh) { Utils.ThrowException(thresh < 0 ? new ArgumentOutOfRangeException("thresh") : null); double[] simVec = GetDotProductSimilarity(dataset, vec); // throws ArgumentNullException SparseVector <double> sparseVec = new SparseVector <double>(); for (int i = 0; i < simVec.Length; i++) { if (simVec[i] > thresh) { sparseVec.InnerIdx.Add(i); sparseVec.InnerDat.Add(simVec[i]); } } return(sparseVec); }
private ClusteringResult CreateSingleCluster(IUnlabeledExampleCollection <SparseVector <double> > dataset) { ClusteringResult clustering = new ClusteringResult(); Cluster root = new Cluster(); for (int i = 0; i < dataset.Count; i++) { root.Items.Add(i); } clustering.AddRoot(root); CentroidData centroid = new CentroidData(); centroid.Items.AddRange(root.Items); centroid.Update(dataset); centroid.UpdateCentroidLen(); mCentroids = new ArrayList <CentroidData>(); mCentroids.Add(centroid); return(clustering); }
public void Update(IUnlabeledExampleCollection <SparseVector <double> > dataset) { Set <int> addIdx = Set <int> .Difference(mItems, mCurrentItems); Set <int> rmvIdx = Set <int> .Difference(mCurrentItems, mItems); foreach (int itemIdx in addIdx) { SparseVector <double> vec = dataset[itemIdx]; AddToSum(vec); } foreach (int itemIdx in rmvIdx) { SparseVector <double> vec = dataset[itemIdx]; AddToDiff(-1, vec); } mCurrentItems = mItems; mItems = new Set <int>(); Update(/*positiveValuesOnly=*/ false); }
public ClusteringResult Cluster(IUnlabeledExampleCollection <SparseVector <double> > dataset) { Utils.ThrowException(dataset == null ? new ArgumentNullException("dataset") : null); Utils.ThrowException(dataset.Count < 2 ? new ArgumentValueException("dataset") : null); ClusteringResult clusteringResult = new ClusteringResult(); Queue <Cluster> queue = new Queue <Cluster>(); // create root Cluster root = new Cluster(); for (int i = 0; i < dataset.Count; i++) { Utils.ThrowException(dataset[i].Count == 0 ? new ArgumentValueException("dataset") : null); root.Items.Add(i); } clusteringResult.AddRoot(root); // add root to queue queue.Enqueue(root); while (queue.Count > 0) { // get next cluster Cluster cluster = queue.Dequeue(); // compute cluster quality UnlabeledDataset <SparseVector <double> > localDataset = GetDatasetSubset(cluster.Items, dataset); SparseVector <double> centroid; double quality = GetClusterQuality(localDataset, out centroid); cluster.ClusterInfo = new Pair <SparseVector <double>, double>(centroid, quality); if (quality < mMinQuality) { // split cluster, add children to queue ClusteringResult localResult = mKMeansClustering.Cluster(localDataset); for (int i = 0; i < 2; i++) { cluster.AddChild(localResult.Roots[i]); localResult.Roots[i].Parent = cluster; queue.Enqueue(localResult.Roots[i]); } } } return(clusteringResult); }
internal void kMeansMainLoop(IUnlabeledExampleCollection <SparseVector <double> > dataset, ArrayList <CentroidData> centroids, out double clustQual) { double[][] dotProd = new double[centroids.Count][]; SparseMatrix <double> dataMtx = ModelUtils.GetTransposedMatrix(dataset); int iter = 0; double bestClustQual = 0; while (true) { iter++; mLogger.Trace("Cluster", "Iteration {0} ...", iter); // assign items to clusters Assign(centroids, dataMtx, dataset.Count, /*offs=*/ 0, out clustQual); mLogger.Trace("Cluster", "Quality: {0:0.0000}", clustQual); // update centroids Update(dataset, centroids); // check if done if (iter > 1 && clustQual - bestClustQual <= mEps) { break; } bestClustQual = clustQual; } }
public ClusteringResult Cluster(int numOutdated, IUnlabeledExampleCollection<SparseVector<double>> batch) { Utils.ThrowException(batch == null ? new ArgumentNullException("batch") : null); Utils.ThrowException(numOutdated < 0 ? new ArgumentOutOfRangeException("numOutdated") : null); if (mDataset == null) { // initialize mLogger.Info("Cluster", "Initializing ..."); Utils.ThrowException(numOutdated > 0 ? new ArgumentOutOfRangeException("numOutdated") : null); //Utils.ThrowException(batch.Count == 0 ? new ArgumentValueException("batch") : null); if (batch.Count == 0) { return new ClusteringResult(); } kMeans(batch, Math.Min(mK, batch.Count)); mDataset = new UnlabeledDataset<SparseVector<double>>(batch); foreach (CentroidData centroid in mCentroids) { centroid.Tag = mTopicId++; } //OutputState(); } else { // update clusters Utils.ThrowException(numOutdated > mDataset.Count ? new ArgumentOutOfRangeException("numOutdated") : null); if (numOutdated == 0 && batch.Count == 0) { return GetClusteringResult(); } mLogger.Info("Cluster", "Updating clusters ..."); // assign new instances double dummy; Assign(mCentroids, ModelUtils.GetTransposedMatrix(batch), batch.Count, /*offs=*/mDataset.Count, out dummy); mDataset.AddRange(batch); // remove outdated instances foreach (CentroidData centroid in mCentroids) { foreach (int item in centroid.CurrentItems) { if (item >= numOutdated) { centroid.Items.Add(item); } } centroid.Update(mDataset); centroid.UpdateCentroidLen(); } mDataset.RemoveRange(0, numOutdated); ArrayList<CentroidData> centroidsNew = new ArrayList<CentroidData>(mCentroids.Count); foreach (CentroidData centroid in mCentroids) { if (centroid.CurrentItems.Count > 0) { centroidsNew.Add(centroid); Set<int> tmp = new Set<int>(); foreach (int idx in centroid.CurrentItems) { tmp.Add(idx - numOutdated); } centroid.CurrentItems.Inner.SetItems(tmp); } } if (centroidsNew.Count == 0) // reset { mCentroids = null; mDataset = null; return new ClusteringResult(); } mCentroids = centroidsNew; // execute main loop kMeansMainLoop(mDataset, mCentroids); //OutputState(); } // adjust k double minQual; // *** not used at the moment int minQualIdx; double qual = GetClustQual(out minQual, out minQualIdx); if (qual < mQualThresh) { while (qual < mQualThresh) // split cluster at minQualIdx { mLogger.Info("Cluster", "Increasing k to {0} ...", mCentroids.Count + 1); mCentroids.Add(mCentroids[minQualIdx].Clone()); mCentroids.Last.Tag = mTopicId++; kMeansMainLoop(mDataset, mCentroids); if (mCentroids.Last.CurrentItems.Count > mCentroids[minQualIdx].CurrentItems.Count) { // swap topic identifiers object tmp = mCentroids.Last.Tag; mCentroids.Last.Tag = mCentroids[minQualIdx].Tag; mCentroids[minQualIdx].Tag = tmp; } qual = GetClustQual(out minQual, out minQualIdx); //OutputState(); } } else if (numOutdated > 0) { while (qual > mQualThresh && mCentroids.Count > 1) // join clusters { mLogger.Info("Cluster", "Decreasing k to {0} ...", mCentroids.Count - 1); ArrayList<CentroidData> centroidsCopy = mCentroids.DeepClone(); if (mCentroids.Count == 2) // create single cluster { object topicId = mCentroids[0].CurrentItems.Count > mCentroids[1].CurrentItems.Count ? mCentroids[0].Tag : mCentroids[1].Tag; mCentroids = new ArrayList<CentroidData>(); mCentroids.Add(new CentroidData()); for (int i = 0; i < mDataset.Count; i++) { mCentroids.Last.Items.Add(i); } mCentroids.Last.Tag = topicId; mCentroids.Last.Update(mDataset); mCentroids.Last.UpdateCentroidLen(); } else { int idx1, idx2; GetMostSimilarClusters(out idx1, out idx2); CentroidData c1 = mCentroids[idx1]; CentroidData c2 = mCentroids[idx2]; object topicId = c1.CurrentItems.Count > c2.CurrentItems.Count ? c1.Tag : c2.Tag; mCentroids.RemoveAt(idx2); c1.Items.AddRange(c1.CurrentItems); c1.Items.AddRange(c2.CurrentItems); c1.Tag = topicId; c1.Update(mDataset); c1.UpdateCentroidLen(); kMeansMainLoop(mDataset, mCentroids); } qual = GetClustQual(); if (qual >= mQualThresh) { mLogger.Info("Cluster", "Accepted solution at k = {0}.", mCentroids.Count); } else { mCentroids = centroidsCopy; } //OutputState(); } } OutputState(); return GetClusteringResult(); }
public ClusteringResult Cluster(IUnlabeledExampleCollection <SparseVector <double> > dataset) { Utils.ThrowException(dataset == null ? new ArgumentNullException("dataset") : null); Utils.ThrowException(dataset.Count < mK ? new ArgumentValueException("dataset") : null); ClusteringResult clustering = null; ClusteringResult bestClustering = null; double globalBestClustQual = 0; for (int trial = 1; trial <= mTrials; trial++) { mLogger.Trace("Cluster", "Clustering trial {0} of {1} ...", trial, mTrials); ArrayList <SparseVector <double> > centroids = null; clustering = new ClusteringResult(); for (int i = 0; i < mK; i++) { clustering.AddRoot(new Cluster()); } // select seed items double minSim = double.MaxValue; ArrayList <int> tmp = new ArrayList <int>(dataset.Count); for (int i = 0; i < dataset.Count; i++) { tmp.Add(i); } for (int k = 0; k < 3; k++) { ArrayList <SparseVector <double> > seeds = new ArrayList <SparseVector <double> >(mK); tmp.Shuffle(mRnd); for (int i = 0; i < mK; i++) { seeds.Add(ModelUtils.ComputeCentroid(new SparseVector <double>[] { dataset[tmp[i]] }, mCentroidType)); } // assess quality of seed items double simAvg = 0; foreach (SparseVector <double> seed1 in seeds) { foreach (SparseVector <double> seed2 in seeds) { if (seed1 != seed2) { simAvg += mSimilarity.GetSimilarity(seed1, seed2); } } } simAvg /= (double)(mK * mK - mK); if (simAvg < minSim) { minSim = simAvg; centroids = seeds; } } // main loop int iter = 0; double bestClustQual = 0; double clustQual; while (true) { iter++; mLogger.Trace("Cluster", "Iteration {0} ...", iter); clustQual = 0; // assign items to clusters foreach (Cluster cluster in clustering.Roots) { cluster.Items.Clear(); } for (int i = 0; i < dataset.Count; i++) { SparseVector <double> example = dataset[i]; double maxSim = double.MinValue; ArrayList <int> candidates = new ArrayList <int>(); for (int j = 0; j < mK; j++) { SparseVector <double> centroid = centroids[j]; double sim = mSimilarity.GetSimilarity(example, centroid); if (sim > maxSim) { maxSim = sim; candidates.Clear(); candidates.Add(j); } else if (sim == maxSim) { candidates.Add(j); } } if (candidates.Count > 1) { candidates.Shuffle(mRnd); } clustering.Roots[candidates[0]].Items.Add(i); clustQual += maxSim; } clustQual /= (double)dataset.Count; mLogger.Trace("Cluster", "Quality: {0:0.0000}", clustQual); // check if done if (iter > 1 && clustQual - bestClustQual <= mEps) { break; } bestClustQual = clustQual; // compute new centroids for (int i = 0; i < mK; i++) { centroids[i] = clustering.Roots[i].ComputeCentroid(dataset, mCentroidType); } } if (trial == 1 || clustQual > globalBestClustQual) { globalBestClustQual = clustQual; bestClustering = clustering; } } return(bestClustering); }
public static SparseVector <double> ComputeCentroid(IEnumerable <int> vecIdxList, IUnlabeledExampleCollection <SparseVector <double> > dataset, CentroidType type) { Utils.ThrowException(vecIdxList == null ? new ArgumentNullException("vecIdxList") : null); Utils.ThrowException(dataset == null ? new ArgumentNullException("dataset") : null); Dictionary <int, double> tmp = new Dictionary <int, double>(); int vecCount = 0; foreach (int vecIdx in vecIdxList) { Utils.ThrowException((vecIdx < 0 || vecIdx >= dataset.Count) ? new ArgumentValueException("vecIdxList") : null); SparseVector <double> vec = dataset[vecIdx]; foreach (IdxDat <double> item in vec) { if (tmp.ContainsKey(item.Idx)) { tmp[item.Idx] += item.Dat; } else { tmp.Add(item.Idx, item.Dat); } } vecCount++; } //Utils.ThrowException(vecCount == 0 ? new ArgumentValueException("vecIdxList") : null); if (vecCount == 0) { return(new SparseVector <double>()); } SparseVector <double> centroid = new SparseVector <double>(); switch (type) { case CentroidType.Sum: foreach (KeyValuePair <int, double> item in tmp) { centroid.InnerIdx.Add(item.Key); centroid.InnerDat.Add(item.Value); } break; case CentroidType.Avg: foreach (KeyValuePair <int, double> item in tmp) { centroid.InnerIdx.Add(item.Key); centroid.InnerDat.Add(item.Value / (double)vecCount); } break; case CentroidType.NrmL2: double vecLen = 0; foreach (KeyValuePair <int, double> item in tmp) { vecLen += item.Value * item.Value; } //Utils.ThrowException(vecLen == 0 ? new InvalidOperationException() : null); vecLen = Math.Sqrt(vecLen); if (vecLen > 0) { foreach (KeyValuePair <int, double> item in tmp) { centroid.InnerIdx.Add(item.Key); centroid.InnerDat.Add(item.Value / vecLen); } } break; } centroid.Sort(); return(centroid); }
public SparseVector<double> ComputeCentroid(IUnlabeledExampleCollection<SparseVector<double>> dataset, CentroidType type) { return ModelUtils.ComputeCentroid(mItems, dataset, type); // throws ArgumentNullException, ArgumentValueException, InvalidOperationException }
private void kMeansMainLoop(IUnlabeledExampleCollection <SparseVector <double> > dataset, ArrayList <CentroidData> centroids) { double dummy; kMeansMainLoop(dataset, centroids, out dummy); }
protected ClusteringResult kMeans(IUnlabeledExampleCollection <SparseVector <double> > dataset, int k) { if (k == 1) { return(CreateSingleCluster(dataset)); } // border case double globalBestClustQual = 0; for (int trial = 1; trial <= mTrials; trial++) { mLogger.Trace("Cluster", "Clustering trial {0} of {1} ...", trial, mTrials); ArrayList <CentroidData> centroids = new ArrayList <CentroidData>(k); ArrayList <int> bestSeeds = null; for (int i = 0; i < k; i++) { centroids.Add(new CentroidData()); } // select seed items double minSim = double.MaxValue; ArrayList <int> tmp = new ArrayList <int>(dataset.Count); for (int i = 0; i < dataset.Count; i++) { tmp.Add(i); } for (int i = 0; i < 3; i++) { ArrayList <SparseVector <double> > seeds = new ArrayList <SparseVector <double> >(k); tmp.Shuffle(mRnd); for (int j = 0; j < k; j++) { seeds.Add(dataset[tmp[j]]); } // assess quality of seed items double simAvg = 0; foreach (SparseVector <double> seed1 in seeds) { foreach (SparseVector <double> seed2 in seeds) { if (seed1 != seed2) { simAvg += DotProductSimilarity.Instance.GetSimilarity(seed1, seed2); } } } simAvg /= (double)(k * k - k); if (simAvg < minSim) { minSim = simAvg; bestSeeds = new ArrayList <int>(k); for (int j = 0; j < k; j++) { bestSeeds.Add(tmp[j]); } } } for (int i = 0; i < k; i++) { centroids[i].Items.Add(bestSeeds[i]); centroids[i].Update(dataset); centroids[i].UpdateCentroidLen(); } // execute main loop double clustQual; kMeansMainLoop(dataset, centroids, out clustQual); if (trial == 1 || clustQual > globalBestClustQual) { globalBestClustQual = clustQual; mCentroids = centroids; } } return(GetClusteringResult()); }
public override ClusteringResult Cluster(IUnlabeledExampleCollection<SparseVector<double>> batch) { return Cluster(/*numOutdated=*/0, batch); // throws ArgumentNullException, ArgumentValueException }
private UnlabeledDataset <SparseVector <double> > GetDatasetSubset(IEnumerable <int> items, IUnlabeledExampleCollection <SparseVector <double> > dataset) { UnlabeledDataset <SparseVector <double> > datasetSubset = new UnlabeledDataset <SparseVector <double> >(); foreach (int item in items) { datasetSubset.Add(dataset[item]); } return(datasetSubset); }
public SparseVector <double> ComputeCentroid(IUnlabeledExampleCollection <SparseVector <double> > dataset, CentroidType type) { return(ModelUtils.ComputeCentroid(mItems, dataset, type)); // throws ArgumentValueException }
public static SparseMatrix <double> GetDotProductSimilarity(IUnlabeledExampleCollection <SparseVector <double> > dataset) { return(GetDotProductSimilarity(dataset, /*thresh=*/ 0, /*fullMatrix=*/ false)); // throws ArgumentNullException }
public ClusteringResult Cluster(int numOutdated, IUnlabeledExampleCollection <SparseVector <double> > batch) { Utils.ThrowException(batch == null ? new ArgumentNullException("batch") : null); Utils.ThrowException(numOutdated < 0 ? new ArgumentOutOfRangeException("numOutdated") : null); if (mDataset == null) { // initialize mLogger.Trace("Cluster", "Initializing ..."); Utils.ThrowException(numOutdated > 0 ? new ArgumentOutOfRangeException("numOutdated") : null); //Utils.ThrowException(batch.Count == 0 ? new ArgumentValueException("batch") : null); if (batch.Count == 0) { return(new ClusteringResult()); } kMeans(batch, Math.Min(mK, batch.Count)); mDataset = new UnlabeledDataset <SparseVector <double> >(batch); foreach (CentroidData centroid in mCentroids) { centroid.Tag = mTopicId++; } //OutputState(); } else { // update clusters Utils.ThrowException(numOutdated > mDataset.Count ? new ArgumentOutOfRangeException("numOutdated") : null); if (numOutdated == 0 && batch.Count == 0) { return(GetClusteringResult()); } mLogger.Trace("Cluster", "Updating clusters ..."); // assign new instances double dummy; Assign(mCentroids, ModelUtils.GetTransposedMatrix(batch), batch.Count, /*offs=*/ mDataset.Count, out dummy); mDataset.AddRange(batch); // remove outdated instances foreach (CentroidData centroid in mCentroids) { foreach (int item in centroid.CurrentItems) { if (item >= numOutdated) { centroid.Items.Add(item); } } centroid.Update(mDataset); centroid.UpdateCentroidLen(); } mDataset.RemoveRange(0, numOutdated); ArrayList <CentroidData> centroidsNew = new ArrayList <CentroidData>(mCentroids.Count); foreach (CentroidData centroid in mCentroids) { if (centroid.CurrentItems.Count > 0) { centroidsNew.Add(centroid); Set <int> tmp = new Set <int>(); foreach (int idx in centroid.CurrentItems) { tmp.Add(idx - numOutdated); } centroid.CurrentItems.Inner.SetItems(tmp); } } if (centroidsNew.Count == 0) // reset { mCentroids = null; mDataset = null; return(new ClusteringResult()); } mCentroids = centroidsNew; // execute main loop kMeansMainLoop(mDataset, mCentroids); //OutputState(); } // adjust k double minQual; // *** not used at the moment int minQualIdx; double qual = GetClustQual(out minQual, out minQualIdx); if (qual < mQualThresh) { while (qual < mQualThresh) // split cluster at minQualIdx { mLogger.Trace("Cluster", "Increasing k to {0} ...", mCentroids.Count + 1); mCentroids.Add(mCentroids[minQualIdx].Clone()); mCentroids.Last.Tag = mTopicId++; kMeansMainLoop(mDataset, mCentroids); if (mCentroids.Last.CurrentItems.Count > mCentroids[minQualIdx].CurrentItems.Count) { // swap topic identifiers object tmp = mCentroids.Last.Tag; mCentroids.Last.Tag = mCentroids[minQualIdx].Tag; mCentroids[minQualIdx].Tag = tmp; } qual = GetClustQual(out minQual, out minQualIdx); //OutputState(); } } else if (numOutdated > 0) { while (qual > mQualThresh && mCentroids.Count > 1) // join clusters { mLogger.Trace("Cluster", "Decreasing k to {0} ...", mCentroids.Count - 1); ArrayList <CentroidData> centroidsCopy = mCentroids.DeepClone(); if (mCentroids.Count == 2) // create single cluster { object topicId = mCentroids[0].CurrentItems.Count > mCentroids[1].CurrentItems.Count ? mCentroids[0].Tag : mCentroids[1].Tag; mCentroids = new ArrayList <CentroidData>(); mCentroids.Add(new CentroidData()); for (int i = 0; i < mDataset.Count; i++) { mCentroids.Last.Items.Add(i); } mCentroids.Last.Tag = topicId; mCentroids.Last.Update(mDataset); mCentroids.Last.UpdateCentroidLen(); } else { int idx1, idx2; GetMostSimilarClusters(out idx1, out idx2); CentroidData c1 = mCentroids[idx1]; CentroidData c2 = mCentroids[idx2]; object topicId = c1.CurrentItems.Count > c2.CurrentItems.Count ? c1.Tag : c2.Tag; mCentroids.RemoveAt(idx2); c1.Items.AddRange(c1.CurrentItems); c1.Items.AddRange(c2.CurrentItems); c1.Tag = topicId; c1.Update(mDataset); c1.UpdateCentroidLen(); kMeansMainLoop(mDataset, mCentroids); } qual = GetClustQual(); if (qual >= mQualThresh) { mLogger.Trace("Cluster", "Accepted solution at k = {0}.", mCentroids.Count); } else { mCentroids = centroidsCopy; } //OutputState(); } } OutputState(); return(GetClusteringResult()); }
public ClusteringResult Cluster(IUnlabeledExampleCollection <SparseVector <double> > dataset) { Utils.ThrowException(dataset == null ? new ArgumentNullException("dataset") : null); Utils.ThrowException(dataset.Count < mK ? new ArgumentValueException("dataset") : null); mDataset = new UnlabeledDataset <SparseVector <double> >(dataset); ClusteringResult clustering = null; double globalBestClustQual = 0; for (int trial = 1; trial <= mTrials; trial++) { mLogger.Info("Cluster", "Clustering trial {0} of {1} ...", trial, mTrials); ArrayList <CentroidData> centroids = new ArrayList <CentroidData>(mK); ArrayList <int> bestSeeds = null; for (int i = 0; i < mK; i++) { centroids.Add(new CentroidData()); } // select seed items double minSim = double.MaxValue; ArrayList <int> tmp = new ArrayList <int>(mDataset.Count); for (int i = 0; i < mDataset.Count; i++) { tmp.Add(i); } for (int k = 0; k < 3; k++) { ArrayList <SparseVector <double> > seeds = new ArrayList <SparseVector <double> >(mK); tmp.Shuffle(mRnd); for (int i = 0; i < mK; i++) { seeds.Add(mDataset[tmp[i]]); } // assess quality of seed items double simAvg = 0; foreach (SparseVector <double> seed1 in seeds) { foreach (SparseVector <double> seed2 in seeds) { if (seed1 != seed2) { simAvg += DotProductSimilarity.Instance.GetSimilarity(seed1, seed2); } } } simAvg /= (double)(mK * mK - mK); //Console.WriteLine(simAvg); if (simAvg < minSim) { minSim = simAvg; bestSeeds = new ArrayList <int>(mK); for (int i = 0; i < mK; i++) { bestSeeds.Add(tmp[i]); } } } ArrayList <KeyDat <double, int> > medoids = new ArrayList <KeyDat <double, int> >(mK); for (int i = 0; i < mK; i++) { centroids[i].Items.Add(bestSeeds[i]); centroids[i].Update(mDataset); centroids[i].UpdateCentroidLen(); medoids.Add(new KeyDat <double, int>(-1, bestSeeds[i])); } double[,] dotProd = new double[mDataset.Count, mK]; SparseMatrix <double> dsMat = ModelUtils.GetTransposedMatrix(mDataset); // main loop int iter = 0; double bestClustQual = 0; double clustQual; while (true) { iter++; mLogger.Info("Cluster", "Iteration {0} ...", iter); clustQual = 0; // assign items to clusters //StopWatch stopWatch = new StopWatch(); int j = 0; foreach (CentroidData cen in centroids) { SparseVector <double> cenVec = cen.GetSparseVector(); double[] dotProdSimVec = ModelUtils.GetDotProductSimilarity(dsMat, mDataset.Count, cenVec); for (int i = 0; i < dotProdSimVec.Length; i++) { if (dotProdSimVec[i] > 0) { dotProd[i, j] = dotProdSimVec[i]; } } j++; } for (int dsInstIdx = 0; dsInstIdx < mDataset.Count; dsInstIdx++) { double maxSim = double.MinValue; ArrayList <int> candidates = new ArrayList <int>(); for (int cenIdx = 0; cenIdx < mK; cenIdx++) { double sim = dotProd[dsInstIdx, cenIdx]; if (sim > maxSim) { maxSim = sim; candidates.Clear(); candidates.Add(cenIdx); } else if (sim == maxSim) { candidates.Add(cenIdx); } } if (candidates.Count > 1) { candidates.Shuffle(mRnd); } if (candidates.Count > 0) // *** is this always true? { centroids[candidates[0]].Items.Add(dsInstIdx); clustQual += maxSim; if (medoids[candidates[0]].Key < maxSim) { medoids[candidates[0]] = new KeyDat <double, int>(maxSim, dsInstIdx); } } } //Console.WriteLine(stopWatch.TotalMilliseconds); clustQual /= (double)mDataset.Count; mLogger.Info("Cluster", "Quality: {0:0.0000}", clustQual); // compute new centroids for (int i = 0; i < mK; i++) { centroids[i].Update(mDataset); centroids[i].UpdateCentroidLen(); } // check if done if (iter > 1 && clustQual - bestClustQual <= mEps) { break; } bestClustQual = clustQual; for (int i = 0; i < medoids.Count; i++) { medoids[i] = new KeyDat <double, int>(-1, medoids[i].Dat); } } if (trial == 1 || clustQual > globalBestClustQual) { globalBestClustQual = clustQual; mCentroids = centroids; mMedoids = medoids; // save the result clustering = new ClusteringResult(); for (int i = 0; i < mK; i++) { clustering.AddRoot(new Cluster()); clustering.Roots.Last.Items.AddRange(centroids[i].Items); } } } return(clustering); }
public override ClusteringResult Cluster(IUnlabeledExampleCollection <SparseVector <double> > batch) { return(Cluster(/*numOutdated=*/ 0, batch)); // throws ArgumentNullException, ArgumentValueException }
public virtual ClusteringResult Cluster(IUnlabeledExampleCollection <SparseVector <double> > dataset) { Utils.ThrowException(dataset == null ? new ArgumentNullException("dataset") : null); Utils.ThrowException(dataset.Count < mK ? new ArgumentValueException("dataset") : null); return(kMeans(dataset, mK)); }
ClusteringResult IClustering.Cluster(IUnlabeledExampleCollection dataset) { Utils.ThrowException(dataset == null ? new ArgumentNullException("dataset") : null); Utils.ThrowException(!(dataset is IUnlabeledExampleCollection <SparseVector <double> >) ? new ArgumentTypeException("dataset") : null); return(Cluster((IUnlabeledExampleCollection <SparseVector <double> >)dataset)); // throws ArgumentValueException }
public SemanticSpaceLayout(IUnlabeledExampleCollection <SparseVector <double> > dataset) { Utils.ThrowException(dataset == null ? new ArgumentNullException("dataset") : null); mDataset = dataset; }
void IHierarchicalModel.Train(IUnlabeledExampleCollection dataset, ClusteringResult hierarchy) { Utils.ThrowException(dataset == null ? new ArgumentNullException("dataset") : null); Utils.ThrowException(!(dataset is IUnlabeledExampleCollection <SparseVector <double> >) ? new ArgumentTypeException("dataset") : null); Train((IUnlabeledExampleCollection <SparseVector <double> >)dataset, hierarchy); // throws ArgumentNullException, ArgumentValueException }
private void kMeansMainLoop(IUnlabeledExampleCollection<SparseVector<double>> dataset, ArrayList<CentroidData> centroids) { double dummy; kMeansMainLoop(dataset, centroids, out dummy); }