public ClusteringResult Cluster(IUnlabeledExampleCollection <SparseVector <double> > dataset) { Utils.ThrowException(dataset == null ? new ArgumentNullException("dataset") : null); Utils.ThrowException(dataset.Count < NumLeaves ? new ArgumentValueException("dataset") : null); ClusteringResult clusters = mKMeansClustering.Cluster(dataset); UnlabeledDataset <SparseVector <double> > centroids = new UnlabeledDataset <SparseVector <double> >(); foreach (Cluster cluster in clusters.Roots) { SparseVector <double> centroid = ModelUtils.ComputeCentroid(cluster.Items, dataset, CentroidType.NrmL2); centroids.Add(centroid); centroid = Trim(centroid, 1000, 0.8); cluster.ClusterInfo = 1; // cluster level } SparseMatrix <double> simMtx = ModelUtils.GetDotProductSimilarity(centroids, /*thresh=*/ 0, /*fullMatrix=*/ false); SparseMatrix <double> clustMtxTr = ModelUtils.GetTransposedMatrix(centroids); int iter = 1; while (clusters.Roots.Count > 1) { Console.WriteLine("Iteration {0} ...", iter++); int idx1, idx2; FindMaxSim(simMtx, out idx1, out idx2); Update(simMtx, clustMtxTr, clusters.Roots.Count, idx1, idx2, clusters.Roots.Inner, dataset, /*damping=*/ 0.9); Console.WriteLine(simMtx.ToString("E0.00")); Console.WriteLine(); } return(clusters); }
private void Update(SparseMatrix <double> simMtx, SparseMatrix <double> clustMtxTr, int numClusters, int idx1, int idx2, ArrayList <Cluster> clusters, IUnlabeledExampleCollection <SparseVector <double> > dataset, double damping) { Debug.Assert(idx1 < idx2); // create new parent Cluster c1 = clusters[idx1]; Cluster c2 = clusters[idx2]; Cluster parent = new Cluster(); parent.Items.AddRange(c1.Items); parent.Items.AddRange(c2.Items); parent.ClusterInfo = Math.Max((int)c1.ClusterInfo, (int)c2.ClusterInfo) + 1; c1.Parent = parent; c2.Parent = parent; parent.AddChild(c1); parent.AddChild(c2); SparseVector <double> centroid = ModelUtils.ComputeCentroid(parent.Items, dataset, CentroidType.NrmL2); centroid = Trim(centroid, 1000, 0.8); // remove clusters clusters.RemoveAt(idx2); clusters.RemoveAt(idx1); // add new parent clusters.Add(parent); // remove rows at idx1 and idx2 simMtx.PurgeRowAt(idx2); simMtx.PurgeRowAt(idx1); // remove cols at idx1 and idx2 simMtx.PurgeColAt(idx2); simMtx.PurgeColAt(idx1); clustMtxTr.PurgeColAt(idx2); clustMtxTr.PurgeColAt(idx1); // update matrices numClusters -= 2; foreach (IdxDat <double> item in centroid) { if (clustMtxTr[item.Idx] == null) { clustMtxTr[item.Idx] = new SparseVector <double>(new IdxDat <double>[] { new IdxDat <double>(numClusters, item.Dat) }); } else { clustMtxTr[item.Idx].InnerIdx.Add(numClusters); clustMtxTr[item.Idx].InnerDat.Add(item.Dat); } } double[] simVec = ModelUtils.GetDotProductSimilarity(clustMtxTr, numClusters + 1, centroid); for (int i = 0; i < simVec.Length; i++) { simVec[i] *= Math.Pow(damping, (double)((int)parent.ClusterInfo + (int)clusters[i].ClusterInfo) / 2.0); } SparseMatrix <double> col = new SparseMatrix <double>(); col[0] = new SparseVector <double>(simVec); simMtx.AppendCols(col.GetTransposedCopy(), numClusters); }
public Prediction <LblT> Predict(SparseVector <double> example) { Utils.ThrowException(mDatasetMtx == null ? new InvalidOperationException() : null); Utils.ThrowException(example == null ? new ArgumentNullException("example") : null); ArrayList <KeyDat <double, LblT> > tmp = new ArrayList <KeyDat <double, LblT> >(mLabels.Count); double[] dotProdSimVec = ModelUtils.GetDotProductSimilarity(mDatasetMtx, mLabels.Count, example); for (int i = 0; i < mLabels.Count; i++) { tmp.Add(new KeyDat <double, LblT>(dotProdSimVec[i], mLabels[i])); } tmp.Sort(DescSort <KeyDat <double, LblT> > .Instance); Dictionary <LblT, double> voting = new Dictionary <LblT, double>(mLblCmp); int n = Math.Min(mK, tmp.Count); double value; if (mSoftVoting) // "soft" voting { for (int i = 0; i < n; i++) { KeyDat <double, LblT> item = tmp[i]; if (!voting.TryGetValue(item.Dat, out value)) { voting.Add(item.Dat, item.Key); } else { voting[item.Dat] = value + item.Key; } } } else // normal voting { for (int i = 0; i < n; i++) { KeyDat <double, LblT> item = tmp[i]; if (!voting.TryGetValue(item.Dat, out value)) { voting.Add(item.Dat, 1); } else { voting[item.Dat] = value + 1.0; } } } Prediction <LblT> classifierResult = new Prediction <LblT>(); foreach (KeyValuePair <LblT, double> item in voting) { classifierResult.Inner.Add(new KeyDat <double, LblT>(item.Value, item.Key)); } classifierResult.Inner.Sort(DescSort <KeyDat <double, LblT> > .Instance); return(classifierResult); }
public Prediction <LblT> Predict(SparseVector <double> example) { Utils.ThrowException(mCentroidMtxTr == null ? new InvalidOperationException() : null); Utils.ThrowException(example == null ? new ArgumentNullException("example") : null); Prediction <LblT> result = new Prediction <LblT>(); double[] dotProdSimVec = ModelUtils.GetDotProductSimilarity(mCentroidMtxTr, mLabels.Count, example); for (int i = 0; i < dotProdSimVec.Length; i++) { result.Inner.Add(new KeyDat <double, LblT>(dotProdSimVec[i], mLabels[i])); } result.Inner.Sort(DescSort <KeyDat <double, LblT> > .Instance); return(result); }
private double GetClusterQuality(IUnlabeledExampleCollection <SparseVector <double> > dataset, out SparseVector <double> centroid) { // compute centroid centroid = ModelUtils.ComputeCentroid(dataset, CentroidType.NrmL2); // compute intra-cluster similarities double[] simData = ModelUtils.GetDotProductSimilarity(dataset, centroid); // compute cluster quality double quality = 0; for (int i = 0; i < simData.Length; i++) { quality += simData[i]; } quality /= (double)simData.Length; return(quality); }
internal void Assign(ArrayList <CentroidData> centroids, SparseMatrix <double> dataMtx, int instCount, int offs, out double clustQual) { int k = centroids.Count; double[][] dotProd = new double[k][]; clustQual = 0; int i = 0; foreach (CentroidData cen in centroids) { SparseVector <double> cenVec = cen.GetSparseVector(); dotProd[i++] = ModelUtils.GetDotProductSimilarity(dataMtx, instCount, cenVec); } for (int instIdx = 0; instIdx < instCount; instIdx++) { double maxSim = double.MinValue; ArrayList <int> candidates = new ArrayList <int>(); for (int cenIdx = 0; cenIdx < k; cenIdx++) { double sim = dotProd[cenIdx][instIdx]; if (sim > maxSim) { maxSim = sim; candidates.Clear(); candidates.Add(cenIdx); } else if (sim == maxSim) { candidates.Add(cenIdx); } } if (candidates.Count > 1) { candidates.Shuffle(mRnd); } centroids[candidates[0]].Items.Add(instIdx + offs); clustQual += maxSim; } clustQual /= (double)instCount; }
public void Train(ILabeledExampleCollection <LblT, SparseVector <double> > dataset) { Utils.ThrowException(dataset == null ? new ArgumentNullException("dataset") : null); Utils.ThrowException(dataset.Count == 0 ? new ArgumentValueException("dataset") : null); Dictionary <LblT, CentroidData> centroids = new Dictionary <LblT, CentroidData>(mLblCmp); foreach (LabeledExample <LblT, SparseVector <double> > labeledExample in dataset) { if (!centroids.ContainsKey(labeledExample.Label)) { CentroidData centroidData = new CentroidData(); centroidData.AddToSum(labeledExample.Example); centroids.Add(labeledExample.Label, centroidData); } else { CentroidData centroidData = centroids[labeledExample.Label]; centroidData.AddToSum(labeledExample.Example); } } foreach (CentroidData cenData in centroids.Values) { cenData.UpdateCentroidLen(); } double learnRate = 1; double[][] dotProd = null; SparseMatrix <double> dsMtx = null; if (mIterations > 0) { dotProd = new double[centroids.Count][]; dsMtx = ModelUtils.GetTransposedMatrix(ModelUtils.ConvertToUnlabeledDataset(dataset)); } for (int iter = 1; iter <= mIterations; iter++) { mLogger.Info("Train", "Iteration {0} / {1} ...", iter, mIterations); // compute dot products mLogger.Info("Train", "Computing dot products ..."); int j = 0; foreach (KeyValuePair <LblT, CentroidData> labeledCentroid in centroids) { mLogger.ProgressNormal(Logger.Level.Info, /*sender=*/ this, "Train", "Centroid {0} / {1} ...", j + 1, centroids.Count); SparseVector <double> cenVec = labeledCentroid.Value.GetSparseVector(); dotProd[j] = ModelUtils.GetDotProductSimilarity(dsMtx, dataset.Count, cenVec); j++; } // classify training examples mLogger.Info("Train", "Classifying training examples ..."); int errCount = 0; for (int instIdx = 0; instIdx < dataset.Count; instIdx++) { mLogger.ProgressFast(Logger.Level.Info, /*sender=*/ this, "Train", "Example {0} / {1} ...", instIdx + 1, dataset.Count); double maxSim = double.MinValue; CentroidData assignedCentroid = null; CentroidData actualCentroid = null; LabeledExample <LblT, SparseVector <double> > labeledExample = dataset[instIdx]; SparseVector <double> vec = labeledExample.Example; int cenIdx = 0; foreach (KeyValuePair <LblT, CentroidData> labeledCentroid in centroids) { double sim = dotProd[cenIdx][instIdx]; if (sim > maxSim) { maxSim = sim; assignedCentroid = labeledCentroid.Value; } if (labeledCentroid.Key.Equals(labeledExample.Label)) { actualCentroid = labeledCentroid.Value; } cenIdx++; } if (assignedCentroid != actualCentroid) { assignedCentroid.AddToDiff(-learnRate, vec); actualCentroid.AddToDiff(learnRate, vec); errCount++; } } mLogger.Info("Train", "Training set error rate: {0:0.00}%", (double)errCount / (double)dataset.Count * 100.0); // update centroids int k = 0; foreach (CentroidData centroidData in centroids.Values) { mLogger.ProgressNormal(Logger.Level.Info, /*sender=*/ this, "Train", "Centroid {0} / {1} ...", ++k, centroids.Count); centroidData.Update(mPositiveValuesOnly); centroidData.UpdateCentroidLen(); } learnRate *= mDamping; } mCentroidMtxTr = new SparseMatrix <double>(); mLabels = new ArrayList <LblT>(); int rowIdx = 0; foreach (KeyValuePair <LblT, CentroidData> labeledCentroid in centroids) { mCentroidMtxTr[rowIdx++] = labeledCentroid.Value.GetSparseVector(); mLabels.Add(labeledCentroid.Key); } mCentroidMtxTr = mCentroidMtxTr.GetTransposedCopy(); }
private double[][] GetKernel(int rmvFeatIdx) { int numSv = SvmLightLib.GetSupportVectorCount(mModelId); // initialize matrix double[][] kernel = new double[numSv][]; // compute linear kernel SparseMatrix <double> m = new SparseMatrix <double>(); for (int i = 0; i < numSv; i++) { SparseVector <double> sv = GetSupportVector(i); m[i] = sv; } if (rmvFeatIdx >= 0) { m.RemoveColAt(rmvFeatIdx); } SparseMatrix <double> mTr = m.GetTransposedCopy(); for (int i = 0; i < numSv; i++) { double[] innerProd = ModelUtils.GetDotProductSimilarity(mTr, numSv, m[i]); kernel[i] = innerProd; } // compute non-linear kernel switch (mKernelType) { case SvmLightKernelType.Polynomial: for (int row = 0; row < kernel.Length; row++) { for (int col = 0; col < kernel.Length; col++) { kernel[row][col] = Math.Pow(mKernelParamS * kernel[row][col] + mKernelParamC, mKernelParamD); } } break; case SvmLightKernelType.RadialBasisFunction: double[] diag = new double[kernel.Length]; for (int i = 0; i < kernel.Length; i++) { diag[i] = kernel[i][i]; } // save diagonal for (int row = 0; row < kernel.Length; row++) { for (int col = 0; col < kernel.Length; col++) { kernel[row][col] = Math.Exp(-mKernelParamGamma * (diag[row] + diag[col] - 2.0 * kernel[row][col])); } } break; case SvmLightKernelType.Sigmoid: for (int row = 0; row < kernel.Length; row++) { for (int col = 0; col < kernel.Length; col++) { kernel[row][col] = Math.Tanh(mKernelParamS * kernel[row][col] + mKernelParamC); } } break; } return(kernel); }
public ClusteringResult Cluster(IUnlabeledExampleCollection <SparseVector <double> > dataset) { Utils.ThrowException(dataset == null ? new ArgumentNullException("dataset") : null); Utils.ThrowException(dataset.Count < mK ? new ArgumentValueException("dataset") : null); ClusteringResult clustering = null; double globalBestClustQual = 0; for (int trial = 1; trial <= mTrials; trial++) { mLogger.Info("Cluster", "Clustering trial {0} of {1} ...", trial, mTrials); ArrayList <CentroidData> centroids = new ArrayList <CentroidData>(mK); ArrayList <int> bestSeeds = null; for (int i = 0; i < mK; i++) { centroids.Add(new CentroidData()); } // select seed items double minSim = double.MaxValue; ArrayList <int> tmp = new ArrayList <int>(dataset.Count); for (int i = 0; i < dataset.Count; i++) { tmp.Add(i); } for (int k = 0; k < 3; k++) { ArrayList <SparseVector <double> > seeds = new ArrayList <SparseVector <double> >(mK); tmp.Shuffle(mRnd); for (int i = 0; i < mK; i++) { seeds.Add(dataset[tmp[i]]); } // assess quality of seed items double simAvg = 0; foreach (SparseVector <double> seed1 in seeds) { foreach (SparseVector <double> seed2 in seeds) { if (seed1 != seed2) { simAvg += DotProductSimilarity.Instance.GetSimilarity(seed1, seed2); } } } simAvg /= (double)(mK * mK - mK); //Console.WriteLine(simAvg); if (simAvg < minSim) { minSim = simAvg; bestSeeds = new ArrayList <int>(mK); for (int i = 0; i < mK; i++) { bestSeeds.Add(tmp[i]); } } } for (int i = 0; i < mK; i++) { centroids[i].Items.Add(bestSeeds[i]); centroids[i].Update(dataset); centroids[i].UpdateCentroidLen(); } double[][] dotProd = new double[mK][]; SparseMatrix <double> dsMtx = ModelUtils.GetTransposedMatrix(dataset); // main loop int iter = 0; double bestClustQual = 0; double clustQual; while (true) { iter++; mLogger.Info("Cluster", "Iteration {0} ...", iter); clustQual = 0; // assign items to clusters int j = 0; foreach (CentroidData cen in centroids) { SparseVector <double> cenVec = cen.GetSparseVector(); dotProd[j] = ModelUtils.GetDotProductSimilarity(dsMtx, dataset.Count, cenVec); j++; } for (int instIdx = 0; instIdx < dataset.Count; instIdx++) { double maxSim = double.MinValue; ArrayList <int> candidates = new ArrayList <int>(); for (int cenIdx = 0; cenIdx < mK; cenIdx++) { double sim = dotProd[cenIdx][instIdx]; if (sim > maxSim) { maxSim = sim; candidates.Clear(); candidates.Add(cenIdx); } else if (sim == maxSim) { candidates.Add(cenIdx); } } if (candidates.Count > 1) { candidates.Shuffle(mRnd); } if (candidates.Count > 0) // *** is this always true? { centroids[candidates[0]].Items.Add(instIdx); clustQual += maxSim; } } clustQual /= (double)dataset.Count; mLogger.Info("Cluster", "Quality: {0:0.0000}", clustQual); // check if done if (iter > 1 && clustQual - bestClustQual <= mEps) { break; } bestClustQual = clustQual; // compute new centroids for (int i = 0; i < mK; i++) { centroids[i].Update(dataset); centroids[i].UpdateCentroidLen(); } } if (trial == 1 || clustQual > globalBestClustQual) { globalBestClustQual = clustQual; // save the result clustering = new ClusteringResult(); for (int i = 0; i < mK; i++) { clustering.AddRoot(new Cluster()); clustering.Roots.Last.Items.AddRange(centroids[i].Items); } } } return(clustering); }