public static void SimilarityInterfaceTest(ISimilarity distance, string first, string second, double expected) { var resultString = distance.GetSimilarity(first, second); Assert.AreEqual(expected, resultString, ErrorTollerance); var resultNorm = distance.GetSimilarity(new NormalizedString(first), new NormalizedString(second)); Assert.AreEqual(expected, resultNorm, ErrorTollerance); var resultToken = distance.GetSimilarity(new Token(first), new Token(second)); Assert.AreEqual(expected, resultToken, ErrorTollerance); }
public Prediction <LblT> Predict(ExT example) { Utils.ThrowException(mExamples == null ? new InvalidOperationException() : null); Utils.ThrowException(example == null ? new ArgumentNullException("example") : null); ArrayList <KeyDat <double, LabeledExample <LblT, ExT> > > tmp = new ArrayList <KeyDat <double, LabeledExample <LblT, ExT> > >(mExamples.Count); foreach (LabeledExample <LblT, ExT> labeledExample in mExamples) { double sim = mSimilarity.GetSimilarity(example, labeledExample.Example); tmp.Add(new KeyDat <double, LabeledExample <LblT, ExT> >(sim, labeledExample)); } tmp.Sort(DescSort <KeyDat <double, LabeledExample <LblT, ExT> > > .Instance); Dictionary <LblT, double> voting = new Dictionary <LblT, double>(mLblCmp); int n = Math.Min(mK, tmp.Count); double value; if (mSoftVoting) // "soft" voting { for (int i = 0; i < n; i++) { KeyDat <double, LabeledExample <LblT, ExT> > item = tmp[i]; if (!voting.TryGetValue(item.Dat.Label, out value)) { voting.Add(item.Dat.Label, item.Key); } else { voting[item.Dat.Label] = value + item.Key; } } } else // normal voting { for (int i = 0; i < n; i++) { KeyDat <double, LabeledExample <LblT, ExT> > item = tmp[i]; if (!voting.TryGetValue(item.Dat.Label, out value)) { voting.Add(item.Dat.Label, 1); } else { voting[item.Dat.Label] = value + 1.0; } } } Prediction <LblT> classifierResult = new Prediction <LblT>(); foreach (KeyValuePair <LblT, double> item in voting) { classifierResult.Inner.Add(new KeyDat <double, LblT>(item.Value, item.Key)); } classifierResult.Inner.Sort(DescSort <KeyDat <double, LblT> > .Instance); return(classifierResult); }
public ClassifierResult <LblT> Classify(SparseVector <double> .ReadOnly example) { Utils.ThrowException(m_centroids == null ? new InvalidOperationException() : null); Utils.ThrowException(example == null ? new ArgumentNullException("example") : null); ClassifierResult <LblT> result = new ClassifierResult <LblT>(); foreach (Pair <LblT, SparseVector <double> .ReadOnly> labeled_centroid in m_centroids) { double sim = m_similarity.GetSimilarity(labeled_centroid.Second, example); result.Items.Add(new KeyDat <double, LblT>(sim, labeled_centroid.First)); } result.Items.Sort(new DescSort <KeyDat <double, LblT> >()); return(result); }
public Prediction <LblT> Predict(SparseVector <double> example) { Utils.ThrowException(mCentroids == null ? new InvalidOperationException() : null); Utils.ThrowException(example == null ? new ArgumentNullException("example") : null); Prediction <LblT> result = new Prediction <LblT>(); foreach (Pair <LblT, SparseVector <double> > labeledCentroid in mCentroids) { double sim = mSimilarity.GetSimilarity(labeledCentroid.Second, example); result.Inner.Add(new KeyDat <double, LblT>(sim, labeledCentroid.First)); } result.Inner.Sort(DescSort <KeyDat <double, LblT> > .Instance); return(result); }
public ClusteringResult Cluster(IExampleCollection <LblT, SparseVector <double> .ReadOnly> dataset) { Utils.ThrowException(dataset == null ? new ArgumentNullException("dataset") : null); Utils.ThrowException(dataset.Count < m_k ? new ArgumentValueException("dataset") : null); ClusteringResult clustering = null; ClusteringResult best_clustering = null; double global_best_clust_qual = 0; for (int trial = 1; trial <= m_trials; trial++) { Utils.VerboseLine("*** CLUSTERING TRIAL {0} OF {1} ***", trial, m_trials); ArrayList <SparseVector <double> .ReadOnly> centroids = null; clustering = new ClusteringResult(); for (int i = 0; i < m_k; i++) { clustering.Roots.Add(new Cluster()); } // select seed items double min_sim = double.MaxValue; ArrayList <int> tmp = new ArrayList <int>(dataset.Count); for (int i = 0; i < dataset.Count; i++) { tmp.Add(i); } for (int k = 0; k < 3; k++) { ArrayList <SparseVector <double> .ReadOnly> seeds = new ArrayList <SparseVector <double> .ReadOnly>(m_k); tmp.Shuffle(m_rnd); for (int i = 0; i < m_k; i++) { seeds.Add(ModelUtils.ComputeCentroid(new SparseVector <double> .ReadOnly[] { dataset[tmp[i]].Example }, m_centroid_type)); } // assess quality of seed items double sim_avg = 0; foreach (SparseVector <double> .ReadOnly seed_1 in seeds) { foreach (SparseVector <double> .ReadOnly seed_2 in seeds) { if (seed_1 != seed_2) { sim_avg += m_similarity.GetSimilarity(seed_1, seed_2); } } } sim_avg /= (double)(m_k * m_k - m_k); //Console.WriteLine(sim_avg); if (sim_avg < min_sim) { min_sim = sim_avg; centroids = seeds; } } // main loop int iter = 0; double best_clust_qual = 0; double clust_qual; while (true) { iter++; clust_qual = 0; // assign items to clusters foreach (Cluster cluster in clustering.Roots) { cluster.Items.Clear(); } for (int i = 0; i < dataset.Count; i++) { SparseVector <double> .ReadOnly example = dataset[i].Example; double max_sim = double.MinValue; ArrayList <int> candidates = new ArrayList <int>(); for (int j = 0; j < m_k; j++) { SparseVector <double> .ReadOnly centroid = centroids[j]; double sim = m_similarity.GetSimilarity(example, centroid); if (sim > max_sim) { max_sim = sim; candidates.Clear(); candidates.Add(j); } else if (sim == max_sim) { candidates.Add(j); } } if (candidates.Count > 1) { candidates.Shuffle(m_rnd); } if (candidates.Count > 0) // *** is this always true? { clustering.Roots[candidates[0]].Items.Add(new Pair <double, int>(1, i)); clust_qual += max_sim; } } clust_qual /= (double)dataset.Count; Utils.VerboseLine("*** Iteration {0} ***", iter); Utils.VerboseLine("Quality: {0:0.0000}", clust_qual); // check if done if (iter > 1 && clust_qual - best_clust_qual <= m_eps) { break; } best_clust_qual = clust_qual; // compute new centroids for (int i = 0; i < m_k; i++) { centroids[i] = clustering.Roots[i].ComputeCentroid(dataset, m_centroid_type); } } if (trial == 1 || clust_qual > global_best_clust_qual) { global_best_clust_qual = clust_qual; best_clustering = clustering; } } return(best_clustering); }
public ClusteringResult Cluster(IUnlabeledExampleCollection <SparseVector <double> > dataset) { Utils.ThrowException(dataset == null ? new ArgumentNullException("dataset") : null); Utils.ThrowException(dataset.Count < mK ? new ArgumentValueException("dataset") : null); ClusteringResult clustering = null; ClusteringResult bestClustering = null; double globalBestClustQual = 0; for (int trial = 1; trial <= mTrials; trial++) { mLogger.Trace("Cluster", "Clustering trial {0} of {1} ...", trial, mTrials); ArrayList <SparseVector <double> > centroids = null; clustering = new ClusteringResult(); for (int i = 0; i < mK; i++) { clustering.AddRoot(new Cluster()); } // select seed items double minSim = double.MaxValue; ArrayList <int> tmp = new ArrayList <int>(dataset.Count); for (int i = 0; i < dataset.Count; i++) { tmp.Add(i); } for (int k = 0; k < 3; k++) { ArrayList <SparseVector <double> > seeds = new ArrayList <SparseVector <double> >(mK); tmp.Shuffle(mRnd); for (int i = 0; i < mK; i++) { seeds.Add(ModelUtils.ComputeCentroid(new SparseVector <double>[] { dataset[tmp[i]] }, mCentroidType)); } // assess quality of seed items double simAvg = 0; foreach (SparseVector <double> seed1 in seeds) { foreach (SparseVector <double> seed2 in seeds) { if (seed1 != seed2) { simAvg += mSimilarity.GetSimilarity(seed1, seed2); } } } simAvg /= (double)(mK * mK - mK); if (simAvg < minSim) { minSim = simAvg; centroids = seeds; } } // main loop int iter = 0; double bestClustQual = 0; double clustQual; while (true) { iter++; mLogger.Trace("Cluster", "Iteration {0} ...", iter); clustQual = 0; // assign items to clusters foreach (Cluster cluster in clustering.Roots) { cluster.Items.Clear(); } for (int i = 0; i < dataset.Count; i++) { SparseVector <double> example = dataset[i]; double maxSim = double.MinValue; ArrayList <int> candidates = new ArrayList <int>(); for (int j = 0; j < mK; j++) { SparseVector <double> centroid = centroids[j]; double sim = mSimilarity.GetSimilarity(example, centroid); if (sim > maxSim) { maxSim = sim; candidates.Clear(); candidates.Add(j); } else if (sim == maxSim) { candidates.Add(j); } } if (candidates.Count > 1) { candidates.Shuffle(mRnd); } clustering.Roots[candidates[0]].Items.Add(i); clustQual += maxSim; } clustQual /= (double)dataset.Count; mLogger.Trace("Cluster", "Quality: {0:0.0000}", clustQual); // check if done if (iter > 1 && clustQual - bestClustQual <= mEps) { break; } bestClustQual = clustQual; // compute new centroids for (int i = 0; i < mK; i++) { centroids[i] = clustering.Roots[i].ComputeCentroid(dataset, mCentroidType); } } if (trial == 1 || clustQual > globalBestClustQual) { globalBestClustQual = clustQual; bestClustering = clustering; } } return(bestClustering); }