Exemple #1
0
        public static void SimilarityInterfaceTest(ISimilarity distance, string first, string second, double expected)
        {
            var resultString = distance.GetSimilarity(first, second);

            Assert.AreEqual(expected, resultString, ErrorTollerance);

            var resultNorm = distance.GetSimilarity(new NormalizedString(first), new NormalizedString(second));

            Assert.AreEqual(expected, resultNorm, ErrorTollerance);

            var resultToken = distance.GetSimilarity(new Token(first), new Token(second));

            Assert.AreEqual(expected, resultToken, ErrorTollerance);
        }
Exemple #2
0
        public Prediction <LblT> Predict(ExT example)
        {
            Utils.ThrowException(mExamples == null ? new InvalidOperationException() : null);
            Utils.ThrowException(example == null ? new ArgumentNullException("example") : null);
            ArrayList <KeyDat <double, LabeledExample <LblT, ExT> > > tmp = new ArrayList <KeyDat <double, LabeledExample <LblT, ExT> > >(mExamples.Count);

            foreach (LabeledExample <LblT, ExT> labeledExample in mExamples)
            {
                double sim = mSimilarity.GetSimilarity(example, labeledExample.Example);
                tmp.Add(new KeyDat <double, LabeledExample <LblT, ExT> >(sim, labeledExample));
            }
            tmp.Sort(DescSort <KeyDat <double, LabeledExample <LblT, ExT> > > .Instance);
            Dictionary <LblT, double> voting = new Dictionary <LblT, double>(mLblCmp);
            int    n = Math.Min(mK, tmp.Count);
            double value;

            if (mSoftVoting) // "soft" voting
            {
                for (int i = 0; i < n; i++)
                {
                    KeyDat <double, LabeledExample <LblT, ExT> > item = tmp[i];
                    if (!voting.TryGetValue(item.Dat.Label, out value))
                    {
                        voting.Add(item.Dat.Label, item.Key);
                    }
                    else
                    {
                        voting[item.Dat.Label] = value + item.Key;
                    }
                }
            }
            else // normal voting
            {
                for (int i = 0; i < n; i++)
                {
                    KeyDat <double, LabeledExample <LblT, ExT> > item = tmp[i];
                    if (!voting.TryGetValue(item.Dat.Label, out value))
                    {
                        voting.Add(item.Dat.Label, 1);
                    }
                    else
                    {
                        voting[item.Dat.Label] = value + 1.0;
                    }
                }
            }
            Prediction <LblT> classifierResult = new Prediction <LblT>();

            foreach (KeyValuePair <LblT, double> item in voting)
            {
                classifierResult.Inner.Add(new KeyDat <double, LblT>(item.Value, item.Key));
            }
            classifierResult.Inner.Sort(DescSort <KeyDat <double, LblT> > .Instance);
            return(classifierResult);
        }
Exemple #3
0
        public ClassifierResult <LblT> Classify(SparseVector <double> .ReadOnly example)
        {
            Utils.ThrowException(m_centroids == null ? new InvalidOperationException() : null);
            Utils.ThrowException(example == null ? new ArgumentNullException("example") : null);
            ClassifierResult <LblT> result = new ClassifierResult <LblT>();

            foreach (Pair <LblT, SparseVector <double> .ReadOnly> labeled_centroid in m_centroids)
            {
                double sim = m_similarity.GetSimilarity(labeled_centroid.Second, example);
                result.Items.Add(new KeyDat <double, LblT>(sim, labeled_centroid.First));
            }
            result.Items.Sort(new DescSort <KeyDat <double, LblT> >());
            return(result);
        }
        public Prediction <LblT> Predict(SparseVector <double> example)
        {
            Utils.ThrowException(mCentroids == null ? new InvalidOperationException() : null);
            Utils.ThrowException(example == null ? new ArgumentNullException("example") : null);
            Prediction <LblT> result = new Prediction <LblT>();

            foreach (Pair <LblT, SparseVector <double> > labeledCentroid in mCentroids)
            {
                double sim = mSimilarity.GetSimilarity(labeledCentroid.Second, example);
                result.Inner.Add(new KeyDat <double, LblT>(sim, labeledCentroid.First));
            }
            result.Inner.Sort(DescSort <KeyDat <double, LblT> > .Instance);
            return(result);
        }
Exemple #5
0
        public ClusteringResult Cluster(IExampleCollection <LblT, SparseVector <double> .ReadOnly> dataset)
        {
            Utils.ThrowException(dataset == null ? new ArgumentNullException("dataset") : null);
            Utils.ThrowException(dataset.Count < m_k ? new ArgumentValueException("dataset") : null);
            ClusteringResult clustering             = null;
            ClusteringResult best_clustering        = null;
            double           global_best_clust_qual = 0;

            for (int trial = 1; trial <= m_trials; trial++)
            {
                Utils.VerboseLine("*** CLUSTERING TRIAL {0} OF {1} ***", trial, m_trials);
                ArrayList <SparseVector <double> .ReadOnly> centroids = null;
                clustering = new ClusteringResult();
                for (int i = 0; i < m_k; i++)
                {
                    clustering.Roots.Add(new Cluster());
                }
                // select seed items
                double          min_sim = double.MaxValue;
                ArrayList <int> tmp     = new ArrayList <int>(dataset.Count);
                for (int i = 0; i < dataset.Count; i++)
                {
                    tmp.Add(i);
                }
                for (int k = 0; k < 3; k++)
                {
                    ArrayList <SparseVector <double> .ReadOnly> seeds = new ArrayList <SparseVector <double> .ReadOnly>(m_k);
                    tmp.Shuffle(m_rnd);
                    for (int i = 0; i < m_k; i++)
                    {
                        seeds.Add(ModelUtils.ComputeCentroid(new SparseVector <double> .ReadOnly[] { dataset[tmp[i]].Example }, m_centroid_type));
                    }
                    // assess quality of seed items
                    double sim_avg = 0;
                    foreach (SparseVector <double> .ReadOnly seed_1 in seeds)
                    {
                        foreach (SparseVector <double> .ReadOnly seed_2 in seeds)
                        {
                            if (seed_1 != seed_2)
                            {
                                sim_avg += m_similarity.GetSimilarity(seed_1, seed_2);
                            }
                        }
                    }
                    sim_avg /= (double)(m_k * m_k - m_k);
                    //Console.WriteLine(sim_avg);
                    if (sim_avg < min_sim)
                    {
                        min_sim   = sim_avg;
                        centroids = seeds;
                    }
                }
                // main loop
                int    iter            = 0;
                double best_clust_qual = 0;
                double clust_qual;
                while (true)
                {
                    iter++;
                    clust_qual = 0;
                    // assign items to clusters
                    foreach (Cluster cluster in clustering.Roots)
                    {
                        cluster.Items.Clear();
                    }
                    for (int i = 0; i < dataset.Count; i++)
                    {
                        SparseVector <double> .ReadOnly example = dataset[i].Example;
                        double          max_sim    = double.MinValue;
                        ArrayList <int> candidates = new ArrayList <int>();
                        for (int j = 0; j < m_k; j++)
                        {
                            SparseVector <double> .ReadOnly centroid = centroids[j];
                            double sim = m_similarity.GetSimilarity(example, centroid);
                            if (sim > max_sim)
                            {
                                max_sim = sim;
                                candidates.Clear();
                                candidates.Add(j);
                            }
                            else if (sim == max_sim)
                            {
                                candidates.Add(j);
                            }
                        }
                        if (candidates.Count > 1)
                        {
                            candidates.Shuffle(m_rnd);
                        }
                        if (candidates.Count > 0) // *** is this always true?
                        {
                            clustering.Roots[candidates[0]].Items.Add(new Pair <double, int>(1, i));
                            clust_qual += max_sim;
                        }
                    }
                    clust_qual /= (double)dataset.Count;
                    Utils.VerboseLine("*** Iteration {0} ***", iter);
                    Utils.VerboseLine("Quality: {0:0.0000}", clust_qual);
                    // check if done
                    if (iter > 1 && clust_qual - best_clust_qual <= m_eps)
                    {
                        break;
                    }
                    best_clust_qual = clust_qual;
                    // compute new centroids
                    for (int i = 0; i < m_k; i++)
                    {
                        centroids[i] = clustering.Roots[i].ComputeCentroid(dataset, m_centroid_type);
                    }
                }
                if (trial == 1 || clust_qual > global_best_clust_qual)
                {
                    global_best_clust_qual = clust_qual;
                    best_clustering        = clustering;
                }
            }
            return(best_clustering);
        }
Exemple #6
0
        public ClusteringResult Cluster(IUnlabeledExampleCollection <SparseVector <double> > dataset)
        {
            Utils.ThrowException(dataset == null ? new ArgumentNullException("dataset") : null);
            Utils.ThrowException(dataset.Count < mK ? new ArgumentValueException("dataset") : null);
            ClusteringResult clustering          = null;
            ClusteringResult bestClustering      = null;
            double           globalBestClustQual = 0;

            for (int trial = 1; trial <= mTrials; trial++)
            {
                mLogger.Trace("Cluster", "Clustering trial {0} of {1} ...", trial, mTrials);
                ArrayList <SparseVector <double> > centroids = null;
                clustering = new ClusteringResult();
                for (int i = 0; i < mK; i++)
                {
                    clustering.AddRoot(new Cluster());
                }
                // select seed items
                double          minSim = double.MaxValue;
                ArrayList <int> tmp    = new ArrayList <int>(dataset.Count);
                for (int i = 0; i < dataset.Count; i++)
                {
                    tmp.Add(i);
                }
                for (int k = 0; k < 3; k++)
                {
                    ArrayList <SparseVector <double> > seeds = new ArrayList <SparseVector <double> >(mK);
                    tmp.Shuffle(mRnd);
                    for (int i = 0; i < mK; i++)
                    {
                        seeds.Add(ModelUtils.ComputeCentroid(new SparseVector <double>[] { dataset[tmp[i]] }, mCentroidType));
                    }
                    // assess quality of seed items
                    double simAvg = 0;
                    foreach (SparseVector <double> seed1 in seeds)
                    {
                        foreach (SparseVector <double> seed2 in seeds)
                        {
                            if (seed1 != seed2)
                            {
                                simAvg += mSimilarity.GetSimilarity(seed1, seed2);
                            }
                        }
                    }
                    simAvg /= (double)(mK * mK - mK);
                    if (simAvg < minSim)
                    {
                        minSim    = simAvg;
                        centroids = seeds;
                    }
                }
                // main loop
                int    iter          = 0;
                double bestClustQual = 0;
                double clustQual;
                while (true)
                {
                    iter++;
                    mLogger.Trace("Cluster", "Iteration {0} ...", iter);
                    clustQual = 0;
                    // assign items to clusters
                    foreach (Cluster cluster in clustering.Roots)
                    {
                        cluster.Items.Clear();
                    }
                    for (int i = 0; i < dataset.Count; i++)
                    {
                        SparseVector <double> example = dataset[i];
                        double          maxSim        = double.MinValue;
                        ArrayList <int> candidates    = new ArrayList <int>();
                        for (int j = 0; j < mK; j++)
                        {
                            SparseVector <double> centroid = centroids[j];
                            double sim = mSimilarity.GetSimilarity(example, centroid);
                            if (sim > maxSim)
                            {
                                maxSim = sim;
                                candidates.Clear();
                                candidates.Add(j);
                            }
                            else if (sim == maxSim)
                            {
                                candidates.Add(j);
                            }
                        }
                        if (candidates.Count > 1)
                        {
                            candidates.Shuffle(mRnd);
                        }
                        clustering.Roots[candidates[0]].Items.Add(i);
                        clustQual += maxSim;
                    }
                    clustQual /= (double)dataset.Count;
                    mLogger.Trace("Cluster", "Quality: {0:0.0000}", clustQual);
                    // check if done
                    if (iter > 1 && clustQual - bestClustQual <= mEps)
                    {
                        break;
                    }
                    bestClustQual = clustQual;
                    // compute new centroids
                    for (int i = 0; i < mK; i++)
                    {
                        centroids[i] = clustering.Roots[i].ComputeCentroid(dataset, mCentroidType);
                    }
                }
                if (trial == 1 || clustQual > globalBestClustQual)
                {
                    globalBestClustQual = clustQual;
                    bestClustering      = clustering;
                }
            }
            return(bestClustering);
        }