Ejemplo n.º 1
0
        protected ClusteringResult GetClusteringResult()
        {
            ClusteringResult clustering = new ClusteringResult();

            foreach (CentroidData centroid in mCentroids)
            {
                clustering.AddRoot(new Cluster());
                clustering.Roots.Last.Items.AddRange(centroid.CurrentItems);
                clustering.Roots.Last.ClusterInfo = centroid.Tag;
            }
            return(clustering);
        }
Ejemplo n.º 2
0
        private ClusteringResult CreateSingleCluster(IUnlabeledExampleCollection <SparseVector <double> > dataset)
        {
            ClusteringResult clustering = new ClusteringResult();
            Cluster          root       = new Cluster();

            for (int i = 0; i < dataset.Count; i++)
            {
                root.Items.Add(i);
            }
            clustering.AddRoot(root);
            CentroidData centroid = new CentroidData();

            centroid.Items.AddRange(root.Items);
            centroid.Update(dataset);
            centroid.UpdateCentroidLen();
            mCentroids = new ArrayList <CentroidData>();
            mCentroids.Add(centroid);
            return(clustering);
        }
Ejemplo n.º 3
0
        public ClusteringResult Cluster(IUnlabeledExampleCollection <SparseVector <double> > dataset)
        {
            Utils.ThrowException(dataset == null ? new ArgumentNullException("dataset") : null);
            Utils.ThrowException(dataset.Count < 2 ? new ArgumentValueException("dataset") : null);
            ClusteringResult clusteringResult = new ClusteringResult();
            Queue <Cluster>  queue            = new Queue <Cluster>();
            // create root
            Cluster root = new Cluster();

            for (int i = 0; i < dataset.Count; i++)
            {
                Utils.ThrowException(dataset[i].Count == 0 ? new ArgumentValueException("dataset") : null);
                root.Items.Add(i);
            }
            clusteringResult.AddRoot(root);
            // add root to queue
            queue.Enqueue(root);
            while (queue.Count > 0)
            {
                // get next cluster
                Cluster cluster = queue.Dequeue();
                // compute cluster quality
                UnlabeledDataset <SparseVector <double> > localDataset = GetDatasetSubset(cluster.Items, dataset);
                SparseVector <double> centroid;
                double quality = GetClusterQuality(localDataset, out centroid);
                cluster.ClusterInfo = new Pair <SparseVector <double>, double>(centroid, quality);
                if (quality < mMinQuality)
                {
                    // split cluster, add children to queue
                    ClusteringResult localResult = mKMeansClustering.Cluster(localDataset);
                    for (int i = 0; i < 2; i++)
                    {
                        cluster.AddChild(localResult.Roots[i]);
                        localResult.Roots[i].Parent = cluster;
                        queue.Enqueue(localResult.Roots[i]);
                    }
                }
            }
            return(clusteringResult);
        }
Ejemplo n.º 4
0
        public ClusteringResult Cluster(IUnlabeledExampleCollection <SparseVector <double> > dataset)
        {
            Utils.ThrowException(dataset == null ? new ArgumentNullException("dataset") : null);
            Utils.ThrowException(dataset.Count < mK ? new ArgumentValueException("dataset") : null);
            ClusteringResult clustering          = null;
            double           globalBestClustQual = 0;

            for (int trial = 1; trial <= mTrials; trial++)
            {
                mLogger.Info("Cluster", "Clustering trial {0} of {1} ...", trial, mTrials);
                ArrayList <CentroidData> centroids = new ArrayList <CentroidData>(mK);
                ArrayList <int>          bestSeeds = null;
                for (int i = 0; i < mK; i++)
                {
                    centroids.Add(new CentroidData());
                }
                // select seed items
                double          minSim = double.MaxValue;
                ArrayList <int> tmp    = new ArrayList <int>(dataset.Count);
                for (int i = 0; i < dataset.Count; i++)
                {
                    tmp.Add(i);
                }
                for (int k = 0; k < 3; k++)
                {
                    ArrayList <SparseVector <double> > seeds = new ArrayList <SparseVector <double> >(mK);
                    tmp.Shuffle(mRnd);
                    for (int i = 0; i < mK; i++)
                    {
                        seeds.Add(dataset[tmp[i]]);
                    }
                    // assess quality of seed items
                    double simAvg = 0;
                    foreach (SparseVector <double> seed1 in seeds)
                    {
                        foreach (SparseVector <double> seed2 in seeds)
                        {
                            if (seed1 != seed2)
                            {
                                simAvg += DotProductSimilarity.Instance.GetSimilarity(seed1, seed2);
                            }
                        }
                    }
                    simAvg /= (double)(mK * mK - mK);
                    //Console.WriteLine(simAvg);
                    if (simAvg < minSim)
                    {
                        minSim    = simAvg;
                        bestSeeds = new ArrayList <int>(mK);
                        for (int i = 0; i < mK; i++)
                        {
                            bestSeeds.Add(tmp[i]);
                        }
                    }
                }
                for (int i = 0; i < mK; i++)
                {
                    centroids[i].Items.Add(bestSeeds[i]);
                    centroids[i].Update(dataset);
                    centroids[i].UpdateCentroidLen();
                }
                double[][]            dotProd = new double[mK][];
                SparseMatrix <double> dsMtx   = ModelUtils.GetTransposedMatrix(dataset);
                // main loop
                int    iter          = 0;
                double bestClustQual = 0;
                double clustQual;
                while (true)
                {
                    iter++;
                    mLogger.Info("Cluster", "Iteration {0} ...", iter);
                    clustQual = 0;
                    // assign items to clusters
                    int j = 0;
                    foreach (CentroidData cen in centroids)
                    {
                        SparseVector <double> cenVec = cen.GetSparseVector();
                        dotProd[j] = ModelUtils.GetDotProductSimilarity(dsMtx, dataset.Count, cenVec);
                        j++;
                    }
                    for (int instIdx = 0; instIdx < dataset.Count; instIdx++)
                    {
                        double          maxSim     = double.MinValue;
                        ArrayList <int> candidates = new ArrayList <int>();
                        for (int cenIdx = 0; cenIdx < mK; cenIdx++)
                        {
                            double sim = dotProd[cenIdx][instIdx];
                            if (sim > maxSim)
                            {
                                maxSim = sim;
                                candidates.Clear();
                                candidates.Add(cenIdx);
                            }
                            else if (sim == maxSim)
                            {
                                candidates.Add(cenIdx);
                            }
                        }
                        if (candidates.Count > 1)
                        {
                            candidates.Shuffle(mRnd);
                        }
                        if (candidates.Count > 0) // *** is this always true?
                        {
                            centroids[candidates[0]].Items.Add(instIdx);
                            clustQual += maxSim;
                        }
                    }
                    clustQual /= (double)dataset.Count;
                    mLogger.Info("Cluster", "Quality: {0:0.0000}", clustQual);
                    // check if done
                    if (iter > 1 && clustQual - bestClustQual <= mEps)
                    {
                        break;
                    }
                    bestClustQual = clustQual;
                    // compute new centroids
                    for (int i = 0; i < mK; i++)
                    {
                        centroids[i].Update(dataset);
                        centroids[i].UpdateCentroidLen();
                    }
                }
                if (trial == 1 || clustQual > globalBestClustQual)
                {
                    globalBestClustQual = clustQual;
                    // save the result
                    clustering = new ClusteringResult();
                    for (int i = 0; i < mK; i++)
                    {
                        clustering.AddRoot(new Cluster());
                        clustering.Roots.Last.Items.AddRange(centroids[i].Items);
                    }
                }
            }
            return(clustering);
        }
Ejemplo n.º 5
0
        public ClusteringResult Cluster(IUnlabeledExampleCollection <SparseVector <double> > dataset)
        {
            Utils.ThrowException(dataset == null ? new ArgumentNullException("dataset") : null);
            Utils.ThrowException(dataset.Count < mK ? new ArgumentValueException("dataset") : null);
            ClusteringResult clustering          = null;
            ClusteringResult bestClustering      = null;
            double           globalBestClustQual = 0;

            for (int trial = 1; trial <= mTrials; trial++)
            {
                mLogger.Trace("Cluster", "Clustering trial {0} of {1} ...", trial, mTrials);
                ArrayList <SparseVector <double> > centroids = null;
                clustering = new ClusteringResult();
                for (int i = 0; i < mK; i++)
                {
                    clustering.AddRoot(new Cluster());
                }
                // select seed items
                double          minSim = double.MaxValue;
                ArrayList <int> tmp    = new ArrayList <int>(dataset.Count);
                for (int i = 0; i < dataset.Count; i++)
                {
                    tmp.Add(i);
                }
                for (int k = 0; k < 3; k++)
                {
                    ArrayList <SparseVector <double> > seeds = new ArrayList <SparseVector <double> >(mK);
                    tmp.Shuffle(mRnd);
                    for (int i = 0; i < mK; i++)
                    {
                        seeds.Add(ModelUtils.ComputeCentroid(new SparseVector <double>[] { dataset[tmp[i]] }, mCentroidType));
                    }
                    // assess quality of seed items
                    double simAvg = 0;
                    foreach (SparseVector <double> seed1 in seeds)
                    {
                        foreach (SparseVector <double> seed2 in seeds)
                        {
                            if (seed1 != seed2)
                            {
                                simAvg += mSimilarity.GetSimilarity(seed1, seed2);
                            }
                        }
                    }
                    simAvg /= (double)(mK * mK - mK);
                    if (simAvg < minSim)
                    {
                        minSim    = simAvg;
                        centroids = seeds;
                    }
                }
                // main loop
                int    iter          = 0;
                double bestClustQual = 0;
                double clustQual;
                while (true)
                {
                    iter++;
                    mLogger.Trace("Cluster", "Iteration {0} ...", iter);
                    clustQual = 0;
                    // assign items to clusters
                    foreach (Cluster cluster in clustering.Roots)
                    {
                        cluster.Items.Clear();
                    }
                    for (int i = 0; i < dataset.Count; i++)
                    {
                        SparseVector <double> example = dataset[i];
                        double          maxSim        = double.MinValue;
                        ArrayList <int> candidates    = new ArrayList <int>();
                        for (int j = 0; j < mK; j++)
                        {
                            SparseVector <double> centroid = centroids[j];
                            double sim = mSimilarity.GetSimilarity(example, centroid);
                            if (sim > maxSim)
                            {
                                maxSim = sim;
                                candidates.Clear();
                                candidates.Add(j);
                            }
                            else if (sim == maxSim)
                            {
                                candidates.Add(j);
                            }
                        }
                        if (candidates.Count > 1)
                        {
                            candidates.Shuffle(mRnd);
                        }
                        clustering.Roots[candidates[0]].Items.Add(i);
                        clustQual += maxSim;
                    }
                    clustQual /= (double)dataset.Count;
                    mLogger.Trace("Cluster", "Quality: {0:0.0000}", clustQual);
                    // check if done
                    if (iter > 1 && clustQual - bestClustQual <= mEps)
                    {
                        break;
                    }
                    bestClustQual = clustQual;
                    // compute new centroids
                    for (int i = 0; i < mK; i++)
                    {
                        centroids[i] = clustering.Roots[i].ComputeCentroid(dataset, mCentroidType);
                    }
                }
                if (trial == 1 || clustQual > globalBestClustQual)
                {
                    globalBestClustQual = clustQual;
                    bestClustering      = clustering;
                }
            }
            return(bestClustering);
        }