public ClusteringResult Cluster(IUnlabeledExampleCollection <SparseVector <double> > dataset)
        {
            Utils.ThrowException(dataset == null ? new ArgumentNullException("dataset") : null);
            Utils.ThrowException(dataset.Count < NumLeaves ? new ArgumentValueException("dataset") : null);
            ClusteringResult clusters = mKMeansClustering.Cluster(dataset);
            UnlabeledDataset <SparseVector <double> > centroids = new UnlabeledDataset <SparseVector <double> >();

            foreach (Cluster cluster in clusters.Roots)
            {
                SparseVector <double> centroid = ModelUtils.ComputeCentroid(cluster.Items, dataset, CentroidType.NrmL2);
                centroids.Add(centroid);
                centroid            = Trim(centroid, 1000, 0.8);
                cluster.ClusterInfo = 1; // cluster level
            }
            SparseMatrix <double> simMtx     = ModelUtils.GetDotProductSimilarity(centroids, /*thresh=*/ 0, /*fullMatrix=*/ false);
            SparseMatrix <double> clustMtxTr = ModelUtils.GetTransposedMatrix(centroids);
            int iter = 1;

            while (clusters.Roots.Count > 1)
            {
                Console.WriteLine("Iteration {0} ...", iter++);
                int idx1, idx2;
                FindMaxSim(simMtx, out idx1, out idx2);
                Update(simMtx, clustMtxTr, clusters.Roots.Count, idx1, idx2, clusters.Roots.Inner, dataset, /*damping=*/ 0.9);
                Console.WriteLine(simMtx.ToString("E0.00"));
                Console.WriteLine();
            }
            return(clusters);
        }
Beispiel #2
0
 public void Train(ILabeledExampleCollection <LblT, SparseVector <double> > dataset)
 {
     Utils.ThrowException(dataset == null ? new ArgumentNullException("dataset") : null);
     Utils.ThrowException(dataset.Count == 0 ? new ArgumentValueException("dataset") : null);
     mDatasetMtx = ModelUtils.GetTransposedMatrix(ModelUtils.ConvertToUnlabeledDataset(dataset));
     mLabels     = new ArrayList <LblT>();
     foreach (LabeledExample <LblT, SparseVector <double> > labeledExample in dataset)
     {
         mLabels.Add(labeledExample.Label);
     }
 }
Beispiel #3
0
        internal void kMeansMainLoop(IUnlabeledExampleCollection <SparseVector <double> > dataset, ArrayList <CentroidData> centroids, out double clustQual)
        {
            double[][]            dotProd = new double[centroids.Count][];
            SparseMatrix <double> dataMtx = ModelUtils.GetTransposedMatrix(dataset);
            int    iter          = 0;
            double bestClustQual = 0;

            while (true)
            {
                iter++;
                mLogger.Trace("Cluster", "Iteration {0} ...", iter);
                // assign items to clusters
                Assign(centroids, dataMtx, dataset.Count, /*offs=*/ 0, out clustQual);
                mLogger.Trace("Cluster", "Quality: {0:0.0000}", clustQual);
                // update centroids
                Update(dataset, centroids);
                // check if done
                if (iter > 1 && clustQual - bestClustQual <= mEps)
                {
                    break;
                }
                bestClustQual = clustQual;
            }
        }
Beispiel #4
0
        public ClusteringResult Cluster(int numOutdated, IUnlabeledExampleCollection <SparseVector <double> > batch)
        {
            Utils.ThrowException(batch == null ? new ArgumentNullException("batch") : null);
            Utils.ThrowException(numOutdated < 0 ? new ArgumentOutOfRangeException("numOutdated") : null);
            if (mDataset == null)
            {
                // initialize
                mLogger.Trace("Cluster", "Initializing ...");
                Utils.ThrowException(numOutdated > 0 ? new ArgumentOutOfRangeException("numOutdated") : null);
                //Utils.ThrowException(batch.Count == 0 ? new ArgumentValueException("batch") : null);
                if (batch.Count == 0)
                {
                    return(new ClusteringResult());
                }
                kMeans(batch, Math.Min(mK, batch.Count));
                mDataset = new UnlabeledDataset <SparseVector <double> >(batch);
                foreach (CentroidData centroid in mCentroids)
                {
                    centroid.Tag = mTopicId++;
                }
                //OutputState();
            }
            else
            {
                // update clusters
                Utils.ThrowException(numOutdated > mDataset.Count ? new ArgumentOutOfRangeException("numOutdated") : null);
                if (numOutdated == 0 && batch.Count == 0)
                {
                    return(GetClusteringResult());
                }
                mLogger.Trace("Cluster", "Updating clusters ...");
                // assign new instances
                double dummy;
                Assign(mCentroids, ModelUtils.GetTransposedMatrix(batch), batch.Count, /*offs=*/ mDataset.Count, out dummy);
                mDataset.AddRange(batch);
                // remove outdated instances
                foreach (CentroidData centroid in mCentroids)
                {
                    foreach (int item in centroid.CurrentItems)
                    {
                        if (item >= numOutdated)
                        {
                            centroid.Items.Add(item);
                        }
                    }
                    centroid.Update(mDataset);
                    centroid.UpdateCentroidLen();
                }
                mDataset.RemoveRange(0, numOutdated);
                ArrayList <CentroidData> centroidsNew = new ArrayList <CentroidData>(mCentroids.Count);
                foreach (CentroidData centroid in mCentroids)
                {
                    if (centroid.CurrentItems.Count > 0)
                    {
                        centroidsNew.Add(centroid);
                        Set <int> tmp = new Set <int>();
                        foreach (int idx in centroid.CurrentItems)
                        {
                            tmp.Add(idx - numOutdated);
                        }
                        centroid.CurrentItems.Inner.SetItems(tmp);
                    }
                }
                if (centroidsNew.Count == 0) // reset
                {
                    mCentroids = null;
                    mDataset   = null;
                    return(new ClusteringResult());
                }
                mCentroids = centroidsNew;
                // execute main loop
                kMeansMainLoop(mDataset, mCentroids);
                //OutputState();
            }
            // adjust k
            double minQual; // *** not used at the moment
            int    minQualIdx;
            double qual = GetClustQual(out minQual, out minQualIdx);

            if (qual < mQualThresh)
            {
                while (qual < mQualThresh) // split cluster at minQualIdx
                {
                    mLogger.Trace("Cluster", "Increasing k to {0} ...", mCentroids.Count + 1);
                    mCentroids.Add(mCentroids[minQualIdx].Clone());
                    mCentroids.Last.Tag = mTopicId++;
                    kMeansMainLoop(mDataset, mCentroids);
                    if (mCentroids.Last.CurrentItems.Count > mCentroids[minQualIdx].CurrentItems.Count)
                    {
                        // swap topic identifiers
                        object tmp = mCentroids.Last.Tag;
                        mCentroids.Last.Tag        = mCentroids[minQualIdx].Tag;
                        mCentroids[minQualIdx].Tag = tmp;
                    }
                    qual = GetClustQual(out minQual, out minQualIdx);
                    //OutputState();
                }
            }
            else if (numOutdated > 0)
            {
                while (qual > mQualThresh && mCentroids.Count > 1) // join clusters
                {
                    mLogger.Trace("Cluster", "Decreasing k to {0} ...", mCentroids.Count - 1);
                    ArrayList <CentroidData> centroidsCopy = mCentroids.DeepClone();
                    if (mCentroids.Count == 2) // create single cluster
                    {
                        object topicId = mCentroids[0].CurrentItems.Count > mCentroids[1].CurrentItems.Count ? mCentroids[0].Tag : mCentroids[1].Tag;
                        mCentroids = new ArrayList <CentroidData>();
                        mCentroids.Add(new CentroidData());
                        for (int i = 0; i < mDataset.Count; i++)
                        {
                            mCentroids.Last.Items.Add(i);
                        }
                        mCentroids.Last.Tag = topicId;
                        mCentroids.Last.Update(mDataset);
                        mCentroids.Last.UpdateCentroidLen();
                    }
                    else
                    {
                        int idx1, idx2;
                        GetMostSimilarClusters(out idx1, out idx2);
                        CentroidData c1      = mCentroids[idx1];
                        CentroidData c2      = mCentroids[idx2];
                        object       topicId = c1.CurrentItems.Count > c2.CurrentItems.Count ? c1.Tag : c2.Tag;
                        mCentroids.RemoveAt(idx2);
                        c1.Items.AddRange(c1.CurrentItems);
                        c1.Items.AddRange(c2.CurrentItems);
                        c1.Tag = topicId;
                        c1.Update(mDataset);
                        c1.UpdateCentroidLen();
                        kMeansMainLoop(mDataset, mCentroids);
                    }
                    qual = GetClustQual();
                    if (qual >= mQualThresh)
                    {
                        mLogger.Trace("Cluster", "Accepted solution at k = {0}.", mCentroids.Count);
                    }
                    else
                    {
                        mCentroids = centroidsCopy;
                    }
                    //OutputState();
                }
            }
            OutputState();
            return(GetClusteringResult());
        }
Beispiel #5
0
        public void Train(ILabeledExampleCollection <LblT, SparseVector <double> > dataset)
        {
            Utils.ThrowException(dataset == null ? new ArgumentNullException("dataset") : null);
            Utils.ThrowException(dataset.Count == 0 ? new ArgumentValueException("dataset") : null);
            Dictionary <LblT, CentroidData> centroids = new Dictionary <LblT, CentroidData>(mLblCmp);

            foreach (LabeledExample <LblT, SparseVector <double> > labeledExample in dataset)
            {
                if (!centroids.ContainsKey(labeledExample.Label))
                {
                    CentroidData centroidData = new CentroidData();
                    centroidData.AddToSum(labeledExample.Example);
                    centroids.Add(labeledExample.Label, centroidData);
                }
                else
                {
                    CentroidData centroidData = centroids[labeledExample.Label];
                    centroidData.AddToSum(labeledExample.Example);
                }
            }
            foreach (CentroidData cenData in centroids.Values)
            {
                cenData.UpdateCentroidLen();
            }
            double learnRate = 1;

            double[][]            dotProd = null;
            SparseMatrix <double> dsMtx   = null;

            if (mIterations > 0)
            {
                dotProd = new double[centroids.Count][];
                dsMtx   = ModelUtils.GetTransposedMatrix(ModelUtils.ConvertToUnlabeledDataset(dataset));
            }
            for (int iter = 1; iter <= mIterations; iter++)
            {
                mLogger.Info("Train", "Iteration {0} / {1} ...", iter, mIterations);
                // compute dot products
                mLogger.Info("Train", "Computing dot products ...");
                int j = 0;
                foreach (KeyValuePair <LblT, CentroidData> labeledCentroid in centroids)
                {
                    mLogger.ProgressNormal(Logger.Level.Info, /*sender=*/ this, "Train", "Centroid {0} / {1} ...", j + 1, centroids.Count);
                    SparseVector <double> cenVec = labeledCentroid.Value.GetSparseVector();
                    dotProd[j] = ModelUtils.GetDotProductSimilarity(dsMtx, dataset.Count, cenVec);
                    j++;
                }
                // classify training examples
                mLogger.Info("Train", "Classifying training examples ...");
                int errCount = 0;
                for (int instIdx = 0; instIdx < dataset.Count; instIdx++)
                {
                    mLogger.ProgressFast(Logger.Level.Info, /*sender=*/ this, "Train", "Example {0} / {1} ...", instIdx + 1, dataset.Count);
                    double       maxSim           = double.MinValue;
                    CentroidData assignedCentroid = null;
                    CentroidData actualCentroid   = null;
                    LabeledExample <LblT, SparseVector <double> > labeledExample = dataset[instIdx];
                    SparseVector <double> vec = labeledExample.Example;
                    int cenIdx = 0;
                    foreach (KeyValuePair <LblT, CentroidData> labeledCentroid in centroids)
                    {
                        double sim = dotProd[cenIdx][instIdx];
                        if (sim > maxSim)
                        {
                            maxSim = sim; assignedCentroid = labeledCentroid.Value;
                        }
                        if (labeledCentroid.Key.Equals(labeledExample.Label))
                        {
                            actualCentroid = labeledCentroid.Value;
                        }
                        cenIdx++;
                    }
                    if (assignedCentroid != actualCentroid)
                    {
                        assignedCentroid.AddToDiff(-learnRate, vec);
                        actualCentroid.AddToDiff(learnRate, vec);
                        errCount++;
                    }
                }
                mLogger.Info("Train", "Training set error rate: {0:0.00}%", (double)errCount / (double)dataset.Count * 100.0);
                // update centroids
                int k = 0;
                foreach (CentroidData centroidData in centroids.Values)
                {
                    mLogger.ProgressNormal(Logger.Level.Info, /*sender=*/ this, "Train", "Centroid {0} / {1} ...", ++k, centroids.Count);
                    centroidData.Update(mPositiveValuesOnly);
                    centroidData.UpdateCentroidLen();
                }
                learnRate *= mDamping;
            }
            mCentroidMtxTr = new SparseMatrix <double>();
            mLabels        = new ArrayList <LblT>();
            int rowIdx = 0;

            foreach (KeyValuePair <LblT, CentroidData> labeledCentroid in centroids)
            {
                mCentroidMtxTr[rowIdx++] = labeledCentroid.Value.GetSparseVector();
                mLabels.Add(labeledCentroid.Key);
            }
            mCentroidMtxTr = mCentroidMtxTr.GetTransposedCopy();
        }
Beispiel #6
0
        public ClusteringResult Cluster(IUnlabeledExampleCollection <SparseVector <double> > dataset)
        {
            Utils.ThrowException(dataset == null ? new ArgumentNullException("dataset") : null);
            Utils.ThrowException(dataset.Count < mK ? new ArgumentValueException("dataset") : null);
            ClusteringResult clustering          = null;
            double           globalBestClustQual = 0;

            for (int trial = 1; trial <= mTrials; trial++)
            {
                mLogger.Info("Cluster", "Clustering trial {0} of {1} ...", trial, mTrials);
                ArrayList <CentroidData> centroids = new ArrayList <CentroidData>(mK);
                ArrayList <int>          bestSeeds = null;
                for (int i = 0; i < mK; i++)
                {
                    centroids.Add(new CentroidData());
                }
                // select seed items
                double          minSim = double.MaxValue;
                ArrayList <int> tmp    = new ArrayList <int>(dataset.Count);
                for (int i = 0; i < dataset.Count; i++)
                {
                    tmp.Add(i);
                }
                for (int k = 0; k < 3; k++)
                {
                    ArrayList <SparseVector <double> > seeds = new ArrayList <SparseVector <double> >(mK);
                    tmp.Shuffle(mRnd);
                    for (int i = 0; i < mK; i++)
                    {
                        seeds.Add(dataset[tmp[i]]);
                    }
                    // assess quality of seed items
                    double simAvg = 0;
                    foreach (SparseVector <double> seed1 in seeds)
                    {
                        foreach (SparseVector <double> seed2 in seeds)
                        {
                            if (seed1 != seed2)
                            {
                                simAvg += DotProductSimilarity.Instance.GetSimilarity(seed1, seed2);
                            }
                        }
                    }
                    simAvg /= (double)(mK * mK - mK);
                    //Console.WriteLine(simAvg);
                    if (simAvg < minSim)
                    {
                        minSim    = simAvg;
                        bestSeeds = new ArrayList <int>(mK);
                        for (int i = 0; i < mK; i++)
                        {
                            bestSeeds.Add(tmp[i]);
                        }
                    }
                }
                for (int i = 0; i < mK; i++)
                {
                    centroids[i].Items.Add(bestSeeds[i]);
                    centroids[i].Update(dataset);
                    centroids[i].UpdateCentroidLen();
                }
                double[][]            dotProd = new double[mK][];
                SparseMatrix <double> dsMtx   = ModelUtils.GetTransposedMatrix(dataset);
                // main loop
                int    iter          = 0;
                double bestClustQual = 0;
                double clustQual;
                while (true)
                {
                    iter++;
                    mLogger.Info("Cluster", "Iteration {0} ...", iter);
                    clustQual = 0;
                    // assign items to clusters
                    int j = 0;
                    foreach (CentroidData cen in centroids)
                    {
                        SparseVector <double> cenVec = cen.GetSparseVector();
                        dotProd[j] = ModelUtils.GetDotProductSimilarity(dsMtx, dataset.Count, cenVec);
                        j++;
                    }
                    for (int instIdx = 0; instIdx < dataset.Count; instIdx++)
                    {
                        double          maxSim     = double.MinValue;
                        ArrayList <int> candidates = new ArrayList <int>();
                        for (int cenIdx = 0; cenIdx < mK; cenIdx++)
                        {
                            double sim = dotProd[cenIdx][instIdx];
                            if (sim > maxSim)
                            {
                                maxSim = sim;
                                candidates.Clear();
                                candidates.Add(cenIdx);
                            }
                            else if (sim == maxSim)
                            {
                                candidates.Add(cenIdx);
                            }
                        }
                        if (candidates.Count > 1)
                        {
                            candidates.Shuffle(mRnd);
                        }
                        if (candidates.Count > 0) // *** is this always true?
                        {
                            centroids[candidates[0]].Items.Add(instIdx);
                            clustQual += maxSim;
                        }
                    }
                    clustQual /= (double)dataset.Count;
                    mLogger.Info("Cluster", "Quality: {0:0.0000}", clustQual);
                    // check if done
                    if (iter > 1 && clustQual - bestClustQual <= mEps)
                    {
                        break;
                    }
                    bestClustQual = clustQual;
                    // compute new centroids
                    for (int i = 0; i < mK; i++)
                    {
                        centroids[i].Update(dataset);
                        centroids[i].UpdateCentroidLen();
                    }
                }
                if (trial == 1 || clustQual > globalBestClustQual)
                {
                    globalBestClustQual = clustQual;
                    // save the result
                    clustering = new ClusteringResult();
                    for (int i = 0; i < mK; i++)
                    {
                        clustering.AddRoot(new Cluster());
                        clustering.Roots.Last.Items.AddRange(centroids[i].Items);
                    }
                }
            }
            return(clustering);
        }