public ClusteringResult Cluster(IUnlabeledExampleCollection <SparseVector <double> > dataset)
        {
            Utils.ThrowException(dataset == null ? new ArgumentNullException("dataset") : null);
            Utils.ThrowException(dataset.Count < NumLeaves ? new ArgumentValueException("dataset") : null);
            ClusteringResult clusters = mKMeansClustering.Cluster(dataset);
            UnlabeledDataset <SparseVector <double> > centroids = new UnlabeledDataset <SparseVector <double> >();

            foreach (Cluster cluster in clusters.Roots)
            {
                SparseVector <double> centroid = ModelUtils.ComputeCentroid(cluster.Items, dataset, CentroidType.NrmL2);
                centroids.Add(centroid);
                centroid            = Trim(centroid, 1000, 0.8);
                cluster.ClusterInfo = 1; // cluster level
            }
            SparseMatrix <double> simMtx     = ModelUtils.GetDotProductSimilarity(centroids, /*thresh=*/ 0, /*fullMatrix=*/ false);
            SparseMatrix <double> clustMtxTr = ModelUtils.GetTransposedMatrix(centroids);
            int iter = 1;

            while (clusters.Roots.Count > 1)
            {
                Console.WriteLine("Iteration {0} ...", iter++);
                int idx1, idx2;
                FindMaxSim(simMtx, out idx1, out idx2);
                Update(simMtx, clustMtxTr, clusters.Roots.Count, idx1, idx2, clusters.Roots.Inner, dataset, /*damping=*/ 0.9);
                Console.WriteLine(simMtx.ToString("E0.00"));
                Console.WriteLine();
            }
            return(clusters);
        }
示例#2
0
        private UnlabeledDataset <SparseVector <double> > GetDatasetSubset(IEnumerable <int> items, IUnlabeledExampleCollection <SparseVector <double> > dataset)
        {
            UnlabeledDataset <SparseVector <double> > datasetSubset = new UnlabeledDataset <SparseVector <double> >();

            foreach (int item in items)
            {
                datasetSubset.Add(dataset[item]);
            }
            return(datasetSubset);
        }
示例#3
0
        public static IUnlabeledExampleCollection <ExT> ConvertToUnlabeledDataset <LblT, ExT>(ILabeledExampleCollection <LblT, ExT> dataset)
        {
            UnlabeledDataset <ExT> unlabeledDataset = new UnlabeledDataset <ExT>();

            foreach (LabeledExample <LblT, ExT> labeledExample in dataset)
            {
                unlabeledDataset.Add(labeledExample.Example);
            }
            return(unlabeledDataset);
        }
示例#4
0
        public static UnlabeledDataset <ExT> ConvertToUnlabeledDataset <LblT, ExT>(ILabeledExampleCollection <LblT, ExT> dataset)
        {
            Utils.ThrowException(dataset == null ? new ArgumentNullException("dataset") : null);
            UnlabeledDataset <ExT> unlabeledDataset = new UnlabeledDataset <ExT>();

            foreach (LabeledExample <LblT, ExT> labeledExample in dataset)
            {
                unlabeledDataset.Add(labeledExample.Example);
            }
            return(unlabeledDataset);
        }
示例#5
0
 public void Load(BinarySerializer reader)
 {
     Utils.ThrowException(reader == null ? new ArgumentNullException("reader") : null);
     // the following statements throw serialization-related exceptions
     mRnd        = (Random)reader.ReadDotNetObject();
     mEps        = reader.ReadDouble();
     mTrials     = reader.ReadInt();
     mK          = reader.ReadInt();
     mCentroids  = reader.ReadObject <ArrayList <CentroidData> >();
     mDataset    = reader.ReadObject <UnlabeledDataset <SparseVector <double> > >();
     mQualThresh = reader.ReadDouble();
     mTopicId    = reader.ReadLong();
 }
示例#6
0
        public IUnlabeledDataset ConvertDataset(Type newExType, bool move)
        {
            Utils.ThrowException(newExType == null ? new ArgumentNullException("newExType") : null);
            Utils.ThrowException(move && typeof(ExT).IsValueType ? new ArgumentValueException("newExType") : null);
            IUnlabeledDataset  newDataset = null;
            ArrayList <object> tmp        = new ArrayList <object>(mItems.Count);

            for (int i = 0; i < mItems.Count; i++)
            {
                tmp.Add(ModelUtils.ConvertExample(mItems[i], newExType)); // throws ArgumentValueException
                if (move)
                {
                    mItems[i] = default(ExT);
                }                                       // *** this is guaranteed to be null by the second assertion
            }
            if (move)
            {
                mItems.Clear();
            }
            if (newExType == typeof(SparseVector <double>))
            {
                newDataset = new UnlabeledDataset <SparseVector <double> >(tmp);
            }
            else if (newExType == typeof(SparseVector <double> .ReadOnly))
            {
                newDataset = new UnlabeledDataset <SparseVector <double> .ReadOnly>(tmp);
            }
            else if (newExType == typeof(BinaryVector))
            {
                newDataset = new UnlabeledDataset <BinaryVector>(tmp);
            }
            else if (newExType == typeof(BinaryVector.ReadOnly))
            {
                newDataset = new UnlabeledDataset <BinaryVector.ReadOnly>(tmp);
            }
            else
            {
                throw new ArgumentNotSupportedException("newExType");
            }
            return(newDataset);
        }
示例#7
0
        public ClusteringResult Cluster(IUnlabeledExampleCollection <SparseVector <double> > dataset)
        {
            Utils.ThrowException(dataset == null ? new ArgumentNullException("dataset") : null);
            Utils.ThrowException(dataset.Count < 2 ? new ArgumentValueException("dataset") : null);
            ClusteringResult clusteringResult = new ClusteringResult();
            Queue <Cluster>  queue            = new Queue <Cluster>();
            // create root
            Cluster root = new Cluster();

            for (int i = 0; i < dataset.Count; i++)
            {
                Utils.ThrowException(dataset[i].Count == 0 ? new ArgumentValueException("dataset") : null);
                root.Items.Add(i);
            }
            clusteringResult.AddRoot(root);
            // add root to queue
            queue.Enqueue(root);
            while (queue.Count > 0)
            {
                // get next cluster
                Cluster cluster = queue.Dequeue();
                // compute cluster quality
                UnlabeledDataset <SparseVector <double> > localDataset = GetDatasetSubset(cluster.Items, dataset);
                SparseVector <double> centroid;
                double quality = GetClusterQuality(localDataset, out centroid);
                cluster.ClusterInfo = new Pair <SparseVector <double>, double>(centroid, quality);
                if (quality < mMinQuality)
                {
                    // split cluster, add children to queue
                    ClusteringResult localResult = mKMeansClustering.Cluster(localDataset);
                    for (int i = 0; i < 2; i++)
                    {
                        cluster.AddChild(localResult.Roots[i]);
                        localResult.Roots[i].Parent = cluster;
                        queue.Enqueue(localResult.Roots[i]);
                    }
                }
            }
            return(clusteringResult);
        }
示例#8
0
        public ClusteringResult Cluster(int numOutdated, IUnlabeledExampleCollection <SparseVector <double> > batch)
        {
            Utils.ThrowException(batch == null ? new ArgumentNullException("batch") : null);
            Utils.ThrowException(numOutdated < 0 ? new ArgumentOutOfRangeException("numOutdated") : null);
            if (mDataset == null)
            {
                // initialize
                mLogger.Trace("Cluster", "Initializing ...");
                Utils.ThrowException(numOutdated > 0 ? new ArgumentOutOfRangeException("numOutdated") : null);
                //Utils.ThrowException(batch.Count == 0 ? new ArgumentValueException("batch") : null);
                if (batch.Count == 0)
                {
                    return(new ClusteringResult());
                }
                kMeans(batch, Math.Min(mK, batch.Count));
                mDataset = new UnlabeledDataset <SparseVector <double> >(batch);
                foreach (CentroidData centroid in mCentroids)
                {
                    centroid.Tag = mTopicId++;
                }
                //OutputState();
            }
            else
            {
                // update clusters
                Utils.ThrowException(numOutdated > mDataset.Count ? new ArgumentOutOfRangeException("numOutdated") : null);
                if (numOutdated == 0 && batch.Count == 0)
                {
                    return(GetClusteringResult());
                }
                mLogger.Trace("Cluster", "Updating clusters ...");
                // assign new instances
                double dummy;
                Assign(mCentroids, ModelUtils.GetTransposedMatrix(batch), batch.Count, /*offs=*/ mDataset.Count, out dummy);
                mDataset.AddRange(batch);
                // remove outdated instances
                foreach (CentroidData centroid in mCentroids)
                {
                    foreach (int item in centroid.CurrentItems)
                    {
                        if (item >= numOutdated)
                        {
                            centroid.Items.Add(item);
                        }
                    }
                    centroid.Update(mDataset);
                    centroid.UpdateCentroidLen();
                }
                mDataset.RemoveRange(0, numOutdated);
                ArrayList <CentroidData> centroidsNew = new ArrayList <CentroidData>(mCentroids.Count);
                foreach (CentroidData centroid in mCentroids)
                {
                    if (centroid.CurrentItems.Count > 0)
                    {
                        centroidsNew.Add(centroid);
                        Set <int> tmp = new Set <int>();
                        foreach (int idx in centroid.CurrentItems)
                        {
                            tmp.Add(idx - numOutdated);
                        }
                        centroid.CurrentItems.Inner.SetItems(tmp);
                    }
                }
                if (centroidsNew.Count == 0) // reset
                {
                    mCentroids = null;
                    mDataset   = null;
                    return(new ClusteringResult());
                }
                mCentroids = centroidsNew;
                // execute main loop
                kMeansMainLoop(mDataset, mCentroids);
                //OutputState();
            }
            // adjust k
            double minQual; // *** not used at the moment
            int    minQualIdx;
            double qual = GetClustQual(out minQual, out minQualIdx);

            if (qual < mQualThresh)
            {
                while (qual < mQualThresh) // split cluster at minQualIdx
                {
                    mLogger.Trace("Cluster", "Increasing k to {0} ...", mCentroids.Count + 1);
                    mCentroids.Add(mCentroids[minQualIdx].Clone());
                    mCentroids.Last.Tag = mTopicId++;
                    kMeansMainLoop(mDataset, mCentroids);
                    if (mCentroids.Last.CurrentItems.Count > mCentroids[minQualIdx].CurrentItems.Count)
                    {
                        // swap topic identifiers
                        object tmp = mCentroids.Last.Tag;
                        mCentroids.Last.Tag        = mCentroids[minQualIdx].Tag;
                        mCentroids[minQualIdx].Tag = tmp;
                    }
                    qual = GetClustQual(out minQual, out minQualIdx);
                    //OutputState();
                }
            }
            else if (numOutdated > 0)
            {
                while (qual > mQualThresh && mCentroids.Count > 1) // join clusters
                {
                    mLogger.Trace("Cluster", "Decreasing k to {0} ...", mCentroids.Count - 1);
                    ArrayList <CentroidData> centroidsCopy = mCentroids.DeepClone();
                    if (mCentroids.Count == 2) // create single cluster
                    {
                        object topicId = mCentroids[0].CurrentItems.Count > mCentroids[1].CurrentItems.Count ? mCentroids[0].Tag : mCentroids[1].Tag;
                        mCentroids = new ArrayList <CentroidData>();
                        mCentroids.Add(new CentroidData());
                        for (int i = 0; i < mDataset.Count; i++)
                        {
                            mCentroids.Last.Items.Add(i);
                        }
                        mCentroids.Last.Tag = topicId;
                        mCentroids.Last.Update(mDataset);
                        mCentroids.Last.UpdateCentroidLen();
                    }
                    else
                    {
                        int idx1, idx2;
                        GetMostSimilarClusters(out idx1, out idx2);
                        CentroidData c1      = mCentroids[idx1];
                        CentroidData c2      = mCentroids[idx2];
                        object       topicId = c1.CurrentItems.Count > c2.CurrentItems.Count ? c1.Tag : c2.Tag;
                        mCentroids.RemoveAt(idx2);
                        c1.Items.AddRange(c1.CurrentItems);
                        c1.Items.AddRange(c2.CurrentItems);
                        c1.Tag = topicId;
                        c1.Update(mDataset);
                        c1.UpdateCentroidLen();
                        kMeansMainLoop(mDataset, mCentroids);
                    }
                    qual = GetClustQual();
                    if (qual >= mQualThresh)
                    {
                        mLogger.Trace("Cluster", "Accepted solution at k = {0}.", mCentroids.Count);
                    }
                    else
                    {
                        mCentroids = centroidsCopy;
                    }
                    //OutputState();
                }
            }
            OutputState();
            return(GetClusteringResult());
        }