示例#1
0
        public static SparseMatrix <double> GetDotProductSimilarity(IUnlabeledExampleCollection <SparseVector <double> > dataset, double thresh, bool fullMatrix)
        {
            Utils.ThrowException(dataset == null ? new ArgumentNullException("dataset") : null);
            Utils.ThrowException(thresh < 0 ? new ArgumentOutOfRangeException("thresh") : null);
            SparseMatrix <double> trMtx = GetTransposedMatrix(dataset);

            double[] simVec = new double[dataset.Count];
            SparseMatrix <double> simMtx = new SparseMatrix <double>();
            int rowIdx = 0;

            foreach (SparseVector <double> item in dataset)
            {
                GetDotProductSimilarity(item, simVec, trMtx, /*startIdx=*/ fullMatrix ? 0 : rowIdx); // if fullMatrix is false, upper (right) triangular sparse matrix of dot products is computed
                for (int idx = 0; idx < simVec.Length; idx++)
                {
                    double sim = simVec[idx];
                    if (sim > thresh)
                    {
                        if (!simMtx.ContainsRowAt(rowIdx))
                        {
                            simMtx[rowIdx] = new SparseVector <double>(new IdxDat <double>[] { new IdxDat <double>(idx, sim) });
                        }
                        else
                        {
                            simMtx[rowIdx].InnerIdx.Add(idx);
                            simMtx[rowIdx].InnerDat.Add(sim);
                        }
                    }
                    simVec[idx] = 0;
                }
                rowIdx++;
            }
            return(simMtx);
        }
示例#2
0
        // *** Dataset utilities ***

        public static SparseMatrix <double> GetTransposedMatrix(IUnlabeledExampleCollection <SparseVector <double> > dataset)
        {
            Utils.ThrowException(dataset == null ? new ArgumentNullException("dataset") : null);
            //if (dataset.Count == 0) { return new SparseMatrix<double>(); }
            SparseMatrix <double> trMtx = new SparseMatrix <double>();
            int rowIdx = 0;

            foreach (SparseVector <double> item in dataset)
            {
                foreach (IdxDat <double> vecItem in item)
                {
                    if (!trMtx.ContainsRowAt(vecItem.Idx))
                    {
                        trMtx[vecItem.Idx] = new SparseVector <double>(new IdxDat <double>[] { new IdxDat <double>(rowIdx, vecItem.Dat) });
                    }
                    else
                    {
                        trMtx[vecItem.Idx].InnerIdx.Add(rowIdx);
                        trMtx[vecItem.Idx].InnerDat.Add(vecItem.Dat);
                    }
                }
                rowIdx++;
            }
            return(trMtx);
        }
        public ClusteringResult Cluster(IUnlabeledExampleCollection <SparseVector <double> > dataset)
        {
            Utils.ThrowException(dataset == null ? new ArgumentNullException("dataset") : null);
            Utils.ThrowException(dataset.Count < NumLeaves ? new ArgumentValueException("dataset") : null);
            ClusteringResult clusters = mKMeansClustering.Cluster(dataset);
            UnlabeledDataset <SparseVector <double> > centroids = new UnlabeledDataset <SparseVector <double> >();

            foreach (Cluster cluster in clusters.Roots)
            {
                SparseVector <double> centroid = ModelUtils.ComputeCentroid(cluster.Items, dataset, CentroidType.NrmL2);
                centroids.Add(centroid);
                centroid            = Trim(centroid, 1000, 0.8);
                cluster.ClusterInfo = 1; // cluster level
            }
            SparseMatrix <double> simMtx     = ModelUtils.GetDotProductSimilarity(centroids, /*thresh=*/ 0, /*fullMatrix=*/ false);
            SparseMatrix <double> clustMtxTr = ModelUtils.GetTransposedMatrix(centroids);
            int iter = 1;

            while (clusters.Roots.Count > 1)
            {
                Console.WriteLine("Iteration {0} ...", iter++);
                int idx1, idx2;
                FindMaxSim(simMtx, out idx1, out idx2);
                Update(simMtx, clustMtxTr, clusters.Roots.Count, idx1, idx2, clusters.Roots.Inner, dataset, /*damping=*/ 0.9);
                Console.WriteLine(simMtx.ToString("E0.00"));
                Console.WriteLine();
            }
            return(clusters);
        }
示例#4
0
 internal void Update(IUnlabeledExampleCollection <SparseVector <double> > dataset, ArrayList <CentroidData> centroids)
 {
     foreach (CentroidData centroid in centroids)
     {
         centroid.Update(dataset);
         centroid.UpdateCentroidLen();
     }
 }
示例#5
0
        public LabeledDataset <Cluster, ExT> GetClassificationDataset <ExT>(IUnlabeledExampleCollection <ExT> dataset)
        {
            Utils.ThrowException(dataset == null ? new ArgumentNullException("dataset") : null);
            LabeledDataset <Cluster, ExT> classificationDataset = new LabeledDataset <Cluster, ExT>();

            FillClassificationDataset(mRoots, dataset, classificationDataset); // throws ArgumentValueException
            return(classificationDataset);
        }
        private void Update(SparseMatrix <double> simMtx, SparseMatrix <double> clustMtxTr, int numClusters, int idx1, int idx2, ArrayList <Cluster> clusters,
                            IUnlabeledExampleCollection <SparseVector <double> > dataset, double damping)
        {
            Debug.Assert(idx1 < idx2);
            // create new parent
            Cluster c1     = clusters[idx1];
            Cluster c2     = clusters[idx2];
            Cluster parent = new Cluster();

            parent.Items.AddRange(c1.Items);
            parent.Items.AddRange(c2.Items);
            parent.ClusterInfo = Math.Max((int)c1.ClusterInfo, (int)c2.ClusterInfo) + 1;
            c1.Parent          = parent;
            c2.Parent          = parent;
            parent.AddChild(c1);
            parent.AddChild(c2);
            SparseVector <double> centroid = ModelUtils.ComputeCentroid(parent.Items, dataset, CentroidType.NrmL2);

            centroid = Trim(centroid, 1000, 0.8);
            // remove clusters
            clusters.RemoveAt(idx2);
            clusters.RemoveAt(idx1);
            // add new parent
            clusters.Add(parent);
            // remove rows at idx1 and idx2
            simMtx.PurgeRowAt(idx2);
            simMtx.PurgeRowAt(idx1);
            // remove cols at idx1 and idx2
            simMtx.PurgeColAt(idx2);
            simMtx.PurgeColAt(idx1);
            clustMtxTr.PurgeColAt(idx2);
            clustMtxTr.PurgeColAt(idx1);
            // update matrices
            numClusters -= 2;
            foreach (IdxDat <double> item in centroid)
            {
                if (clustMtxTr[item.Idx] == null)
                {
                    clustMtxTr[item.Idx] = new SparseVector <double>(new IdxDat <double>[] { new IdxDat <double>(numClusters, item.Dat) });
                }
                else
                {
                    clustMtxTr[item.Idx].InnerIdx.Add(numClusters);
                    clustMtxTr[item.Idx].InnerDat.Add(item.Dat);
                }
            }
            double[] simVec = ModelUtils.GetDotProductSimilarity(clustMtxTr, numClusters + 1, centroid);
            for (int i = 0; i < simVec.Length; i++)
            {
                simVec[i] *= Math.Pow(damping, (double)((int)parent.ClusterInfo + (int)clusters[i].ClusterInfo) / 2.0);
            }
            SparseMatrix <double> col = new SparseMatrix <double>();

            col[0] = new SparseVector <double>(simVec);
            simMtx.AppendCols(col.GetTransposedCopy(), numClusters);
        }
示例#7
0
        public static double[] GetDotProductSimilarity(IUnlabeledExampleCollection <SparseVector <double> > dataset, SparseVector <double> .ReadOnly vec)
        {
            Utils.ThrowException(dataset == null ? new ArgumentNullException("dataset") : null);
            Utils.ThrowException(vec == null ? new ArgumentNullException("vec") : null);
            SparseMatrix <double> trMtx = GetTransposedMatrix(dataset);

            double[] simVec = new double[dataset.Count];
            GetDotProductSimilarity(vec, simVec, trMtx, /*startIdx=*/ 0);
            return(simVec);
        }
示例#8
0
 private void FillClassificationDataset <ExT>(IEnumerable <Cluster> clusters, IUnlabeledExampleCollection <ExT> dataset, LabeledDataset <Cluster, ExT> classificationDataset)
 {
     foreach (Cluster cluster in clusters)
     {
         foreach (int item in cluster.Items)
         {
             Utils.ThrowException(item < 0 || item >= dataset.Count ? new ArgumentValueException("clusters") : null);
             classificationDataset.Add(cluster, dataset[item]);
         }
         FillClassificationDataset(cluster.Children, dataset, classificationDataset);
     }
 }
示例#9
0
 public void Train(IUnlabeledExampleCollection <SparseVector <double> > dataset, ClusteringResult hierarchy)
 {
     Utils.ThrowException(dataset == null ? new ArgumentNullException("dataset") : null);
     Utils.ThrowException(dataset.Count == 0 ? new ArgumentValueException("dataset") : null);
     Utils.ThrowException(hierarchy == null ? new ArgumentNullException("hierarchy") : null);
     Utils.ThrowException(hierarchy.Roots.Count == 0 ? new ArgumentValueException("hierarchy") : null);
     mModel   = new Dictionary <Cluster, ClusterInfo>();
     mDataset = dataset;
     foreach (Cluster root in hierarchy.Roots)
     {
         ComputeCentroid(root);
     }
     mDataset = null;
 }
示例#10
0
        private double GetClusterQuality(IUnlabeledExampleCollection <SparseVector <double> > dataset, out SparseVector <double> centroid)
        {
            // compute centroid
            centroid = ModelUtils.ComputeCentroid(dataset, CentroidType.NrmL2);
            // compute intra-cluster similarities
            double[] simData = ModelUtils.GetDotProductSimilarity(dataset, centroid);
            // compute cluster quality
            double quality = 0;

            for (int i = 0; i < simData.Length; i++)
            {
                quality += simData[i];
            }
            quality /= (double)simData.Length;
            return(quality);
        }
示例#11
0
        public static SparseVector <double> GetDotProductSimilarity(IUnlabeledExampleCollection <SparseVector <double> > dataset, SparseVector <double> .ReadOnly vec, double thresh)
        {
            Utils.ThrowException(thresh < 0 ? new ArgumentOutOfRangeException("thresh") : null);
            double[] simVec = GetDotProductSimilarity(dataset, vec); // throws ArgumentNullException
            SparseVector <double> sparseVec = new SparseVector <double>();

            for (int i = 0; i < simVec.Length; i++)
            {
                if (simVec[i] > thresh)
                {
                    sparseVec.InnerIdx.Add(i);
                    sparseVec.InnerDat.Add(simVec[i]);
                }
            }
            return(sparseVec);
        }
示例#12
0
        private ClusteringResult CreateSingleCluster(IUnlabeledExampleCollection <SparseVector <double> > dataset)
        {
            ClusteringResult clustering = new ClusteringResult();
            Cluster          root       = new Cluster();

            for (int i = 0; i < dataset.Count; i++)
            {
                root.Items.Add(i);
            }
            clustering.AddRoot(root);
            CentroidData centroid = new CentroidData();

            centroid.Items.AddRange(root.Items);
            centroid.Update(dataset);
            centroid.UpdateCentroidLen();
            mCentroids = new ArrayList <CentroidData>();
            mCentroids.Add(centroid);
            return(clustering);
        }
示例#13
0
        public void Update(IUnlabeledExampleCollection <SparseVector <double> > dataset)
        {
            Set <int> addIdx = Set <int> .Difference(mItems, mCurrentItems);

            Set <int> rmvIdx = Set <int> .Difference(mCurrentItems, mItems);

            foreach (int itemIdx in addIdx)
            {
                SparseVector <double> vec = dataset[itemIdx];
                AddToSum(vec);
            }
            foreach (int itemIdx in rmvIdx)
            {
                SparseVector <double> vec = dataset[itemIdx];
                AddToDiff(-1, vec);
            }
            mCurrentItems = mItems;
            mItems        = new Set <int>();
            Update(/*positiveValuesOnly=*/ false);
        }
示例#14
0
        public ClusteringResult Cluster(IUnlabeledExampleCollection <SparseVector <double> > dataset)
        {
            Utils.ThrowException(dataset == null ? new ArgumentNullException("dataset") : null);
            Utils.ThrowException(dataset.Count < 2 ? new ArgumentValueException("dataset") : null);
            ClusteringResult clusteringResult = new ClusteringResult();
            Queue <Cluster>  queue            = new Queue <Cluster>();
            // create root
            Cluster root = new Cluster();

            for (int i = 0; i < dataset.Count; i++)
            {
                Utils.ThrowException(dataset[i].Count == 0 ? new ArgumentValueException("dataset") : null);
                root.Items.Add(i);
            }
            clusteringResult.AddRoot(root);
            // add root to queue
            queue.Enqueue(root);
            while (queue.Count > 0)
            {
                // get next cluster
                Cluster cluster = queue.Dequeue();
                // compute cluster quality
                UnlabeledDataset <SparseVector <double> > localDataset = GetDatasetSubset(cluster.Items, dataset);
                SparseVector <double> centroid;
                double quality = GetClusterQuality(localDataset, out centroid);
                cluster.ClusterInfo = new Pair <SparseVector <double>, double>(centroid, quality);
                if (quality < mMinQuality)
                {
                    // split cluster, add children to queue
                    ClusteringResult localResult = mKMeansClustering.Cluster(localDataset);
                    for (int i = 0; i < 2; i++)
                    {
                        cluster.AddChild(localResult.Roots[i]);
                        localResult.Roots[i].Parent = cluster;
                        queue.Enqueue(localResult.Roots[i]);
                    }
                }
            }
            return(clusteringResult);
        }
示例#15
0
        internal void kMeansMainLoop(IUnlabeledExampleCollection <SparseVector <double> > dataset, ArrayList <CentroidData> centroids, out double clustQual)
        {
            double[][]            dotProd = new double[centroids.Count][];
            SparseMatrix <double> dataMtx = ModelUtils.GetTransposedMatrix(dataset);
            int    iter          = 0;
            double bestClustQual = 0;

            while (true)
            {
                iter++;
                mLogger.Trace("Cluster", "Iteration {0} ...", iter);
                // assign items to clusters
                Assign(centroids, dataMtx, dataset.Count, /*offs=*/ 0, out clustQual);
                mLogger.Trace("Cluster", "Quality: {0:0.0000}", clustQual);
                // update centroids
                Update(dataset, centroids);
                // check if done
                if (iter > 1 && clustQual - bestClustQual <= mEps)
                {
                    break;
                }
                bestClustQual = clustQual;
            }
        }
 public ClusteringResult Cluster(int numOutdated, IUnlabeledExampleCollection<SparseVector<double>> batch)
 {
     Utils.ThrowException(batch == null ? new ArgumentNullException("batch") : null);
     Utils.ThrowException(numOutdated < 0 ? new ArgumentOutOfRangeException("numOutdated") : null);
     if (mDataset == null)
     {
         // initialize
         mLogger.Info("Cluster", "Initializing ...");
         Utils.ThrowException(numOutdated > 0 ? new ArgumentOutOfRangeException("numOutdated") : null);
         //Utils.ThrowException(batch.Count == 0 ? new ArgumentValueException("batch") : null);
         if (batch.Count == 0) { return new ClusteringResult(); }
         kMeans(batch, Math.Min(mK, batch.Count));
         mDataset = new UnlabeledDataset<SparseVector<double>>(batch);
         foreach (CentroidData centroid in mCentroids) { centroid.Tag = mTopicId++; }
         //OutputState();
     }
     else
     {
         // update clusters
         Utils.ThrowException(numOutdated > mDataset.Count ? new ArgumentOutOfRangeException("numOutdated") : null);
         if (numOutdated == 0 && batch.Count == 0) { return GetClusteringResult(); }
         mLogger.Info("Cluster", "Updating clusters ...");
         // assign new instances
         double dummy;
         Assign(mCentroids, ModelUtils.GetTransposedMatrix(batch), batch.Count, /*offs=*/mDataset.Count, out dummy);
         mDataset.AddRange(batch);
         // remove outdated instances
         foreach (CentroidData centroid in mCentroids)
         {
             foreach (int item in centroid.CurrentItems)
             {
                 if (item >= numOutdated) { centroid.Items.Add(item); }
             }
             centroid.Update(mDataset);
             centroid.UpdateCentroidLen();
         }
         mDataset.RemoveRange(0, numOutdated);
         ArrayList<CentroidData> centroidsNew = new ArrayList<CentroidData>(mCentroids.Count);
         foreach (CentroidData centroid in mCentroids)
         {
             if (centroid.CurrentItems.Count > 0)
             {
                 centroidsNew.Add(centroid);
                 Set<int> tmp = new Set<int>();
                 foreach (int idx in centroid.CurrentItems) { tmp.Add(idx - numOutdated); }
                 centroid.CurrentItems.Inner.SetItems(tmp);
             }
         }
         if (centroidsNew.Count == 0) // reset
         {
             mCentroids = null;
             mDataset = null;
             return new ClusteringResult();
         }
         mCentroids = centroidsNew;
         // execute main loop
         kMeansMainLoop(mDataset, mCentroids);
         //OutputState();
     }
     // adjust k
     double minQual; // *** not used at the moment
     int minQualIdx;
     double qual = GetClustQual(out minQual, out minQualIdx);
     if (qual < mQualThresh)
     {
         while (qual < mQualThresh) // split cluster at minQualIdx
         {
             mLogger.Info("Cluster", "Increasing k to {0} ...", mCentroids.Count + 1);
             mCentroids.Add(mCentroids[minQualIdx].Clone());
             mCentroids.Last.Tag = mTopicId++;
             kMeansMainLoop(mDataset, mCentroids);
             if (mCentroids.Last.CurrentItems.Count > mCentroids[minQualIdx].CurrentItems.Count)
             {
                 // swap topic identifiers
                 object tmp = mCentroids.Last.Tag;
                 mCentroids.Last.Tag = mCentroids[minQualIdx].Tag;
                 mCentroids[minQualIdx].Tag = tmp;
             }
             qual = GetClustQual(out minQual, out minQualIdx);
             //OutputState();
         }
     }
     else if (numOutdated > 0)
     {
         while (qual > mQualThresh && mCentroids.Count > 1) // join clusters
         {
             mLogger.Info("Cluster", "Decreasing k to {0} ...", mCentroids.Count - 1);
             ArrayList<CentroidData> centroidsCopy = mCentroids.DeepClone();
             if (mCentroids.Count == 2) // create single cluster
             {
                 object topicId = mCentroids[0].CurrentItems.Count > mCentroids[1].CurrentItems.Count ? mCentroids[0].Tag : mCentroids[1].Tag;
                 mCentroids = new ArrayList<CentroidData>();
                 mCentroids.Add(new CentroidData());
                 for (int i = 0; i < mDataset.Count; i++) { mCentroids.Last.Items.Add(i); }
                 mCentroids.Last.Tag = topicId;
                 mCentroids.Last.Update(mDataset);
                 mCentroids.Last.UpdateCentroidLen();
             }
             else
             {
                 int idx1, idx2;
                 GetMostSimilarClusters(out idx1, out idx2);
                 CentroidData c1 = mCentroids[idx1];
                 CentroidData c2 = mCentroids[idx2];
                 object topicId = c1.CurrentItems.Count > c2.CurrentItems.Count ? c1.Tag : c2.Tag;
                 mCentroids.RemoveAt(idx2);
                 c1.Items.AddRange(c1.CurrentItems);
                 c1.Items.AddRange(c2.CurrentItems);
                 c1.Tag = topicId;
                 c1.Update(mDataset);
                 c1.UpdateCentroidLen();
                 kMeansMainLoop(mDataset, mCentroids);
             }
             qual = GetClustQual();
             if (qual >= mQualThresh)
             {
                 mLogger.Info("Cluster", "Accepted solution at k = {0}.", mCentroids.Count);
             }
             else
             {
                 mCentroids = centroidsCopy;
             }
             //OutputState();
         }
     }
     OutputState();
     return GetClusteringResult();
 }
示例#17
0
        public ClusteringResult Cluster(IUnlabeledExampleCollection <SparseVector <double> > dataset)
        {
            Utils.ThrowException(dataset == null ? new ArgumentNullException("dataset") : null);
            Utils.ThrowException(dataset.Count < mK ? new ArgumentValueException("dataset") : null);
            ClusteringResult clustering          = null;
            ClusteringResult bestClustering      = null;
            double           globalBestClustQual = 0;

            for (int trial = 1; trial <= mTrials; trial++)
            {
                mLogger.Trace("Cluster", "Clustering trial {0} of {1} ...", trial, mTrials);
                ArrayList <SparseVector <double> > centroids = null;
                clustering = new ClusteringResult();
                for (int i = 0; i < mK; i++)
                {
                    clustering.AddRoot(new Cluster());
                }
                // select seed items
                double          minSim = double.MaxValue;
                ArrayList <int> tmp    = new ArrayList <int>(dataset.Count);
                for (int i = 0; i < dataset.Count; i++)
                {
                    tmp.Add(i);
                }
                for (int k = 0; k < 3; k++)
                {
                    ArrayList <SparseVector <double> > seeds = new ArrayList <SparseVector <double> >(mK);
                    tmp.Shuffle(mRnd);
                    for (int i = 0; i < mK; i++)
                    {
                        seeds.Add(ModelUtils.ComputeCentroid(new SparseVector <double>[] { dataset[tmp[i]] }, mCentroidType));
                    }
                    // assess quality of seed items
                    double simAvg = 0;
                    foreach (SparseVector <double> seed1 in seeds)
                    {
                        foreach (SparseVector <double> seed2 in seeds)
                        {
                            if (seed1 != seed2)
                            {
                                simAvg += mSimilarity.GetSimilarity(seed1, seed2);
                            }
                        }
                    }
                    simAvg /= (double)(mK * mK - mK);
                    if (simAvg < minSim)
                    {
                        minSim    = simAvg;
                        centroids = seeds;
                    }
                }
                // main loop
                int    iter          = 0;
                double bestClustQual = 0;
                double clustQual;
                while (true)
                {
                    iter++;
                    mLogger.Trace("Cluster", "Iteration {0} ...", iter);
                    clustQual = 0;
                    // assign items to clusters
                    foreach (Cluster cluster in clustering.Roots)
                    {
                        cluster.Items.Clear();
                    }
                    for (int i = 0; i < dataset.Count; i++)
                    {
                        SparseVector <double> example = dataset[i];
                        double          maxSim        = double.MinValue;
                        ArrayList <int> candidates    = new ArrayList <int>();
                        for (int j = 0; j < mK; j++)
                        {
                            SparseVector <double> centroid = centroids[j];
                            double sim = mSimilarity.GetSimilarity(example, centroid);
                            if (sim > maxSim)
                            {
                                maxSim = sim;
                                candidates.Clear();
                                candidates.Add(j);
                            }
                            else if (sim == maxSim)
                            {
                                candidates.Add(j);
                            }
                        }
                        if (candidates.Count > 1)
                        {
                            candidates.Shuffle(mRnd);
                        }
                        clustering.Roots[candidates[0]].Items.Add(i);
                        clustQual += maxSim;
                    }
                    clustQual /= (double)dataset.Count;
                    mLogger.Trace("Cluster", "Quality: {0:0.0000}", clustQual);
                    // check if done
                    if (iter > 1 && clustQual - bestClustQual <= mEps)
                    {
                        break;
                    }
                    bestClustQual = clustQual;
                    // compute new centroids
                    for (int i = 0; i < mK; i++)
                    {
                        centroids[i] = clustering.Roots[i].ComputeCentroid(dataset, mCentroidType);
                    }
                }
                if (trial == 1 || clustQual > globalBestClustQual)
                {
                    globalBestClustQual = clustQual;
                    bestClustering      = clustering;
                }
            }
            return(bestClustering);
        }
示例#18
0
        public static SparseVector <double> ComputeCentroid(IEnumerable <int> vecIdxList, IUnlabeledExampleCollection <SparseVector <double> > dataset, CentroidType type)
        {
            Utils.ThrowException(vecIdxList == null ? new ArgumentNullException("vecIdxList") : null);
            Utils.ThrowException(dataset == null ? new ArgumentNullException("dataset") : null);
            Dictionary <int, double> tmp = new Dictionary <int, double>();
            int vecCount = 0;

            foreach (int vecIdx in vecIdxList)
            {
                Utils.ThrowException((vecIdx < 0 || vecIdx >= dataset.Count) ? new ArgumentValueException("vecIdxList") : null);
                SparseVector <double> vec = dataset[vecIdx];
                foreach (IdxDat <double> item in vec)
                {
                    if (tmp.ContainsKey(item.Idx))
                    {
                        tmp[item.Idx] += item.Dat;
                    }
                    else
                    {
                        tmp.Add(item.Idx, item.Dat);
                    }
                }
                vecCount++;
            }
            //Utils.ThrowException(vecCount == 0 ? new ArgumentValueException("vecIdxList") : null);
            if (vecCount == 0)
            {
                return(new SparseVector <double>());
            }
            SparseVector <double> centroid = new SparseVector <double>();

            switch (type)
            {
            case CentroidType.Sum:
                foreach (KeyValuePair <int, double> item in tmp)
                {
                    centroid.InnerIdx.Add(item.Key);
                    centroid.InnerDat.Add(item.Value);
                }
                break;

            case CentroidType.Avg:
                foreach (KeyValuePair <int, double> item in tmp)
                {
                    centroid.InnerIdx.Add(item.Key);
                    centroid.InnerDat.Add(item.Value / (double)vecCount);
                }
                break;

            case CentroidType.NrmL2:
                double vecLen = 0;
                foreach (KeyValuePair <int, double> item in tmp)
                {
                    vecLen += item.Value * item.Value;
                }
                //Utils.ThrowException(vecLen == 0 ? new InvalidOperationException() : null);
                vecLen = Math.Sqrt(vecLen);
                if (vecLen > 0)
                {
                    foreach (KeyValuePair <int, double> item in tmp)
                    {
                        centroid.InnerIdx.Add(item.Key);
                        centroid.InnerDat.Add(item.Value / vecLen);
                    }
                }
                break;
            }
            centroid.Sort();
            return(centroid);
        }
示例#19
0
 public SparseVector<double> ComputeCentroid(IUnlabeledExampleCollection<SparseVector<double>> dataset, CentroidType type)
 {
     return ModelUtils.ComputeCentroid(mItems, dataset, type); // throws ArgumentNullException, ArgumentValueException, InvalidOperationException
 }
示例#20
0
        private void kMeansMainLoop(IUnlabeledExampleCollection <SparseVector <double> > dataset, ArrayList <CentroidData> centroids)
        {
            double dummy;

            kMeansMainLoop(dataset, centroids, out dummy);
        }
示例#21
0
        protected ClusteringResult kMeans(IUnlabeledExampleCollection <SparseVector <double> > dataset, int k)
        {
            if (k == 1)
            {
                return(CreateSingleCluster(dataset));
            }                                                    // border case
            double globalBestClustQual = 0;

            for (int trial = 1; trial <= mTrials; trial++)
            {
                mLogger.Trace("Cluster", "Clustering trial {0} of {1} ...", trial, mTrials);
                ArrayList <CentroidData> centroids = new ArrayList <CentroidData>(k);
                ArrayList <int>          bestSeeds = null;
                for (int i = 0; i < k; i++)
                {
                    centroids.Add(new CentroidData());
                }
                // select seed items
                double          minSim = double.MaxValue;
                ArrayList <int> tmp    = new ArrayList <int>(dataset.Count);
                for (int i = 0; i < dataset.Count; i++)
                {
                    tmp.Add(i);
                }
                for (int i = 0; i < 3; i++)
                {
                    ArrayList <SparseVector <double> > seeds = new ArrayList <SparseVector <double> >(k);
                    tmp.Shuffle(mRnd);
                    for (int j = 0; j < k; j++)
                    {
                        seeds.Add(dataset[tmp[j]]);
                    }
                    // assess quality of seed items
                    double simAvg = 0;
                    foreach (SparseVector <double> seed1 in seeds)
                    {
                        foreach (SparseVector <double> seed2 in seeds)
                        {
                            if (seed1 != seed2)
                            {
                                simAvg += DotProductSimilarity.Instance.GetSimilarity(seed1, seed2);
                            }
                        }
                    }
                    simAvg /= (double)(k * k - k);
                    if (simAvg < minSim)
                    {
                        minSim    = simAvg;
                        bestSeeds = new ArrayList <int>(k);
                        for (int j = 0; j < k; j++)
                        {
                            bestSeeds.Add(tmp[j]);
                        }
                    }
                }
                for (int i = 0; i < k; i++)
                {
                    centroids[i].Items.Add(bestSeeds[i]);
                    centroids[i].Update(dataset);
                    centroids[i].UpdateCentroidLen();
                }
                // execute main loop
                double clustQual;
                kMeansMainLoop(dataset, centroids, out clustQual);
                if (trial == 1 || clustQual > globalBestClustQual)
                {
                    globalBestClustQual = clustQual;
                    mCentroids          = centroids;
                }
            }
            return(GetClusteringResult());
        }
 public override ClusteringResult Cluster(IUnlabeledExampleCollection<SparseVector<double>> batch)
 {
     return Cluster(/*numOutdated=*/0, batch); // throws ArgumentNullException, ArgumentValueException
 }
示例#23
0
        private UnlabeledDataset <SparseVector <double> > GetDatasetSubset(IEnumerable <int> items, IUnlabeledExampleCollection <SparseVector <double> > dataset)
        {
            UnlabeledDataset <SparseVector <double> > datasetSubset = new UnlabeledDataset <SparseVector <double> >();

            foreach (int item in items)
            {
                datasetSubset.Add(dataset[item]);
            }
            return(datasetSubset);
        }
示例#24
0
 public SparseVector <double> ComputeCentroid(IUnlabeledExampleCollection <SparseVector <double> > dataset, CentroidType type)
 {
     return(ModelUtils.ComputeCentroid(mItems, dataset, type)); // throws ArgumentValueException
 }
示例#25
0
 public static SparseMatrix <double> GetDotProductSimilarity(IUnlabeledExampleCollection <SparseVector <double> > dataset)
 {
     return(GetDotProductSimilarity(dataset, /*thresh=*/ 0, /*fullMatrix=*/ false)); // throws ArgumentNullException
 }
示例#26
0
        public ClusteringResult Cluster(int numOutdated, IUnlabeledExampleCollection <SparseVector <double> > batch)
        {
            Utils.ThrowException(batch == null ? new ArgumentNullException("batch") : null);
            Utils.ThrowException(numOutdated < 0 ? new ArgumentOutOfRangeException("numOutdated") : null);
            if (mDataset == null)
            {
                // initialize
                mLogger.Trace("Cluster", "Initializing ...");
                Utils.ThrowException(numOutdated > 0 ? new ArgumentOutOfRangeException("numOutdated") : null);
                //Utils.ThrowException(batch.Count == 0 ? new ArgumentValueException("batch") : null);
                if (batch.Count == 0)
                {
                    return(new ClusteringResult());
                }
                kMeans(batch, Math.Min(mK, batch.Count));
                mDataset = new UnlabeledDataset <SparseVector <double> >(batch);
                foreach (CentroidData centroid in mCentroids)
                {
                    centroid.Tag = mTopicId++;
                }
                //OutputState();
            }
            else
            {
                // update clusters
                Utils.ThrowException(numOutdated > mDataset.Count ? new ArgumentOutOfRangeException("numOutdated") : null);
                if (numOutdated == 0 && batch.Count == 0)
                {
                    return(GetClusteringResult());
                }
                mLogger.Trace("Cluster", "Updating clusters ...");
                // assign new instances
                double dummy;
                Assign(mCentroids, ModelUtils.GetTransposedMatrix(batch), batch.Count, /*offs=*/ mDataset.Count, out dummy);
                mDataset.AddRange(batch);
                // remove outdated instances
                foreach (CentroidData centroid in mCentroids)
                {
                    foreach (int item in centroid.CurrentItems)
                    {
                        if (item >= numOutdated)
                        {
                            centroid.Items.Add(item);
                        }
                    }
                    centroid.Update(mDataset);
                    centroid.UpdateCentroidLen();
                }
                mDataset.RemoveRange(0, numOutdated);
                ArrayList <CentroidData> centroidsNew = new ArrayList <CentroidData>(mCentroids.Count);
                foreach (CentroidData centroid in mCentroids)
                {
                    if (centroid.CurrentItems.Count > 0)
                    {
                        centroidsNew.Add(centroid);
                        Set <int> tmp = new Set <int>();
                        foreach (int idx in centroid.CurrentItems)
                        {
                            tmp.Add(idx - numOutdated);
                        }
                        centroid.CurrentItems.Inner.SetItems(tmp);
                    }
                }
                if (centroidsNew.Count == 0) // reset
                {
                    mCentroids = null;
                    mDataset   = null;
                    return(new ClusteringResult());
                }
                mCentroids = centroidsNew;
                // execute main loop
                kMeansMainLoop(mDataset, mCentroids);
                //OutputState();
            }
            // adjust k
            double minQual; // *** not used at the moment
            int    minQualIdx;
            double qual = GetClustQual(out minQual, out minQualIdx);

            if (qual < mQualThresh)
            {
                while (qual < mQualThresh) // split cluster at minQualIdx
                {
                    mLogger.Trace("Cluster", "Increasing k to {0} ...", mCentroids.Count + 1);
                    mCentroids.Add(mCentroids[minQualIdx].Clone());
                    mCentroids.Last.Tag = mTopicId++;
                    kMeansMainLoop(mDataset, mCentroids);
                    if (mCentroids.Last.CurrentItems.Count > mCentroids[minQualIdx].CurrentItems.Count)
                    {
                        // swap topic identifiers
                        object tmp = mCentroids.Last.Tag;
                        mCentroids.Last.Tag        = mCentroids[minQualIdx].Tag;
                        mCentroids[minQualIdx].Tag = tmp;
                    }
                    qual = GetClustQual(out minQual, out minQualIdx);
                    //OutputState();
                }
            }
            else if (numOutdated > 0)
            {
                while (qual > mQualThresh && mCentroids.Count > 1) // join clusters
                {
                    mLogger.Trace("Cluster", "Decreasing k to {0} ...", mCentroids.Count - 1);
                    ArrayList <CentroidData> centroidsCopy = mCentroids.DeepClone();
                    if (mCentroids.Count == 2) // create single cluster
                    {
                        object topicId = mCentroids[0].CurrentItems.Count > mCentroids[1].CurrentItems.Count ? mCentroids[0].Tag : mCentroids[1].Tag;
                        mCentroids = new ArrayList <CentroidData>();
                        mCentroids.Add(new CentroidData());
                        for (int i = 0; i < mDataset.Count; i++)
                        {
                            mCentroids.Last.Items.Add(i);
                        }
                        mCentroids.Last.Tag = topicId;
                        mCentroids.Last.Update(mDataset);
                        mCentroids.Last.UpdateCentroidLen();
                    }
                    else
                    {
                        int idx1, idx2;
                        GetMostSimilarClusters(out idx1, out idx2);
                        CentroidData c1      = mCentroids[idx1];
                        CentroidData c2      = mCentroids[idx2];
                        object       topicId = c1.CurrentItems.Count > c2.CurrentItems.Count ? c1.Tag : c2.Tag;
                        mCentroids.RemoveAt(idx2);
                        c1.Items.AddRange(c1.CurrentItems);
                        c1.Items.AddRange(c2.CurrentItems);
                        c1.Tag = topicId;
                        c1.Update(mDataset);
                        c1.UpdateCentroidLen();
                        kMeansMainLoop(mDataset, mCentroids);
                    }
                    qual = GetClustQual();
                    if (qual >= mQualThresh)
                    {
                        mLogger.Trace("Cluster", "Accepted solution at k = {0}.", mCentroids.Count);
                    }
                    else
                    {
                        mCentroids = centroidsCopy;
                    }
                    //OutputState();
                }
            }
            OutputState();
            return(GetClusteringResult());
        }
示例#27
0
        public ClusteringResult Cluster(IUnlabeledExampleCollection <SparseVector <double> > dataset)
        {
            Utils.ThrowException(dataset == null ? new ArgumentNullException("dataset") : null);
            Utils.ThrowException(dataset.Count < mK ? new ArgumentValueException("dataset") : null);
            mDataset = new UnlabeledDataset <SparseVector <double> >(dataset);
            ClusteringResult clustering          = null;
            double           globalBestClustQual = 0;

            for (int trial = 1; trial <= mTrials; trial++)
            {
                mLogger.Info("Cluster", "Clustering trial {0} of {1} ...", trial, mTrials);
                ArrayList <CentroidData> centroids = new ArrayList <CentroidData>(mK);
                ArrayList <int>          bestSeeds = null;
                for (int i = 0; i < mK; i++)
                {
                    centroids.Add(new CentroidData());
                }
                // select seed items
                double          minSim = double.MaxValue;
                ArrayList <int> tmp    = new ArrayList <int>(mDataset.Count);
                for (int i = 0; i < mDataset.Count; i++)
                {
                    tmp.Add(i);
                }
                for (int k = 0; k < 3; k++)
                {
                    ArrayList <SparseVector <double> > seeds = new ArrayList <SparseVector <double> >(mK);
                    tmp.Shuffle(mRnd);
                    for (int i = 0; i < mK; i++)
                    {
                        seeds.Add(mDataset[tmp[i]]);
                    }
                    // assess quality of seed items
                    double simAvg = 0;
                    foreach (SparseVector <double> seed1 in seeds)
                    {
                        foreach (SparseVector <double> seed2 in seeds)
                        {
                            if (seed1 != seed2)
                            {
                                simAvg += DotProductSimilarity.Instance.GetSimilarity(seed1, seed2);
                            }
                        }
                    }
                    simAvg /= (double)(mK * mK - mK);
                    //Console.WriteLine(simAvg);
                    if (simAvg < minSim)
                    {
                        minSim    = simAvg;
                        bestSeeds = new ArrayList <int>(mK);
                        for (int i = 0; i < mK; i++)
                        {
                            bestSeeds.Add(tmp[i]);
                        }
                    }
                }
                ArrayList <KeyDat <double, int> > medoids = new ArrayList <KeyDat <double, int> >(mK);
                for (int i = 0; i < mK; i++)
                {
                    centroids[i].Items.Add(bestSeeds[i]);
                    centroids[i].Update(mDataset);
                    centroids[i].UpdateCentroidLen();
                    medoids.Add(new KeyDat <double, int>(-1, bestSeeds[i]));
                }
                double[,] dotProd = new double[mDataset.Count, mK];
                SparseMatrix <double> dsMat = ModelUtils.GetTransposedMatrix(mDataset);
                // main loop
                int    iter          = 0;
                double bestClustQual = 0;
                double clustQual;
                while (true)
                {
                    iter++;
                    mLogger.Info("Cluster", "Iteration {0} ...", iter);
                    clustQual = 0;
                    // assign items to clusters
                    //StopWatch stopWatch = new StopWatch();
                    int j = 0;
                    foreach (CentroidData cen in centroids)
                    {
                        SparseVector <double> cenVec = cen.GetSparseVector();
                        double[] dotProdSimVec       = ModelUtils.GetDotProductSimilarity(dsMat, mDataset.Count, cenVec);
                        for (int i = 0; i < dotProdSimVec.Length; i++)
                        {
                            if (dotProdSimVec[i] > 0)
                            {
                                dotProd[i, j] = dotProdSimVec[i];
                            }
                        }
                        j++;
                    }
                    for (int dsInstIdx = 0; dsInstIdx < mDataset.Count; dsInstIdx++)
                    {
                        double          maxSim     = double.MinValue;
                        ArrayList <int> candidates = new ArrayList <int>();
                        for (int cenIdx = 0; cenIdx < mK; cenIdx++)
                        {
                            double sim = dotProd[dsInstIdx, cenIdx];
                            if (sim > maxSim)
                            {
                                maxSim = sim;
                                candidates.Clear();
                                candidates.Add(cenIdx);
                            }
                            else if (sim == maxSim)
                            {
                                candidates.Add(cenIdx);
                            }
                        }
                        if (candidates.Count > 1)
                        {
                            candidates.Shuffle(mRnd);
                        }
                        if (candidates.Count > 0) // *** is this always true?
                        {
                            centroids[candidates[0]].Items.Add(dsInstIdx);
                            clustQual += maxSim;
                            if (medoids[candidates[0]].Key < maxSim)
                            {
                                medoids[candidates[0]] = new KeyDat <double, int>(maxSim, dsInstIdx);
                            }
                        }
                    }
                    //Console.WriteLine(stopWatch.TotalMilliseconds);
                    clustQual /= (double)mDataset.Count;
                    mLogger.Info("Cluster", "Quality: {0:0.0000}", clustQual);
                    // compute new centroids
                    for (int i = 0; i < mK; i++)
                    {
                        centroids[i].Update(mDataset);
                        centroids[i].UpdateCentroidLen();
                    }
                    // check if done
                    if (iter > 1 && clustQual - bestClustQual <= mEps)
                    {
                        break;
                    }
                    bestClustQual = clustQual;
                    for (int i = 0; i < medoids.Count; i++)
                    {
                        medoids[i] = new KeyDat <double, int>(-1, medoids[i].Dat);
                    }
                }
                if (trial == 1 || clustQual > globalBestClustQual)
                {
                    globalBestClustQual = clustQual;
                    mCentroids          = centroids;
                    mMedoids            = medoids;
                    // save the result
                    clustering = new ClusteringResult();
                    for (int i = 0; i < mK; i++)
                    {
                        clustering.AddRoot(new Cluster());
                        clustering.Roots.Last.Items.AddRange(centroids[i].Items);
                    }
                }
            }
            return(clustering);
        }
示例#28
0
 public override ClusteringResult Cluster(IUnlabeledExampleCollection <SparseVector <double> > batch)
 {
     return(Cluster(/*numOutdated=*/ 0, batch)); // throws ArgumentNullException, ArgumentValueException
 }
示例#29
0
 public virtual ClusteringResult Cluster(IUnlabeledExampleCollection <SparseVector <double> > dataset)
 {
     Utils.ThrowException(dataset == null ? new ArgumentNullException("dataset") : null);
     Utils.ThrowException(dataset.Count < mK ? new ArgumentValueException("dataset") : null);
     return(kMeans(dataset, mK));
 }
示例#30
0
 ClusteringResult IClustering.Cluster(IUnlabeledExampleCollection dataset)
 {
     Utils.ThrowException(dataset == null ? new ArgumentNullException("dataset") : null);
     Utils.ThrowException(!(dataset is IUnlabeledExampleCollection <SparseVector <double> >) ? new ArgumentTypeException("dataset") : null);
     return(Cluster((IUnlabeledExampleCollection <SparseVector <double> >)dataset)); // throws ArgumentValueException
 }
示例#31
0
 public SemanticSpaceLayout(IUnlabeledExampleCollection <SparseVector <double> > dataset)
 {
     Utils.ThrowException(dataset == null ? new ArgumentNullException("dataset") : null);
     mDataset = dataset;
 }
示例#32
0
 void IHierarchicalModel.Train(IUnlabeledExampleCollection dataset, ClusteringResult hierarchy)
 {
     Utils.ThrowException(dataset == null ? new ArgumentNullException("dataset") : null);
     Utils.ThrowException(!(dataset is IUnlabeledExampleCollection <SparseVector <double> >) ? new ArgumentTypeException("dataset") : null);
     Train((IUnlabeledExampleCollection <SparseVector <double> >)dataset, hierarchy); // throws ArgumentNullException, ArgumentValueException
 }
 private void kMeansMainLoop(IUnlabeledExampleCollection<SparseVector<double>> dataset, ArrayList<CentroidData> centroids)
 {
     double dummy;
     kMeansMainLoop(dataset, centroids, out dummy);
 }