Exemple #1
0
        public ClusteringResult Cluster(IUnlabeledExampleCollection <SparseVector <double> > dataset)
        {
            Utils.ThrowException(dataset == null ? new ArgumentNullException("dataset") : null);
            Utils.ThrowException(dataset.Count < mK ? new ArgumentValueException("dataset") : null);
            mDataset = new UnlabeledDataset <SparseVector <double> >(dataset);
            ClusteringResult clustering          = null;
            double           globalBestClustQual = 0;

            for (int trial = 1; trial <= mTrials; trial++)
            {
                mLogger.Info("Cluster", "Clustering trial {0} of {1} ...", trial, mTrials);
                ArrayList <CentroidData> centroids = new ArrayList <CentroidData>(mK);
                ArrayList <int>          bestSeeds = null;
                for (int i = 0; i < mK; i++)
                {
                    centroids.Add(new CentroidData());
                }
                // select seed items
                double          minSim = double.MaxValue;
                ArrayList <int> tmp    = new ArrayList <int>(mDataset.Count);
                for (int i = 0; i < mDataset.Count; i++)
                {
                    tmp.Add(i);
                }
                for (int k = 0; k < 3; k++)
                {
                    ArrayList <SparseVector <double> > seeds = new ArrayList <SparseVector <double> >(mK);
                    tmp.Shuffle(mRnd);
                    for (int i = 0; i < mK; i++)
                    {
                        seeds.Add(mDataset[tmp[i]]);
                    }
                    // assess quality of seed items
                    double simAvg = 0;
                    foreach (SparseVector <double> seed1 in seeds)
                    {
                        foreach (SparseVector <double> seed2 in seeds)
                        {
                            if (seed1 != seed2)
                            {
                                simAvg += DotProductSimilarity.Instance.GetSimilarity(seed1, seed2);
                            }
                        }
                    }
                    simAvg /= (double)(mK * mK - mK);
                    //Console.WriteLine(simAvg);
                    if (simAvg < minSim)
                    {
                        minSim    = simAvg;
                        bestSeeds = new ArrayList <int>(mK);
                        for (int i = 0; i < mK; i++)
                        {
                            bestSeeds.Add(tmp[i]);
                        }
                    }
                }
                ArrayList <KeyDat <double, int> > medoids = new ArrayList <KeyDat <double, int> >(mK);
                for (int i = 0; i < mK; i++)
                {
                    centroids[i].Items.Add(bestSeeds[i]);
                    centroids[i].Update(mDataset);
                    centroids[i].UpdateCentroidLen();
                    medoids.Add(new KeyDat <double, int>(-1, bestSeeds[i]));
                }
                double[,] dotProd = new double[mDataset.Count, mK];
                SparseMatrix <double> dsMat = ModelUtils.GetTransposedMatrix(mDataset);
                // main loop
                int    iter          = 0;
                double bestClustQual = 0;
                double clustQual;
                while (true)
                {
                    iter++;
                    mLogger.Info("Cluster", "Iteration {0} ...", iter);
                    clustQual = 0;
                    // assign items to clusters
                    //StopWatch stopWatch = new StopWatch();
                    int j = 0;
                    foreach (CentroidData cen in centroids)
                    {
                        SparseVector <double> cenVec = cen.GetSparseVector();
                        double[] dotProdSimVec       = ModelUtils.GetDotProductSimilarity(dsMat, mDataset.Count, cenVec);
                        for (int i = 0; i < dotProdSimVec.Length; i++)
                        {
                            if (dotProdSimVec[i] > 0)
                            {
                                dotProd[i, j] = dotProdSimVec[i];
                            }
                        }
                        j++;
                    }
                    for (int dsInstIdx = 0; dsInstIdx < mDataset.Count; dsInstIdx++)
                    {
                        double          maxSim     = double.MinValue;
                        ArrayList <int> candidates = new ArrayList <int>();
                        for (int cenIdx = 0; cenIdx < mK; cenIdx++)
                        {
                            double sim = dotProd[dsInstIdx, cenIdx];
                            if (sim > maxSim)
                            {
                                maxSim = sim;
                                candidates.Clear();
                                candidates.Add(cenIdx);
                            }
                            else if (sim == maxSim)
                            {
                                candidates.Add(cenIdx);
                            }
                        }
                        if (candidates.Count > 1)
                        {
                            candidates.Shuffle(mRnd);
                        }
                        if (candidates.Count > 0) // *** is this always true?
                        {
                            centroids[candidates[0]].Items.Add(dsInstIdx);
                            clustQual += maxSim;
                            if (medoids[candidates[0]].Key < maxSim)
                            {
                                medoids[candidates[0]] = new KeyDat <double, int>(maxSim, dsInstIdx);
                            }
                        }
                    }
                    //Console.WriteLine(stopWatch.TotalMilliseconds);
                    clustQual /= (double)mDataset.Count;
                    mLogger.Info("Cluster", "Quality: {0:0.0000}", clustQual);
                    // compute new centroids
                    for (int i = 0; i < mK; i++)
                    {
                        centroids[i].Update(mDataset);
                        centroids[i].UpdateCentroidLen();
                    }
                    // check if done
                    if (iter > 1 && clustQual - bestClustQual <= mEps)
                    {
                        break;
                    }
                    bestClustQual = clustQual;
                    for (int i = 0; i < medoids.Count; i++)
                    {
                        medoids[i] = new KeyDat <double, int>(-1, medoids[i].Dat);
                    }
                }
                if (trial == 1 || clustQual > globalBestClustQual)
                {
                    globalBestClustQual = clustQual;
                    mCentroids          = centroids;
                    mMedoids            = medoids;
                    // save the result
                    clustering = new ClusteringResult();
                    for (int i = 0; i < mK; i++)
                    {
                        clustering.AddRoot(new Cluster());
                        clustering.Roots.Last.Items.AddRange(centroids[i].Items);
                    }
                }
            }
            return(clustering);
        }
Exemple #2
0
        public override void Run(object[] args)
        {
            // Get the stop words and stemmer for English.

            IStemmer stemmer;

            Set <string> .ReadOnly stopWords;
            TextMiningUtils.GetLanguageTools(Language.English,
                                             out stopWords, out stemmer);

            // Create a tokenizer.

            UnicodeTokenizer tokenizer = new UnicodeTokenizer();

            tokenizer.MinTokenLen = 2;                      // Each token must be at least 2
            // characters long.
            tokenizer.Filter = TokenizerFilter.AlphaStrict; // Tokens
            // can consist of alphabetic characters only.

            // Load a document corpus from a file. Each line in the file
            // represents one document.

            string[] docs = File.ReadAllLines(@"Data\YahooFinance.txt");

            // Create a bag-of-words space.

            BowSpace bowSpc = new BowSpace();

            bowSpc.Tokenizer         = tokenizer;                                              // Assign the tokenizer.
            bowSpc.StopWords         = stopWords;                                              // Assign the stop words.
            bowSpc.Stemmer           = stemmer;                                                // Assign the stemmer.
            bowSpc.MinWordFreq       = 3;                                                      // A term must appear at least 3 times in the corpus for it to be part of the vocabulary.
            bowSpc.MaxNGramLen       = 3;                                                      // Terms consisting of at most 3 consecutive words will be considered.
            bowSpc.WordWeightType    = WordWeightType.TfIdf;                                   // Set the weighting scheme for the bag-of-words vectors to TF-IDF.
            bowSpc.NormalizeVectors  = true;                                                   // The TF-IDF vectors will be normalized.
            bowSpc.CutLowWeightsPerc = 0.2;                                                    // The terms with the lowest weights, summing up to 20% of the overall weight sum, will be removed from each TF-IDF vector.

            ArrayList <SparseVector <double> >        sparseVectors = bowSpc.Initialize(docs); // Initialize the Bow space.
            UnlabeledDataset <SparseVector <double> > ud            = new UnlabeledDataset <SparseVector <double> >(sparseVectors);

            // Compute 100 clusters of documents.

            KMeansClusteringFast kMeans = new KMeansClusteringFast(100); // Set k to 100.

            kMeans.Trials = 3;                                           // Perform 3 repetitions. Take the best result.
            kMeans.Eps    = 0.001;                                       // Stop iterating when the partition quality increases for less than 0.001.

            ClusteringResult cr = kMeans.Cluster(ud);                    // Execute.

            // Extract the top 5 terms with the highest TF-IDF weights
            // from each of the clusters' centroids and output the
            // number of documents (companies) in each cluster.

            foreach (Cluster cl in cr.Roots)
            {
                SparseVector <double> .ReadOnly centroid = cl.ComputeCentroid(ud, CentroidType.NrmL2);
                Console.Write(bowSpc.GetKeywordsStr(centroid, 5));
                Output.WriteLine(" ({0} companies)", cl.Items.Count);
            }

            // Output the documents that are contained in the first
            // cluster.

            foreach (int docIdx in cr.Roots[0].Items)
            {
                Output.WriteLine(docs[docIdx]);
            }
        }
Exemple #3
0
        public Vector2D[] ComputeLayout(LayoutSettings settings)
        {
            UnlabeledDataset <SparseVector <double> > dataset = new UnlabeledDataset <SparseVector <double> >(mDataset);

            // clustering
            mLogger.Info("ComputeLayout", "Clustering ...");
            KMeansFast kMeans = new KMeansFast(mKClust);

            kMeans.Eps    = mKMeansEps;
            kMeans.Random = mRandom;
            kMeans.Trials = 1;
            ClusteringResult clustering = kMeans.Cluster(mDataset); // throws ArgumentValueException
            // determine reference instances
            UnlabeledDataset <SparseVector <double> > dsRefInst = new UnlabeledDataset <SparseVector <double> >();

            foreach (Cluster cluster in clustering.Roots)
            {
                SparseVector <double> centroid
                    = cluster.Items.Count > 0 ? cluster.ComputeCentroid(mDataset, CentroidType.NrmL2) : new SparseVector <double>();
                dsRefInst.Add(centroid); // dataset of reference instances
                dataset.Add(centroid);   // add centroids to the main dataset
            }
            // position reference instances
            mLogger.Info("ComputeLayout", "Positioning reference instances ...");
            SparseMatrix <double>    simMtx = ModelUtils.GetDotProductSimilarity(dsRefInst, mSimThresh, /*fullMatrix=*/ false);
            StressMajorizationLayout sm     = new StressMajorizationLayout(dsRefInst.Count, new DistFunc(simMtx));

            sm.Random = mRandom;
            Vector2D[] centrPos = sm.ComputeLayout();
            // k-NN
            mLogger.Info("ComputeLayout", "Computing similarities ...");
            simMtx = ModelUtils.GetDotProductSimilarity(dataset, mSimThresh, /*fullMatrix=*/ true);
            mLogger.Info("ComputeLayout", "Constructing system of linear equations ...");
            LabeledDataset <double, SparseVector <double> > lsqrDs = new LabeledDataset <double, SparseVector <double> >();

            foreach (IdxDat <SparseVector <double> > simMtxRow in simMtx)
            {
                if (simMtxRow.Dat.Count <= 1)
                {
                    mLogger.Warn("ComputeLayout", "Instance #{0} has no neighborhood.", simMtxRow.Idx);
                }
                ArrayList <KeyDat <double, int> > knn = new ArrayList <KeyDat <double, int> >(simMtxRow.Dat.Count);
                foreach (IdxDat <double> item in simMtxRow.Dat)
                {
                    if (item.Idx != simMtxRow.Idx)
                    {
                        knn.Add(new KeyDat <double, int>(item.Dat, item.Idx));
                    }
                }
                knn.Sort(DescSort <KeyDat <double, int> > .Instance);
                int count = Math.Min(knn.Count, mKNN);
                SparseVector <double> eq = new SparseVector <double>();
                double wgt = 1.0 / (double)count;
                for (int i = 0; i < count; i++)
                {
                    eq.InnerIdx.Add(knn[i].Dat);
                    eq.InnerDat.Add(-wgt);
                }
                eq.InnerIdx.Sort(); // *** sort only indices
                eq[simMtxRow.Idx] = 1;
                lsqrDs.Add(0, eq);
            }
            Vector2D[] layout = new Vector2D[dataset.Count - mKClust];
            for (int i = dataset.Count - mKClust, j = 0; i < dataset.Count; i++, j++)
            {
                SparseVector <double> eq = new SparseVector <double>(new IdxDat <double>[] { new IdxDat <double>(i, 1) });
                lsqrDs.Add(centrPos[j].X, eq);
            }
            LSqrModel lsqr = new LSqrModel();

            lsqr.Train(lsqrDs);
            for (int i = 0; i < layout.Length; i++)
            {
                layout[i].X = lsqr.Solution[i];
            }
            for (int i = lsqrDs.Count - mKClust, j = 0; i < lsqrDs.Count; i++, j++)
            {
                lsqrDs[i].Label = centrPos[j].Y;
            }
            lsqr.Train(lsqrDs);
            for (int i = 0; i < layout.Length; i++)
            {
                layout[i].Y = lsqr.Solution[i];
            }
            return(settings == null ? layout : settings.AdjustLayout(layout));
        }
 public IncrementalSemanticSpaceLayout(IUnlabeledExampleCollection <SparseVector <double> > dataset)
 {
     Utils.ThrowException(dataset == null ? new ArgumentNullException("dataset") : null);
     mDataset = new UnlabeledDataset <SparseVector <double> >(dataset);
 }
        // TODO: exceptions
        public Vector2D[] Update(int numDequeue, IEnumerable <SparseVector <double> > newInst, bool test, LayoutSettings settings, ref PtInfo[] ptInfo, int _count)
        {
            // clustering
            mLogger.Info("Update", "Clustering ...");
            /*prof*/ StopWatch sw = new StopWatch();

            mKMeans.Eps = mKMeansEps;
            int iter = 0;

            mKMeans.Update(numDequeue, newInst, ref iter);
            /*prof*/ sw.Save("cl.txt", _count, iter.ToString());
            // determine reference instances
            /*prof*/ sw.Reset();
            UnlabeledDataset <SparseVector <double> > dsRefInst = new UnlabeledDataset <SparseVector <double> >();
            UnlabeledDataset <SparseVector <double> > dsNewInst = new UnlabeledDataset <SparseVector <double> >(newInst);

            foreach (SparseVector <double> centroid in mKMeans.GetCentroids())
            {
                dsRefInst.Add(centroid); // dataset of reference instances
                dsNewInst.Add(centroid); // dataset of new instances
            }
            // position reference instances
            mLogger.Info("Update", "Positioning reference instances ...");
            SparseMatrix <double>    simMtx = ModelUtils.GetDotProductSimilarity(dsRefInst, mSimThresh, /*fullMatrix=*/ false);
            StressMajorizationLayout sm     = new StressMajorizationLayout(dsRefInst.Count, new DistFunc(simMtx));

            sm.Random   = mRandom;
            sm.MaxSteps = int.MaxValue;
            sm.MinDiff  = 1E-3;
            mRefPos     = sm.ComputeLayout(/*settings=*/ null, mRefPos /*make this a property!!!*/);
            /*prof*/ sw.Save("sm.txt", _count);
            // k-NN
            /*prof*/ sw.Reset();
            DateTime t = DateTime.Now;

            mLogger.Info("Update", "Computing similarities ...");
            // update list of neighborhoods
            mPatches.RemoveRange(mDataset.Count - mKClust, mKClust);
            mPatches.RemoveRange(0, numDequeue);
            // remove instances from [dataset and] neighborhoods
            foreach (Patch patch in mPatches)
            {
                if (patch.Min != null && (patch.Min.Idx < numDequeue || patch.Max.Idx >= mDataset.Count - mKClust))
                {
                    int oldCount = patch.List.Count;
                    ArrayList <KeyDat <double, Patch> > tmp = new ArrayList <KeyDat <double, Patch> >();
                    foreach (KeyDat <double, Patch> item in patch.List)
                    {
                        if (item.Dat.Idx >= numDequeue && item.Dat.Idx < mDataset.Count - mKClust)
                        {
                            tmp.Add(item);
                        }
                        //else
                        //{
                        //    Console.WriteLine("Remove {0}", item.Dat.Idx - numDequeue);
                        //}
                    }
                    patch.List = tmp;
                    patch.ProcessList();
                    patch.NeedUpdate = patch.List.Count < mKNn && oldCount >= mKNn;
                }
            }
            // update dataset
            mDataset.RemoveRange(mDataset.Count - mKClust, mKClust);
            mDataset.RemoveRange(0, numDequeue);
            // add new instances to dataset
            int preAddCount = mDataset.Count;

            mDataset.AddRange(dsNewInst);
            // precompute transposed matrices
            SparseMatrix <double> trNewInst = ModelUtils.GetTransposedMatrix(dsNewInst);
            SparseMatrix <double> trDataset = ModelUtils.GetTransposedMatrix(mDataset);

            // add new instances to neighborhoods
            for (int i = 0; i < dsNewInst.Count; i++)
            {
                mPatches.Add(new Patch(-1));
                mPatches.Last.NeedUpdate = true;
            }
            for (int i = 0; i < mPatches.Count; i++)
            {
                mPatches[i].Idx = i;
            }
            for (int i = 0; i < mPatches.Count; i++)
            {
                Patch patch = mPatches[i];
                SparseVector <double> vec = mDataset[i];
                if (vec != null)
                {
                    if (patch.NeedUpdate) // full update required
                    {
                        //if (i == 1347) { Console.WriteLine("full update"); }
                        SparseVector <double>             simVec = ModelUtils.GetDotProductSimilarity(trDataset, mDataset.Count, vec, mSimThresh);
                        ArrayList <KeyDat <double, int> > tmp    = new ArrayList <KeyDat <double, int> >();
                        foreach (IdxDat <double> item in simVec)
                        {
                            if (item.Idx != i)
                            {
                                tmp.Add(new KeyDat <double, int>(item.Dat, item.Idx));
                            }
                        }
                        tmp.Sort(new Comparer2());
                        int count = Math.Min(tmp.Count, mKNnExt);
                        patch.List.Clear();
                        for (int j = 0; j < count; j++)
                        {
                            patch.List.Add(new KeyDat <double, Patch>(tmp[j].Key, mPatches[tmp[j].Dat]));
                        }
                        patch.ProcessList();
                        patch.NeedUpdate = false;
                    }
                    else // only new instances need to be considered
                    {
                        //if (i == 1347) { Console.WriteLine("partial update"); }
                        SparseVector <double> simVec = ModelUtils.GetDotProductSimilarity(trNewInst, dsNewInst.Count, vec, mSimThresh);
                        // check if further processing is needed
                        bool needMerge = false;
                        if (test)
                        {
                            foreach (IdxDat <double> item in simVec)
                            {
                                if (item.Dat >= patch.MinSim)
                                {
                                    needMerge = true;
                                    //Console.WriteLine("{0} {1}", item.Dat, patch.MinSim);
                                    break;
                                }
                            }
                        }
                        else
                        {
                            foreach (IdxDat <double> item in simVec)
                            {
                                if (item.Dat > patch.MinSim)
                                {
                                    needMerge = true;
                                    //Console.WriteLine("{0} {1}", item.Dat, patch.MinSim);
                                    break;
                                }
                            }
                        }
                        if (needMerge || patch.List.Count < mKNn)
                        {
                            //if (i == 1347) { Console.WriteLine("merge"); }
                            int oldCount = patch.List.Count;
                            ArrayList <KeyDat <double, Patch> > tmp = new ArrayList <KeyDat <double, Patch> >();
                            foreach (IdxDat <double> item in simVec)
                            {
                                tmp.Add(new KeyDat <double, Patch>(item.Dat, mPatches[item.Idx + preAddCount]));
                            }
                            // merge the two lists
                            // TODO: speed this up
                            patch.List.AddRange(tmp);
                            patch.List.Sort(new Comparer());
                            // trim list to size
                            if (oldCount >= mKNn)
                            {
                                patch.List.RemoveRange(oldCount, patch.List.Count - oldCount);
                            }
                            patch.ProcessList();
                        }
                    }
                }
            }
            /*prof*/ sw.Save("knn.txt", _count);
            // *** Test ***
            sw.Reset();
            ModelUtils.GetDotProductSimilarity(mDataset, mSimThresh, /*fullMatrix=*/ true);
            sw.Save("selfSim.txt", _count, mDataset.Count.ToString());
            if (test)
            {
                simMtx = ModelUtils.GetDotProductSimilarity(mDataset, mSimThresh, /*fullMatrix=*/ true);
                ArrayList <Patch> patches = new ArrayList <Patch>();
                for (int i = 0; i < mDataset.Count; i++)
                {
                    patches.Add(new Patch(i));
                }
                foreach (IdxDat <SparseVector <double> > simMtxRow in simMtx)
                {
                    if (simMtxRow.Dat.Count <= 1)
                    {
                        mLogger.Warn("Update", "Instance #{0} has no neighborhood.", simMtxRow.Idx);
                    }
                    ArrayList <KeyDat <double, int> > knn = new ArrayList <KeyDat <double, int> >(simMtxRow.Dat.Count);
                    foreach (IdxDat <double> item in simMtxRow.Dat)
                    {
                        if (item.Idx != simMtxRow.Idx)
                        {
                            knn.Add(new KeyDat <double, int>(item.Dat, item.Idx));
                        }
                    }
                    knn.Sort(new Comparer2());
                    int count = Math.Min(knn.Count, mKNnExt);
                    for (int i = 0; i < count; i++)
                    {
                        patches[simMtxRow.Idx].List.Add(new KeyDat <double, Patch>(knn[i].Key, patches[knn[i].Dat]));
                    }
                    patches[simMtxRow.Idx].ProcessList();
                }
                // compare
                if (patches.Count != mPatches.Count)
                {
                    throw new Exception("Count mismatch.");
                }
                for (int i = 0; i < mPatches.Count; i++)
                {
                    if (patches[i].List.Count < mKNn && patches[i].List.Count != mPatches[i].List.Count)
                    {
                        Console.WriteLine(mPatches[i].List.Count);
                        Console.WriteLine(patches[i].List.Count);
                        Output(mPatches[i].List);
                        Output(patches[i].List);
                        Console.WriteLine(i);
                        throw new Exception("List count mismatch.");
                    }
                    int count = Math.Min(mPatches[i].List.Count, mKNn);
                    for (int j = 0; j < count; j++)
                    {
                        //Console.WriteLine("{4} {0}-{1} {2}-{3}", mPatches[i].List[j].Key, mPatches[i].List[j].Dat.Idx, patches[i].List[j].Key, patches[i].List[j].Dat.Idx, i);
                        if (mPatches[i].List[j].Key != patches[i].List[j].Key || mPatches[i].List[j].Dat.Idx != patches[i].List[j].Dat.Idx)
                        {
                            Console.WriteLine("i:{4} fast:{0}-{1} slow:{2}-{3}", mPatches[i].List[j].Key, mPatches[i].List[j].Dat.Idx, patches[i].List[j].Key, patches[i].List[j].Dat.Idx, i);
                            int idxFast = mPatches[i].List[j].Dat.Idx;
                            int idxSlow = patches[i].List[j].Dat.Idx;
                            Console.WriteLine("slow @ fast idx: {0}", GetKey(patches[i].List, idxFast));
                            Console.WriteLine("fast @ slow idx: {0}", GetKey(mPatches[i].List, idxSlow));
                            throw new Exception("Patch item mismatch.");
                        }
                    }
                }
            }
            // *** End of test ***
            //Console.WriteLine("Number of patches: {0}", mPatches.Count);
            //int waka = 0;
            //foreach (Patch patch in mPatches)
            //{
            //    waka += patch.List.Count;
            //}
            //Console.WriteLine("Avg list size: {0}", (double)waka / (double)mPatches.Count);
            Console.WriteLine((DateTime.Now - t).TotalMilliseconds);
            /*prof*/ sw.Reset();
            mLogger.Info("Update", "Constructing system of linear equations ...");
            LabeledDataset <double, SparseVector <double> > lsqrDs = new LabeledDataset <double, SparseVector <double> >();

            Vector2D[] layout = new Vector2D[mDataset.Count - mKClust];
            foreach (Patch patch in mPatches)
            {
                int count = Math.Min(patch.List.Count, mKNn);
                SparseVector <double> eq = new SparseVector <double>();
                double wgt = 1.0 / (double)count;
                for (int i = 0; i < count; i++)
                {
                    eq.InnerIdx.Add(patch.List[i].Dat.Idx);
                    eq.InnerDat.Add(-wgt);
                }
                eq.InnerIdx.Sort(); // *** sort only indices
                eq[patch.Idx] = 1;
                lsqrDs.Add(0, eq);
            }
            for (int i = mDataset.Count - mKClust, j = 0; i < mDataset.Count; i++, j++)
            {
                SparseVector <double> eq = new SparseVector <double>(new IdxDat <double>[] { new IdxDat <double>(i, 1) });
                lsqrDs.Add(mRefPos[j].X, eq);
            }
            LSqrModel lsqr = new LSqrModel();

            mSolX.RemoveRange(0, numDequeue);
            double[] aux = new double[mKClust];
            mSolX.CopyTo(mSolX.Count - mKClust, aux, 0, mKClust);
            mSolX.RemoveRange(mSolX.Count - mKClust, mKClust);
            foreach (SparseVector <double> newVec in newInst)
            {
                mSolX.Add(0);
            }
            mSolX.AddRange(aux);
            lsqr.InitialSolution = mSolX.ToArray();
            lsqr.Train(lsqrDs);
            mSolX = lsqr.Solution.GetWritableCopy();
            //for (int i = 0; i < lsqr.InitialSolution.Length; i++)
            //{
            //    Console.WriteLine("{0}\t{1}", lsqr.InitialSolution[i], lsqr.Solution[i]);
            //}
            for (int i = 0; i < layout.Length; i++)
            {
                layout[i].X = lsqr.Solution[i];
            }
            for (int i = lsqrDs.Count - mKClust, j = 0; i < lsqrDs.Count; i++, j++)
            {
                lsqrDs[i].Label = mRefPos[j].Y;
            }
            mSolY.RemoveRange(0, numDequeue);
            aux = new double[mKClust];
            mSolY.CopyTo(mSolY.Count - mKClust, aux, 0, mKClust);
            mSolY.RemoveRange(mSolY.Count - mKClust, mKClust);
            foreach (SparseVector <double> newVec in newInst)
            {
                mSolY.Add(0);
            }
            mSolY.AddRange(aux);
            lsqr.InitialSolution = mSolY.ToArray();
            lsqr.Train(lsqrDs);
            mSolY = lsqr.Solution.GetWritableCopy();
            for (int i = 0; i < layout.Length; i++)
            {
                layout[i].Y = lsqr.Solution[i];
            }
            /*prof*/ sw.Save("lsqr.txt", _count);
            // -----------------------------------------------------------------
            // make ptInfo
            // -----------------------------------------------------------------
            ptInfo = new PtInfo[layout.Length];
            int ii = 0;

            foreach (Vector2D pt in layout)
            {
                ptInfo[ii]     = new PtInfo();
                ptInfo[ii].X   = pt.X;
                ptInfo[ii].Y   = pt.Y;
                ptInfo[ii].Vec = mDataset[ii];
                ii++;
            }
            // -----------------------------------------------------------------
            return(settings == null ? layout : settings.AdjustLayout(layout));
        }
        public Vector2D[] ComputeLayout(LayoutSettings settings)
        {
            // clustering
            mLogger.Info("ComputeLayout", "Clustering ...");
            mKMeans        = new IncrementalKMeans(mKClust);
            mKMeans.Eps    = mKMeansEps;
            mKMeans.Random = mRandom;
            mKMeans.Trials = 3;
            ClusteringResult clustering = mKMeans.Cluster(mDataset); // throws ArgumentValueException
            // determine reference instances
            UnlabeledDataset <SparseVector <double> > dsRefInst = new UnlabeledDataset <SparseVector <double> >();

            foreach (SparseVector <double> centroid in mKMeans.GetCentroids())
            {
                dsRefInst.Add(centroid); // dataset of reference instances
                mDataset.Add(centroid);  // add centroids to the main dataset
            }
            // position reference instances
            mLogger.Info("ComputeLayout", "Positioning reference instances ...");
            SparseMatrix <double>    simMtx = ModelUtils.GetDotProductSimilarity(dsRefInst, mSimThresh, /*fullMatrix=*/ false);
            StressMajorizationLayout sm     = new StressMajorizationLayout(dsRefInst.Count, new DistFunc(simMtx));

            sm.Random   = mRandom;
            sm.MaxSteps = int.MaxValue;
            sm.MinDiff  = 0.00001;
            mRefPos     = sm.ComputeLayout();
            // k-NN
            mLogger.Info("ComputeLayout", "Computing similarities ...");
            simMtx = ModelUtils.GetDotProductSimilarity(mDataset, mSimThresh, /*fullMatrix=*/ true);
            mLogger.Info("ComputeLayout", "Constructing system of linear equations ...");
            LabeledDataset <double, SparseVector <double> > lsqrDs = new LabeledDataset <double, SparseVector <double> >();

            mPatches = new ArrayList <Patch>(mDataset.Count);
            for (int i = 0; i < mDataset.Count; i++)
            {
                mPatches.Add(new Patch(i));
            }
            foreach (IdxDat <SparseVector <double> > simMtxRow in simMtx)
            {
                if (simMtxRow.Dat.Count <= 1)
                {
                    mLogger.Warn("ComputeLayout", "Instance #{0} has no neighborhood.", simMtxRow.Idx);
                }
                ArrayList <KeyDat <double, int> > knn = new ArrayList <KeyDat <double, int> >(simMtxRow.Dat.Count);
                foreach (IdxDat <double> item in simMtxRow.Dat)
                {
                    if (item.Idx != simMtxRow.Idx)
                    {
                        knn.Add(new KeyDat <double, int>(item.Dat, item.Idx));
                    }
                }
                knn.Sort(DescSort <KeyDat <double, int> > .Instance);
                int count = Math.Min(knn.Count, mKNnExt);
                for (int i = 0; i < count; i++)
                {
                    mPatches[simMtxRow.Idx].List.Add(new KeyDat <double, Patch>(knn[i].Key, mPatches[knn[i].Dat]));
                }
                mPatches[simMtxRow.Idx].ProcessList();
                count = Math.Min(knn.Count, mKNn);
                SparseVector <double> eq = new SparseVector <double>();
                double wgt = 1.0 / (double)count;
                for (int i = 0; i < count; i++)
                {
                    eq.InnerIdx.Add(knn[i].Dat);
                    eq.InnerDat.Add(-wgt);
                }
                eq.InnerIdx.Sort(); // *** sort only indices
                eq[simMtxRow.Idx] = 1;
                lsqrDs.Add(0, eq);
            }
            Vector2D[] layout = new Vector2D[mDataset.Count - mKClust];
            for (int i = mDataset.Count - mKClust, j = 0; i < mDataset.Count; i++, j++)
            {
                SparseVector <double> eq = new SparseVector <double>(new IdxDat <double>[] { new IdxDat <double>(i, 1) });
                lsqrDs.Add(mRefPos[j].X, eq);
            }
            LSqrModel lsqr = new LSqrModel();

            lsqr.Train(lsqrDs);
            mSolX = lsqr.Solution.GetWritableCopy();
            for (int i = 0; i < layout.Length; i++)
            {
                layout[i].X = lsqr.Solution[i];
            }
            for (int i = lsqrDs.Count - mKClust, j = 0; i < lsqrDs.Count; i++, j++)
            {
                lsqrDs[i].Label = mRefPos[j].Y;
            }
            lsqr.Train(lsqrDs);
            mSolY = lsqr.Solution.GetWritableCopy();
            for (int i = 0; i < layout.Length; i++)
            {
                layout[i].Y = lsqr.Solution[i];
            }
            return(settings == null ? layout : settings.AdjustLayout(layout));
        }
 public ClusteringResult Cluster(int numOutdated, IUnlabeledExampleCollection<SparseVector<double>> batch)
 {
     Utils.ThrowException(batch == null ? new ArgumentNullException("batch") : null);
     Utils.ThrowException(numOutdated < 0 ? new ArgumentOutOfRangeException("numOutdated") : null);
     if (mDataset == null)
     {
         // initialize
         mLogger.Info("Cluster", "Initializing ...");
         Utils.ThrowException(numOutdated > 0 ? new ArgumentOutOfRangeException("numOutdated") : null);
         //Utils.ThrowException(batch.Count == 0 ? new ArgumentValueException("batch") : null);
         if (batch.Count == 0) { return new ClusteringResult(); }
         kMeans(batch, Math.Min(mK, batch.Count));
         mDataset = new UnlabeledDataset<SparseVector<double>>(batch);
         foreach (CentroidData centroid in mCentroids) { centroid.Tag = mTopicId++; }
         //OutputState();
     }
     else
     {
         // update clusters
         Utils.ThrowException(numOutdated > mDataset.Count ? new ArgumentOutOfRangeException("numOutdated") : null);
         if (numOutdated == 0 && batch.Count == 0) { return GetClusteringResult(); }
         mLogger.Info("Cluster", "Updating clusters ...");
         // assign new instances
         double dummy;
         Assign(mCentroids, ModelUtils.GetTransposedMatrix(batch), batch.Count, /*offs=*/mDataset.Count, out dummy);
         mDataset.AddRange(batch);
         // remove outdated instances
         foreach (CentroidData centroid in mCentroids)
         {
             foreach (int item in centroid.CurrentItems)
             {
                 if (item >= numOutdated) { centroid.Items.Add(item); }
             }
             centroid.Update(mDataset);
             centroid.UpdateCentroidLen();
         }
         mDataset.RemoveRange(0, numOutdated);
         ArrayList<CentroidData> centroidsNew = new ArrayList<CentroidData>(mCentroids.Count);
         foreach (CentroidData centroid in mCentroids)
         {
             if (centroid.CurrentItems.Count > 0)
             {
                 centroidsNew.Add(centroid);
                 Set<int> tmp = new Set<int>();
                 foreach (int idx in centroid.CurrentItems) { tmp.Add(idx - numOutdated); }
                 centroid.CurrentItems.Inner.SetItems(tmp);
             }
         }
         if (centroidsNew.Count == 0) // reset
         {
             mCentroids = null;
             mDataset = null;
             return new ClusteringResult();
         }
         mCentroids = centroidsNew;
         // execute main loop
         kMeansMainLoop(mDataset, mCentroids);
         //OutputState();
     }
     // adjust k
     double minQual; // *** not used at the moment
     int minQualIdx;
     double qual = GetClustQual(out minQual, out minQualIdx);
     if (qual < mQualThresh)
     {
         while (qual < mQualThresh) // split cluster at minQualIdx
         {
             mLogger.Info("Cluster", "Increasing k to {0} ...", mCentroids.Count + 1);
             mCentroids.Add(mCentroids[minQualIdx].Clone());
             mCentroids.Last.Tag = mTopicId++;
             kMeansMainLoop(mDataset, mCentroids);
             if (mCentroids.Last.CurrentItems.Count > mCentroids[minQualIdx].CurrentItems.Count)
             {
                 // swap topic identifiers
                 object tmp = mCentroids.Last.Tag;
                 mCentroids.Last.Tag = mCentroids[minQualIdx].Tag;
                 mCentroids[minQualIdx].Tag = tmp;
             }
             qual = GetClustQual(out minQual, out minQualIdx);
             //OutputState();
         }
     }
     else if (numOutdated > 0)
     {
         while (qual > mQualThresh && mCentroids.Count > 1) // join clusters
         {
             mLogger.Info("Cluster", "Decreasing k to {0} ...", mCentroids.Count - 1);
             ArrayList<CentroidData> centroidsCopy = mCentroids.DeepClone();
             if (mCentroids.Count == 2) // create single cluster
             {
                 object topicId = mCentroids[0].CurrentItems.Count > mCentroids[1].CurrentItems.Count ? mCentroids[0].Tag : mCentroids[1].Tag;
                 mCentroids = new ArrayList<CentroidData>();
                 mCentroids.Add(new CentroidData());
                 for (int i = 0; i < mDataset.Count; i++) { mCentroids.Last.Items.Add(i); }
                 mCentroids.Last.Tag = topicId;
                 mCentroids.Last.Update(mDataset);
                 mCentroids.Last.UpdateCentroidLen();
             }
             else
             {
                 int idx1, idx2;
                 GetMostSimilarClusters(out idx1, out idx2);
                 CentroidData c1 = mCentroids[idx1];
                 CentroidData c2 = mCentroids[idx2];
                 object topicId = c1.CurrentItems.Count > c2.CurrentItems.Count ? c1.Tag : c2.Tag;
                 mCentroids.RemoveAt(idx2);
                 c1.Items.AddRange(c1.CurrentItems);
                 c1.Items.AddRange(c2.CurrentItems);
                 c1.Tag = topicId;
                 c1.Update(mDataset);
                 c1.UpdateCentroidLen();
                 kMeansMainLoop(mDataset, mCentroids);
             }
             qual = GetClustQual();
             if (qual >= mQualThresh)
             {
                 mLogger.Info("Cluster", "Accepted solution at k = {0}.", mCentroids.Count);
             }
             else
             {
                 mCentroids = centroidsCopy;
             }
             //OutputState();
         }
     }
     OutputState();
     return GetClusteringResult();
 }
 public void Load(BinarySerializer reader)
 {
     Utils.ThrowException(reader == null ? new ArgumentNullException("reader") : null);
     // the following statements throw serialization-related exceptions
     mRnd = (Random)reader.ReadDotNetObject();
     mEps = reader.ReadDouble();
     mTrials = reader.ReadInt();
     mK = reader.ReadInt();
     mCentroids = reader.ReadObject<ArrayList<CentroidData>>();
     mDataset = reader.ReadObject<UnlabeledDataset<SparseVector<double>>>();
     mQualThresh = reader.ReadDouble();
     mTopicId = reader.ReadLong();
 }