Example #1
0
        static void PrintKMeans(StreamWriter writer, ClusteringResult result, bool printMembers)
        {
            writer.WriteLine("KMeans clusters");

            for (int i = 0; i < result.Clusters.Count; i++)
            {
                var cluster = result.Clusters[i];
                var sumOfId = cluster.Members.Sum(e => e.Member.Id);
                var firstId = "None";
                var lastId  = "None";

                if (cluster.Members.Count > 0)
                {
                    firstId = cluster.Members[0].Member.Id.ToString();
                    lastId  = cluster.Members.Last().Member.Id.ToString();
                }
                writer.WriteLine("Cluster {0}: member(s) {4,4} || #{1,5}, first {2,4}, last {3,4}", i + 1, sumOfId, firstId, lastId, cluster.Members.Count);
            }
            writer.Write("\r\n\r\n");

            if (printMembers)
            {
                for (int valCluster = 0; valCluster < result.Clusters.Count; valCluster++)
                {
                    writer.WriteLine("Cluster {0} members:", valCluster);
                    for (int valMember = 0; valMember < result.Clusters[valCluster].Members.Count; valMember++)
                    {
                        writer.WriteLine("Member {0,4}, id {1,3}", valMember, result.Clusters[valCluster].Members[valMember].Member.Id);
                    }
                    writer.Write("\r\n\r\n");
                }
            }
        }
Example #2
0
        private void RunClusteringAndGraph()
        {
            if (chartDataSource != null)
            {
                Cluster cluster;

                distanceMetric = DistanceMetric(currentDistanceMatrix);

                try
                {
                    clusterResult = ClusterCalculate();
                }
                catch (InvalidOperationException)
                {
                    MessageBox.Show("Please try again.");
                    return;
                }


                /* Executing scatterplot */
                foreach (var dataPoint in chartDataSource)
                {
                    cluster = clusterResult.FindCluster(dataPoint.Origin);
                    if (cluster != null)
                    {
                        dataPoint.Group = string.Format("Cluster {0}", cluster.Id);
                    }
                }
                chartDataSource = chartDataSource.OrderBy(item => item.Group).ToList();

                scatterPlotControl1.BuildScatterPlot(chartDataSource);
            }
        }
Example #3
0
        private static void GetBestPartition(
            ClusteringResult <DataPoint> clustering,
            IInternalEvaluationCriterion <DataPoint> criterion, string criterionName)
        {
            // gets coeffs for all cluster-sets
            var evals = clustering.EvaluateClustering(criterion);

            // saves cluster-sets indexes to CSV file
            SaveToCsv(evals, Path.GetFullPath(Path.Combine(RESULTS_PATH, $"{criterionName}.csv")), criterionName);

            // gets max coeff
            var maxEval = new ClusterSetEvaluation <DataPoint>(null, double.MinValue);

            foreach (var eval in evals)
            {
                if (eval.EvaluationValue > maxEval.EvaluationValue)
                {
                    maxEval = eval;
                }
            }

            // prints cluster set info
            Console.WriteLine("======================================");
            Console.WriteLine($"Max {criterionName}: {maxEval.EvaluationValue:0.00}");
            if (maxEval.ClusterSet == null)
            {
                return;
            }
            Console.WriteLine(
                $"Clusters at distance: {maxEval.ClusterSet.Dissimilarity:0.00} ({maxEval.ClusterSet.Count})");
            foreach (var cluster in maxEval.ClusterSet)
            {
                Console.WriteLine($" - {cluster}");
            }
        }
Example #4
0
 /// <summary>
 ///     Saves the given <see cref="ClusteringResult{TInstance}" /> to a d3.js dendrogram file.
 /// </summary>
 /// <typeparam name="TInstance">The type of instance considered.</typeparam>
 /// <param name="clustering">The clustering result to be saved to a dendrogram file.</param>
 /// <param name="filePath">The path to the file in which to save the clustering dendrogram.</param>
 /// <param name="printNames">Whether to include clusters' string representation in their nodes.</param>
 /// <param name="formatting">The Json file formatting.</param>
 public static void SaveD3DendrogramFile <TInstance>(
     this ClusteringResult <TInstance> clustering, string filePath,
     bool printNames = true, Formatting formatting = Formatting.None)
     where TInstance : IComparable <TInstance>
 {
     using (var fs = File.Create(filePath))
         using (var sw = new StreamWriter(fs, Encoding.UTF8))
         {
             var writer = new JsonTextWriter(sw)
             {
                 Formatting = formatting
             };
             WriteJson(clustering.SingleCluster, writer, printNames);
         }
 }
Example #5
0
        public static string GetDendrogramJson <TInstance>(
            this ClusteringResult <TInstance> clustering,
            bool printNames       = true,
            Formatting formatting = Formatting.None)
            where TInstance : IComparable <TInstance>
        {
            var sb     = new StringBuilder();
            var sbw    = new StringWriter(sb);
            var writer = new JsonTextWriter(sbw)
            {
                Formatting = formatting
            };

            WriteJson(clustering.SingleCluster, writer, printNames);
            return(sb.ToString());
        }
Example #6
0
        private void ClusteringWorkerDoWork(object sender, DoWorkEventArgs e)
        {
            // checks data points
            if (this._dataPoints == null || this._dataPoints.Count == 0)
            {
                return;
            }

            // selects linkage criterion
            ILinkageCriterion <DataPoint> linkage;
            var selectedIndex = e.Argument;

            switch (selectedIndex)
            {
            case 1:
                linkage = new CompleteLinkage <DataPoint>(this._dissimilarityMetric);
                break;

            case 2:
                linkage = new SingleLinkage <DataPoint>(this._dissimilarityMetric);
                break;

            case 3:
                linkage = new MinimumEnergyLinkage <DataPoint>(this._dissimilarityMetric);
                break;

            case 4:
                linkage = new CentroidLinkage <DataPoint>(this._dissimilarityMetric, DataPoint.GetMedoid);
                break;

            case 5:
                linkage = new WardsMinimumVarianceLinkage <DataPoint>(
                    this._dissimilarityMetric, DataPoint.GetMedoid);
                break;

            default:
                linkage = new AverageLinkage <DataPoint>(this._dissimilarityMetric);
                break;
            }

            // clusters data-points
            var clusteringAlg = new AgglomerativeClusteringAlgorithm <DataPoint>(linkage);

            this._clusteringResult = clusteringAlg.GetClustering(this._dataPoints);
        }
Example #7
0
        private void LoadDataSet()
        {
            // loads data-points
            var parser = new CsvParser();

            this._dataPoints = parser.Load(Path.GetFullPath(this.openFileDialog.FileName));

            // clears series
            this._chartDataPoints.Clear();
            this.ChartPoints.Clear();

            // adds points to series
            var maxX = double.MinValue;
            var minX = double.MaxValue;

            foreach (var dataPoint in this._dataPoints)
            {
                var chartDataPoint = new ChartDataPoint(dataPoint.Value[0], dataPoint.Value[1])
                {
                    Label = dataPoint.ID
                };
                this.ChartPoints.Add(chartDataPoint);
                this._chartDataPoints.Add(dataPoint, chartDataPoint);
                maxX = Math.Max(maxX, dataPoint.Value[0]);
                minX = Math.Min(minX, dataPoint.Value[0]);
            }

            // resets
            this._numClusters         = int.MinValue;
            this._clusteringResult    = null;
            this._dissimilarityMetric = new CachedDissimilarityMetric <DataPoint>(new DataPoint(), this._dataPoints);

            // adjusts track-bar according to num clusters
            this.numClustersTrackBar.SmallChange = (uint)(this.numClustersTrackBar.Maximum / this._dataPoints.Count);
            this.numClustersTrackBar.LargeChange = this.numClustersTrackBar.SmallChange * 5;

            this.datasetChart.ChartAreas[0].AxisX.Maximum = Math.Ceiling(maxX);
            this.datasetChart.ChartAreas[0].AxisX.Minimum = Math.Floor(minX);
        }
Example #8
0
 public void Train(IUnlabeledExampleCollection <SparseVector <double> > dataset, ClusteringResult hierarchy)
 {
     Utils.ThrowException(dataset == null ? new ArgumentNullException("dataset") : null);
     Utils.ThrowException(dataset.Count == 0 ? new ArgumentValueException("dataset") : null);
     Utils.ThrowException(hierarchy == null ? new ArgumentNullException("hierarchy") : null);
     Utils.ThrowException(hierarchy.Roots.Count == 0 ? new ArgumentValueException("hierarchy") : null);
     mModel   = new Dictionary <Cluster, ClusterInfo>();
     mDataset = dataset;
     foreach (Cluster root in hierarchy.Roots)
     {
         ComputeCentroid(root);
     }
     mDataset = null;
 }
Example #9
0
 void IHierarchicalModel.Train(IUnlabeledExampleCollection dataset, ClusteringResult hierarchy)
 {
     Utils.ThrowException(dataset == null ? new ArgumentNullException("dataset") : null);
     Utils.ThrowException(!(dataset is IUnlabeledExampleCollection <SparseVector <double> >) ? new ArgumentTypeException("dataset") : null);
     Train((IUnlabeledExampleCollection <SparseVector <double> >)dataset, hierarchy); // throws ArgumentNullException, ArgumentValueException
 }
Example #10
0
        static void Main(string[] args)
        {
            // Get the stop words and stemmer for English.

            IStemmer stemmer;

            Set <string> .ReadOnly stopWords;
            TextMiningUtils.GetLanguageTools(Language.English,
                                             out stopWords, out stemmer);

            // Create a tokenizer.

            UnicodeTokenizer tokenizer = new UnicodeTokenizer();

            tokenizer.MinTokenLen = 2;                      // Each token must be at least 2
            // characters long.
            tokenizer.Filter = TokenizerFilter.AlphaStrict; // Tokens
            // can consist of alphabetic characters only.

            // Load a document corpus from a file. Each line in the file
            // represents one document.

            string[] docs
                = File.ReadAllLines("..\\..\\Data\\YahooFinance.txt");

            // Create a bag-of-words space.

            BowSpace bowSpc = new BowSpace();

            bowSpc.Tokenizer   = tokenizer; // Assign the tokenizer.
            bowSpc.StopWords   = stopWords; // Assign the stop words.
            bowSpc.Stemmer     = stemmer;   // Assign the stemmer.
            bowSpc.MinWordFreq = 3;         // A term must appear at least 3
            // times in the corpus for it to be part of the
            // vocabulary.
            bowSpc.MaxNGramLen = 3;                       // Terms consisting of at most 3
            // consecutive words will be considered.
            bowSpc.WordWeightType = WordWeightType.TfIdf; // Set the
            // weighting scheme for the bag-of-words vectors to
            // TF-IDF.
            bowSpc.NormalizeVectors = true; // The TF-IDF vectors will
            // be normalized.
            bowSpc.CutLowWeightsPerc = 0.2; // The terms with the lowest
            // weights, summing up to 20% of the overall weight sum,
            // will be removed from each TF-IDF vector.

            bowSpc.Initialize(docs); // Initialize the BOW space.

            // Compute 100 clusters of documents.

            KMeansFast kMeans = new KMeansFast(100); // Set k to 100.

            kMeans.Trials = 3;                       // Perform 3 repetitions. Take the best
            // result.
            kMeans.Eps = 0.001;                      // Stop iterating when the partition
            // quality increases for less than 0.001.

            ClusteringResult cr = kMeans.Cluster(bowSpc); // Execute.

            // Extract the top 5 terms with the highest TF-IDF weights
            // from each of the clusters' centroids and output the
            // number of documents (companies) in each cluster.

            foreach (Cluster cl in cr.Roots)
            {
                SparseVector <double> .ReadOnly centroid
                    = cl.ComputeCentroid(bowSpc, CentroidType.NrmL2);
                Console.Write(bowSpc.GetKeywordsStr(centroid, 5));
                Console.WriteLine(" ({0} companies)", cl.Items.Count);
            }

            // Output the documents that are contained in the first
            // cluster.

            foreach (int docIdx in cr.Roots[0].Items)
            {
                Console.WriteLine(docs[docIdx]);
            }
        }
Example #11
0
        public Vector2D[] ComputeLayout(LayoutSettings settings)
        {
            UnlabeledDataset <SparseVector <double> > dataset = new UnlabeledDataset <SparseVector <double> >(mDataset);

            // clustering
            mLogger.Info("ComputeLayout", "Clustering ...");
            KMeansFast kMeans = new KMeansFast(mKClust);

            kMeans.Eps    = mKMeansEps;
            kMeans.Random = mRandom;
            kMeans.Trials = 1;
            ClusteringResult clustering = kMeans.Cluster(mDataset); // throws ArgumentValueException
            // determine reference instances
            UnlabeledDataset <SparseVector <double> > dsRefInst = new UnlabeledDataset <SparseVector <double> >();

            foreach (Cluster cluster in clustering.Roots)
            {
                SparseVector <double> centroid
                    = cluster.Items.Count > 0 ? cluster.ComputeCentroid(mDataset, CentroidType.NrmL2) : new SparseVector <double>();
                dsRefInst.Add(centroid); // dataset of reference instances
                dataset.Add(centroid);   // add centroids to the main dataset
            }
            // position reference instances
            mLogger.Info("ComputeLayout", "Positioning reference instances ...");
            SparseMatrix <double>    simMtx = ModelUtils.GetDotProductSimilarity(dsRefInst, mSimThresh, /*fullMatrix=*/ false);
            StressMajorizationLayout sm     = new StressMajorizationLayout(dsRefInst.Count, new DistFunc(simMtx));

            sm.Random = mRandom;
            Vector2D[] centrPos = sm.ComputeLayout();
            // k-NN
            mLogger.Info("ComputeLayout", "Computing similarities ...");
            simMtx = ModelUtils.GetDotProductSimilarity(dataset, mSimThresh, /*fullMatrix=*/ true);
            mLogger.Info("ComputeLayout", "Constructing system of linear equations ...");
            LabeledDataset <double, SparseVector <double> > lsqrDs = new LabeledDataset <double, SparseVector <double> >();

            foreach (IdxDat <SparseVector <double> > simMtxRow in simMtx)
            {
                if (simMtxRow.Dat.Count <= 1)
                {
                    mLogger.Warn("ComputeLayout", "Instance #{0} has no neighborhood.", simMtxRow.Idx);
                }
                ArrayList <KeyDat <double, int> > knn = new ArrayList <KeyDat <double, int> >(simMtxRow.Dat.Count);
                foreach (IdxDat <double> item in simMtxRow.Dat)
                {
                    if (item.Idx != simMtxRow.Idx)
                    {
                        knn.Add(new KeyDat <double, int>(item.Dat, item.Idx));
                    }
                }
                knn.Sort(DescSort <KeyDat <double, int> > .Instance);
                int count = Math.Min(knn.Count, mKNN);
                SparseVector <double> eq = new SparseVector <double>();
                double wgt = 1.0 / (double)count;
                for (int i = 0; i < count; i++)
                {
                    eq.InnerIdx.Add(knn[i].Dat);
                    eq.InnerDat.Add(-wgt);
                }
                eq.InnerIdx.Sort(); // *** sort only indices
                eq[simMtxRow.Idx] = 1;
                lsqrDs.Add(0, eq);
            }
            Vector2D[] layout = new Vector2D[dataset.Count - mKClust];
            for (int i = dataset.Count - mKClust, j = 0; i < dataset.Count; i++, j++)
            {
                SparseVector <double> eq = new SparseVector <double>(new IdxDat <double>[] { new IdxDat <double>(i, 1) });
                lsqrDs.Add(centrPos[j].X, eq);
            }
            LSqrModel lsqr = new LSqrModel();

            lsqr.Train(lsqrDs);
            for (int i = 0; i < layout.Length; i++)
            {
                layout[i].X = lsqr.Solution[i];
            }
            for (int i = lsqrDs.Count - mKClust, j = 0; i < lsqrDs.Count; i++, j++)
            {
                lsqrDs[i].Label = centrPos[j].Y;
            }
            lsqr.Train(lsqrDs);
            for (int i = 0; i < layout.Length; i++)
            {
                layout[i].Y = lsqr.Solution[i];
            }
            return(settings == null ? layout : settings.AdjustLayout(layout));
        }
Example #12
0
 internal void AddMathSet(IMathSet set)
 {
     this.m_mathSet = set;
     this.m_cResult = null;
 }
 public void AddMathSet(IMathSet set)
 {
     this.m_mathSet = set;
     this.m_cResult = null;
 }
Example #14
0
        //private double GetQual()
        //{
        //    double clustQual = 0;
        //    foreach (Centroid centroid in mCentroids)
        //    {
        //        foreach (int itemIdx in centroid.CurrentItems)
        //        {
        //            clustQual += centroid.GetDotProduct(mDataset[itemIdx]);
        //        }
        //    }
        //    clustQual /= (double)mDataset.Count;
        //    return clustQual;
        //}

        // TODO: exceptions
        public ClusteringResult Update(int dequeueN, IEnumerable <SparseVector <double> > addList, ref int iter)
        {
            StopWatch stopWatch = new StopWatch();

            // update centroid data (1)
            foreach (CentroidData centroid in mCentroids)
            {
                foreach (int item in centroid.CurrentItems)
                {
                    if (item >= dequeueN)
                    {
                        centroid.Items.Add(item);
                    }
                }
                centroid.Update(mDataset);
                centroid.UpdateCentroidLen();
            }
            //Console.WriteLine(">>> {0} >>> update centroid data (1)", stopWatch.TotalMilliseconds);
            stopWatch.Reset();
            // update dataset
            mDataset.RemoveRange(0, dequeueN);
            int ofs = mDataset.Count;

            mDataset.AddRange(addList);
            //Console.WriteLine(">>> {0} >>> update dataset", stopWatch.TotalMilliseconds);
            stopWatch.Reset();
            // update centroid data (2)
            foreach (CentroidData centroid in mCentroids)
            {
                Set <int> itemsOfs = new Set <int>();
                foreach (int item in centroid.CurrentItems)
                {
                    itemsOfs.Add(item - dequeueN);
                }
                centroid.CurrentItems.Inner.SetItems(itemsOfs);
                centroid.Items.SetItems(itemsOfs);
            }
            //Console.WriteLine(">>> {0} >>> update centroid data (2)", stopWatch.TotalMilliseconds);
            stopWatch.Reset();
            // assign new instances
            double bestClustQual = 0;

            {
                mLogger.Info("Update", "Initializing ...");
                int i = 0;
                foreach (SparseVector <double> example in addList)
                {
                    double          maxSim     = double.MinValue;
                    ArrayList <int> candidates = new ArrayList <int>();
                    for (int j = 0; j < mK; j++)
                    {
                        double sim = mCentroids[j].GetDotProduct(example);
                        if (sim > maxSim)
                        {
                            maxSim = sim;
                            candidates.Clear();
                            candidates.Add(j);
                        }
                        else if (sim == maxSim)
                        {
                            candidates.Add(j);
                        }
                    }
                    if (candidates.Count > 1)
                    {
                        candidates.Shuffle(mRnd);
                    }
                    if (candidates.Count > 0) // *** is this always true?
                    {
                        mCentroids[candidates[0]].Items.Add(ofs + i);
                    }
                    i++;
                }
                // update centroids
                foreach (CentroidData centroid in mCentroids)
                {
                    centroid.Update(mDataset);
                    centroid.UpdateCentroidLen();
                }
                //Console.WriteLine(GetQual());
                foreach (CentroidData centroid in mCentroids)
                {
                    foreach (int itemIdx in centroid.CurrentItems)
                    {
                        bestClustQual += centroid.GetDotProduct(mDataset[itemIdx]);
                    }
                }
                bestClustQual /= (double)mDataset.Count;
                mLogger.Info("Update", "Quality: {0:0.0000}", bestClustQual);
            }
            //Console.WriteLine(">>> {0} >>> assign new instances", stopWatch.TotalMilliseconds);
            stopWatch.Reset();
            // main k-means loop
            iter = 0;
            while (true)
            {
                iter++;
                mLogger.Info("Update", "Iteration {0} ...", iter);
                // assign items to clusters
                for (int i = 0; i < mDataset.Count; i++)
                {
                    SparseVector <double> example = mDataset[i];
                    double          maxSim        = double.MinValue;
                    ArrayList <int> candidates    = new ArrayList <int>();
                    for (int j = 0; j < mK; j++)
                    {
                        double sim = mCentroids[j].GetDotProduct(example);
                        if (sim > maxSim)
                        {
                            maxSim = sim;
                            candidates.Clear();
                            candidates.Add(j);
                        }
                        else if (sim == maxSim)
                        {
                            candidates.Add(j);
                        }
                    }
                    if (candidates.Count > 1)
                    {
                        candidates.Shuffle(mRnd);
                    }
                    if (candidates.Count > 0) // *** is this always true?
                    {
                        mCentroids[candidates[0]].Items.Add(i);
                    }
                }
                //
                // *** OPTIMIZE THIS with GetDotProductSimilarity (see this.Cluster) !!! ***
                //
                //Console.WriteLine(">>> {0} >>> loop: assign items to clusters", stopWatch.TotalMilliseconds);
                stopWatch.Reset();
                double clustQual = 0;
                // update centroids
                foreach (CentroidData centroid in mCentroids)
                {
                    centroid.Update(mDataset);
                    centroid.UpdateCentroidLen();
                }
                //Console.WriteLine(GetQual());
                foreach (CentroidData centroid in mCentroids)
                {
                    foreach (int itemIdx in centroid.CurrentItems)
                    {
                        clustQual += centroid.GetDotProduct(mDataset[itemIdx]);
                    }
                }
                clustQual /= (double)mDataset.Count;
                //Console.WriteLine(">>> {0} >>> loop: update centroids", stopWatch.TotalMilliseconds);
                stopWatch.Reset();
                mLogger.Info("Update", "Quality: {0:0.0000} Diff: {1:0.0000}", clustQual, clustQual - bestClustQual);
                // check if done
                if (clustQual - bestClustQual <= mEps)
                {
                    break;
                }
                bestClustQual = clustQual;
            }
            // save the result
            ClusteringResult clustering = new ClusteringResult();

            for (int i = 0; i < mK; i++)
            {
                clustering.AddRoot(new Cluster());
                clustering.Roots.Last.Items.AddRange(mCentroids[i].Items);
            }
            return(clustering);
        }
 public void AddClusteringResult(ClusteringResult set)
 {
     this.m_cResult = set;
 }
Example #16
0
        public ClusteringResult Cluster(IUnlabeledExampleCollection <SparseVector <double> > dataset)
        {
            Utils.ThrowException(dataset == null ? new ArgumentNullException("dataset") : null);
            Utils.ThrowException(dataset.Count < mK ? new ArgumentValueException("dataset") : null);
            mDataset = new UnlabeledDataset <SparseVector <double> >(dataset);
            ClusteringResult clustering          = null;
            double           globalBestClustQual = 0;

            for (int trial = 1; trial <= mTrials; trial++)
            {
                mLogger.Info("Cluster", "Clustering trial {0} of {1} ...", trial, mTrials);
                ArrayList <CentroidData> centroids = new ArrayList <CentroidData>(mK);
                ArrayList <int>          bestSeeds = null;
                for (int i = 0; i < mK; i++)
                {
                    centroids.Add(new CentroidData());
                }
                // select seed items
                double          minSim = double.MaxValue;
                ArrayList <int> tmp    = new ArrayList <int>(mDataset.Count);
                for (int i = 0; i < mDataset.Count; i++)
                {
                    tmp.Add(i);
                }
                for (int k = 0; k < 3; k++)
                {
                    ArrayList <SparseVector <double> > seeds = new ArrayList <SparseVector <double> >(mK);
                    tmp.Shuffle(mRnd);
                    for (int i = 0; i < mK; i++)
                    {
                        seeds.Add(mDataset[tmp[i]]);
                    }
                    // assess quality of seed items
                    double simAvg = 0;
                    foreach (SparseVector <double> seed1 in seeds)
                    {
                        foreach (SparseVector <double> seed2 in seeds)
                        {
                            if (seed1 != seed2)
                            {
                                simAvg += DotProductSimilarity.Instance.GetSimilarity(seed1, seed2);
                            }
                        }
                    }
                    simAvg /= (double)(mK * mK - mK);
                    //Console.WriteLine(simAvg);
                    if (simAvg < minSim)
                    {
                        minSim    = simAvg;
                        bestSeeds = new ArrayList <int>(mK);
                        for (int i = 0; i < mK; i++)
                        {
                            bestSeeds.Add(tmp[i]);
                        }
                    }
                }
                ArrayList <KeyDat <double, int> > medoids = new ArrayList <KeyDat <double, int> >(mK);
                for (int i = 0; i < mK; i++)
                {
                    centroids[i].Items.Add(bestSeeds[i]);
                    centroids[i].Update(mDataset);
                    centroids[i].UpdateCentroidLen();
                    medoids.Add(new KeyDat <double, int>(-1, bestSeeds[i]));
                }
                double[,] dotProd = new double[mDataset.Count, mK];
                SparseMatrix <double> dsMat = ModelUtils.GetTransposedMatrix(mDataset);
                // main loop
                int    iter          = 0;
                double bestClustQual = 0;
                double clustQual;
                while (true)
                {
                    iter++;
                    mLogger.Info("Cluster", "Iteration {0} ...", iter);
                    clustQual = 0;
                    // assign items to clusters
                    //StopWatch stopWatch = new StopWatch();
                    int j = 0;
                    foreach (CentroidData cen in centroids)
                    {
                        SparseVector <double> cenVec = cen.GetSparseVector();
                        double[] dotProdSimVec       = ModelUtils.GetDotProductSimilarity(dsMat, mDataset.Count, cenVec);
                        for (int i = 0; i < dotProdSimVec.Length; i++)
                        {
                            if (dotProdSimVec[i] > 0)
                            {
                                dotProd[i, j] = dotProdSimVec[i];
                            }
                        }
                        j++;
                    }
                    for (int dsInstIdx = 0; dsInstIdx < mDataset.Count; dsInstIdx++)
                    {
                        double          maxSim     = double.MinValue;
                        ArrayList <int> candidates = new ArrayList <int>();
                        for (int cenIdx = 0; cenIdx < mK; cenIdx++)
                        {
                            double sim = dotProd[dsInstIdx, cenIdx];
                            if (sim > maxSim)
                            {
                                maxSim = sim;
                                candidates.Clear();
                                candidates.Add(cenIdx);
                            }
                            else if (sim == maxSim)
                            {
                                candidates.Add(cenIdx);
                            }
                        }
                        if (candidates.Count > 1)
                        {
                            candidates.Shuffle(mRnd);
                        }
                        if (candidates.Count > 0) // *** is this always true?
                        {
                            centroids[candidates[0]].Items.Add(dsInstIdx);
                            clustQual += maxSim;
                            if (medoids[candidates[0]].Key < maxSim)
                            {
                                medoids[candidates[0]] = new KeyDat <double, int>(maxSim, dsInstIdx);
                            }
                        }
                    }
                    //Console.WriteLine(stopWatch.TotalMilliseconds);
                    clustQual /= (double)mDataset.Count;
                    mLogger.Info("Cluster", "Quality: {0:0.0000}", clustQual);
                    // compute new centroids
                    for (int i = 0; i < mK; i++)
                    {
                        centroids[i].Update(mDataset);
                        centroids[i].UpdateCentroidLen();
                    }
                    // check if done
                    if (iter > 1 && clustQual - bestClustQual <= mEps)
                    {
                        break;
                    }
                    bestClustQual = clustQual;
                    for (int i = 0; i < medoids.Count; i++)
                    {
                        medoids[i] = new KeyDat <double, int>(-1, medoids[i].Dat);
                    }
                }
                if (trial == 1 || clustQual > globalBestClustQual)
                {
                    globalBestClustQual = clustQual;
                    mCentroids          = centroids;
                    mMedoids            = medoids;
                    // save the result
                    clustering = new ClusteringResult();
                    for (int i = 0; i < mK; i++)
                    {
                        clustering.AddRoot(new Cluster());
                        clustering.Roots.Last.Items.AddRange(centroids[i].Items);
                    }
                }
            }
            return(clustering);
        }
        public Vector2D[] ComputeLayout(LayoutSettings settings)
        {
            // clustering
            mLogger.Info("ComputeLayout", "Clustering ...");
            mKMeans        = new IncrementalKMeans(mKClust);
            mKMeans.Eps    = mKMeansEps;
            mKMeans.Random = mRandom;
            mKMeans.Trials = 3;
            ClusteringResult clustering = mKMeans.Cluster(mDataset); // throws ArgumentValueException
            // determine reference instances
            UnlabeledDataset <SparseVector <double> > dsRefInst = new UnlabeledDataset <SparseVector <double> >();

            foreach (SparseVector <double> centroid in mKMeans.GetCentroids())
            {
                dsRefInst.Add(centroid); // dataset of reference instances
                mDataset.Add(centroid);  // add centroids to the main dataset
            }
            // position reference instances
            mLogger.Info("ComputeLayout", "Positioning reference instances ...");
            SparseMatrix <double>    simMtx = ModelUtils.GetDotProductSimilarity(dsRefInst, mSimThresh, /*fullMatrix=*/ false);
            StressMajorizationLayout sm     = new StressMajorizationLayout(dsRefInst.Count, new DistFunc(simMtx));

            sm.Random   = mRandom;
            sm.MaxSteps = int.MaxValue;
            sm.MinDiff  = 0.00001;
            mRefPos     = sm.ComputeLayout();
            // k-NN
            mLogger.Info("ComputeLayout", "Computing similarities ...");
            simMtx = ModelUtils.GetDotProductSimilarity(mDataset, mSimThresh, /*fullMatrix=*/ true);
            mLogger.Info("ComputeLayout", "Constructing system of linear equations ...");
            LabeledDataset <double, SparseVector <double> > lsqrDs = new LabeledDataset <double, SparseVector <double> >();

            mPatches = new ArrayList <Patch>(mDataset.Count);
            for (int i = 0; i < mDataset.Count; i++)
            {
                mPatches.Add(new Patch(i));
            }
            foreach (IdxDat <SparseVector <double> > simMtxRow in simMtx)
            {
                if (simMtxRow.Dat.Count <= 1)
                {
                    mLogger.Warn("ComputeLayout", "Instance #{0} has no neighborhood.", simMtxRow.Idx);
                }
                ArrayList <KeyDat <double, int> > knn = new ArrayList <KeyDat <double, int> >(simMtxRow.Dat.Count);
                foreach (IdxDat <double> item in simMtxRow.Dat)
                {
                    if (item.Idx != simMtxRow.Idx)
                    {
                        knn.Add(new KeyDat <double, int>(item.Dat, item.Idx));
                    }
                }
                knn.Sort(DescSort <KeyDat <double, int> > .Instance);
                int count = Math.Min(knn.Count, mKNnExt);
                for (int i = 0; i < count; i++)
                {
                    mPatches[simMtxRow.Idx].List.Add(new KeyDat <double, Patch>(knn[i].Key, mPatches[knn[i].Dat]));
                }
                mPatches[simMtxRow.Idx].ProcessList();
                count = Math.Min(knn.Count, mKNn);
                SparseVector <double> eq = new SparseVector <double>();
                double wgt = 1.0 / (double)count;
                for (int i = 0; i < count; i++)
                {
                    eq.InnerIdx.Add(knn[i].Dat);
                    eq.InnerDat.Add(-wgt);
                }
                eq.InnerIdx.Sort(); // *** sort only indices
                eq[simMtxRow.Idx] = 1;
                lsqrDs.Add(0, eq);
            }
            Vector2D[] layout = new Vector2D[mDataset.Count - mKClust];
            for (int i = mDataset.Count - mKClust, j = 0; i < mDataset.Count; i++, j++)
            {
                SparseVector <double> eq = new SparseVector <double>(new IdxDat <double>[] { new IdxDat <double>(i, 1) });
                lsqrDs.Add(mRefPos[j].X, eq);
            }
            LSqrModel lsqr = new LSqrModel();

            lsqr.Train(lsqrDs);
            mSolX = lsqr.Solution.GetWritableCopy();
            for (int i = 0; i < layout.Length; i++)
            {
                layout[i].X = lsqr.Solution[i];
            }
            for (int i = lsqrDs.Count - mKClust, j = 0; i < lsqrDs.Count; i++, j++)
            {
                lsqrDs[i].Label = mRefPos[j].Y;
            }
            lsqr.Train(lsqrDs);
            mSolY = lsqr.Solution.GetWritableCopy();
            for (int i = 0; i < layout.Length; i++)
            {
                layout[i].Y = lsqr.Solution[i];
            }
            return(settings == null ? layout : settings.AdjustLayout(layout));
        }
Example #18
0
 internal void AddClusteringResult(ClusteringResult set)
 {
     this.m_cResult = set;
 }
Example #19
0
        public static void Main(string[] args)
        {
            Plot    generatedDataPlot = new Plot();
            Spawner spawner           = new Spawner(STD_DEV);

            List <PointF> allPoints = new List <PointF>();

            for (int i = 0; i < CLUSTER_COUNT; ++i)
            {
                spawner.ResetCenter(MIN_CENTER_DISTANCE, MAX_CENTER_DISTANCE);

                PointF[] points = spawner.Spawn(POINT_COUNT);
                allPoints.AddRange(points);

                Color color = generatedDataPlot.GetNextColor();

                generatedDataPlot.AddScatterPoints(points, color, label: $"Points {i + 1}");
                generatedDataPlot.AddPoint(spawner.Center.X, spawner.Center.Y, color, 25);
            }

            generatedDataPlot.Legend();

            PlotForm generatedDataPlotForm = new PlotForm(generatedDataPlot, "source_data");

            generatedDataPlotForm.ShowDialog();

            Plot grayDataPlot = new Plot();

            grayDataPlot.AddScatterPoints(allPoints.ToArray(), label: "Gray points");
            grayDataPlot.Legend();

            PlotForm grayDataPlotForm = new PlotForm(grayDataPlot, "gray_data");

            grayDataPlotForm.ShowDialog();

            KMeansClusterizer clusterizer = new KMeansClusterizer();

            List <Dictionary <PointF, List <PointF> > > clusterizingHistory = clusterizer.Clusterize(allPoints, CLUSTER_COUNT);

            PlotForm resultPlotForm = new PlotForm(CreateClusterizingPlot(clusterizingHistory.Last()), "crusterized");

            resultPlotForm.ShowDialog();

            PlotForm historyForm = new PlotForm(clusterizingHistory.Select(c => CreateClusterizingPlot(c)).ToList(), "history_");

            historyForm.ShowDialog();

            CentroidLinkage <DataPoint> linkage = new CentroidLinkage <DataPoint>(
                new DissimilarityMetric(),
                cluster => new DataPoint(
                    cluster.Average(p => p.X),
                    cluster.Average(p => p.Y)
                    )
                );
            AgglomerativeClusteringAlgorithm <DataPoint> algorithm = new AgglomerativeClusteringAlgorithm <DataPoint>(linkage);

            HashSet <DataPoint>          dataPoints       = allPoints.Select(p => new DataPoint(p)).ToHashSet();
            ClusteringResult <DataPoint> clusteringResult = algorithm.GetClustering(dataPoints);
            ClusterSet <DataPoint>       result           = clusteringResult[clusteringResult.Count - 3];

            Plot aglomeraPlot = new Plot();

            foreach (Cluster <DataPoint> resultCluster in result)
            {
                Color color = aglomeraPlot.GetNextColor();

                aglomeraPlot.AddScatterPoints(
                    resultCluster.Select(p => (double)p.X).ToArray(),
                    resultCluster.Select(p => (double)p.Y).ToArray(),
                    color
                    );

                aglomeraPlot.AddPoint(
                    resultCluster.Select(p => p.X).Average(),
                    resultCluster.Select(p => p.Y).Average(),
                    color, 25
                    );
            }

            PlotForm aglomeraForm = new PlotForm(aglomeraPlot, "aglomera");

            aglomeraForm.ShowDialog();

            clusteringResult.SaveD3DendrogramFile(Environment.CurrentDirectory + "/dendro.json");

            Console.ReadLine();
        }