Example #1
0
        /**
         * Used to compare between clusters. If this cluster is better, meaning less
         * deviation, returns a negative number so that it is put first when sorted.
         */
        public int CompareTo(Cluster other, DocumentAnalyzer analyzer)
        {
            // TODO: Find better method, this one favors smaller clusters

            // Put clusters with only one article at the ends
            if (Documents.Count == 1)
            {
                return(1);
            }
            if (other.Documents.Count == 1)
            {
                return(-1);
            }

            double simA, simB;

            simA = simB = 0;

            foreach (int i in this.Documents)
            {
                simA += analyzer.Similarity(this.Centroid, i);
            }
            foreach (int i in other.Documents)
            {
                simB += analyzer.Similarity(other.Centroid, i);
            }
            simA /= this.Documents.Count;
            simB /= other.Documents.Count;

            if (simA > simB)
            {
                return(-1);
            }
            else if (simA < simB)
            {
                return(1);
            }
            else
            {
                return(0);
            }
        }
Example #2
0
        /**
         * Returns an array of clusters of size k with clusters sorted by least
         * deviation between articles and articles within each cluster sorted
         * by greatest similarity to that cluster's centroid
         */
        public Cluster[] cluster(int k)
        {
            // Possible improvement: limit min/max cluster size

            var clusters = new Cluster[k];
            var r        = new Random();

            // Initialize k clusters by choosing random documents as starting points
            for (int i = 0; i < k; i++)
            {
                clusters[i] = new Cluster();
                bool run    = true;
                int  count1 = 0;
                while (run && count1 < 10)
                {
                    count1++;
                    // Choose random document to initialize cluster to
                    int index = (int)(r.NextDouble() * articles.Count);
                    clusters[i].Centroid = analyzer.GetDocumentTfIdf(index);
                    bool continueRunning = false;
                    for (int j = 0; j < i; j++)
                    {
                        // Keep running if this cluster is too similar to an already chosen cluster
                        if (analyzer.Similarity(clusters[j].Centroid, clusters[i].Centroid) >= 0.1)
                        {
                            continueRunning = true;
                            break;
                        }
                    }
                    run = continueRunning;
                }
            }

            int  count2        = 0;
            bool stillChanging = true;

            while (stillChanging && count2 < 20)
            {
                count2++;
                foreach (Cluster c in clusters)
                {
                    c.Documents.Clear();
                }

                // Assign each article the cluster it's closest to
                for (int i = 0; i < articles.Count; i++)
                {
                    double greatestSim  = 0;
                    int    clusterIndex = 0;
                    for (int j = 0; j < clusters.Length; j++)
                    {
                        double currentSim = analyzer.Similarity(clusters[j].Centroid, i);
                        if (currentSim > greatestSim)
                        {
                            greatestSim  = currentSim;
                            clusterIndex = j;
                        }
                    }
                    clusters[clusterIndex].Documents.Add(i);
                }

                // Update cluster centroids
                var newCentroids = new double[clusters.Length][];
                for (int i = 0; i < clusters.Length; i++)
                {
                    var newCentroid = new double[analyzer.NumberOfTerms()];
                    var currDocs    = clusters[i].Documents;
                    for (int j = 0; j < currDocs.Count; j++)
                    {
                        var values = analyzer.GetDocumentTfIdf(currDocs[j]);
                        for (int l = 0; l < newCentroid.Length; l++)
                        {
                            newCentroid[l] += values[l];
                        }
                    }
                    for (int j = 0; j < newCentroid.Length; j++)
                    {
                        newCentroid[j] /= currDocs.Count;
                    }
                    newCentroids[i] = newCentroid;
                }

                // Finish if clusters remain unchanged
                bool unchanged = true;
                for (int i = 0; i < clusters.Length; i++)
                {
                    if (!newCentroids[i].SequenceEqual(clusters[i].Centroid))
                    {
                        unchanged            = false;
                        clusters[i].Centroid = newCentroids[i];
                    }
                }
                stillChanging = !unchanged;
            }

            // Sort by best clusters and best articles within each cluster
            Array.Sort(clusters, (Cluster x, Cluster y) => x.CompareTo(y, analyzer));
            foreach (Cluster c in clusters)
            {
                c.Documents.Sort((int x, int y) => analyzer.Compare(x, y, c.Centroid));
            }

            // Map from indexes to the actual NewsArticles and add to clusters
            foreach (Cluster c in clusters)
            {
                foreach (int index in c.Documents)
                {
                    c.Articles.Add(articles[index]);
                }
            }

            return(clusters);
        }