/** * Used to compare between clusters. If this cluster is better, meaning less * deviation, returns a negative number so that it is put first when sorted. */ public int CompareTo(Cluster other, DocumentAnalyzer analyzer) { // TODO: Find better method, this one favors smaller clusters // Put clusters with only one article at the ends if (Documents.Count == 1) { return(1); } if (other.Documents.Count == 1) { return(-1); } double simA, simB; simA = simB = 0; foreach (int i in this.Documents) { simA += analyzer.Similarity(this.Centroid, i); } foreach (int i in other.Documents) { simB += analyzer.Similarity(other.Centroid, i); } simA /= this.Documents.Count; simB /= other.Documents.Count; if (simA > simB) { return(-1); } else if (simA < simB) { return(1); } else { return(0); } }
/** * Returns an array of clusters of size k with clusters sorted by least * deviation between articles and articles within each cluster sorted * by greatest similarity to that cluster's centroid */ public Cluster[] cluster(int k) { // Possible improvement: limit min/max cluster size var clusters = new Cluster[k]; var r = new Random(); // Initialize k clusters by choosing random documents as starting points for (int i = 0; i < k; i++) { clusters[i] = new Cluster(); bool run = true; int count1 = 0; while (run && count1 < 10) { count1++; // Choose random document to initialize cluster to int index = (int)(r.NextDouble() * articles.Count); clusters[i].Centroid = analyzer.GetDocumentTfIdf(index); bool continueRunning = false; for (int j = 0; j < i; j++) { // Keep running if this cluster is too similar to an already chosen cluster if (analyzer.Similarity(clusters[j].Centroid, clusters[i].Centroid) >= 0.1) { continueRunning = true; break; } } run = continueRunning; } } int count2 = 0; bool stillChanging = true; while (stillChanging && count2 < 20) { count2++; foreach (Cluster c in clusters) { c.Documents.Clear(); } // Assign each article the cluster it's closest to for (int i = 0; i < articles.Count; i++) { double greatestSim = 0; int clusterIndex = 0; for (int j = 0; j < clusters.Length; j++) { double currentSim = analyzer.Similarity(clusters[j].Centroid, i); if (currentSim > greatestSim) { greatestSim = currentSim; clusterIndex = j; } } clusters[clusterIndex].Documents.Add(i); } // Update cluster centroids var newCentroids = new double[clusters.Length][]; for (int i = 0; i < clusters.Length; i++) { var newCentroid = new double[analyzer.NumberOfTerms()]; var currDocs = clusters[i].Documents; for (int j = 0; j < currDocs.Count; j++) { var values = analyzer.GetDocumentTfIdf(currDocs[j]); for (int l = 0; l < newCentroid.Length; l++) { newCentroid[l] += values[l]; } } for (int j = 0; j < newCentroid.Length; j++) { newCentroid[j] /= currDocs.Count; } newCentroids[i] = newCentroid; } // Finish if clusters remain unchanged bool unchanged = true; for (int i = 0; i < clusters.Length; i++) { if (!newCentroids[i].SequenceEqual(clusters[i].Centroid)) { unchanged = false; clusters[i].Centroid = newCentroids[i]; } } stillChanging = !unchanged; } // Sort by best clusters and best articles within each cluster Array.Sort(clusters, (Cluster x, Cluster y) => x.CompareTo(y, analyzer)); foreach (Cluster c in clusters) { c.Documents.Sort((int x, int y) => analyzer.Compare(x, y, c.Centroid)); } // Map from indexes to the actual NewsArticles and add to clusters foreach (Cluster c in clusters) { foreach (int index in c.Documents) { c.Articles.Add(articles[index]); } } return(clusters); }