public IList<ClusterResult> Cluster(int k) { using (FileStream indexFileStream = File.Open(indexFilePath, FileMode.Open)) { using (FileStream metadataFileStream = File.Open(metadataFilePath, FileMode.Open)) { IndexMetadata indexMetadata = new IndexMetadata(metadataFileStream); TermIndex index = new TermIndex(indexFileStream); QueryEngine queryEngine = new QueryEngine(index, indexMetadata); KMeansClusterFinder clusterFinder = new KMeansClusterFinder(indexMetadata, index); IList<long> allDocIds = indexMetadata.GetDocumentIds(); long[][] clusters = clusterFinder.Cluster(allDocIds, k); IList<ClusterResult> clusterResults = new List<ClusterResult>(); foreach (long[] cluster in clusters) { // Get the term frequencies in the collection IEnumerable<DocumentInfo> clusterDocuments = indexMetadata.GetDocuments(cluster); TermVector sum = new TermVector(); foreach (TermVector vector in clusterDocuments.Select(d => d.TermVector)) { sum += vector; } IEnumerable<string> topTerms = TermVector.GetCentroid(indexMetadata.GetDocuments(cluster) .Select(docInfo => docInfo.TermVector)) .GetNonZeroDimensions() .OrderByDescending(term => sum.GetDimensionLength(term) * this.GetIdf(index, indexMetadata, term)) .Take(6); clusterResults.Add(new ClusterResult(topTerms.ToList(), clusterDocuments.Select(docInfo => docInfo.Uri).ToList())); } return clusterResults; } } }
public int DotProduct(TermVector other) { int sum = 0; foreach (string term in this.Terms) sum += this.GetDimensionLength(term) * other.GetDimensionLength(term); return sum; }
private void Cluster(string args) { int k = int.Parse(args); Console.WriteLine("Clustering..."); KMeansClusterFinder clusterFinder = new KMeansClusterFinder(this.metadata, this.index); IList<long> allDocIds = this.metadata.GetDocumentIds(); long[][] clusters = clusterFinder.Cluster(allDocIds, k); int i = 1; Console.WriteLine("Done! Here are top terms for each cluster:"); foreach(long[] cluster in clusters) { IEnumerable<DocumentInfo> clusterDocuments = this.metadata.GetDocuments(cluster); TermVector sum = new TermVector(); foreach (TermVector vector in clusterDocuments.Select(d => d.TermVector)) { sum += vector; } IEnumerable<string> topTerms = TermVector.GetCentroid(this.metadata.GetDocuments(cluster) .Select(docInfo => docInfo.TermVector)) .GetNonZeroDimensions() .OrderByDescending(term => sum.GetDimensionLength(term)*this.GetIdf(term)) .Take(6); Console.Write(i + ": "); if (topTerms.Count() == 0) { Console.Write("Empty cluster!"); Console.WriteLine(); } else { foreach (string term in topTerms) { Console.Write(term+" "); } Console.WriteLine(); } i++; } }