コード例 #1
0
ファイル: Spimi.cs プロジェクト: tristanstcyr/SPIMI
    public IList<ClusterResult> Cluster(int k)
    {
        using (FileStream indexFileStream = File.Open(indexFilePath, FileMode.Open))
        {
            using (FileStream metadataFileStream = File.Open(metadataFilePath, FileMode.Open))
            {
                IndexMetadata indexMetadata = new IndexMetadata(metadataFileStream);
                TermIndex index = new TermIndex(indexFileStream);
                QueryEngine queryEngine = new QueryEngine(index, indexMetadata);

                KMeansClusterFinder clusterFinder = new KMeansClusterFinder(indexMetadata, index);
                IList<long> allDocIds = indexMetadata.GetDocumentIds();
                long[][] clusters = clusterFinder.Cluster(allDocIds, k);

                IList<ClusterResult> clusterResults = new List<ClusterResult>();

                foreach (long[] cluster in clusters)
                {
                    // Get the term frequencies in the collection
                    IEnumerable<DocumentInfo> clusterDocuments = indexMetadata.GetDocuments(cluster);
                    TermVector sum = new TermVector();
                    foreach (TermVector vector in clusterDocuments.Select(d => d.TermVector))
                    {
                        sum += vector;
                    }

                    IEnumerable<string> topTerms =
                        TermVector.GetCentroid(indexMetadata.GetDocuments(cluster)
                            .Select(docInfo => docInfo.TermVector))
                        .GetNonZeroDimensions()
                        .OrderByDescending(term => sum.GetDimensionLength(term) * this.GetIdf(index, indexMetadata, term))
                        .Take(6);

                    clusterResults.Add(new ClusterResult(topTerms.ToList(),
                        clusterDocuments.Select(docInfo => docInfo.Uri).ToList()));
                }

                return clusterResults;
            }
        }
    }
コード例 #2
0
ファイル: TermVector.cs プロジェクト: tristanstcyr/SPIMI
 public int DotProduct(TermVector other)
 {
     int sum = 0;
     foreach (string term in this.Terms)
         sum += this.GetDimensionLength(term) * other.GetDimensionLength(term);
     return sum;
 }
コード例 #3
0
ファイル: SpimiCli.cs プロジェクト: tristanstcyr/SPIMI
            private void Cluster(string args)
            {
                int k = int.Parse(args);
                Console.WriteLine("Clustering...");
                KMeansClusterFinder clusterFinder = new KMeansClusterFinder(this.metadata, this.index);
                IList<long> allDocIds = this.metadata.GetDocumentIds();
                long[][] clusters = clusterFinder.Cluster(allDocIds, k);
                int i = 1;
                Console.WriteLine("Done! Here are top terms for each cluster:");

                foreach(long[] cluster in clusters)
                {
                    IEnumerable<DocumentInfo> clusterDocuments = this.metadata.GetDocuments(cluster);
                    TermVector sum = new TermVector();
                    foreach (TermVector vector in clusterDocuments.Select(d => d.TermVector))
                    {
                        sum += vector;
                    }

                    IEnumerable<string> topTerms =
                        TermVector.GetCentroid(this.metadata.GetDocuments(cluster)
                            .Select(docInfo => docInfo.TermVector))
                        .GetNonZeroDimensions()
                        .OrderByDescending(term => sum.GetDimensionLength(term)*this.GetIdf(term))
                        .Take(6);

                    Console.Write(i + ": ");
                    if (topTerms.Count() == 0)
                    {
                        Console.Write("Empty cluster!");
                        Console.WriteLine();
                    }
                    else
                    {
                        foreach (string term in topTerms)
                        {
                            Console.Write(term+"  ");
                        }
                        Console.WriteLine();
                    }
                    i++;
                }
            }