Ejemplo n.º 1
0
        /// <summary>
        /// Parses and tokenizes the inputted file. New terms are added to the 
        /// dictionary, and reccurent terms are added to those terms' postings list.
        /// </summary>
        /// <param name="uri">The path of the file to index.</param>
        /// <param name="file">The already opened file stream of the file in question.</param>
        public void Index(string uri, Stream file)
        {
            // Each file holds many documents: we need to parse them out first.
            foreach (Document document in parser.ExtractDocuments(file))
            {
                // Extract the terms from the document and add the document to their respective postings lists
                long docId = nextDocumentId++;
                int termsInDoc = 0;
                TermVector vector = new TermVector();
                foreach (string term in lexer.Tokenize(document.Body))
                {
                    vector.AddTerm(term);

                    termIndexBlockWriter.AddPosting(term, docId);
                    if (termIndexBlockWriter.Postings == maxPostingCountPerBlock)
                    {
                        // Write block to disk
                        this.FlushBlockWriter();
                    }
                    termsInDoc++;
                    collectionLengthInTokens++;
                }

                this.metadataWriter.AddDocumentInfo(docId,
                    new DocumentInfo(uri, document.Title, termsInDoc, document.SpecialIdentifier, vector));
            }
        }
Ejemplo n.º 2
0
 public DocumentInfo(string uri, string title, int length, string identifier, TermVector termVector)
 {
     this.Uri = uri;
     this.Length = length;
     this.SpecialIdentifier = identifier;
     this.Title = title;
     this.TermVector = termVector;
 }
Ejemplo n.º 3
0
        TermVector[] CalculateCentroids(List<long>[] clusters)
        {
            TermVector[] centoids = new TermVector[clusters.Length];
            int clusterIndex = 0;
            foreach (List<long> cluster in clusters)
            {
                centoids[clusterIndex] = TermVector.GetCentroid(
                    this.GetTermVectors(this.metadata.GetDocuments(cluster)));
                clusterIndex++;
            }

            return centoids;
        }
Ejemplo n.º 4
0
    public IList<ClusterResult> Cluster(int k)
    {
        using (FileStream indexFileStream = File.Open(indexFilePath, FileMode.Open))
        {
            using (FileStream metadataFileStream = File.Open(metadataFilePath, FileMode.Open))
            {
                IndexMetadata indexMetadata = new IndexMetadata(metadataFileStream);
                TermIndex index = new TermIndex(indexFileStream);
                QueryEngine queryEngine = new QueryEngine(index, indexMetadata);

                KMeansClusterFinder clusterFinder = new KMeansClusterFinder(indexMetadata, index);
                IList<long> allDocIds = indexMetadata.GetDocumentIds();
                long[][] clusters = clusterFinder.Cluster(allDocIds, k);

                IList<ClusterResult> clusterResults = new List<ClusterResult>();

                foreach (long[] cluster in clusters)
                {
                    // Get the term frequencies in the collection
                    IEnumerable<DocumentInfo> clusterDocuments = indexMetadata.GetDocuments(cluster);
                    TermVector sum = new TermVector();
                    foreach (TermVector vector in clusterDocuments.Select(d => d.TermVector))
                    {
                        sum += vector;
                    }

                    IEnumerable<string> topTerms =
                        TermVector.GetCentroid(indexMetadata.GetDocuments(cluster)
                            .Select(docInfo => docInfo.TermVector))
                        .GetNonZeroDimensions()
                        .OrderByDescending(term => sum.GetDimensionLength(term) * this.GetIdf(index, indexMetadata, term))
                        .Take(6);

                    clusterResults.Add(new ClusterResult(topTerms.ToList(),
                        clusterDocuments.Select(docInfo => docInfo.Uri).ToList()));
                }

                return clusterResults;
            }
        }
    }
Ejemplo n.º 5
0
 public int DotProduct(TermVector other)
 {
     int sum = 0;
     foreach (string term in this.Terms)
         sum += this.GetDimensionLength(term) * other.GetDimensionLength(term);
     return sum;
 }
Ejemplo n.º 6
0
 public double CosineSimilarity(TermVector centroid)
 {
     return this.DotProduct(centroid) / (this.EuclideanLength() * centroid.EuclideanLength());
 }
Ejemplo n.º 7
0
            private void Cluster(string args)
            {
                int k = int.Parse(args);
                Console.WriteLine("Clustering...");
                KMeansClusterFinder clusterFinder = new KMeansClusterFinder(this.metadata, this.index);
                IList<long> allDocIds = this.metadata.GetDocumentIds();
                long[][] clusters = clusterFinder.Cluster(allDocIds, k);
                int i = 1;
                Console.WriteLine("Done! Here are top terms for each cluster:");

                foreach(long[] cluster in clusters)
                {
                    IEnumerable<DocumentInfo> clusterDocuments = this.metadata.GetDocuments(cluster);
                    TermVector sum = new TermVector();
                    foreach (TermVector vector in clusterDocuments.Select(d => d.TermVector))
                    {
                        sum += vector;
                    }

                    IEnumerable<string> topTerms =
                        TermVector.GetCentroid(this.metadata.GetDocuments(cluster)
                            .Select(docInfo => docInfo.TermVector))
                        .GetNonZeroDimensions()
                        .OrderByDescending(term => sum.GetDimensionLength(term)*this.GetIdf(term))
                        .Take(6);

                    Console.Write(i + ": ");
                    if (topTerms.Count() == 0)
                    {
                        Console.Write("Empty cluster!");
                        Console.WriteLine();
                    }
                    else
                    {
                        foreach (string term in topTerms)
                        {
                            Console.Write(term+"  ");
                        }
                        Console.WriteLine();
                    }
                    i++;
                }
            }
Ejemplo n.º 8
0
        private void Cluster(IEnumerable<long> documentsToCluster, TermVector[] centroids, List<long>[] clusters)
        {
            Contract.Requires(centroids.Length == clusters.Length);

            // Init clusters
            for (int clusterIndex = 0; clusterIndex < centroids.Length; clusterIndex++)
                clusters[clusterIndex] = new List<long>();

            // Assign documents to clusters
            foreach (long documentId in documentsToCluster)
            {
                // Find nearest centroid
                TermVector termVector = metadata[documentId].TermVector;

                int clusterIndex = 0;
                double minDistance = double.MaxValue;
                int minDistanceIndex = 0;

                foreach (TermVector centroid in centroids)
                {
                    //double distance = termVector.CosineSimilarity(centroid);
                    double distance = (centroid - termVector).EuclideanLength();
                    if (distance < minDistance)
                    {
                        minDistance = distance;
                        minDistanceIndex = clusterIndex;
                    }
                    clusterIndex++;
                }

                clusters[minDistanceIndex].Add(documentId);
            }
        }
Ejemplo n.º 9
0
        private TermVector[] GetRandomCentroidsFromDocuments(IList<long> documentsToCluster, int k)
        {
            HashSet<long> seeds = new HashSet<long>();
            TermVector[] centroids = new TermVector[k];
            Random random = new Random();

            // Start k with random seeds
            for (int seedIndex = 0; seedIndex < k; seedIndex++)
            {
                long seedDocId;

                do
                {
                    int randomIndex = random.Next(0, (int)documentsToCluster.Count - 1);
                    seedDocId = documentsToCluster[randomIndex];
                }
                while (seeds.Contains(seedDocId));
                centroids[seedIndex] = metadata[seedDocId].TermVector;
                seeds.Add(seedIndex);
            }

            return centroids;
        }