/// <summary> /// Parses and tokenizes the inputted file. New terms are added to the /// dictionary, and reccurent terms are added to those terms' postings list. /// </summary> /// <param name="uri">The path of the file to index.</param> /// <param name="file">The already opened file stream of the file in question.</param> public void Index(string uri, Stream file) { // Each file holds many documents: we need to parse them out first. foreach (Document document in parser.ExtractDocuments(file)) { // Extract the terms from the document and add the document to their respective postings lists long docId = nextDocumentId++; int termsInDoc = 0; TermVector vector = new TermVector(); foreach (string term in lexer.Tokenize(document.Body)) { vector.AddTerm(term); termIndexBlockWriter.AddPosting(term, docId); if (termIndexBlockWriter.Postings == maxPostingCountPerBlock) { // Write block to disk this.FlushBlockWriter(); } termsInDoc++; collectionLengthInTokens++; } this.metadataWriter.AddDocumentInfo(docId, new DocumentInfo(uri, document.Title, termsInDoc, document.SpecialIdentifier, vector)); } }
public DocumentInfo(string uri, string title, int length, string identifier, TermVector termVector) { this.Uri = uri; this.Length = length; this.SpecialIdentifier = identifier; this.Title = title; this.TermVector = termVector; }
TermVector[] CalculateCentroids(List<long>[] clusters) { TermVector[] centoids = new TermVector[clusters.Length]; int clusterIndex = 0; foreach (List<long> cluster in clusters) { centoids[clusterIndex] = TermVector.GetCentroid( this.GetTermVectors(this.metadata.GetDocuments(cluster))); clusterIndex++; } return centoids; }
public IList<ClusterResult> Cluster(int k) { using (FileStream indexFileStream = File.Open(indexFilePath, FileMode.Open)) { using (FileStream metadataFileStream = File.Open(metadataFilePath, FileMode.Open)) { IndexMetadata indexMetadata = new IndexMetadata(metadataFileStream); TermIndex index = new TermIndex(indexFileStream); QueryEngine queryEngine = new QueryEngine(index, indexMetadata); KMeansClusterFinder clusterFinder = new KMeansClusterFinder(indexMetadata, index); IList<long> allDocIds = indexMetadata.GetDocumentIds(); long[][] clusters = clusterFinder.Cluster(allDocIds, k); IList<ClusterResult> clusterResults = new List<ClusterResult>(); foreach (long[] cluster in clusters) { // Get the term frequencies in the collection IEnumerable<DocumentInfo> clusterDocuments = indexMetadata.GetDocuments(cluster); TermVector sum = new TermVector(); foreach (TermVector vector in clusterDocuments.Select(d => d.TermVector)) { sum += vector; } IEnumerable<string> topTerms = TermVector.GetCentroid(indexMetadata.GetDocuments(cluster) .Select(docInfo => docInfo.TermVector)) .GetNonZeroDimensions() .OrderByDescending(term => sum.GetDimensionLength(term) * this.GetIdf(index, indexMetadata, term)) .Take(6); clusterResults.Add(new ClusterResult(topTerms.ToList(), clusterDocuments.Select(docInfo => docInfo.Uri).ToList())); } return clusterResults; } } }
public int DotProduct(TermVector other) { int sum = 0; foreach (string term in this.Terms) sum += this.GetDimensionLength(term) * other.GetDimensionLength(term); return sum; }
public double CosineSimilarity(TermVector centroid) { return this.DotProduct(centroid) / (this.EuclideanLength() * centroid.EuclideanLength()); }
private void Cluster(string args) { int k = int.Parse(args); Console.WriteLine("Clustering..."); KMeansClusterFinder clusterFinder = new KMeansClusterFinder(this.metadata, this.index); IList<long> allDocIds = this.metadata.GetDocumentIds(); long[][] clusters = clusterFinder.Cluster(allDocIds, k); int i = 1; Console.WriteLine("Done! Here are top terms for each cluster:"); foreach(long[] cluster in clusters) { IEnumerable<DocumentInfo> clusterDocuments = this.metadata.GetDocuments(cluster); TermVector sum = new TermVector(); foreach (TermVector vector in clusterDocuments.Select(d => d.TermVector)) { sum += vector; } IEnumerable<string> topTerms = TermVector.GetCentroid(this.metadata.GetDocuments(cluster) .Select(docInfo => docInfo.TermVector)) .GetNonZeroDimensions() .OrderByDescending(term => sum.GetDimensionLength(term)*this.GetIdf(term)) .Take(6); Console.Write(i + ": "); if (topTerms.Count() == 0) { Console.Write("Empty cluster!"); Console.WriteLine(); } else { foreach (string term in topTerms) { Console.Write(term+" "); } Console.WriteLine(); } i++; } }
private void Cluster(IEnumerable<long> documentsToCluster, TermVector[] centroids, List<long>[] clusters) { Contract.Requires(centroids.Length == clusters.Length); // Init clusters for (int clusterIndex = 0; clusterIndex < centroids.Length; clusterIndex++) clusters[clusterIndex] = new List<long>(); // Assign documents to clusters foreach (long documentId in documentsToCluster) { // Find nearest centroid TermVector termVector = metadata[documentId].TermVector; int clusterIndex = 0; double minDistance = double.MaxValue; int minDistanceIndex = 0; foreach (TermVector centroid in centroids) { //double distance = termVector.CosineSimilarity(centroid); double distance = (centroid - termVector).EuclideanLength(); if (distance < minDistance) { minDistance = distance; minDistanceIndex = clusterIndex; } clusterIndex++; } clusters[minDistanceIndex].Add(documentId); } }
private TermVector[] GetRandomCentroidsFromDocuments(IList<long> documentsToCluster, int k) { HashSet<long> seeds = new HashSet<long>(); TermVector[] centroids = new TermVector[k]; Random random = new Random(); // Start k with random seeds for (int seedIndex = 0; seedIndex < k; seedIndex++) { long seedDocId; do { int randomIndex = random.Next(0, (int)documentsToCluster.Count - 1); seedDocId = documentsToCluster[randomIndex]; } while (seeds.Contains(seedDocId)); centroids[seedIndex] = metadata[seedDocId].TermVector; seeds.Add(seedIndex); } return centroids; }