public static void Main(string[] args) { if (args.Length != 2) { Console.WriteLine("usage: Spimi <folderpath> <DestinationIndexFilePath>"); return; } string directory = args[0]; string indexFilePath = args[1]; SpimiIndexer indexer = new SpimiIndexer(new BasicLexer()); DirectoryInfo dir = new DirectoryInfo(directory); foreach(FileInfo file in dir.GetFiles().Where(f => f.Extension.Equals(".sgm"))) indexer.Index(file.FullName, file.Open(FileMode.Open)); using (FileStream indexFileStream = File.Open(indexFilePath, FileMode.CreateNew)) { indexer.CreateIndex(indexFileStream); FileIndex index = FileIndex.Open(indexFileStream); QueryEngine queryEngine = new QueryEngine(index); while (true) { Console.Write("> "); string query = Console.ReadLine(); foreach (string docId in queryEngine.Query(query.ToLower())) { Console.WriteLine(docId); } } } }
public static void Main(string[] args) { if (args.Length != 3) { Console.WriteLine("usage: Spimi <folderpath> <DestinationIndexFilePath> <metadatafilepath"); Console.ReadLine(); return; } string directory = args[0]; string indexFilePath = args[1]; string metadataFilePath = args[2]; Console.WriteLine("Welcome to Spimi!"); DirectoryInfo directoryInfo = new DirectoryInfo(directory); if (!directoryInfo.Exists) { Console.WriteLine("Directory could not be found"); return; } using (FileStream indexFileStream = File.Open(indexFilePath, FileMode.Create)) { using (FileStream metadataFileStream = File.Open(metadataFilePath, FileMode.Create)) { // Index the corpus Console.WriteLine("Parsing corpus and creating index blocks..."); SpimiIndexer indexer = new SpimiIndexer( new BasicLexer(), new HtmlParser(), indexFileStream, metadataFileStream); WebCrawler crawler = new WebCrawler(directoryInfo); foreach (WebDocument doc in crawler.GetDocuments()) { Stream stream = doc.Open(); indexer.Index(doc.Uri, stream); stream.Close(); } // 2- Build the final index Console.WriteLine("Merging blocks into one index..."); indexer.WriteOut(); IndexMetadata indexMetadata = new IndexMetadata(metadataFileStream); TermIndex index = new TermIndex(indexFileStream); QueryEngine queryEngine = new QueryEngine(index, indexMetadata); // 3- Query the index Console.WriteLine("Done! Please use one of the following commands: \n/query <term1> <term2>\n/cluster <k>\n"); QueryCli cli = new QueryCli(indexMetadata, index); cli.Run(); } } }
public void testQuery() { QueryEngine engine = new QueryEngine(index, metadata); IList<long> foundPostings = engine.Query("foo bar", RankingMode.TFIDF); IList<Posting> expectedPostings = postingsWithFoo.Union(postingsWithBar).ToList(); foreach (Posting posting in expectedPostings) { Assert.IsTrue(foundPostings.Contains(posting.DocumentId)); } }
public IList<ClusterResult> Cluster(int k) { using (FileStream indexFileStream = File.Open(indexFilePath, FileMode.Open)) { using (FileStream metadataFileStream = File.Open(metadataFilePath, FileMode.Open)) { IndexMetadata indexMetadata = new IndexMetadata(metadataFileStream); TermIndex index = new TermIndex(indexFileStream); QueryEngine queryEngine = new QueryEngine(index, indexMetadata); KMeansClusterFinder clusterFinder = new KMeansClusterFinder(indexMetadata, index); IList<long> allDocIds = indexMetadata.GetDocumentIds(); long[][] clusters = clusterFinder.Cluster(allDocIds, k); IList<ClusterResult> clusterResults = new List<ClusterResult>(); foreach (long[] cluster in clusters) { // Get the term frequencies in the collection IEnumerable<DocumentInfo> clusterDocuments = indexMetadata.GetDocuments(cluster); TermVector sum = new TermVector(); foreach (TermVector vector in clusterDocuments.Select(d => d.TermVector)) { sum += vector; } IEnumerable<string> topTerms = TermVector.GetCentroid(indexMetadata.GetDocuments(cluster) .Select(docInfo => docInfo.TermVector)) .GetNonZeroDimensions() .OrderByDescending(term => sum.GetDimensionLength(term) * this.GetIdf(index, indexMetadata, term)) .Take(6); clusterResults.Add(new ClusterResult(topTerms.ToList(), clusterDocuments.Select(docInfo => docInfo.Uri).ToList())); } return clusterResults; } } }
public IList<QueryResult> Query(string query, RankingMode rankingMode) { using (FileStream indexFileStream = File.Open(indexFilePath, FileMode.Open)) { using (FileStream metadataFileStream = File.Open(metadataFilePath, FileMode.Open)) { IndexMetadata indexMetadata = new IndexMetadata(metadataFileStream); TermIndex index = new TermIndex(indexFileStream); QueryEngine queryEngine = new QueryEngine(index, indexMetadata); IList<long> results = queryEngine.Query(query.ToLower(), rankingMode); IList<QueryResult> queryResults = new List<QueryResult>(); int i = 1; Console.WriteLine("rank\tscore\ttitle"); foreach (long docId in results.Take(500)) { DocumentInfo docInfo; if (indexMetadata.TryGetDocumentInfo(docId, out docInfo)) { QueryResult res = new QueryResult() { Title = docInfo.Title, Uri = docInfo.Uri, Score = queryEngine.Scores[docId] }; queryResults.Add(res); } else { Console.WriteLine("Found document id in posting list that wasn't indexed in metadata: " + docId); } } return queryResults; } } }
public QueryCli(IndexMetadata metadata, TermIndex index) { this.metadata = metadata; this.index = index; this.queryEngine = new QueryEngine(index, metadata); }