public void TestLargeIndex() { SpimiIndexer spimi = new SpimiIndexer(new BasicLexer()); MemoryStream indexStream = new MemoryStream(); spimi.Index("TestData1", GetStream(TestData1)); spimi.Index("TestData2", GetStream(TestData2)); spimi.CreateIndex(indexStream); FileIndex index = FileIndex.Open(indexStream); PostingList list = index.GetPostingList("sit"); Assert.AreEqual("sit", list.Term); Assert.AreEqual("TestData1", list.Postings[0]); Assert.AreEqual("TestData2", list.Postings[1]); }
public static void Main(string[] args) { if (args.Length != 2) { Console.WriteLine("usage: Spimi <folderpath> <DestinationIndexFilePath>"); return; } string directory = args[0]; string indexFilePath = args[1]; SpimiIndexer indexer = new SpimiIndexer(new BasicLexer()); DirectoryInfo dir = new DirectoryInfo(directory); foreach(FileInfo file in dir.GetFiles().Where(f => f.Extension.Equals(".sgm"))) indexer.Index(file.FullName, file.Open(FileMode.Open)); using (FileStream indexFileStream = File.Open(indexFilePath, FileMode.CreateNew)) { indexer.CreateIndex(indexFileStream); FileIndex index = FileIndex.Open(indexFileStream); QueryEngine queryEngine = new QueryEngine(index); while (true) { Console.Write("> "); string query = Console.ReadLine(); foreach (string docId in queryEngine.Query(query.ToLower())) { Console.WriteLine(docId); } } } }
public static void Main(string[] args) { if (args.Length != 3) { Console.WriteLine("usage: Spimi <folderpath> <DestinationIndexFilePath> <metadatafilepath"); Console.ReadLine(); return; } string directory = args[0]; string indexFilePath = args[1]; string metadataFilePath = args[2]; Console.WriteLine("Welcome to Spimi!"); DirectoryInfo directoryInfo = new DirectoryInfo(directory); if (!directoryInfo.Exists) { Console.WriteLine("Directory could not be found"); return; } using (FileStream indexFileStream = File.Open(indexFilePath, FileMode.Create)) { using (FileStream metadataFileStream = File.Open(metadataFilePath, FileMode.Create)) { // Index the corpus Console.WriteLine("Parsing corpus and creating index blocks..."); SpimiIndexer indexer = new SpimiIndexer( new BasicLexer(), new HtmlParser(), indexFileStream, metadataFileStream); WebCrawler crawler = new WebCrawler(directoryInfo); foreach (WebDocument doc in crawler.GetDocuments()) { Stream stream = doc.Open(); indexer.Index(doc.Uri, stream); stream.Close(); } // 2- Build the final index Console.WriteLine("Merging blocks into one index..."); indexer.WriteOut(); IndexMetadata indexMetadata = new IndexMetadata(metadataFileStream); TermIndex index = new TermIndex(indexFileStream); QueryEngine queryEngine = new QueryEngine(index, indexMetadata); // 3- Query the index Console.WriteLine("Done! Please use one of the following commands: \n/query <term1> <term2>\n/cluster <k>\n"); QueryCli cli = new QueryCli(indexMetadata, index); cli.Run(); } } }
public IndexingStats Index(string site) { IndexingStats result = new IndexingStats(); DirectoryInfo directoryInfo = new DirectoryInfo(directory + site); if (!directoryInfo.Exists) { return result; } DateTime start = DateTime.Now; using (FileStream indexFileStream = File.Open(indexFilePath, FileMode.Create)) { using (FileStream metadataFileStream = File.Open(metadataFilePath, FileMode.Create)) { // Index the corpus SpimiIndexer indexer = new SpimiIndexer( new BasicLexer(), new HtmlParser(), indexFileStream, metadataFileStream); WebCrawler crawler = new WebCrawler(directoryInfo); foreach (WebDocument doc in crawler.GetDocuments()) { Stream stream = doc.Open(); indexer.Index(doc.Uri, stream); stream.Close(); } indexer.WriteOut(); IndexMetadata indexMetadata = new IndexMetadata(metadataFileStream); result.CollectionSize = indexMetadata.CollectionLengthInDocuments; } } DateTime end = DateTime.Now; result.IndexingTime = (end - start).TotalMilliseconds; return result; }