public static void Main(string[] args) { if (args.Length != 3) { Console.WriteLine("usage: Spimi <folderpath> <DestinationIndexFilePath> <metadatafilepath"); Console.ReadLine(); return; } string directory = args[0]; string indexFilePath = args[1]; string metadataFilePath = args[2]; Console.WriteLine("Welcome to Spimi!"); DirectoryInfo directoryInfo = new DirectoryInfo(directory); if (!directoryInfo.Exists) { Console.WriteLine("Directory could not be found"); return; } using (FileStream indexFileStream = File.Open(indexFilePath, FileMode.Create)) { using (FileStream metadataFileStream = File.Open(metadataFilePath, FileMode.Create)) { // Index the corpus Console.WriteLine("Parsing corpus and creating index blocks..."); SpimiIndexer indexer = new SpimiIndexer( new BasicLexer(), new HtmlParser(), indexFileStream, metadataFileStream); WebCrawler crawler = new WebCrawler(directoryInfo); foreach (WebDocument doc in crawler.GetDocuments()) { Stream stream = doc.Open(); indexer.Index(doc.Uri, stream); stream.Close(); } // 2- Build the final index Console.WriteLine("Merging blocks into one index..."); indexer.WriteOut(); IndexMetadata indexMetadata = new IndexMetadata(metadataFileStream); TermIndex index = new TermIndex(indexFileStream); QueryEngine queryEngine = new QueryEngine(index, indexMetadata); // 3- Query the index Console.WriteLine("Done! Please use one of the following commands: \n/query <term1> <term2>\n/cluster <k>\n"); QueryCli cli = new QueryCli(indexMetadata, index); cli.Run(); } } }
public IndexingStats Index(string site) { IndexingStats result = new IndexingStats(); DirectoryInfo directoryInfo = new DirectoryInfo(directory + site); if (!directoryInfo.Exists) { return result; } DateTime start = DateTime.Now; using (FileStream indexFileStream = File.Open(indexFilePath, FileMode.Create)) { using (FileStream metadataFileStream = File.Open(metadataFilePath, FileMode.Create)) { // Index the corpus SpimiIndexer indexer = new SpimiIndexer( new BasicLexer(), new HtmlParser(), indexFileStream, metadataFileStream); WebCrawler crawler = new WebCrawler(directoryInfo); foreach (WebDocument doc in crawler.GetDocuments()) { Stream stream = doc.Open(); indexer.Index(doc.Uri, stream); stream.Close(); } indexer.WriteOut(); IndexMetadata indexMetadata = new IndexMetadata(metadataFileStream); result.CollectionSize = indexMetadata.CollectionLengthInDocuments; } } DateTime end = DateTime.Now; result.IndexingTime = (end - start).TotalMilliseconds; return result; }