Example #1
0
        public static void Main(string[] args)
        {
            if (args.Length != 3)
            {
                Console.WriteLine("usage: Spimi <folderpath> <DestinationIndexFilePath> <metadatafilepath");
                Console.ReadLine();
                return;
            }
            string directory = args[0];
            string indexFilePath = args[1];
            string metadataFilePath = args[2];

            Console.WriteLine("Welcome to Spimi!");

            DirectoryInfo directoryInfo = new DirectoryInfo(directory);
            if (!directoryInfo.Exists)
            {
                Console.WriteLine("Directory could not be found");
                return;
            }

            using (FileStream indexFileStream = File.Open(indexFilePath, FileMode.Create))
            {
                using (FileStream metadataFileStream = File.Open(metadataFilePath, FileMode.Create))
                {
                    // Index the corpus
                    Console.WriteLine("Parsing corpus and creating index blocks...");
                    SpimiIndexer indexer = new SpimiIndexer(
                        new BasicLexer(),
                        new HtmlParser(),
                        indexFileStream,
                        metadataFileStream);

                    WebCrawler crawler = new WebCrawler(directoryInfo);
                    foreach (WebDocument doc in crawler.GetDocuments())
                    {
                        Stream stream = doc.Open();
                        indexer.Index(doc.Uri, stream);
                        stream.Close();
                    }

                    // 2- Build the final index
                    Console.WriteLine("Merging blocks into one index...");
                    indexer.WriteOut();

                    IndexMetadata indexMetadata = new IndexMetadata(metadataFileStream);
                    TermIndex index = new TermIndex(indexFileStream);
                    QueryEngine queryEngine = new QueryEngine(index, indexMetadata);

                    // 3- Query the index
                    Console.WriteLine("Done! Please use one of the following commands: \n/query <term1> <term2>\n/cluster <k>\n");

                    QueryCli cli = new QueryCli(indexMetadata, index);
                    cli.Run();
                }
            }
        }
Example #2
0
    public IndexingStats Index(string site)
    {
        IndexingStats result = new IndexingStats();

        DirectoryInfo directoryInfo = new DirectoryInfo(directory + site);
        if (!directoryInfo.Exists)
        {
            return result;
        }

        DateTime start = DateTime.Now;
        using (FileStream indexFileStream = File.Open(indexFilePath, FileMode.Create))
        {
            using (FileStream metadataFileStream = File.Open(metadataFilePath, FileMode.Create))
            {
                // Index the corpus
                SpimiIndexer indexer = new SpimiIndexer(
                    new BasicLexer(),
                    new HtmlParser(),
                    indexFileStream,
                    metadataFileStream);

                WebCrawler crawler = new WebCrawler(directoryInfo);
                foreach (WebDocument doc in crawler.GetDocuments())
                {
                    Stream stream = doc.Open();
                    indexer.Index(doc.Uri, stream);
                    stream.Close();
                }

                indexer.WriteOut();
                IndexMetadata indexMetadata = new IndexMetadata(metadataFileStream);
                result.CollectionSize = indexMetadata.CollectionLengthInDocuments;
            }
        }
        DateTime end = DateTime.Now;
        result.IndexingTime = (end - start).TotalMilliseconds;
        return result;
    }