Example #1
0
        public static void Main(string[] args)
        {
            if (args.Length != 2)
            {
                Console.WriteLine("usage: Spimi <folderpath> <DestinationIndexFilePath>");
                return;
            }
            string directory = args[0];
            string indexFilePath = args[1];
            SpimiIndexer indexer = new SpimiIndexer(new BasicLexer());

            DirectoryInfo dir = new DirectoryInfo(directory);
            foreach(FileInfo file in dir.GetFiles().Where(f => f.Extension.Equals(".sgm")))
                indexer.Index(file.FullName, file.Open(FileMode.Open));

            using (FileStream indexFileStream = File.Open(indexFilePath, FileMode.CreateNew))
            {
                indexer.CreateIndex(indexFileStream);
                FileIndex index = FileIndex.Open(indexFileStream);
                QueryEngine queryEngine = new QueryEngine(index);
                while (true)
                {
                    Console.Write("> ");
                    string query = Console.ReadLine();
                    foreach (string docId in queryEngine.Query(query.ToLower()))
                    {
                        Console.WriteLine(docId);
                    }
                }
            }
        }
Example #2
0
        public static void Main(string[] args)
        {
            if (args.Length != 3)
            {
                Console.WriteLine("usage: Spimi <folderpath> <DestinationIndexFilePath> <metadatafilepath");
                Console.ReadLine();
                return;
            }
            string directory = args[0];
            string indexFilePath = args[1];
            string metadataFilePath = args[2];

            Console.WriteLine("Welcome to Spimi!");

            DirectoryInfo directoryInfo = new DirectoryInfo(directory);
            if (!directoryInfo.Exists)
            {
                Console.WriteLine("Directory could not be found");
                return;
            }

            using (FileStream indexFileStream = File.Open(indexFilePath, FileMode.Create))
            {
                using (FileStream metadataFileStream = File.Open(metadataFilePath, FileMode.Create))
                {
                    // Index the corpus
                    Console.WriteLine("Parsing corpus and creating index blocks...");
                    SpimiIndexer indexer = new SpimiIndexer(
                        new BasicLexer(),
                        new HtmlParser(),
                        indexFileStream,
                        metadataFileStream);

                    WebCrawler crawler = new WebCrawler(directoryInfo);
                    foreach (WebDocument doc in crawler.GetDocuments())
                    {
                        Stream stream = doc.Open();
                        indexer.Index(doc.Uri, stream);
                        stream.Close();
                    }

                    // 2- Build the final index
                    Console.WriteLine("Merging blocks into one index...");
                    indexer.WriteOut();

                    IndexMetadata indexMetadata = new IndexMetadata(metadataFileStream);
                    TermIndex index = new TermIndex(indexFileStream);
                    QueryEngine queryEngine = new QueryEngine(index, indexMetadata);

                    // 3- Query the index
                    Console.WriteLine("Done! Please use one of the following commands: \n/query <term1> <term2>\n/cluster <k>\n");

                    QueryCli cli = new QueryCli(indexMetadata, index);
                    cli.Run();
                }
            }
        }
Example #3
0
        public void TestLargeIndex()
        {
            SpimiIndexer spimi = new SpimiIndexer(new BasicLexer());

            MemoryStream indexStream = new MemoryStream();
            spimi.Index("TestData1", GetStream(TestData1));
            spimi.Index("TestData2", GetStream(TestData2));
            spimi.CreateIndex(indexStream);
            FileIndex index = FileIndex.Open(indexStream);
            PostingList list = index.GetPostingList("sit");
            Assert.AreEqual("sit", list.Term);
            Assert.AreEqual("TestData1", list.Postings[0]);
            Assert.AreEqual("TestData2", list.Postings[1]);
        }
Example #4
0
    public IndexingStats Index(string site)
    {
        IndexingStats result = new IndexingStats();

        DirectoryInfo directoryInfo = new DirectoryInfo(directory + site);
        if (!directoryInfo.Exists)
        {
            return result;
        }

        DateTime start = DateTime.Now;
        using (FileStream indexFileStream = File.Open(indexFilePath, FileMode.Create))
        {
            using (FileStream metadataFileStream = File.Open(metadataFilePath, FileMode.Create))
            {
                // Index the corpus
                SpimiIndexer indexer = new SpimiIndexer(
                    new BasicLexer(),
                    new HtmlParser(),
                    indexFileStream,
                    metadataFileStream);

                WebCrawler crawler = new WebCrawler(directoryInfo);
                foreach (WebDocument doc in crawler.GetDocuments())
                {
                    Stream stream = doc.Open();
                    indexer.Index(doc.Uri, stream);
                    stream.Close();
                }

                indexer.WriteOut();
                IndexMetadata indexMetadata = new IndexMetadata(metadataFileStream);
                result.CollectionSize = indexMetadata.CollectionLengthInDocuments;
            }
        }
        DateTime end = DateTime.Now;
        result.IndexingTime = (end - start).TotalMilliseconds;
        return result;
    }