Пример #1
0
        public static void Main(string[] args)
        {
            if (args.Length != 2)
            {
                Console.WriteLine("usage: Spimi <folderpath> <DestinationIndexFilePath>");
                return;
            }
            string directory = args[0];
            string indexFilePath = args[1];
            SpimiIndexer indexer = new SpimiIndexer(new BasicLexer());

            DirectoryInfo dir = new DirectoryInfo(directory);
            foreach(FileInfo file in dir.GetFiles().Where(f => f.Extension.Equals(".sgm")))
                indexer.Index(file.FullName, file.Open(FileMode.Open));

            using (FileStream indexFileStream = File.Open(indexFilePath, FileMode.CreateNew))
            {
                indexer.CreateIndex(indexFileStream);
                FileIndex index = FileIndex.Open(indexFileStream);
                QueryEngine queryEngine = new QueryEngine(index);
                while (true)
                {
                    Console.Write("> ");
                    string query = Console.ReadLine();
                    foreach (string docId in queryEngine.Query(query.ToLower()))
                    {
                        Console.WriteLine(docId);
                    }
                }
            }
        }
Пример #2
0
        public static void Main(string[] args)
        {
            if (args.Length != 3)
            {
                Console.WriteLine("usage: Spimi <folderpath> <DestinationIndexFilePath> <metadatafilepath");
                Console.ReadLine();
                return;
            }
            string directory = args[0];
            string indexFilePath = args[1];
            string metadataFilePath = args[2];

            Console.WriteLine("Welcome to Spimi!");

            DirectoryInfo directoryInfo = new DirectoryInfo(directory);
            if (!directoryInfo.Exists)
            {
                Console.WriteLine("Directory could not be found");
                return;
            }

            using (FileStream indexFileStream = File.Open(indexFilePath, FileMode.Create))
            {
                using (FileStream metadataFileStream = File.Open(metadataFilePath, FileMode.Create))
                {
                    // Index the corpus
                    Console.WriteLine("Parsing corpus and creating index blocks...");
                    SpimiIndexer indexer = new SpimiIndexer(
                        new BasicLexer(),
                        new HtmlParser(),
                        indexFileStream,
                        metadataFileStream);

                    WebCrawler crawler = new WebCrawler(directoryInfo);
                    foreach (WebDocument doc in crawler.GetDocuments())
                    {
                        Stream stream = doc.Open();
                        indexer.Index(doc.Uri, stream);
                        stream.Close();
                    }

                    // 2- Build the final index
                    Console.WriteLine("Merging blocks into one index...");
                    indexer.WriteOut();

                    IndexMetadata indexMetadata = new IndexMetadata(metadataFileStream);
                    TermIndex index = new TermIndex(indexFileStream);
                    QueryEngine queryEngine = new QueryEngine(index, indexMetadata);

                    // 3- Query the index
                    Console.WriteLine("Done! Please use one of the following commands: \n/query <term1> <term2>\n/cluster <k>\n");

                    QueryCli cli = new QueryCli(indexMetadata, index);
                    cli.Run();
                }
            }
        }
Пример #3
0
 public void testQuery()
 {
     QueryEngine engine = new QueryEngine(index, metadata);
     IList<long> foundPostings = engine.Query("foo bar", RankingMode.TFIDF);
     IList<Posting> expectedPostings = postingsWithFoo.Union(postingsWithBar).ToList();
     foreach (Posting posting in expectedPostings)
     {
         Assert.IsTrue(foundPostings.Contains(posting.DocumentId));
     }
 }
Пример #4
0
    public IList<ClusterResult> Cluster(int k)
    {
        using (FileStream indexFileStream = File.Open(indexFilePath, FileMode.Open))
        {
            using (FileStream metadataFileStream = File.Open(metadataFilePath, FileMode.Open))
            {
                IndexMetadata indexMetadata = new IndexMetadata(metadataFileStream);
                TermIndex index = new TermIndex(indexFileStream);
                QueryEngine queryEngine = new QueryEngine(index, indexMetadata);

                KMeansClusterFinder clusterFinder = new KMeansClusterFinder(indexMetadata, index);
                IList<long> allDocIds = indexMetadata.GetDocumentIds();
                long[][] clusters = clusterFinder.Cluster(allDocIds, k);

                IList<ClusterResult> clusterResults = new List<ClusterResult>();

                foreach (long[] cluster in clusters)
                {
                    // Get the term frequencies in the collection
                    IEnumerable<DocumentInfo> clusterDocuments = indexMetadata.GetDocuments(cluster);
                    TermVector sum = new TermVector();
                    foreach (TermVector vector in clusterDocuments.Select(d => d.TermVector))
                    {
                        sum += vector;
                    }

                    IEnumerable<string> topTerms =
                        TermVector.GetCentroid(indexMetadata.GetDocuments(cluster)
                            .Select(docInfo => docInfo.TermVector))
                        .GetNonZeroDimensions()
                        .OrderByDescending(term => sum.GetDimensionLength(term) * this.GetIdf(index, indexMetadata, term))
                        .Take(6);

                    clusterResults.Add(new ClusterResult(topTerms.ToList(),
                        clusterDocuments.Select(docInfo => docInfo.Uri).ToList()));
                }

                return clusterResults;
            }
        }
    }
Пример #5
0
    public IList<QueryResult> Query(string query, RankingMode rankingMode)
    {
        using (FileStream indexFileStream = File.Open(indexFilePath, FileMode.Open))
        {
            using (FileStream metadataFileStream = File.Open(metadataFilePath, FileMode.Open))
            {
                IndexMetadata indexMetadata = new IndexMetadata(metadataFileStream);
                TermIndex index = new TermIndex(indexFileStream);
                QueryEngine queryEngine = new QueryEngine(index, indexMetadata);

                IList<long> results = queryEngine.Query(query.ToLower(), rankingMode);
                IList<QueryResult> queryResults = new List<QueryResult>();

                int i = 1;
                Console.WriteLine("rank\tscore\ttitle");
                foreach (long docId in results.Take(500))
                {
                    DocumentInfo docInfo;
                    if (indexMetadata.TryGetDocumentInfo(docId, out docInfo))
                    {
                        QueryResult res = new QueryResult()
                        {
                            Title = docInfo.Title,
                            Uri = docInfo.Uri,
                            Score = queryEngine.Scores[docId]
                        };
                        queryResults.Add(res);
                    }
                    else
                    {
                        Console.WriteLine("Found document id in posting list that wasn't indexed in metadata: " + docId);
                    }
                }

                return queryResults;
            }
        }
    }
Пример #6
0
 public QueryCli(IndexMetadata metadata, TermIndex index)
 {
     this.metadata = metadata;
     this.index = index;
     this.queryEngine = new QueryEngine(index, metadata);
 }