Ejemplo n.º 1
0
        /// <summary>
        /// 建索引主體
        /// </summary>
        /// <returns></returns>
        public bool MakeIndex()
        {
            try
            {
                // 1. 解文
                ParseSourceDoc();

                // 2. 建索引
                ChineseSegmentor segmentor = new ChineseSegmentor();
                var server        = MongoDbLib.GetServerConnection(_indexer.MongoDbServer);
                var database      = server.GetDatabase(_indexer.DbName);
                var tblSourceText = database.GetCollection <SourceText>(Constants.TblSourceText);

                // 斷詞,處理每個 Token
                var sourcees = from s in tblSourceText.AsQueryable <SourceText>()
                               orderby s.DocId, s.ParaId
                select s;
                Dictionary <String, InvertedIndex> fullIndexes = new Dictionary <String, InvertedIndex>();
                InvertedIndex aIndex = null;
                foreach (var aSourceText in sourcees)
                {
                    List <Pair <String, Int32> > result = segmentor.SegWords(aSourceText.Para);
                    foreach (var aToken in result)
                    {
                        if (fullIndexes.ContainsKey(aToken.First))
                        {
                            aIndex = fullIndexes[aToken.First];
                        }
                        else
                        {
                            aIndex      = new InvertedIndex();
                            aIndex.Word = aToken.First;
                        }

                        aIndex.Indexes.Add(new IndexElement()
                        {
                            DocId  = aSourceText.DocId,
                            ParaId = aSourceText.ParaId,
                            Offset = aToken.Second
                        });

                        fullIndexes[aToken.First] = aIndex;
                    }
                }

                // 在 Storage 存入 Word List
                var wordListCollection       = database.GetCollection(Constants.TblWordList);
                List <BsonDocument> batch    = new List <BsonDocument>();
                List <String>       wordList = fullIndexes.Keys.ToList();
                for (int wordId = 0; wordId < fullIndexes.Count; wordId++)
                {
                    aIndex        = fullIndexes[wordList[wordId]];
                    aIndex.WordId = wordId;

                    batch.Add(new BsonDocument()
                    {
                        { "Word", wordList[wordId] },
                        { "WordId", wordId }
                    });
                }

                wordListCollection.InsertBatch(batch);

                // 儲存全文索引
                var tblFullText = database.GetCollection(Constants.TblFullText);
                List <InvertedIndex> fullText = new List <InvertedIndex>();
                tblFullText.InsertBatch <InvertedIndex>(fullIndexes.Values.ToList());

                return(true);
            }
            catch (Exception e)
            {
                Console.WriteLine(e.StackTrace);
                return(false);
            }
        }
Ejemplo n.º 2
0
        public SearchResult Search(Repository repo, String keyword, int startPos, int fetchSize)
        {
            try
            {
                SearchResult result = new SearchResult();

                // 計時器
                Stopwatch sw = new Stopwatch();
                sw.Reset();
                sw.Start();

                // MongoDb 初始化
                var server      = MongoDbLib.GetServerConnection(MongoDbServer);
                var database    = server.GetDatabase(DbName);
                var tblWordList = database.GetCollection <WordItem>(Constants.TblWordList);
                var tblFullText = database.GetCollection <InvertedIndex>(Constants.TblFullText);

                // 針對搜尋關鍵字斷詞
                List <Pair <String, Int32> > keywordTokens = _segmentor.SegWords(keyword);

                // 自索引中取出對應的 word list
                var buf   = (from t in keywordTokens select t.First).ToList();
                var query = from w in tblWordList.AsQueryable <WordItem>()
                            where w.Word.In(buf)
                            select new { w.WordId };
                List <Int32> wordIdList = new List <Int32>();
                foreach (var aWord in query)
                {
                    wordIdList.Add(aWord.WordId);
                }

                // word id 為 0 筆,表示搜尋結果為 0
                if (wordIdList.Count == 0)
                {
                    sw.Stop();
                    result.SearchTime = sw.ElapsedMilliseconds / 1000.0;
                    return(result);
                }

                // 自全文索引中,取出對應的記錄
                var indexes = from i in tblFullText.AsQueryable <InvertedIndex>()
                              where i.WordId.In(wordIdList)
                              select i;

                if (indexes.Count() != wordIdList.Count)
                {
                    return(null);
                }

                // 將每個 keyword token 對應回相對應的 index
                List <List <IndexElement> > checkedIndex = new List <List <IndexElement> >();
                foreach (var aToken in keywordTokens)
                {
                    checkedIndex.Add(indexes.Where(t => t.Word == aToken.First).First().Indexes);
                }

                // 檢查各文件是否為符合的文件
                var firstTokenIndex = checkedIndex[0];
                Dictionary <Int32, Int32> hittedDocs = new Dictionary <Int32, Int32>();
                foreach (var currentIndex in firstTokenIndex)
                {
                    if (keywordTokens.Count == 1 || CheckDocumentIsHitted(keywordTokens, 1, checkedIndex, currentIndex))
                    {
                        if (hittedDocs.ContainsKey(currentIndex.DocId))
                        {
                            hittedDocs[currentIndex.DocId]++;
                        }
                        else
                        {
                            hittedDocs[currentIndex.DocId] = 1;
                        }
                    }
                }

                // 文件照分數排序,取出指定區間的 doc id 列表
                var sortedDocIds = (from entry in hittedDocs orderby entry.Value descending select entry.Key).Skip(startPos).Take(fetchSize).ToList();

                // 結果儲存
                result.Matches = hittedDocs.Count;
                sw.Stop();
                result.SearchTime = sw.ElapsedMilliseconds / 1000.0;

                for (int i = 0; i < fetchSize && i < sortedDocIds.Count; i++)
                {
                    String rawText = this.GetDocumentById(sortedDocIds[i]);
                    result.Results.Add(new ResultItem()
                    {
                        Rank     = startPos + 1 + i,
                        Score    = hittedDocs[sortedDocIds[i]],
                        HitField = rawText.Replace(keyword, "<<" + keyword + ">>")
                    });
                }
                return(result);
            }
            catch (Exception e)
            {
                return(null);
            }
        }