Beispiel #1
0
        private Searcher()
        {
            this.MongoDbServer = Constants.DefaultServerAddress;
            this.DbName = Constants.DefaultDbName;

            _segmentor = new ChineseSegmentor();
        }
Beispiel #2
0
        public Dictionary<Int32, Int32> Search(String keyword)
        {
            var server = MongoDbLib.GetServerConnection(ServerAddress);
            var database = server.GetDatabase(DbName);
            var tblWordList = database.GetCollection<WordItem>(Constants.TblWordList);
            var tblFullText = database.GetCollection<InvertedIndex>(Constants.TblFullText);

            // 針對搜尋關鍵字斷詞
            ChineseSegmentor segmentor = new ChineseSegmentor();
            List<Pair<String, Int32>> keywordTokens = segmentor.SegWords(keyword);

            // 自索引中取出對應的 word list
            var buf = (from t in keywordTokens select t.First).ToList();
            var query = from w in tblWordList.AsQueryable<WordItem>()
                        where w.Word.In(buf)
                        select new { w.WordId };
            List<Int32> wordIdList = new List<Int32>();
            foreach (var aWord in query)
            {
                wordIdList.Add(aWord.WordId);
            }

            // 自全文索引中,取出對應的記錄
            var indexes = from i in tblFullText.AsQueryable<InvertedIndex>()
                          where i.WordId.In(wordIdList)
                          select i;

            if (indexes.Count() != wordIdList.Count)
            {
                return null;
            }

            // 將每個 keyword token 對應回相對應的 index
            List<List<IndexElement>> checkedIndex = new List<List<IndexElement>>();
            foreach (var aToken in keywordTokens)
            {
                checkedIndex.Add(indexes.Where(t => t.Word == aToken.First).First().Indexes);
            }

            // 檢查各文件是否為符合的文件
            var firstTokenIndex = checkedIndex[0];
            Dictionary<Int32, Int32> hittedDocs = new Dictionary<Int32, Int32>();
            foreach (var currentIndex in firstTokenIndex)
            {
                if (keywordTokens.Count == 1 || CheckDocumentIsHitted(keywordTokens, 1, checkedIndex, currentIndex))
                {
                    if (hittedDocs.ContainsKey(currentIndex.DocId))
                        hittedDocs[currentIndex.DocId]++;
                    else
                        hittedDocs[currentIndex.DocId] = 1;
                }
            }

            // 將文件照點閱率排序
            var sortedDict = (from entry in hittedDocs orderby entry.Value descending select entry).ToDictionary(pair => pair.Key, pair => pair.Value);

            return sortedDict;
        }
Beispiel #3
0
        public bool MakeIndex()
        {
            try
            {
                // 1. 解文
                Fetch();

                // 2. 建索引
                ChineseSegmentor segmentor = new ChineseSegmentor();
                var server = MongoDbLib.GetServerConnection(ServerAddress);
                var database = server.GetDatabase(DbName);
                var tblSourceText = database.GetCollection<SourceDocument>(Constants.TblSourceText);

                // 斷詞,處理每個 Token
                var sourcees = from s in tblSourceText.AsQueryable<SourceDocument>()
                               orderby s.DocId, s.ParaId
                               select s;
                Dictionary<String, InvertedIndex> fullIndexes = new Dictionary<String, InvertedIndex>();
                InvertedIndex aIndex = null;
                foreach (var aSourceText in sourcees)
                {
                    List<Pair<String, Int32>> result = segmentor.SegWords(aSourceText.Para);
                    foreach (var aToken in result)
                    {
                        if (fullIndexes.ContainsKey(aToken.First))
                        {
                            aIndex = fullIndexes[aToken.First];
                        }
                        else
                        {
                            aIndex = new InvertedIndex();
                            aIndex.Word = aToken.First;
                        }

                        aIndex.Indexes.Add(new IndexElement()
                        {
                            DocId = aSourceText.DocId,
                            ParaId = aSourceText.ParaId,
                            Offset = aToken.Second
                        });

                        fullIndexes[aToken.First] = aIndex;
                    }
                }

                // 在 Storage 存入 Word List
                var wordListCollection = database.GetCollection(Constants.TblWordList);
                List<BsonDocument> batch = new List<BsonDocument>();
                List<String> wordList = fullIndexes.Keys.ToList();
                for (int wordId = 0; wordId < fullIndexes.Count; wordId++)
                {
                    aIndex = fullIndexes[wordList[wordId]];
                    aIndex.WordId = wordId;

                    batch.Add(new BsonDocument()
                        {
                            { "Word", wordList[wordId] },
                            { "WordId", wordId}
                        });
                }

                wordListCollection.InsertBatch(batch);

                // 儲存全文索引
                var tblFullText = database.GetCollection(Constants.TblFullText);
                List<InvertedIndex> fullText = new List<InvertedIndex>();
                tblFullText.InsertBatch<InvertedIndex>(fullIndexes.Values.ToList());

                return true;
            }
            catch (Exception e)
            {
                Console.WriteLine(e.StackTrace);
                return false;
            }
        }