private Searcher() { this.MongoDbServer = Constants.DefaultServerAddress; this.DbName = Constants.DefaultDbName; _segmentor = new ChineseSegmentor(); }
public Dictionary<Int32, Int32> Search(String keyword) { var server = MongoDbLib.GetServerConnection(ServerAddress); var database = server.GetDatabase(DbName); var tblWordList = database.GetCollection<WordItem>(Constants.TblWordList); var tblFullText = database.GetCollection<InvertedIndex>(Constants.TblFullText); // 針對搜尋關鍵字斷詞 ChineseSegmentor segmentor = new ChineseSegmentor(); List<Pair<String, Int32>> keywordTokens = segmentor.SegWords(keyword); // 自索引中取出對應的 word list var buf = (from t in keywordTokens select t.First).ToList(); var query = from w in tblWordList.AsQueryable<WordItem>() where w.Word.In(buf) select new { w.WordId }; List<Int32> wordIdList = new List<Int32>(); foreach (var aWord in query) { wordIdList.Add(aWord.WordId); } // 自全文索引中,取出對應的記錄 var indexes = from i in tblFullText.AsQueryable<InvertedIndex>() where i.WordId.In(wordIdList) select i; if (indexes.Count() != wordIdList.Count) { return null; } // 將每個 keyword token 對應回相對應的 index List<List<IndexElement>> checkedIndex = new List<List<IndexElement>>(); foreach (var aToken in keywordTokens) { checkedIndex.Add(indexes.Where(t => t.Word == aToken.First).First().Indexes); } // 檢查各文件是否為符合的文件 var firstTokenIndex = checkedIndex[0]; Dictionary<Int32, Int32> hittedDocs = new Dictionary<Int32, Int32>(); foreach (var currentIndex in firstTokenIndex) { if (keywordTokens.Count == 1 || CheckDocumentIsHitted(keywordTokens, 1, checkedIndex, currentIndex)) { if (hittedDocs.ContainsKey(currentIndex.DocId)) hittedDocs[currentIndex.DocId]++; else hittedDocs[currentIndex.DocId] = 1; } } // 將文件照點閱率排序 var sortedDict = (from entry in hittedDocs orderby entry.Value descending select entry).ToDictionary(pair => pair.Key, pair => pair.Value); return sortedDict; }
public bool MakeIndex() { try { // 1. 解文 Fetch(); // 2. 建索引 ChineseSegmentor segmentor = new ChineseSegmentor(); var server = MongoDbLib.GetServerConnection(ServerAddress); var database = server.GetDatabase(DbName); var tblSourceText = database.GetCollection<SourceDocument>(Constants.TblSourceText); // 斷詞,處理每個 Token var sourcees = from s in tblSourceText.AsQueryable<SourceDocument>() orderby s.DocId, s.ParaId select s; Dictionary<String, InvertedIndex> fullIndexes = new Dictionary<String, InvertedIndex>(); InvertedIndex aIndex = null; foreach (var aSourceText in sourcees) { List<Pair<String, Int32>> result = segmentor.SegWords(aSourceText.Para); foreach (var aToken in result) { if (fullIndexes.ContainsKey(aToken.First)) { aIndex = fullIndexes[aToken.First]; } else { aIndex = new InvertedIndex(); aIndex.Word = aToken.First; } aIndex.Indexes.Add(new IndexElement() { DocId = aSourceText.DocId, ParaId = aSourceText.ParaId, Offset = aToken.Second }); fullIndexes[aToken.First] = aIndex; } } // 在 Storage 存入 Word List var wordListCollection = database.GetCollection(Constants.TblWordList); List<BsonDocument> batch = new List<BsonDocument>(); List<String> wordList = fullIndexes.Keys.ToList(); for (int wordId = 0; wordId < fullIndexes.Count; wordId++) { aIndex = fullIndexes[wordList[wordId]]; aIndex.WordId = wordId; batch.Add(new BsonDocument() { { "Word", wordList[wordId] }, { "WordId", wordId} }); } wordListCollection.InsertBatch(batch); // 儲存全文索引 var tblFullText = database.GetCollection(Constants.TblFullText); List<InvertedIndex> fullText = new List<InvertedIndex>(); tblFullText.InsertBatch<InvertedIndex>(fullIndexes.Values.ToList()); return true; } catch (Exception e) { Console.WriteLine(e.StackTrace); return false; } }