/// <summary> /// 解析原始文件,存人 MongoDB 中 /// </summary> /// <returns></returns> private bool ParseSourceDoc() { try { int docId = 0; // 取得 MongoDB Connection var server = MongoDbLib.GetServerConnection(_indexer.MongoDbServer); var database = server.GetDatabase(_indexer.DbName); var collection = database.GetCollection <SourceText>(Constants.TblSourceText); // 處理每份來源文件 StringBuilder sb = new StringBuilder(); foreach (var aDoc in _bufferedSrcDocs) { // clear sb.Clear(); // 取出 Full Text 欄位 foreach (var aField in aDoc.FullText) { sb.Append(aField.Value + "\n"); } // 將 Full Text 解析成多個 Para String rawContent = sb.ToString(); String[] paras = rawContent.Split(new String[] { "\r\n", "\n" }, StringSplitOptions.RemoveEmptyEntries); // 將 Para 資訊存入 MongoDb 中 for (int i = 0; i < paras.Length; i++) { var doc = new SourceText(); doc.DocId = docId; doc.ParaId = i; doc.Para = paras[i]; collection.Insert(doc); } docId++; } return(true); } catch { return(false); } }
/// <summary> /// 刪除索引庫 /// </summary> /// <param name="name"></param> /// <returns></returns> public bool DeleteRepository(String name) { try { var server = MongoDbLib.GetServerConnection(MongoDbServer); var database = server.GetDatabase(DbName); database.DropCollection(Constants.TblFullText); database.DropCollection(Constants.TblSourceText); database.DropCollection(Constants.TblWordList); return(true); } catch { return(false); } }
private Indexer() { this.MongoDbServer = Constants.DefaultServerAddress; this.DbName = Constants.DefaultDbName; this.Repositories = new Dictionary <string, Repository>(); // 取得 repository 資訊列表 var server = MongoDbLib.GetServerConnection(MongoDbServer); var database = server.GetDatabase(DbName); var tblRepos = database.GetCollection <RepositoryInfo>(Constants.TblRepository); var repos = from r in tblRepos.AsQueryable <RepositoryInfo>() select r; foreach (var aRepo in repos) { this.Repositories.Add(aRepo.Name, new Repository(this, aRepo.Name)); } }
/// <summary> /// 新增索引庫 /// </summary> /// <param name="name"></param> /// <returns></returns> public Repository CreateRepository(String name) { try { if (this.Repositories.ContainsKey(name)) { return(Repositories[name]); } else { try { // 新增 repository 資訊 Repository repo = new Repository(this, name); this.Repositories.Add(name, repo); // 將 repository 資訊寫入 MongoDB 中 var server = MongoDbLib.GetServerConnection(MongoDbServer); var database = server.GetDatabase(DbName); var tblRepos = database.GetCollection <RepositoryInfo>(Constants.TblRepository); tblRepos.Insert(new RepositoryInfo() { Name = name }); return(repo); } catch (Exception e) { // roll back if (this.Repositories.ContainsKey(name)) { this.Repositories.Remove(name); } throw e; } } } catch (Exception e) { throw e; } }
/// <summary> /// 以 Doc Id 取出文件 /// </summary> /// <param name="docId"></param> /// <returns></returns> private String GetDocumentById(int docId) { try { var server = MongoDbLib.GetServerConnection(MongoDbServer); var database = server.GetDatabase(DbName); var tblSourceText = database.GetCollection <WordItem>(Constants.TblSourceText); var sourceTexts = from s in tblSourceText.AsQueryable <SourceText>() where s.DocId == docId orderby s.ParaId select s.Para; return(String.Join("\r\n", sourceTexts)); } catch (Exception e) { return(""); } }
/// <summary> /// 建索引主體 /// </summary> /// <returns></returns> public bool MakeIndex() { try { // 1. 解文 ParseSourceDoc(); // 2. 建索引 ChineseSegmentor segmentor = new ChineseSegmentor(); var server = MongoDbLib.GetServerConnection(_indexer.MongoDbServer); var database = server.GetDatabase(_indexer.DbName); var tblSourceText = database.GetCollection <SourceText>(Constants.TblSourceText); // 斷詞,處理每個 Token var sourcees = from s in tblSourceText.AsQueryable <SourceText>() orderby s.DocId, s.ParaId select s; Dictionary <String, InvertedIndex> fullIndexes = new Dictionary <String, InvertedIndex>(); InvertedIndex aIndex = null; foreach (var aSourceText in sourcees) { List <Pair <String, Int32> > result = segmentor.SegWords(aSourceText.Para); foreach (var aToken in result) { if (fullIndexes.ContainsKey(aToken.First)) { aIndex = fullIndexes[aToken.First]; } else { aIndex = new InvertedIndex(); aIndex.Word = aToken.First; } aIndex.Indexes.Add(new IndexElement() { DocId = aSourceText.DocId, ParaId = aSourceText.ParaId, Offset = aToken.Second }); fullIndexes[aToken.First] = aIndex; } } // 在 Storage 存入 Word List var wordListCollection = database.GetCollection(Constants.TblWordList); List <BsonDocument> batch = new List <BsonDocument>(); List <String> wordList = fullIndexes.Keys.ToList(); for (int wordId = 0; wordId < fullIndexes.Count; wordId++) { aIndex = fullIndexes[wordList[wordId]]; aIndex.WordId = wordId; batch.Add(new BsonDocument() { { "Word", wordList[wordId] }, { "WordId", wordId } }); } wordListCollection.InsertBatch(batch); // 儲存全文索引 var tblFullText = database.GetCollection(Constants.TblFullText); List <InvertedIndex> fullText = new List <InvertedIndex>(); tblFullText.InsertBatch <InvertedIndex>(fullIndexes.Values.ToList()); return(true); } catch (Exception e) { Console.WriteLine(e.StackTrace); return(false); } }
public SearchResult Search(Repository repo, String keyword, int startPos, int fetchSize) { try { SearchResult result = new SearchResult(); // 計時器 Stopwatch sw = new Stopwatch(); sw.Reset(); sw.Start(); // MongoDb 初始化 var server = MongoDbLib.GetServerConnection(MongoDbServer); var database = server.GetDatabase(DbName); var tblWordList = database.GetCollection <WordItem>(Constants.TblWordList); var tblFullText = database.GetCollection <InvertedIndex>(Constants.TblFullText); // 針對搜尋關鍵字斷詞 List <Pair <String, Int32> > keywordTokens = _segmentor.SegWords(keyword); // 自索引中取出對應的 word list var buf = (from t in keywordTokens select t.First).ToList(); var query = from w in tblWordList.AsQueryable <WordItem>() where w.Word.In(buf) select new { w.WordId }; List <Int32> wordIdList = new List <Int32>(); foreach (var aWord in query) { wordIdList.Add(aWord.WordId); } // word id 為 0 筆,表示搜尋結果為 0 if (wordIdList.Count == 0) { sw.Stop(); result.SearchTime = sw.ElapsedMilliseconds / 1000.0; return(result); } // 自全文索引中,取出對應的記錄 var indexes = from i in tblFullText.AsQueryable <InvertedIndex>() where i.WordId.In(wordIdList) select i; if (indexes.Count() != wordIdList.Count) { return(null); } // 將每個 keyword token 對應回相對應的 index List <List <IndexElement> > checkedIndex = new List <List <IndexElement> >(); foreach (var aToken in keywordTokens) { checkedIndex.Add(indexes.Where(t => t.Word == aToken.First).First().Indexes); } // 檢查各文件是否為符合的文件 var firstTokenIndex = checkedIndex[0]; Dictionary <Int32, Int32> hittedDocs = new Dictionary <Int32, Int32>(); foreach (var currentIndex in firstTokenIndex) { if (keywordTokens.Count == 1 || CheckDocumentIsHitted(keywordTokens, 1, checkedIndex, currentIndex)) { if (hittedDocs.ContainsKey(currentIndex.DocId)) { hittedDocs[currentIndex.DocId]++; } else { hittedDocs[currentIndex.DocId] = 1; } } } // 文件照分數排序,取出指定區間的 doc id 列表 var sortedDocIds = (from entry in hittedDocs orderby entry.Value descending select entry.Key).Skip(startPos).Take(fetchSize).ToList(); // 結果儲存 result.Matches = hittedDocs.Count; sw.Stop(); result.SearchTime = sw.ElapsedMilliseconds / 1000.0; for (int i = 0; i < fetchSize && i < sortedDocIds.Count; i++) { String rawText = this.GetDocumentById(sortedDocIds[i]); result.Results.Add(new ResultItem() { Rank = startPos + 1 + i, Score = hittedDocs[sortedDocIds[i]], HitField = rawText.Replace(keyword, "<<" + keyword + ">>") }); } return(result); } catch (Exception e) { return(null); } }