Пример #1
0
        /// <summary>
        /// 解析原始文件,存人 MongoDB 中
        /// </summary>
        /// <returns></returns>
        private bool ParseSourceDoc()
        {
            try
            {
                int docId = 0;

                // 取得 MongoDB Connection
                var server     = MongoDbLib.GetServerConnection(_indexer.MongoDbServer);
                var database   = server.GetDatabase(_indexer.DbName);
                var collection = database.GetCollection <SourceText>(Constants.TblSourceText);

                // 處理每份來源文件
                StringBuilder sb = new StringBuilder();
                foreach (var aDoc in _bufferedSrcDocs)
                {
                    // clear
                    sb.Clear();

                    // 取出 Full Text 欄位
                    foreach (var aField in aDoc.FullText)
                    {
                        sb.Append(aField.Value + "\n");
                    }

                    // 將 Full Text 解析成多個 Para
                    String   rawContent = sb.ToString();
                    String[] paras      = rawContent.Split(new String[] { "\r\n", "\n" }, StringSplitOptions.RemoveEmptyEntries);

                    // 將 Para 資訊存入 MongoDb 中
                    for (int i = 0; i < paras.Length; i++)
                    {
                        var doc = new SourceText();
                        doc.DocId  = docId;
                        doc.ParaId = i;
                        doc.Para   = paras[i];
                        collection.Insert(doc);
                    }

                    docId++;
                }

                return(true);
            }
            catch
            {
                return(false);
            }
        }
Пример #2
0
        /// <summary>
        /// 刪除索引庫
        /// </summary>
        /// <param name="name"></param>
        /// <returns></returns>
        public bool DeleteRepository(String name)
        {
            try
            {
                var server   = MongoDbLib.GetServerConnection(MongoDbServer);
                var database = server.GetDatabase(DbName);
                database.DropCollection(Constants.TblFullText);
                database.DropCollection(Constants.TblSourceText);
                database.DropCollection(Constants.TblWordList);

                return(true);
            }
            catch
            {
                return(false);
            }
        }
Пример #3
0
        private Indexer()
        {
            this.MongoDbServer = Constants.DefaultServerAddress;
            this.DbName        = Constants.DefaultDbName;
            this.Repositories  = new Dictionary <string, Repository>();

            // 取得 repository 資訊列表
            var server   = MongoDbLib.GetServerConnection(MongoDbServer);
            var database = server.GetDatabase(DbName);
            var tblRepos = database.GetCollection <RepositoryInfo>(Constants.TblRepository);
            var repos    = from r in tblRepos.AsQueryable <RepositoryInfo>()
                           select r;

            foreach (var aRepo in repos)
            {
                this.Repositories.Add(aRepo.Name, new Repository(this, aRepo.Name));
            }
        }
Пример #4
0
        /// <summary>
        /// 新增索引庫
        /// </summary>
        /// <param name="name"></param>
        /// <returns></returns>
        public Repository CreateRepository(String name)
        {
            try
            {
                if (this.Repositories.ContainsKey(name))
                {
                    return(Repositories[name]);
                }
                else
                {
                    try
                    {
                        // 新增 repository 資訊
                        Repository repo = new Repository(this, name);
                        this.Repositories.Add(name, repo);

                        // 將 repository 資訊寫入 MongoDB 中
                        var server   = MongoDbLib.GetServerConnection(MongoDbServer);
                        var database = server.GetDatabase(DbName);
                        var tblRepos = database.GetCollection <RepositoryInfo>(Constants.TblRepository);
                        tblRepos.Insert(new RepositoryInfo()
                        {
                            Name = name
                        });

                        return(repo);
                    }
                    catch (Exception e)
                    {
                        // roll back
                        if (this.Repositories.ContainsKey(name))
                        {
                            this.Repositories.Remove(name);
                        }
                        throw e;
                    }
                }
            }
            catch (Exception e)
            {
                throw e;
            }
        }
Пример #5
0
        /// <summary>
        /// 以 Doc Id 取出文件
        /// </summary>
        /// <param name="docId"></param>
        /// <returns></returns>
        private String GetDocumentById(int docId)
        {
            try
            {
                var server        = MongoDbLib.GetServerConnection(MongoDbServer);
                var database      = server.GetDatabase(DbName);
                var tblSourceText = database.GetCollection <WordItem>(Constants.TblSourceText);

                var sourceTexts = from s in tblSourceText.AsQueryable <SourceText>()
                                  where s.DocId == docId
                                  orderby s.ParaId
                                  select s.Para;

                return(String.Join("\r\n", sourceTexts));
            }
            catch (Exception e)
            {
                return("");
            }
        }
Пример #6
0
        /// <summary>
        /// 建索引主體
        /// </summary>
        /// <returns></returns>
        public bool MakeIndex()
        {
            try
            {
                // 1. 解文
                ParseSourceDoc();

                // 2. 建索引
                ChineseSegmentor segmentor = new ChineseSegmentor();
                var server        = MongoDbLib.GetServerConnection(_indexer.MongoDbServer);
                var database      = server.GetDatabase(_indexer.DbName);
                var tblSourceText = database.GetCollection <SourceText>(Constants.TblSourceText);

                // 斷詞,處理每個 Token
                var sourcees = from s in tblSourceText.AsQueryable <SourceText>()
                               orderby s.DocId, s.ParaId
                select s;
                Dictionary <String, InvertedIndex> fullIndexes = new Dictionary <String, InvertedIndex>();
                InvertedIndex aIndex = null;
                foreach (var aSourceText in sourcees)
                {
                    List <Pair <String, Int32> > result = segmentor.SegWords(aSourceText.Para);
                    foreach (var aToken in result)
                    {
                        if (fullIndexes.ContainsKey(aToken.First))
                        {
                            aIndex = fullIndexes[aToken.First];
                        }
                        else
                        {
                            aIndex      = new InvertedIndex();
                            aIndex.Word = aToken.First;
                        }

                        aIndex.Indexes.Add(new IndexElement()
                        {
                            DocId  = aSourceText.DocId,
                            ParaId = aSourceText.ParaId,
                            Offset = aToken.Second
                        });

                        fullIndexes[aToken.First] = aIndex;
                    }
                }

                // 在 Storage 存入 Word List
                var wordListCollection       = database.GetCollection(Constants.TblWordList);
                List <BsonDocument> batch    = new List <BsonDocument>();
                List <String>       wordList = fullIndexes.Keys.ToList();
                for (int wordId = 0; wordId < fullIndexes.Count; wordId++)
                {
                    aIndex        = fullIndexes[wordList[wordId]];
                    aIndex.WordId = wordId;

                    batch.Add(new BsonDocument()
                    {
                        { "Word", wordList[wordId] },
                        { "WordId", wordId }
                    });
                }

                wordListCollection.InsertBatch(batch);

                // 儲存全文索引
                var tblFullText = database.GetCollection(Constants.TblFullText);
                List <InvertedIndex> fullText = new List <InvertedIndex>();
                tblFullText.InsertBatch <InvertedIndex>(fullIndexes.Values.ToList());

                return(true);
            }
            catch (Exception e)
            {
                Console.WriteLine(e.StackTrace);
                return(false);
            }
        }
Пример #7
0
        public SearchResult Search(Repository repo, String keyword, int startPos, int fetchSize)
        {
            try
            {
                SearchResult result = new SearchResult();

                // 計時器
                Stopwatch sw = new Stopwatch();
                sw.Reset();
                sw.Start();

                // MongoDb 初始化
                var server      = MongoDbLib.GetServerConnection(MongoDbServer);
                var database    = server.GetDatabase(DbName);
                var tblWordList = database.GetCollection <WordItem>(Constants.TblWordList);
                var tblFullText = database.GetCollection <InvertedIndex>(Constants.TblFullText);

                // 針對搜尋關鍵字斷詞
                List <Pair <String, Int32> > keywordTokens = _segmentor.SegWords(keyword);

                // 自索引中取出對應的 word list
                var buf   = (from t in keywordTokens select t.First).ToList();
                var query = from w in tblWordList.AsQueryable <WordItem>()
                            where w.Word.In(buf)
                            select new { w.WordId };
                List <Int32> wordIdList = new List <Int32>();
                foreach (var aWord in query)
                {
                    wordIdList.Add(aWord.WordId);
                }

                // word id 為 0 筆,表示搜尋結果為 0
                if (wordIdList.Count == 0)
                {
                    sw.Stop();
                    result.SearchTime = sw.ElapsedMilliseconds / 1000.0;
                    return(result);
                }

                // 自全文索引中,取出對應的記錄
                var indexes = from i in tblFullText.AsQueryable <InvertedIndex>()
                              where i.WordId.In(wordIdList)
                              select i;

                if (indexes.Count() != wordIdList.Count)
                {
                    return(null);
                }

                // 將每個 keyword token 對應回相對應的 index
                List <List <IndexElement> > checkedIndex = new List <List <IndexElement> >();
                foreach (var aToken in keywordTokens)
                {
                    checkedIndex.Add(indexes.Where(t => t.Word == aToken.First).First().Indexes);
                }

                // 檢查各文件是否為符合的文件
                var firstTokenIndex = checkedIndex[0];
                Dictionary <Int32, Int32> hittedDocs = new Dictionary <Int32, Int32>();
                foreach (var currentIndex in firstTokenIndex)
                {
                    if (keywordTokens.Count == 1 || CheckDocumentIsHitted(keywordTokens, 1, checkedIndex, currentIndex))
                    {
                        if (hittedDocs.ContainsKey(currentIndex.DocId))
                        {
                            hittedDocs[currentIndex.DocId]++;
                        }
                        else
                        {
                            hittedDocs[currentIndex.DocId] = 1;
                        }
                    }
                }

                // 文件照分數排序,取出指定區間的 doc id 列表
                var sortedDocIds = (from entry in hittedDocs orderby entry.Value descending select entry.Key).Skip(startPos).Take(fetchSize).ToList();

                // 結果儲存
                result.Matches = hittedDocs.Count;
                sw.Stop();
                result.SearchTime = sw.ElapsedMilliseconds / 1000.0;

                for (int i = 0; i < fetchSize && i < sortedDocIds.Count; i++)
                {
                    String rawText = this.GetDocumentById(sortedDocIds[i]);
                    result.Results.Add(new ResultItem()
                    {
                        Rank     = startPos + 1 + i,
                        Score    = hittedDocs[sortedDocIds[i]],
                        HitField = rawText.Replace(keyword, "<<" + keyword + ">>")
                    });
                }
                return(result);
            }
            catch (Exception e)
            {
                return(null);
            }
        }