Example #1
0
File: Word.cs Project: Ztnull/NLP
        private string[] _ArrSentence     = { };//总内容首先被分割成句子
        #endregion

        #region 内部方法

        #region 将DataTable类型的词典转换为JSON
        /// <summary>
        /// 将DataTable类型的词典转换为JSON
        /// </summary>
        /// <param name="dtDict">DataTable类型的词典</param>
        /// <returns></returns>
        private string _GetJsonFromDataTable(DataTable dtDict)
        {
            string Result = "";

            try
            {
                if (dtDict.Rows.Count < 1)
                {
                    return(Result);
                }
            }
            catch (Exception ex)
            {
                return(Result);
            }
            List <_DictModel> ListDict = new List <_DictModel>();
            _DictModel        DictModel;

            foreach (DataRow dr in dtDict.Rows)
            {
                DictModel          = new _DictModel();
                DictModel.DictName = dr["DictName"].ToString();
                DictModel.DictAttr = dr["DictAttr"].ToString();
                DictModel.DictNote = dr["DictNote"].ToString();
                ListDict.Add(DictModel);
            }
            Result = _Jss.Serialize(ListDict);
            return(Result);
        }
Example #2
0
File: Word.cs Project: Ztnull/NLP
        /// <summary>
        /// 获取最终的分词结果,以JSON形式返回各词
        /// </summary>
        /// <param name="ReturnJSON">是否以JSON形式返回分词结果,如果否则以“词语 , 词语 , ...”方式返回结果</param>
        /// <returns>JSON形式的分词结果</returns>
        public string GetWord(bool ReturnJSON)
        {
            string Result = "";

            DataRow[]         _ArrDr     = null;
            int               _DictCount = 0;//词语出现次数
            _DictModel        _modelDict;
            List <_DictModel> _ListDict = new List <_DictModel>();
            DataTable         _dtTemp   = new DataTable();
            DataTable         dtResult  = new DataTable();

            #region 初始化分词结果的表结构
            _dtTemp.Columns.Add("DictName");
            _dtTemp.Columns.Add("DictAttr");
            _dtTemp.Columns.Add("DictNote");
            dtResult = _dtTemp.Clone();
            dtResult.Columns.Add("DictCount");
            #endregion
            this.Dict = this._ReadDict();//加载外部词典到this.Dict
            if (this.Dict == null)
            {
                Result = "外部词典读取出错!请检查外部词典路径。";
                return(Result);
            }
            _ArrSentence = this._GetSentence();//对原文分句,分词程序对原文以句为单位进行分词
            for (int i = 0; i < _ArrSentence.Length; i++)
            //对每条句子循环
            {
                if (_ArrSentence[i].Length >= 1)
                //句子内容不为空时才能进行分词
                {
                    //根据WordSize产生对应数量的占位符,以便对逆向匹配后_ArrSentence中剩余的不足WordSize数量的字符进行词典匹配
                    string _Placeholder = "";
                    for (int L = 1; L <= this.WordSize; L++)
                    {
                        _Placeholder += "■";
                    }
                    _ArrSentence[i] = _Placeholder + _ArrSentence[i];
                    for (int j = _ArrSentence[i].Length - this.WordSize; j >= 0; j--)
                    //在句子内进行分词的循环
                    {
                        int    _Temp = j;
                        string _TempWord;
                        for (int k = this.WordSize; k >= 2; k--)
                        //k循环用于取不同长度的词
                        {
                            _TempWord = _ArrSentence[i].Substring(_Temp++, k);
                            //如果在词典中找到了匹配的词,则以DataRow类型放入_ArrDr
                            _ArrDr = this.Dict.Select("DictName='" + _TempWord + "'");
                            if (_ArrDr != null && _ArrDr.Length > 0)
                            {
                                _dtTemp.Rows.Add(_ArrDr[0].ItemArray);
                            }
                        } //for k
                    }     //for j
                }         //if (_ArrSentence[i].Length > 0)
            }             //for i

            //统计词语数量(包含重复词语)
            _WordCountHaveRepeate = _dtTemp.Rows.Count;

            #region 去除重复词语,并统计重复次数,按照重复次数降序排列
            for (int i = 0; i < _dtTemp.Rows.Count; i++)
            {
                //_dtTemp自身查询重复词语,如果查询结果>1则表示有重复词,则记录重复次数并删除重复的DataRow
                _ArrDr     = _dtTemp.Select("DictName='" + _dtTemp.Rows[i]["DictName"].ToString() + "'");
                _DictCount = _ArrDr.Length;
                if (_DictCount > 1)
                {
                    for (int j = 1; j < _DictCount; j++)
                    {
                        _dtTemp.Rows.Remove(_ArrDr[j]);//删除重复的DataRow
                    }
                }
                dtResult.Rows.Add(_ArrDr[0].ItemArray);
                dtResult.Rows[i]["DictCount"] = _DictCount;//将词的重复次数添加到DictCount字段
            }
            //根据词的重复次数对结果dtResult表排序
            DataView dvTemp = dtResult.DefaultView;
            dvTemp.Sort = "DictCount DESC , DictName ASC";
            dtResult    = dvTemp.ToTable();
            #endregion

            //统计词语数量(不含重复词语)
            _WordCountWithoutRepeate = dtResult.Rows.Count;

            //dtResult是以DataTable形式存储的分词结果,现在将dt转化为list,以便最终转换成JSON输出
            string ResultWordList = "";//如果需要返回词语列表(非JSON结果),则用此变量存储结果并返回
            foreach (DataRow drTemp in dtResult.Rows)
            {
                _modelDict           = new _DictModel();
                _modelDict.DictName  = drTemp["DictName"].ToString();
                _modelDict.DictAttr  = drTemp["DictAttr"].ToString();
                _modelDict.DictNote  = drTemp["DictNote"].ToString();
                _modelDict.DictCount = Convert.ToInt32(drTemp["DictCount"]);
                _ListDict.Add(_modelDict);
                ResultWordList += _modelDict.DictName + ",";
            }
            if (dtResult.Rows.Count > 0)
            {
                Result         = _Jss.Serialize(_ListDict);   //生成JSON结果
                ResultWordList = ResultWordList.TrimEnd(','); //生成词语列表结果
            }
            if (ReturnJSON)
            {
                return(Result);//返回JSON结果
            }
            else
            {
                return(ResultWordList);//返回词语列表结果
            }
        }