private string[] _ArrSentence = { };//总内容首先被分割成句子 #endregion #region 内部方法 #region 将DataTable类型的词典转换为JSON /// <summary> /// 将DataTable类型的词典转换为JSON /// </summary> /// <param name="dtDict">DataTable类型的词典</param> /// <returns></returns> private string _GetJsonFromDataTable(DataTable dtDict) { string Result = ""; try { if (dtDict.Rows.Count < 1) { return(Result); } } catch (Exception ex) { return(Result); } List <_DictModel> ListDict = new List <_DictModel>(); _DictModel DictModel; foreach (DataRow dr in dtDict.Rows) { DictModel = new _DictModel(); DictModel.DictName = dr["DictName"].ToString(); DictModel.DictAttr = dr["DictAttr"].ToString(); DictModel.DictNote = dr["DictNote"].ToString(); ListDict.Add(DictModel); } Result = _Jss.Serialize(ListDict); return(Result); }
/// <summary> /// 获取最终的分词结果,以JSON形式返回各词 /// </summary> /// <param name="ReturnJSON">是否以JSON形式返回分词结果,如果否则以“词语 , 词语 , ...”方式返回结果</param> /// <returns>JSON形式的分词结果</returns> public string GetWord(bool ReturnJSON) { string Result = ""; DataRow[] _ArrDr = null; int _DictCount = 0;//词语出现次数 _DictModel _modelDict; List <_DictModel> _ListDict = new List <_DictModel>(); DataTable _dtTemp = new DataTable(); DataTable dtResult = new DataTable(); #region 初始化分词结果的表结构 _dtTemp.Columns.Add("DictName"); _dtTemp.Columns.Add("DictAttr"); _dtTemp.Columns.Add("DictNote"); dtResult = _dtTemp.Clone(); dtResult.Columns.Add("DictCount"); #endregion this.Dict = this._ReadDict();//加载外部词典到this.Dict if (this.Dict == null) { Result = "外部词典读取出错!请检查外部词典路径。"; return(Result); } _ArrSentence = this._GetSentence();//对原文分句,分词程序对原文以句为单位进行分词 for (int i = 0; i < _ArrSentence.Length; i++) //对每条句子循环 { if (_ArrSentence[i].Length >= 1) //句子内容不为空时才能进行分词 { //根据WordSize产生对应数量的占位符,以便对逆向匹配后_ArrSentence中剩余的不足WordSize数量的字符进行词典匹配 string _Placeholder = ""; for (int L = 1; L <= this.WordSize; L++) { _Placeholder += "■"; } _ArrSentence[i] = _Placeholder + _ArrSentence[i]; for (int j = _ArrSentence[i].Length - this.WordSize; j >= 0; j--) //在句子内进行分词的循环 { int _Temp = j; string _TempWord; for (int k = this.WordSize; k >= 2; k--) //k循环用于取不同长度的词 { _TempWord = _ArrSentence[i].Substring(_Temp++, k); //如果在词典中找到了匹配的词,则以DataRow类型放入_ArrDr _ArrDr = this.Dict.Select("DictName='" + _TempWord + "'"); if (_ArrDr != null && _ArrDr.Length > 0) { _dtTemp.Rows.Add(_ArrDr[0].ItemArray); } } //for k } //for j } //if (_ArrSentence[i].Length > 0) } //for i //统计词语数量(包含重复词语) _WordCountHaveRepeate = _dtTemp.Rows.Count; #region 去除重复词语,并统计重复次数,按照重复次数降序排列 for (int i = 0; i < _dtTemp.Rows.Count; i++) { //_dtTemp自身查询重复词语,如果查询结果>1则表示有重复词,则记录重复次数并删除重复的DataRow _ArrDr = _dtTemp.Select("DictName='" + _dtTemp.Rows[i]["DictName"].ToString() + "'"); _DictCount = _ArrDr.Length; if (_DictCount > 1) { for (int j = 1; j < _DictCount; j++) { _dtTemp.Rows.Remove(_ArrDr[j]);//删除重复的DataRow } } dtResult.Rows.Add(_ArrDr[0].ItemArray); dtResult.Rows[i]["DictCount"] = _DictCount;//将词的重复次数添加到DictCount字段 } //根据词的重复次数对结果dtResult表排序 DataView dvTemp = dtResult.DefaultView; dvTemp.Sort = "DictCount DESC , DictName ASC"; dtResult = dvTemp.ToTable(); #endregion //统计词语数量(不含重复词语) _WordCountWithoutRepeate = dtResult.Rows.Count; //dtResult是以DataTable形式存储的分词结果,现在将dt转化为list,以便最终转换成JSON输出 string ResultWordList = "";//如果需要返回词语列表(非JSON结果),则用此变量存储结果并返回 foreach (DataRow drTemp in dtResult.Rows) { _modelDict = new _DictModel(); _modelDict.DictName = drTemp["DictName"].ToString(); _modelDict.DictAttr = drTemp["DictAttr"].ToString(); _modelDict.DictNote = drTemp["DictNote"].ToString(); _modelDict.DictCount = Convert.ToInt32(drTemp["DictCount"]); _ListDict.Add(_modelDict); ResultWordList += _modelDict.DictName + ","; } if (dtResult.Rows.Count > 0) { Result = _Jss.Serialize(_ListDict); //生成JSON结果 ResultWordList = ResultWordList.TrimEnd(','); //生成词语列表结果 } if (ReturnJSON) { return(Result);//返回JSON结果 } else { return(ResultWordList);//返回词语列表结果 } }