/// <summary> /// 对文本进行分词 /// </summary> /// <param name="bytes">输入的字符串byte数组</param> /// <param name="enc">编码方式</param> /// <returns>分词结果列表</returns> public List <ResultTerm> Segment(byte[] bytes, Encoding enc) { result_t[] result = new result_t[bytes.Length]; int posStrArrLen = 0; byte[] bys = new byte[bytes.Length]; int i = 0; int nWrdCnt = ICTCLAS_ParagraphProcessAW_B(bytes, result, getCodeType(enc), true); List <ResultTerm> returnResult = new List <ResultTerm>(nWrdCnt); result_t r; //取字符串真实长度: byte[] gbbytes = bytes; // enc.GetBytes(str); for (i = 0; i < result.Length; i++) { r = result[i]; if (r.length != 0) { posStrArrLen = 0; for (int l = 0; l < 8; l++) { if (r.sPos[l] == 0) { posStrArrLen = l; break; } } ResultTerm word = new ResultTerm(); word.Word = enc.GetString(gbbytes, r.start, r.length); word.POS = r.POS_id; word.POSStr = Encoding.ASCII.GetString(r.sPos, 0, posStrArrLen); returnResult.Add(word); } } result = null; return(returnResult); }
/// <summary> /// 对文本进行分词 /// </summary> /// <param name="str">要分词的字符串</param> /// <returns>分词结果列表</returns> public List <ResultTerm> Segment(string str) { result_t[] result = new result_t[str.Length]; int posStrArrLen = 0; result_t r; byte[] bytes = Encoding.UTF8.GetBytes(str); int nWrdCnt = ICTCLAS_ParagraphProcessAW_B(bytes, result, eCodeType.CODE_TYPE_UTF8, true); List <ResultTerm> returnResult = new List <ResultTerm>(nWrdCnt); for (int i = 0; i < result.Length; i++) { r = result[i]; if (r.length != 0) { posStrArrLen = 0; for (int l = 0; l < 8; l++) { if (r.sPos[l] == 0) { posStrArrLen = l; break; } } ResultTerm word = new ResultTerm(); word.Word = Encoding.UTF8.GetString(bytes, r.start, r.length); word.POS = r.POS_id; word.POSStr = Encoding.ASCII.GetString(r.sPos, 0, posStrArrLen); returnResult.Add(word); } } result = null; return(returnResult); }
/// <summary> /// ���ı����зִ� /// </summary> /// <param name="bytes">������ַ���byte����</param> /// <param name="enc">���뷽ʽ</param> /// <returns>�ִʽ���б�</returns> public List<ResultTerm> Segment(byte[] bytes, Encoding enc) { result_t[] result = new result_t[bytes.Length]; int posStrArrLen = 0; byte[] bys = new byte[bytes.Length]; int i = 0; int nWrdCnt = ICTCLAS_ParagraphProcessAW_B(bytes, result, getCodeType(enc), true); List<ResultTerm> returnResult = new List<ResultTerm>(nWrdCnt); result_t r; //ȡ�ַ�����ʵ����: byte[] gbbytes = bytes; // enc.GetBytes(str); for (i = 0; i < result.Length; i++) { r = result[i]; if (r.length != 0) { posStrArrLen = 0; for (int l = 0; l < 8; l++) { if (r.sPos[l] == 0) { posStrArrLen = l; break; } } ResultTerm word = new ResultTerm(); word.Word = enc.GetString(gbbytes, r.start, r.length); word.POS = r.POS_id; word.POSStr = Encoding.ASCII.GetString(r.sPos, 0, posStrArrLen); returnResult.Add(word); } } result = null; return returnResult; }
/// <summary> /// ���ı����зִ� /// </summary> /// <param name="str">Ҫ�ִʵ��ַ���</param> /// <returns>�ִʽ���б�</returns> public List<ResultTerm> Segment(string str) { result_t[] result = new result_t[str.Length]; int posStrArrLen = 0; result_t r; byte[] bytes = Encoding.UTF8.GetBytes(str); int nWrdCnt = ICTCLAS_ParagraphProcessAW_B(bytes, result, eCodeType.CODE_TYPE_UTF8, true); List<ResultTerm> returnResult = new List<ResultTerm>(nWrdCnt); for (int i = 0; i < result.Length; i++) { r = result[i]; if (r.length != 0) { posStrArrLen =0; for(int l = 0; l<8 ;l++) { if (r.sPos[l]==0 ) { posStrArrLen = l; break; } } ResultTerm word = new ResultTerm(); word.Word = Encoding.UTF8.GetString(bytes, r.start, r.length); word.POS = r.POS_id; word.POSStr = Encoding.ASCII.GetString(r.sPos, 0, posStrArrLen); returnResult.Add(word); } } result = null; return returnResult; }