/**s * 快速原子分词,希望用这个方法替换掉原来缓慢的方法 * * @param charArray * @param start * @param end * @return */ protected static List <AtomNode> quickAtomSegment(char[] charArray, int start, int end) { List <AtomNode> atomNodeList = new List <AtomNode>(); int offsetAtom = start; int preType = CharType.get(charArray[offsetAtom]); int curType; while (++offsetAtom < end) { curType = CharType.get(charArray[offsetAtom]); if (curType != preType) { // 浮点数识别 if (charArray[offsetAtom] == '.' && preType == CharType.CT_NUM) { while (++offsetAtom < end) { curType = CharType.get(charArray[offsetAtom]); if (curType != CharType.CT_NUM) { break; } } } atomNodeList.Add(new AtomNode(new String(charArray, start, offsetAtom - start), preType)); start = offsetAtom; } preType = curType; } if (offsetAtom == end) { atomNodeList.Add(new AtomNode(new String(charArray, start, offsetAtom - start), preType)); } return(atomNodeList); }
/** * 原子分词 * * @param sSentence * @param start * @param end * @return * @deprecated 应该使用字符数组的版本 */ private static List <AtomNode> AtomSegment(String sSentence, int start, int end) { if (end < start) { throw new Exception("start=" + start + " < end=" + end); } List <AtomNode> atomSegment = new List <AtomNode>(); int pCur = 0, nCurType, nNextType; StringBuilder sb = new StringBuilder(); char c; //============================================================================================== // by zhenyulu: // // TODO: 使用一系列正则表达式将句子中的完整成分(百分比、日期、电子邮件、URL等)预先提取出来 //============================================================================================== char[] charArray = sSentence.Substring(start, end).ToCharArray(); int[] charTypeArray = new int[charArray.Length]; // 生成对应单个汉字的字符类型数组 for (int i = 0; i < charArray.Length; ++i) { c = charArray[i]; charTypeArray[i] = CharType.get(c); if (c == '.' && i < (charArray.Length - 1) && CharType.get(charArray[i + 1]) == Predefine.CT_NUM) { charTypeArray[i] = Predefine.CT_NUM; } else if (c == '.' && i < (charArray.Length - 1) && charArray[i + 1] >= '0' && charArray[i + 1] <= '9') { charTypeArray[i] = Predefine.CT_SINGLE; } else if (charTypeArray[i] == Predefine.CT_LETTER) { charTypeArray[i] = Predefine.CT_SINGLE; } } // 根据字符类型数组中的内容完成原子切割 while (pCur < charArray.Length) { nCurType = charTypeArray[pCur]; if (nCurType == Predefine.CT_CHINESE || nCurType == Predefine.CT_INDEX || nCurType == Predefine.CT_DELIMITER || nCurType == Predefine.CT_OTHER) { String single = charArray[pCur].ToString(); if (single.Length != 0) { atomSegment.Add(new AtomNode(single, nCurType)); } pCur++; } //如果是字符、数字或者后面跟随了数字的小数点“.”则一直取下去。 else if (pCur < charArray.Length - 1 && ((nCurType == Predefine.CT_SINGLE) || nCurType == Predefine.CT_NUM)) { sb.Remove(0, sb.Length); sb.Append(charArray[pCur]); bool reachEnd = true; while (pCur < charArray.Length - 1) { nNextType = charTypeArray[++pCur]; if (nNextType == nCurType) { sb.Append(charArray[pCur]); } else { reachEnd = false; break; } } atomSegment.Add(new AtomNode(sb.ToString(), nCurType)); if (reachEnd) { pCur++; } } // 对于所有其它情况 else { atomSegment.Add(new AtomNode(charArray[pCur], nCurType)); pCur++; } } // logger.trace("原子分词:" + atomSegment); return(atomSegment); }
/** * 原子分词 * * @param charArray * @param start 从start开始(包含) * @param end 到end结束(不包含end) * @return 一个列表,代表从start到from的所有字构成的原子节点 */ protected static List <AtomNode> atomSegment(char[] charArray, int start, int end) { List <AtomNode> atomSegment = new List <AtomNode>(); int pCur = start, nCurType, nNextType; StringBuilder sb = new StringBuilder(); char c; int[] charTypeArray = new int[end - start]; // 生成对应单个汉字的字符类型数组 for (int i = 0; i < charTypeArray.Length; ++i) { c = charArray[i + start]; charTypeArray[i] = CharType.get(c); if (c == '.' && i + start < (charArray.Length - 1) && CharType.get(charArray[i + start + 1]) == CharType.CT_NUM) { charTypeArray[i] = CharType.CT_NUM; } else if (c == '.' && i + start < (charArray.Length - 1) && charArray[i + start + 1] >= '0' && charArray[i + start + 1] <= '9') { charTypeArray[i] = CharType.CT_SINGLE; } else if (charTypeArray[i] == CharType.CT_LETTER) { charTypeArray[i] = CharType.CT_SINGLE; } } // 根据字符类型数组中的内容完成原子切割 while (pCur < end) { nCurType = charTypeArray[pCur - start]; if (nCurType == CharType.CT_CHINESE || nCurType == CharType.CT_INDEX || nCurType == CharType.CT_DELIMITER || nCurType == CharType.CT_OTHER) { String single = charArray[pCur].ToString(); if (single.Length != 0) { atomSegment.Add(new AtomNode(single, nCurType)); } pCur++; } //如果是字符、数字或者后面跟随了数字的小数点“.”则一直取下去。 else if (pCur < end - 1 && ((nCurType == CharType.CT_SINGLE) || nCurType == CharType.CT_NUM)) { sb.Remove(0, sb.Length); sb.Append(charArray[pCur]); bool reachEnd = true; while (pCur < end - 1) { nNextType = charTypeArray[++pCur - start]; if (nNextType == nCurType) { sb.Append(charArray[pCur]); } else { reachEnd = false; break; } } atomSegment.Add(new AtomNode(sb.ToString(), nCurType)); if (reachEnd) { pCur++; } } // 对于所有其它情况 else { atomSegment.Add(new AtomNode(charArray[pCur], nCurType)); pCur++; } } return(atomSegment); }