//==================================================================== // 对sSentence进行单个汉字的切割 //==================================================================== public static List <AtomNode> AtomSegment(string sSentence) { List <AtomNode> atomSegment = new List <AtomNode>(); AtomNode tmpEnd = null; int startIndex = 0, length = sSentence.Length, pCur = 0, nCurType, nNextType; StringBuilder sb = new StringBuilder(); char c; // 如果是开始符号 if (sSentence.StartsWith(Predefine.SENTENCE_BEGIN)) { atomSegment.Add(new AtomNode(Predefine.SENTENCE_BEGIN, Predefine.CT_SENTENCE_BEGIN)); startIndex = Predefine.SENTENCE_BEGIN.Length; length -= startIndex; } // 如果是结束符号 if (sSentence.EndsWith(Predefine.SENTENCE_END)) { tmpEnd = new AtomNode(Predefine.SENTENCE_END, Predefine.CT_SENTENCE_END); length -= Predefine.SENTENCE_END.Length; } //============================================================================================== // by zhenyulu: // // TODO: 使用一系列正则表达式将句子中的完整成分(百分比、日期、电子邮件、URL等)预先提取出来 //============================================================================================== char[] charArray = sSentence.ToCharArray(startIndex, length); int[] charTypeArray = new int[charArray.Length]; // 生成对应单个汉字的字符类型数组 for (int i = 0; i < charArray.Length; i++) { c = charArray[i]; charTypeArray[i] = Utility.charType(c); if (c == '.' && i < (charArray.Length - 1) && Utility.charType(charArray[i + 1]) == Predefine.CT_NUM) { charTypeArray[i] = Predefine.CT_NUM; } else if (c == '.' && i < (charArray.Length - 1) && charArray[i + 1] >= '0' && charArray[i + 1] <= '9') { charTypeArray[i] = Predefine.CT_SINGLE; } else if (charTypeArray[i] == Predefine.CT_LETTER) { charTypeArray[i] = Predefine.CT_SINGLE; } } // 根据字符类型数组中的内容完成原子切割 while (pCur < charArray.Length) { nCurType = charTypeArray[pCur]; if (nCurType == Predefine.CT_CHINESE || nCurType == Predefine.CT_INDEX || nCurType == Predefine.CT_DELIMITER || nCurType == Predefine.CT_OTHER) { if (charArray[pCur].ToString().Trim().Length != 0) { atomSegment.Add(new AtomNode(charArray[pCur].ToString(), nCurType)); } pCur++; } //如果是字符、数字或者后面跟随了数字的小数点“.”则一直取下去。 else if (pCur < charArray.Length - 1 && (nCurType == Predefine.CT_SINGLE || nCurType == Predefine.CT_NUM)) { sb.Remove(0, sb.Length); sb.Append(charArray[pCur]); bool reachEnd = true; while (pCur < charArray.Length - 1) { nNextType = charTypeArray[++pCur]; if (nNextType == nCurType) { sb.Append(charArray[pCur]); } else { reachEnd = false; break; } } atomSegment.Add(new AtomNode(sb.ToString(), nCurType)); if (reachEnd) { pCur++; } } // 对于所有其它情况 else { atomSegment.Add(new AtomNode(charArray[pCur].ToString(), nCurType)); pCur++; } } // 增加结束标志 if (tmpEnd != null) { atomSegment.Add(tmpEnd); } return(atomSegment); }
//==================================================================== // ��sSentence���е������ֵ��и� //==================================================================== public static List<AtomNode> AtomSegment(string sSentence) { List<AtomNode> atomSegment = new List<AtomNode>(); AtomNode tmpEnd = null; int startIndex = 0, length = sSentence.Length, pCur = 0, nCurType, nNextType; StringBuilder sb = new StringBuilder(); char c; // ����ǿ�ʼ���� if (sSentence.StartsWith(Predefine.SENTENCE_BEGIN)) { atomSegment.Add(new AtomNode(Predefine.SENTENCE_BEGIN, Predefine.CT_SENTENCE_BEGIN)); startIndex = Predefine.SENTENCE_BEGIN.Length; length -= startIndex; } // ����ǽ������� if (sSentence.EndsWith(Predefine.SENTENCE_END)) { tmpEnd = new AtomNode(Predefine.SENTENCE_END, Predefine.CT_SENTENCE_END); length -= Predefine.SENTENCE_END.Length; } //============================================================================================== // by zhenyulu: // // TODO: ʹ��һϵ��������ʽ�������е������ɷ֣��ٷֱȡ����ڡ������ʼ���URL�ȣ�Ԥ����ȡ���� //============================================================================================== char[] charArray = sSentence.ToCharArray(startIndex, length); int[] charTypeArray = new int[charArray.Length]; // ���ɶ�Ӧ�������ֵ��ַ��������� for (int i = 0; i < charArray.Length; i++) { c = charArray[i]; charTypeArray[i] = Utility.charType(c); if (c == '.' && i < (charArray.Length - 1) && Utility.charType(charArray[i + 1]) == Predefine.CT_NUM) charTypeArray[i] = Predefine.CT_NUM; else if (c == '.' && i < (charArray.Length - 1) && charArray[i + 1] >= '0' && charArray[i + 1] <= '9') charTypeArray[i] = Predefine.CT_SINGLE; else if (charTypeArray[i] == Predefine.CT_LETTER) charTypeArray[i] = Predefine.CT_SINGLE; } // �����ַ����������е��������ԭ���и� while (pCur < charArray.Length) { nCurType = charTypeArray[pCur]; if (nCurType == Predefine.CT_CHINESE || nCurType == Predefine.CT_INDEX || nCurType == Predefine.CT_DELIMITER || nCurType == Predefine.CT_OTHER) { if (charArray[pCur].ToString().Trim().Length != 0) atomSegment.Add(new AtomNode(charArray[pCur].ToString(), nCurType)); pCur++; } //������ַ������ֻ��ߺ�����������ֵ�С���㡰.����һֱȡ��ȥ�� else if (pCur < charArray.Length - 1 && (nCurType == Predefine.CT_SINGLE || nCurType == Predefine.CT_NUM)) { sb.Remove(0, sb.Length); sb.Append(charArray[pCur]); bool reachEnd = true; while (pCur < charArray.Length - 1) { nNextType = charTypeArray[++pCur]; if (nNextType == nCurType) sb.Append(charArray[pCur]); else { reachEnd = false; break; } } atomSegment.Add(new AtomNode(sb.ToString(), nCurType)); if (reachEnd) pCur++; } // ��������������� else { atomSegment.Add(new AtomNode(charArray[pCur].ToString(), nCurType)); pCur++; } } // ���ӽ�����־ if (tmpEnd != null) atomSegment.Add(tmpEnd); return atomSegment; }
//==================================================================== // 对sSentence进行单个汉字的切割 //==================================================================== public static List<AtomNode> AtomSegment(string sSentence) { List<AtomNode> atomSegment = new List<AtomNode>(); AtomNode tmpEnd = null; int startIndex = 0, length = sSentence.Length, pCur = 0, nCurType, nNextType; StringBuilder sb = new StringBuilder(); char c; // 如果是开始符号 if (sSentence.StartsWith(Predefine.SENTENCE_BEGIN)) { atomSegment.Add(new AtomNode(Predefine.SENTENCE_BEGIN, Predefine.CT_SENTENCE_BEGIN)); startIndex = Predefine.SENTENCE_BEGIN.Length; length -= startIndex; } // 如果是结束符号 if (sSentence.EndsWith(Predefine.SENTENCE_END)) { tmpEnd = new AtomNode(Predefine.SENTENCE_END, Predefine.CT_SENTENCE_END); length -= Predefine.SENTENCE_END.Length; } //============================================================================================== // by zhenyulu: // // TODO: 使用一系列正则表达式将句子中的完整成分(百分比、日期、电子邮件、URL等)预先提取出来 //============================================================================================== char[] charArray = sSentence.ToCharArray(startIndex, length); int[] charTypeArray = new int[charArray.Length]; // 生成对应单个汉字的字符类型数组 for (int i = 0; i < charArray.Length; i++) { c = charArray[i]; charTypeArray[i] = Utility.charType(c); if (c == '.' && i < (charArray.Length - 1) && Utility.charType(charArray[i + 1]) == Predefine.CT_NUM) charTypeArray[i] = Predefine.CT_NUM; else if (c == '.' && i < (charArray.Length - 1) && charArray[i + 1] >= '0' && charArray[i + 1] <= '9') charTypeArray[i] = Predefine.CT_SINGLE; else if (charTypeArray[i] == Predefine.CT_LETTER) charTypeArray[i] = Predefine.CT_SINGLE; } // 根据字符类型数组中的内容完成原子切割 while (pCur < charArray.Length) { nCurType = charTypeArray[pCur]; if (nCurType == Predefine.CT_CHINESE || nCurType == Predefine.CT_INDEX || nCurType == Predefine.CT_DELIMITER || nCurType == Predefine.CT_OTHER) { if (charArray[pCur].ToString().Trim().Length != 0) atomSegment.Add(new AtomNode(charArray[pCur].ToString(), nCurType)); pCur++; } //如果是字符、数字或者后面跟随了数字的小数点“.”则一直取下去。 else if (pCur < charArray.Length - 1 && (nCurType == Predefine.CT_SINGLE || nCurType == Predefine.CT_NUM)) { sb.Remove(0, sb.Length); sb.Append(charArray[pCur]); bool reachEnd = true; while (pCur < charArray.Length - 1) { nNextType = charTypeArray[++pCur]; if (nNextType == nCurType) sb.Append(charArray[pCur]); else { reachEnd = false; break; } } atomSegment.Add(new AtomNode(sb.ToString(), nCurType)); if (reachEnd) pCur++; } // 对于所有其它情况 else { atomSegment.Add(new AtomNode(charArray[pCur].ToString(), nCurType)); pCur++; } } // 增加结束标志 if (tmpEnd != null) atomSegment.Add(tmpEnd); return atomSegment; }