Exemplo n.º 1
0
        //====================================================================
        // 对sSentence进行单个汉字的切割
        //====================================================================
        public static List <AtomNode> AtomSegment(string sSentence)
        {
            List <AtomNode> atomSegment = new List <AtomNode>();
            AtomNode        tmpEnd = null;
            int             startIndex = 0, length = sSentence.Length, pCur = 0, nCurType, nNextType;
            StringBuilder   sb = new StringBuilder();
            char            c;

            // 如果是开始符号
            if (sSentence.StartsWith(Predefine.SENTENCE_BEGIN))
            {
                atomSegment.Add(new AtomNode(Predefine.SENTENCE_BEGIN, Predefine.CT_SENTENCE_BEGIN));
                startIndex = Predefine.SENTENCE_BEGIN.Length;
                length    -= startIndex;
            }

            // 如果是结束符号
            if (sSentence.EndsWith(Predefine.SENTENCE_END))
            {
                tmpEnd  = new AtomNode(Predefine.SENTENCE_END, Predefine.CT_SENTENCE_END);
                length -= Predefine.SENTENCE_END.Length;
            }

            //==============================================================================================
            // by zhenyulu:
            //
            // TODO: 使用一系列正则表达式将句子中的完整成分(百分比、日期、电子邮件、URL等)预先提取出来
            //==============================================================================================

            char[] charArray     = sSentence.ToCharArray(startIndex, length);
            int[]  charTypeArray = new int[charArray.Length];

            // 生成对应单个汉字的字符类型数组
            for (int i = 0; i < charArray.Length; i++)
            {
                c = charArray[i];
                charTypeArray[i] = Utility.charType(c);

                if (c == '.' && i < (charArray.Length - 1) && Utility.charType(charArray[i + 1]) == Predefine.CT_NUM)
                {
                    charTypeArray[i] = Predefine.CT_NUM;
                }
                else if (c == '.' && i < (charArray.Length - 1) && charArray[i + 1] >= '0' && charArray[i + 1] <= '9')
                {
                    charTypeArray[i] = Predefine.CT_SINGLE;
                }
                else if (charTypeArray[i] == Predefine.CT_LETTER)
                {
                    charTypeArray[i] = Predefine.CT_SINGLE;
                }
            }

            // 根据字符类型数组中的内容完成原子切割
            while (pCur < charArray.Length)
            {
                nCurType = charTypeArray[pCur];

                if (nCurType == Predefine.CT_CHINESE || nCurType == Predefine.CT_INDEX ||
                    nCurType == Predefine.CT_DELIMITER || nCurType == Predefine.CT_OTHER)
                {
                    if (charArray[pCur].ToString().Trim().Length != 0)
                    {
                        atomSegment.Add(new AtomNode(charArray[pCur].ToString(), nCurType));
                    }
                    pCur++;
                }
                //如果是字符、数字或者后面跟随了数字的小数点“.”则一直取下去。
                else if (pCur < charArray.Length - 1 && (nCurType == Predefine.CT_SINGLE || nCurType == Predefine.CT_NUM))
                {
                    sb.Remove(0, sb.Length);
                    sb.Append(charArray[pCur]);

                    bool reachEnd = true;
                    while (pCur < charArray.Length - 1)
                    {
                        nNextType = charTypeArray[++pCur];

                        if (nNextType == nCurType)
                        {
                            sb.Append(charArray[pCur]);
                        }
                        else
                        {
                            reachEnd = false;
                            break;
                        }
                    }
                    atomSegment.Add(new AtomNode(sb.ToString(), nCurType));
                    if (reachEnd)
                    {
                        pCur++;
                    }
                }
                // 对于所有其它情况
                else
                {
                    atomSegment.Add(new AtomNode(charArray[pCur].ToString(), nCurType));
                    pCur++;
                }
            }

            // 增加结束标志
            if (tmpEnd != null)
            {
                atomSegment.Add(tmpEnd);
            }

            return(atomSegment);
        }
Exemplo n.º 2
0
        //====================================================================
        // ��sSentence���е������ֵ��и�
        //====================================================================
        public static List<AtomNode> AtomSegment(string sSentence)
        {
            List<AtomNode> atomSegment = new List<AtomNode>();
             AtomNode tmpEnd = null;
             int startIndex = 0, length = sSentence.Length, pCur = 0, nCurType, nNextType;
             StringBuilder sb = new StringBuilder();
             char c;

             // ����ǿ�ʼ����
             if (sSentence.StartsWith(Predefine.SENTENCE_BEGIN))
             {
            atomSegment.Add(new AtomNode(Predefine.SENTENCE_BEGIN, Predefine.CT_SENTENCE_BEGIN));
            startIndex = Predefine.SENTENCE_BEGIN.Length;
            length -= startIndex;
             }

             // ����ǽ�������
             if (sSentence.EndsWith(Predefine.SENTENCE_END))
             {
            tmpEnd = new AtomNode(Predefine.SENTENCE_END, Predefine.CT_SENTENCE_END);
            length -= Predefine.SENTENCE_END.Length;
             }

             //==============================================================================================
             // by zhenyulu:
             //
             // TODO: ʹ��һϵ��������ʽ�������е������ɷ֣��ٷֱȡ����ڡ������ʼ���URL�ȣ�Ԥ����ȡ����
             //==============================================================================================

             char[] charArray = sSentence.ToCharArray(startIndex, length);
             int[] charTypeArray = new int[charArray.Length];

             // ���ɶ�Ӧ�������ֵ��ַ���������
             for (int i = 0; i < charArray.Length; i++)
             {
            c = charArray[i];
            charTypeArray[i] = Utility.charType(c);

            if (c == '.' && i < (charArray.Length - 1) && Utility.charType(charArray[i + 1]) == Predefine.CT_NUM)
               charTypeArray[i] = Predefine.CT_NUM;
            else if (c == '.' && i < (charArray.Length - 1) && charArray[i + 1] >= '0' && charArray[i + 1] <= '9')
               charTypeArray[i] = Predefine.CT_SINGLE;
            else if (charTypeArray[i] == Predefine.CT_LETTER)
               charTypeArray[i] = Predefine.CT_SINGLE;
             }

             // �����ַ����������е��������ԭ���и�
             while (pCur < charArray.Length)
             {
            nCurType = charTypeArray[pCur];

            if (nCurType == Predefine.CT_CHINESE || nCurType == Predefine.CT_INDEX ||
               nCurType == Predefine.CT_DELIMITER || nCurType == Predefine.CT_OTHER)
            {
               if (charArray[pCur].ToString().Trim().Length != 0)
                  atomSegment.Add(new AtomNode(charArray[pCur].ToString(), nCurType));
               pCur++;
            }
            //������ַ������ֻ��ߺ�����������ֵ�С���㡰.����һֱȡ��ȥ��
            else if (pCur < charArray.Length - 1 && (nCurType == Predefine.CT_SINGLE || nCurType == Predefine.CT_NUM))
            {
               sb.Remove(0, sb.Length);
               sb.Append(charArray[pCur]);

               bool reachEnd = true;
               while (pCur < charArray.Length - 1)
               {
                  nNextType = charTypeArray[++pCur];

                  if (nNextType == nCurType)
                     sb.Append(charArray[pCur]);
                  else
                  {
                     reachEnd = false;
                     break;
                  }
               }
               atomSegment.Add(new AtomNode(sb.ToString(), nCurType));
               if (reachEnd)
                  pCur++;
            }
            // ���������������
            else
            {
               atomSegment.Add(new AtomNode(charArray[pCur].ToString(), nCurType));
               pCur++;
            }
             }

             // ���ӽ�����־
             if (tmpEnd != null)
            atomSegment.Add(tmpEnd);

             return atomSegment;
        }
Exemplo n.º 3
0
      //====================================================================
      // 对sSentence进行单个汉字的切割
      //====================================================================
      public static List<AtomNode> AtomSegment(string sSentence)
      {
         List<AtomNode> atomSegment = new List<AtomNode>();
         AtomNode tmpEnd = null;
         int startIndex = 0, length = sSentence.Length, pCur = 0, nCurType, nNextType;
         StringBuilder sb = new StringBuilder();
         char c;

         // 如果是开始符号
         if (sSentence.StartsWith(Predefine.SENTENCE_BEGIN))
         {
            atomSegment.Add(new AtomNode(Predefine.SENTENCE_BEGIN, Predefine.CT_SENTENCE_BEGIN));
            startIndex = Predefine.SENTENCE_BEGIN.Length;
            length -= startIndex;
         }

         // 如果是结束符号
         if (sSentence.EndsWith(Predefine.SENTENCE_END))
         {
            tmpEnd = new AtomNode(Predefine.SENTENCE_END, Predefine.CT_SENTENCE_END);
            length -= Predefine.SENTENCE_END.Length;
         }

         //==============================================================================================
         // by zhenyulu:
         //
         // TODO: 使用一系列正则表达式将句子中的完整成分(百分比、日期、电子邮件、URL等)预先提取出来
         //==============================================================================================

         char[] charArray = sSentence.ToCharArray(startIndex, length);
         int[] charTypeArray = new int[charArray.Length];

         // 生成对应单个汉字的字符类型数组
         for (int i = 0; i < charArray.Length; i++)
         {
            c = charArray[i];
            charTypeArray[i] = Utility.charType(c);

            if (c == '.' && i < (charArray.Length - 1) && Utility.charType(charArray[i + 1]) == Predefine.CT_NUM)
               charTypeArray[i] = Predefine.CT_NUM;
            else if (c == '.' && i < (charArray.Length - 1) && charArray[i + 1] >= '0' && charArray[i + 1] <= '9')
               charTypeArray[i] = Predefine.CT_SINGLE;
            else if (charTypeArray[i] == Predefine.CT_LETTER)
               charTypeArray[i] = Predefine.CT_SINGLE;
         }

         // 根据字符类型数组中的内容完成原子切割
         while (pCur < charArray.Length)
         {
            nCurType = charTypeArray[pCur];

            if (nCurType == Predefine.CT_CHINESE || nCurType == Predefine.CT_INDEX ||
               nCurType == Predefine.CT_DELIMITER || nCurType == Predefine.CT_OTHER)
            {
               if (charArray[pCur].ToString().Trim().Length != 0)
                  atomSegment.Add(new AtomNode(charArray[pCur].ToString(), nCurType));
               pCur++;
            }
            //如果是字符、数字或者后面跟随了数字的小数点“.”则一直取下去。
            else if (pCur < charArray.Length - 1 && (nCurType == Predefine.CT_SINGLE || nCurType == Predefine.CT_NUM))
            {
               sb.Remove(0, sb.Length);
               sb.Append(charArray[pCur]);

               bool reachEnd = true;
               while (pCur < charArray.Length - 1)
               {
                  nNextType = charTypeArray[++pCur];

                  if (nNextType == nCurType)
                     sb.Append(charArray[pCur]);
                  else
                  {
                     reachEnd = false;
                     break;
                  }
               }
               atomSegment.Add(new AtomNode(sb.ToString(), nCurType));
               if (reachEnd)
                  pCur++;
            }
            // 对于所有其它情况
            else
            {
               atomSegment.Add(new AtomNode(charArray[pCur].ToString(), nCurType));
               pCur++;
            }
         }

         // 增加结束标志
         if (tmpEnd != null)
            atomSegment.Add(tmpEnd);

         return atomSegment;
      }