Exemplo n.º 1
0
        /**s
         * 快速原子分词,希望用这个方法替换掉原来缓慢的方法
         *
         * @param charArray
         * @param start
         * @param end
         * @return
         */
        protected static List <AtomNode> quickAtomSegment(char[] charArray, int start, int end)
        {
            List <AtomNode> atomNodeList = new List <AtomNode>();
            int             offsetAtom   = start;
            int             preType      = CharType.get(charArray[offsetAtom]);
            int             curType;

            while (++offsetAtom < end)
            {
                curType = CharType.get(charArray[offsetAtom]);
                if (curType != preType)
                {
                    // 浮点数识别
                    if (charArray[offsetAtom] == '.' && preType == CharType.CT_NUM)
                    {
                        while (++offsetAtom < end)
                        {
                            curType = CharType.get(charArray[offsetAtom]);
                            if (curType != CharType.CT_NUM)
                            {
                                break;
                            }
                        }
                    }
                    atomNodeList.Add(new AtomNode(new String(charArray, start, offsetAtom - start), preType));
                    start = offsetAtom;
                }
                preType = curType;
            }
            if (offsetAtom == end)
            {
                atomNodeList.Add(new AtomNode(new String(charArray, start, offsetAtom - start), preType));
            }

            return(atomNodeList);
        }
        /**
         * 原子分词
         *
         * @param sSentence
         * @param start
         * @param end
         * @return
         * @deprecated 应该使用字符数组的版本
         */
        private static List <AtomNode> AtomSegment(String sSentence, int start, int end)
        {
            if (end < start)
            {
                throw new Exception("start=" + start + " < end=" + end);
            }
            List <AtomNode> atomSegment = new List <AtomNode>();
            int             pCur = 0, nCurType, nNextType;
            StringBuilder   sb = new StringBuilder();
            char            c;


            //==============================================================================================
            // by zhenyulu:
            //
            // TODO: 使用一系列正则表达式将句子中的完整成分(百分比、日期、电子邮件、URL等)预先提取出来
            //==============================================================================================

            char[] charArray     = sSentence.Substring(start, end).ToCharArray();
            int[]  charTypeArray = new int[charArray.Length];

            // 生成对应单个汉字的字符类型数组
            for (int i = 0; i < charArray.Length; ++i)
            {
                c = charArray[i];
                charTypeArray[i] = CharType.get(c);

                if (c == '.' && i < (charArray.Length - 1) && CharType.get(charArray[i + 1]) == Predefine.CT_NUM)
                {
                    charTypeArray[i] = Predefine.CT_NUM;
                }
                else if (c == '.' && i < (charArray.Length - 1) && charArray[i + 1] >= '0' && charArray[i + 1] <= '9')
                {
                    charTypeArray[i] = Predefine.CT_SINGLE;
                }
                else if (charTypeArray[i] == Predefine.CT_LETTER)
                {
                    charTypeArray[i] = Predefine.CT_SINGLE;
                }
            }

            // 根据字符类型数组中的内容完成原子切割
            while (pCur < charArray.Length)
            {
                nCurType = charTypeArray[pCur];

                if (nCurType == Predefine.CT_CHINESE || nCurType == Predefine.CT_INDEX ||
                    nCurType == Predefine.CT_DELIMITER || nCurType == Predefine.CT_OTHER)
                {
                    String single = charArray[pCur].ToString();
                    if (single.Length != 0)
                    {
                        atomSegment.Add(new AtomNode(single, nCurType));
                    }
                    pCur++;
                }
                //如果是字符、数字或者后面跟随了数字的小数点“.”则一直取下去。
                else if (pCur < charArray.Length - 1 && ((nCurType == Predefine.CT_SINGLE) || nCurType == Predefine.CT_NUM))
                {
                    sb.Remove(0, sb.Length);
                    sb.Append(charArray[pCur]);

                    bool reachEnd = true;
                    while (pCur < charArray.Length - 1)
                    {
                        nNextType = charTypeArray[++pCur];

                        if (nNextType == nCurType)
                        {
                            sb.Append(charArray[pCur]);
                        }
                        else
                        {
                            reachEnd = false;
                            break;
                        }
                    }
                    atomSegment.Add(new AtomNode(sb.ToString(), nCurType));
                    if (reachEnd)
                    {
                        pCur++;
                    }
                }
                // 对于所有其它情况
                else
                {
                    atomSegment.Add(new AtomNode(charArray[pCur], nCurType));
                    pCur++;
                }
            }

            //        logger.trace("原子分词:" + atomSegment);
            return(atomSegment);
        }
Exemplo n.º 3
0
        /**
         * 原子分词
         *
         * @param charArray
         * @param start     从start开始(包含)
         * @param end       到end结束(不包含end)
         * @return 一个列表,代表从start到from的所有字构成的原子节点
         */
        protected static List <AtomNode> atomSegment(char[] charArray, int start, int end)
        {
            List <AtomNode> atomSegment = new List <AtomNode>();
            int             pCur = start, nCurType, nNextType;
            StringBuilder   sb = new StringBuilder();
            char            c;

            int[] charTypeArray = new int[end - start];

            // 生成对应单个汉字的字符类型数组
            for (int i = 0; i < charTypeArray.Length; ++i)
            {
                c = charArray[i + start];
                charTypeArray[i] = CharType.get(c);

                if (c == '.' && i + start < (charArray.Length - 1) && CharType.get(charArray[i + start + 1]) == CharType.CT_NUM)
                {
                    charTypeArray[i] = CharType.CT_NUM;
                }
                else if (c == '.' && i + start < (charArray.Length - 1) && charArray[i + start + 1] >= '0' && charArray[i + start + 1] <= '9')
                {
                    charTypeArray[i] = CharType.CT_SINGLE;
                }
                else if (charTypeArray[i] == CharType.CT_LETTER)
                {
                    charTypeArray[i] = CharType.CT_SINGLE;
                }
            }

            // 根据字符类型数组中的内容完成原子切割
            while (pCur < end)
            {
                nCurType = charTypeArray[pCur - start];

                if (nCurType == CharType.CT_CHINESE || nCurType == CharType.CT_INDEX ||
                    nCurType == CharType.CT_DELIMITER || nCurType == CharType.CT_OTHER)
                {
                    String single = charArray[pCur].ToString();
                    if (single.Length != 0)
                    {
                        atomSegment.Add(new AtomNode(single, nCurType));
                    }
                    pCur++;
                }
                //如果是字符、数字或者后面跟随了数字的小数点“.”则一直取下去。
                else if (pCur < end - 1 && ((nCurType == CharType.CT_SINGLE) || nCurType == CharType.CT_NUM))
                {
                    sb.Remove(0, sb.Length);
                    sb.Append(charArray[pCur]);

                    bool reachEnd = true;
                    while (pCur < end - 1)
                    {
                        nNextType = charTypeArray[++pCur - start];

                        if (nNextType == nCurType)
                        {
                            sb.Append(charArray[pCur]);
                        }
                        else
                        {
                            reachEnd = false;
                            break;
                        }
                    }
                    atomSegment.Add(new AtomNode(sb.ToString(), nCurType));
                    if (reachEnd)
                    {
                        pCur++;
                    }
                }
                // 对于所有其它情况
                else
                {
                    atomSegment.Add(new AtomNode(charArray[pCur], nCurType));
                    pCur++;
                }
            }

            return(atomSegment);
        }