Exemple #1
0
        private static void MergeContinueNumIntoOne(ref WordLinkedArray linkedArray)
        {
            if (linkedArray.Count < 2)
            {
                return;
            }

            string   tmp;
            WordNode pCur  = linkedArray.first;
            WordNode pNext = pCur.next;

            while (pNext != null)
            {
                if ((Utility.IsAllNum(pCur.theWord.sWord) || Utility.IsAllChineseNum(pCur.theWord.sWord)) &&
                    (Utility.IsAllNum(pNext.theWord.sWord) || Utility.IsAllChineseNum(pNext.theWord.sWord)))
                {
                    tmp = pCur.theWord.sWord + pNext.theWord.sWord;
                    if (Utility.IsAllNum(tmp) || Utility.IsAllChineseNum(tmp))
                    {
                        pCur.theWord.sWord += pNext.theWord.sWord;
                        pCur.col            = pNext.col;
                        pCur.next           = pNext.next;
                        linkedArray.Count--;
                        pNext = pCur.next;
                        continue;
                    }
                }

                pCur  = pCur.next;
                pNext = pNext.next;
            }
        }
Exemple #2
0
        //====================================================================
        // Generate Word according the segmentation route
        //====================================================================
        private static WordResult[] GenerateWord(int[] uniPath, WordLinkedArray linkedArray, RowFirstDynamicArray <ChainContent> m_graphOptimum)
        {
            if (linkedArray.Count == 0)
            {
                return(null);
            }

            //--------------------------------------------------------------------
            //Merge all seperate continue num into one number
            MergeContinueNumIntoOne(ref linkedArray);

            //--------------------------------------------------------------------
            //The delimiter "--"
            ChangeDelimiterPOS(ref linkedArray);

            //--------------------------------------------------------------------
            //如果前一个词是数字,当前词以“-”或“-”开始,并且不止这一个字符,
            //那么将此“-”符号从当前词中分离出来。
            //例如 “3 / -4 / 月”需要拆分成“3 / - / 4 / 月”
            SplitMiddleSlashFromDigitalWords(ref linkedArray);

            //--------------------------------------------------------------------
            //1、如果当前词是数字,下一个词是“月、日、时、分、秒、月份”中的一个,则合并,且当前词词性是时间
            //2、如果当前词是可以作为年份的数字,下一个词是“年”,则合并,词性为时间,否则为数字。
            //3、如果最后一个汉字是"点" ,则认为当前数字是时间
            //4、如果当前串最后一个汉字不是"∶·./"和半角的'.''/',那么是数
            //5、当前串最后一个汉字是"∶·./"和半角的'.''/',且长度大于1,那么去掉最后一个字符。例如"1."
            CheckDateElements(ref linkedArray);

            //--------------------------------------------------------------------
            //输出结果
            WordResult[] result = new WordResult[linkedArray.Count];

            WordNode pCur = linkedArray.first;
            int      i    = 0;

            while (pCur != null)
            {
                WordResult item = new WordResult();
                item.sWord  = pCur.theWord.sWord;
                item.nPOS   = pCur.theWord.nPOS;
                item.dValue = pCur.theWord.dValue;
                result[i]   = item;

                m_graphOptimum.SetElement(pCur.row, pCur.col, new ChainContent(item.dValue, item.nPOS, pCur.sWordInSegGraph));

                pCur = pCur.next;
                i++;
            }

            return(result);
        }
Exemple #3
0
        public override string ToString()
        {
            StringBuilder sb = new StringBuilder();

            WordNode cur = first;

            while (cur != null)
            {
                sb.Append(string.Format("{0}, ", cur.theWord.sWord));
                cur = cur.next;
            }

            return(sb.ToString());
        }
Exemple #4
0
        public void AppendNode(WordNode node)
        {
            if (first == null && last == null)
            {
                first = node;
                last  = node;
            }
            else
            {
                last.next = node;
                last      = node;
            }

            Count++;
        }
      public void AppendNode(WordNode node)
      {
         if (first == null && last == null)
         {
            first = node;
            last = node;
         }
         else
         {
            last.next = node;
            last = node;
         }

         Count++;
      }
Exemple #6
0
        private static void ChangeDelimiterPOS(ref WordLinkedArray linkedArray)
        {
            WordNode pCur = linkedArray.first;

            while (pCur != null)
            {
                if (pCur.theWord.sWord == "--" || pCur.theWord.sWord == "—" || pCur.theWord.sWord == "-")
                {
                    pCur.theWord.nPOS   = 30464; //'w'*256;Set the POS with 'w'
                    pCur.theWord.dValue = 0;
                }

                pCur = pCur.next;
            }
        }
Exemple #7
0
        //====================================================================
        //如果前一个词是数字,当前词以“-”或“-”开始,并且不止这一个字符,
        //那么将此“-”符号从当前词中分离出来。
        //例如 “3 / -4 / 月”需要拆分成“3 / - / 4 / 月”
        //====================================================================
        private static void SplitMiddleSlashFromDigitalWords(ref WordLinkedArray linkedArray)
        {
            if (linkedArray.Count < 2)
            {
                return;
            }

            WordNode pCur = linkedArray.first.next;
            WordNode pPre = linkedArray.first;

            while (pCur != null)
            {
                //27904='m'*256
                if ((Math.Abs(pPre.theWord.nPOS) == 27904 || Math.Abs(pPre.theWord.nPOS) == 29696) &&
                    (Utility.IsAllNum(pCur.theWord.sWord) || Utility.IsAllChineseNum(pCur.theWord.sWord)) &&
                    ("--".IndexOf(pCur.theWord.sWord.ToCharArray()[0]) >= 0) && pCur.theWord.sWord.Length > 1)
                {
                    // 将“-”拆分出来。
                    WordNode newNode = new WordNode();
                    newNode.row             = pCur.row + 1;
                    newNode.col             = pCur.col;
                    newNode.sWordInSegGraph = pCur.theWord.sWord.Substring(1);
                    WordResult theWord = new WordResult();
                    theWord.sWord   = newNode.sWordInSegGraph;
                    theWord.nPOS    = 27904;
                    theWord.dValue  = pCur.theWord.dValue;
                    newNode.theWord = theWord;

                    pCur.col            = pCur.row + 1;
                    pCur.theWord.sWord  = pCur.theWord.sWord.Substring(0, 1);
                    pCur.theWord.nPOS   = 30464; //'w'*256;
                    pCur.theWord.dValue = 0;

                    newNode.next = pCur.next;
                    pCur.next    = newNode;

                    linkedArray.Count++;
                }
                pCur = pCur.next;
                pPre = pPre.next;
            }
        }
Exemple #8
0
        //====================================================================
        // 将BiPath转换为LinkedArray
        // 例如“他说的确实在理”
        // BiPath:(0, 1, 2, 3, 6, 9, 11, 12)
        //    0    1   2   3   4     5   6     7   8     9   10    11  12
        // 始##始  他  说  的  的确  确  确实  实  实在  在  在理  理  末##末
        //====================================================================
        private static WordLinkedArray BiPath2LinkedArray(int[] biPath, RowFirstDynamicArray <ChainContent> segGraph, List <AtomNode> atomSegment)
        {
            List <ChainItem <ChainContent> > list = segGraph.ToListItems();
            StringBuilder sb = new StringBuilder();

            WordLinkedArray result = new WordLinkedArray();

            for (int i = 0; i < biPath.Length; i++)
            {
                WordNode node = new WordNode();

                node.row             = list[biPath[i]].row;
                node.col             = list[biPath[i]].col;
                node.sWordInSegGraph = list[biPath[i]].Content.sWord;

                node.theWord = new WordResult();
                if (node.sWordInSegGraph == "未##人" || node.sWordInSegGraph == "未##地" ||
                    node.sWordInSegGraph == "未##数" || node.sWordInSegGraph == "未##时" || node.sWordInSegGraph == "未##串")
                {
                    sb.Remove(0, sb.Length);
                    for (int j = node.row; j < node.col; j++)
                    {
                        sb.Append(atomSegment[j].sWord);
                    }

                    node.theWord.sWord = sb.ToString();
                }
                else
                {
                    node.theWord.sWord = list[biPath[i]].Content.sWord;
                }

                node.theWord.nPOS   = list[biPath[i]].Content.nPOS;
                node.theWord.dValue = list[biPath[i]].Content.eWeight;

                result.AppendNode(node);
            }

            return(result);
        }
Exemple #9
0
        //====================================================================
        //���ǰһ���������֣���ǰ���ԡ�������-����ʼ�����Ҳ�ֹ��һ���ַ���
        //��ô���ˡ��������Ŵӵ�ǰ���з��������
        //���� ��3 / -4 / �¡���Ҫ��ֳɡ�3 / - / 4 / �¡�
        //====================================================================
        private static void SplitMiddleSlashFromDigitalWords(ref WordLinkedArray linkedArray)
        {
            if (linkedArray.Count < 2)
            return;

             WordNode pCur = linkedArray.first.next;
             WordNode pPre = linkedArray.first;

             while (pCur != null)
             {
            //27904='m'*256
            if ((Math.Abs(pPre.theWord.nPOS) == 27904 || Math.Abs(pPre.theWord.nPOS) == 29696) &&
               (Utility.IsAllNum(pCur.theWord.sWord) || Utility.IsAllChineseNum(pCur.theWord.sWord)) &&
               ("-��".IndexOf(pCur.theWord.sWord.ToCharArray()[0]) >= 0) && pCur.theWord.sWord.Length > 1)
            {
               // ����������ֳ�����
               WordNode newNode = new WordNode();
               newNode.row = pCur.row + 1;
               newNode.col = pCur.col;
               newNode.sWordInSegGraph = pCur.theWord.sWord.Substring(1);
               WordResult theWord = new WordResult();
               theWord.sWord = newNode.sWordInSegGraph;
               theWord.nPOS = 27904;
               theWord.dValue = pCur.theWord.dValue;
               newNode.theWord = theWord;

               pCur.col = pCur.row + 1;
               pCur.theWord.sWord = pCur.theWord.sWord.Substring(0, 1);
               pCur.theWord.nPOS = 30464; //'w'*256;
               pCur.theWord.dValue = 0;

               newNode.next = pCur.next;
               pCur.next = newNode;

               linkedArray.Count++;
            }
            pCur = pCur.next;
            pPre = pPre.next;
             }
        }
Exemple #10
0
        //====================================================================
        // ��BiPathת��ΪLinkedArray
        // ���硰��˵��ȷʵ�����
        // BiPath����0, 1, 2, 3, 6, 9, 11, 12��
        //    0    1   2   3   4     5   6     7   8     9   10    11  12
        // ʼ##ʼ  ��  ˵  ��  ��ȷ  ȷ  ȷʵ  ʵ  ʵ��  ��  ����  ��  ĩ##ĩ
        //====================================================================
        private static WordLinkedArray BiPath2LinkedArray(int[] biPath, RowFirstDynamicArray<ChainContent> segGraph, List<AtomNode> atomSegment)
        {
            List<ChainItem<ChainContent>> list = segGraph.ToListItems();
             StringBuilder sb = new StringBuilder();

             WordLinkedArray result = new WordLinkedArray();

             for (int i = 0; i < biPath.Length; i++)
             {
            WordNode node = new WordNode();

            node.row = list[biPath[i]].row;
            node.col = list[biPath[i]].col;
            node.sWordInSegGraph = list[biPath[i]].Content.sWord;

            node.theWord = new WordResult();
            if (node.sWordInSegGraph == "δ##��" || node.sWordInSegGraph == "δ##��" ||
               node.sWordInSegGraph == "δ##��" || node.sWordInSegGraph == "δ##ʱ" || node.sWordInSegGraph == "δ##��")
            {
               sb.Remove(0, sb.Length);
               for (int j = node.row; j < node.col; j++)
                  sb.Append(atomSegment[j].sWord);

               node.theWord.sWord = sb.ToString();
            }
            else
               node.theWord.sWord = list[biPath[i]].Content.sWord;

            node.theWord.nPOS = list[biPath[i]].Content.nPOS;
            node.theWord.dValue = list[biPath[i]].Content.eWeight;

            result.AppendNode(node);
             }

             return result;
        }
Exemple #11
0
        //====================================================================
        //1、如果当前词是数字,下一个词是“月、日、时、分、秒、月份”中的一个,则合并且当前词词性是时间
        //2、如果当前词是可以作为年份的数字,下一个词是“年”,则合并,词性为时间,否则为数字。
        //3、如果最后一个汉字是"点" ,则认为当前数字是时间
        //4、如果当前串最后一个汉字不是"∶·./"和半角的'.''/',那么是数
        //5、当前串最后一个汉字是"∶·./"和半角的'.''/',且长度大于1,那么去掉最后一个字符。例如"1."
        //====================================================================
        private static void CheckDateElements(ref WordLinkedArray linkedArray)
        {
            if (linkedArray.Count < 2)
            {
                return;
            }

            string   nextWord;
            WordNode pCur  = linkedArray.first;
            WordNode pNext = pCur.next;

            while (pNext != null)
            {
                if (Utility.IsAllNum(pCur.theWord.sWord) || Utility.IsAllChineseNum(pCur.theWord.sWord))
                {
                    //===== 1、如果当前词是数字,下一个词是“月、日、时、分、秒、月份”中的一个,则合并且当前词词性是时间
                    nextWord = pNext.theWord.sWord;
                    if ((nextWord.Length == 1 && "月日时分秒".IndexOf(nextWord) != -1) || (nextWord.Length == 2 && nextWord == "月份"))
                    {
                        //2001年
                        pCur.theWord.sWord  += nextWord;
                        pCur.col             = pNext.col;
                        pCur.sWordInSegGraph = "未##时";
                        pCur.theWord.nPOS    = -29696; //'t'*256;//Set the POS with 'm'
                        pCur.next            = pNext.next;
                        pNext = pCur.next;
                        linkedArray.Count--;
                    }
                    //===== 2、如果当前词是可以作为年份的数字,下一个词是“年”,则合并,词性为时间,否则为数字。
                    else if (nextWord == "年")
                    {
                        if (IsYearTime(pCur.theWord.sWord))
                        {
                            pCur.theWord.sWord  += nextWord;
                            pCur.col             = pNext.col;
                            pCur.sWordInSegGraph = "未##时";
                            pCur.theWord.nPOS    = -29696; //'t'*256;//Set the POS with 'm'
                            pCur.next            = pNext.next;
                            pNext = pCur.next;
                            linkedArray.Count--;
                        }
                        //===== 否则当前词就是数字了 =====
                        else
                        {
                            pCur.sWordInSegGraph = "未##数";
                            pCur.theWord.nPOS    = -27904; //Set the POS with 'm'
                        }
                    }
                    else
                    {
                        //===== 3、如果最后一个汉字是"点" ,则认为当前数字是时间
                        if (pCur.theWord.sWord.EndsWith("点"))
                        {
                            pCur.sWordInSegGraph = "未##时";
                            pCur.theWord.nPOS    = -29696; //Set the POS with 't'
                        }
                        else
                        {
                            char[] tmpcharArray = pCur.theWord.sWord.ToCharArray();
                            string lastChar     = tmpcharArray[tmpcharArray.Length - 1].ToString();
                            //===== 4、如果当前串最后一个汉字不是"∶·./"和半角的'.''/',那么是数
                            if ("∶·././".IndexOf(lastChar) == -1)
                            {
                                pCur.sWordInSegGraph = "未##数";
                                pCur.theWord.nPOS    = -27904; //'m'*256;Set the POS with 'm'
                            }
                            //===== 5、当前串最后一个汉字是"∶·./"和半角的'.''/',且长度大于1,那么去掉最后一个字符。例如"1."
                            else if (pCur.theWord.sWord.Length > 1)
                            {
                                pCur.theWord.sWord = pCur.theWord.sWord.Substring(0, pCur.theWord.sWord.Length - 1);

                                pCur.sWordInSegGraph = "未##数";
                                pCur.theWord.nPOS    = -27904; //'m'*256;Set the POS with 'm'
                            }
                        }
                    }
                }

                pCur  = pCur.next;
                pNext = pNext.next;
            }
        }