Пример #1
0
        //====================================================================
        // Generate Word according the segmentation route
        //====================================================================
        private static WordResult[] GenerateWord(int[] uniPath, WordLinkedArray linkedArray, RowFirstDynamicArray <ChainContent> m_graphOptimum)
        {
            if (linkedArray.Count == 0)
            {
                return(null);
            }

            //--------------------------------------------------------------------
            //Merge all seperate continue num into one number
            MergeContinueNumIntoOne(ref linkedArray);

            //--------------------------------------------------------------------
            //The delimiter "--"
            ChangeDelimiterPOS(ref linkedArray);

            //--------------------------------------------------------------------
            //如果前一个词是数字,当前词以“-”或“-”开始,并且不止这一个字符,
            //那么将此“-”符号从当前词中分离出来。
            //例如 “3 / -4 / 月”需要拆分成“3 / - / 4 / 月”
            SplitMiddleSlashFromDigitalWords(ref linkedArray);

            //--------------------------------------------------------------------
            //1、如果当前词是数字,下一个词是“月、日、时、分、秒、月份”中的一个,则合并,且当前词词性是时间
            //2、如果当前词是可以作为年份的数字,下一个词是“年”,则合并,词性为时间,否则为数字。
            //3、如果最后一个汉字是"点" ,则认为当前数字是时间
            //4、如果当前串最后一个汉字不是"∶·./"和半角的'.''/',那么是数
            //5、当前串最后一个汉字是"∶·./"和半角的'.''/',且长度大于1,那么去掉最后一个字符。例如"1."
            CheckDateElements(ref linkedArray);

            //--------------------------------------------------------------------
            //输出结果
            WordResult[] result = new WordResult[linkedArray.Count];

            WordNode pCur = linkedArray.first;
            int      i    = 0;

            while (pCur != null)
            {
                WordResult item = new WordResult();
                item.sWord  = pCur.theWord.sWord;
                item.nPOS   = pCur.theWord.nPOS;
                item.dValue = pCur.theWord.dValue;
                result[i]   = item;

                m_graphOptimum.SetElement(pCur.row, pCur.col, new ChainContent(item.dValue, item.nPOS, pCur.sWordInSegGraph));

                pCur = pCur.next;
                i++;
            }

            return(result);
        }
Пример #2
0
        //====================================================================
        //如果前一个词是数字,当前词以“-”或“-”开始,并且不止这一个字符,
        //那么将此“-”符号从当前词中分离出来。
        //例如 “3 / -4 / 月”需要拆分成“3 / - / 4 / 月”
        //====================================================================
        private static void SplitMiddleSlashFromDigitalWords(ref WordLinkedArray linkedArray)
        {
            if (linkedArray.Count < 2)
            {
                return;
            }

            WordNode pCur = linkedArray.first.next;
            WordNode pPre = linkedArray.first;

            while (pCur != null)
            {
                //27904='m'*256
                if ((Math.Abs(pPre.theWord.nPOS) == 27904 || Math.Abs(pPre.theWord.nPOS) == 29696) &&
                    (Utility.IsAllNum(pCur.theWord.sWord) || Utility.IsAllChineseNum(pCur.theWord.sWord)) &&
                    ("--".IndexOf(pCur.theWord.sWord.ToCharArray()[0]) >= 0) && pCur.theWord.sWord.Length > 1)
                {
                    // 将“-”拆分出来。
                    WordNode newNode = new WordNode();
                    newNode.row             = pCur.row + 1;
                    newNode.col             = pCur.col;
                    newNode.sWordInSegGraph = pCur.theWord.sWord.Substring(1);
                    WordResult theWord = new WordResult();
                    theWord.sWord   = newNode.sWordInSegGraph;
                    theWord.nPOS    = 27904;
                    theWord.dValue  = pCur.theWord.dValue;
                    newNode.theWord = theWord;

                    pCur.col            = pCur.row + 1;
                    pCur.theWord.sWord  = pCur.theWord.sWord.Substring(0, 1);
                    pCur.theWord.nPOS   = 30464; //'w'*256;
                    pCur.theWord.dValue = 0;

                    newNode.next = pCur.next;
                    pCur.next    = newNode;

                    linkedArray.Count++;
                }
                pCur = pCur.next;
                pPre = pPre.next;
            }
        }
      //Unknown word recognition
      //pWordSegResult:word Segmentation result;
      //graphOptimum: The optimized segmentation graph
      //graphSeg: The original segmentation graph
      public bool Recognition(WordResult[] pWordSegResult, RowFirstDynamicArray<ChainContent> graphOptimum,
         List<AtomNode> atomSegment, WordDictionary dictCore)
      {
         ChainItem<ChainContent> item;
         int nStartPos = 0, j = 0, nAtomStart, nAtomEnd, nPOSOriginal;
         double dValue;
         m_roleTag.POSTagging(pWordSegResult, dictCore, m_dict);
         //Tag the segmentation with unknown recognition roles according the core dictionary and unknown recognition dictionary
         for (int i = 0; i < m_roleTag.m_nUnknownWordsCount; i++)
         {
            while (j < atomSegment.Count && nStartPos < m_roleTag.m_nUnknownWords[i, 0])
               nStartPos += atomSegment[j++].sWord.Length;

            nAtomStart = j;
            while (j < atomSegment.Count && nStartPos < m_roleTag.m_nUnknownWords[i, 1])
               nStartPos += atomSegment[j++].sWord.Length;

            nAtomEnd = j;
            if (nAtomStart < nAtomEnd)
            {
               item = graphOptimum.GetElement(nAtomStart, nAtomEnd);
               if (item != null)
               {
                  dValue = item.Content.eWeight;
                  nPOSOriginal = item.Content.nPOS;
               }
               else
                  dValue = Predefine.INFINITE_VALUE;

               if (dValue > m_roleTag.m_dWordsPossibility[i])
                  //Set the element with less frequency
                  graphOptimum.SetElement(nAtomStart, nAtomEnd, new ChainContent(m_roleTag.m_dWordsPossibility[i], m_nPOS, m_sUnknownFlags));
            }
         }
         return true;
      }
Пример #4
0
        private int GetFrom(WordResult[] pWordItems, int nIndex, WordDictionary dictCore, WordDictionary dictUnknown)
        {
            WordInfo info;
            int[] aPOS = new int[Predefine.MAX_POS_PER_WORD];
            int[] aFreq = new int[Predefine.MAX_POS_PER_WORD];
            int nFreq = 0, j, nRetPos = 0, nWordsIndex = 0;
            bool bSplit = false; //Need to split in Transliteration recognition 
            int i = 1, nPOSCount;
            string sCurWord; //Current word

            nWordsIndex = i + nIndex - 1;
            for (i = 1; i < Predefine.MAX_WORDS_PER_SENTENCE && nWordsIndex < pWordItems.Length; i++)
            {
                if (m_tagType == TAG_TYPE.TT_NORMAL || !dictUnknown.IsExist(pWordItems[nWordsIndex].sWord, 44))
                {
                    m_sWords[i] = pWordItems[nWordsIndex].sWord; //store current word
                    m_nWordPosition[i + 1] = m_nWordPosition[i] + m_sWords[i].Length;
                }
                else
                {
                    if (!bSplit)
                    {
                        m_sWords[i] = pWordItems[nWordsIndex].sWord.Substring(0, 1);
                        //store current word
                        bSplit = true;
                    }
                    else
                    {
                        m_sWords[i] = pWordItems[nWordsIndex].sWord.Substring(1);
                        //store current word
                        bSplit = false;
                    }
                    m_nWordPosition[i + 1] = m_nWordPosition[i] + m_sWords[i].Length;
                }
                //Record the position of current word
                m_nStartPos = m_nWordPosition[i + 1];
                //Move the Start POS to the ending
                if (m_tagType != TAG_TYPE.TT_NORMAL)
                {
                    //Get the POSs from the unknown recognition dictionary
                    sCurWord = m_sWords[i];
                    if (m_tagType == TAG_TYPE.TT_TRANS_PERSON && i > 0 && m_sWords[i - 1] != null &&
                       Utility.charType(m_sWords[i - 1].ToCharArray()[0]) == Predefine.CT_CHINESE)
                    {
                        if (m_sWords[i] == ".")
                            sCurWord = ".";
                        else if (m_sWords[i] == "-")
                            sCurWord = "-";
                    }

                    info = dictUnknown.GetWordInfo(sCurWord);
                    if (info != null)
                    {
                        nPOSCount = info.Count + 1;
                        for (j = 0; j < info.Count; j++)
                        {
                            //Get the POS set of sCurWord in the unknown dictionary
                            m_nTags[i, j] = info.POSs[j];
                            m_dFrequency[i, j] = -Math.Log((double)(1 + info.Frequencies[j])) +
                               Math.Log((double)(m_context.GetFrequency(0, info.POSs[j]) + nPOSCount));
                        }
                    }
                    else
                    {
                        nPOSCount = 1;
                        j = 0;
                    }

                    //Get the POS set of sCurWord in the core dictionary
                    //We ignore the POS in the core dictionary and recognize them as other (0).
                    //We add their frequency to get the possibility as POS 0
                    if (string.Compare(m_sWords[i], "始##始") == 0)
                    {
                        m_nTags[i, j] = 100;
                        m_dFrequency[i, j] = 0;
                        j++;
                    }
                    else if (string.Compare(m_sWords[i], "末##末") == 0)
                    {
                        m_nTags[i, j] = 101;
                        m_dFrequency[i, j] = 0;
                        j++;
                    }
                    else
                    {
                        //dictCore.GetHandle(m_sWords[i], &nCount, aPOS, aFreq);
                        info = dictCore.GetWordInfo(m_sWords[i]);
                        nFreq = 0;
                        if (info != null)
                        {
                            for (int k = 0; k < info.Count; k++)
                            {
                                nFreq += info.Frequencies[k];
                            }
                            if (info.Count > 0)
                            {
                                m_nTags[i, j] = 0;
                                //m_dFrequency[i][j]=(double)(1+nFreq)/(double)(m_context.GetFrequency(0,0)+1);
                                m_dFrequency[i, j] = -Math.Log((double)(1 + nFreq)) + Math.Log((double)(m_context.GetFrequency(0, 0) + nPOSCount));
                                j++;
                            }
                        }
                    }
                }
                else
                //For normal POS tagging
                {
                    j = 0;
                    //Get the POSs from the unknown recognition dictionary
                    if (pWordItems[nWordsIndex].nPOS > 0)
                    {
                        //The word has  is only one POS value
                        //We have record its POS and nFrequncy in the items.
                        m_nTags[i, j] = pWordItems[nWordsIndex].nPOS;
                        m_dFrequency[i, j] = -Math.Log(pWordItems[nWordsIndex].dValue) + Math.Log((double)(m_context.GetFrequency(0, m_nTags[i, j]) + 1));
                        if (m_dFrequency[i, j] < 0)
                            //Not permit the value less than 0
                            m_dFrequency[i, j] = 0;
                        j++;
                    }
                    else
                    {
                        //The word has multiple POSs, we should retrieve the information from Core Dictionary 
                        if (pWordItems[nWordsIndex].nPOS < 0)
                        {
                            //The word has  is only one POS value
                            //We have record its POS and nFrequncy in the items.
                            m_nTags[i, j] = -pWordItems[nWordsIndex].nPOS;
                            m_dFrequency[i, j++] = pWordItems[nWordsIndex].dValue;
                        }
                        //dictCore.GetHandle(m_sWords[i], &nCount, aPOS, aFreq);
                        info = dictCore.GetWordInfo(m_sWords[i]);
                        if (info != null)
                        {
                            nPOSCount = info.Count;
                            for (; j < info.Count; j++)
                            {
                                //Get the POS set of sCurWord in the unknown dictionary
                                m_nTags[i, j] = info.POSs[j];
                                m_dFrequency[i, j] = -Math.Log(1 + info.Frequencies[j]) + Math.Log(m_context.GetFrequency(0, m_nTags[i, j]) + nPOSCount);
                            }
                        }
                    }
                }
                if (j == 0)
                {
                    //We donot know the POS, so we have to guess them according lexical knowledge
                    GuessPOS(i, out j); //Guess the POS of current word
                }
                m_nTags[i, j] = -1; //Set the ending POS 
                if (j == 1 && m_nTags[i, j] != Predefine.CT_SENTENCE_BEGIN)
                //No ambuguity
                {
                    //No ambuguity, so we can break from the loop
                    i++;
                    m_sWords[i] = null;
                    break;
                }
                if (!bSplit)
                    nWordsIndex++;
            }
            if (nWordsIndex == pWordItems.Length)
                nRetPos = -1;
            //Reaching ending

            if (m_nTags[i - 1, 1] != -1)
            //||m_sWords[i][0]==0
            {
                //Set end for words like "张/华/平"
                if (m_tagType != TAG_TYPE.TT_NORMAL)
                    m_nTags[i, 0] = 101;
                else
                    m_nTags[i, 0] = 1;

                m_dFrequency[i, 0] = 0;
                m_sWords[i] = null; //Set virtual ending
                m_nTags[i++, 1] = -1;
            }
            m_nCurLength = i; //The current word count
            if (nRetPos != -1)
                return nWordsIndex + 1;
            //Next start position
            return -1; //Reaching ending
        }
Пример #5
0
 //POS tagging with Hidden Markov Model
 public bool POSTagging(WordResult[] pWordItems, WordDictionary dictCore, WordDictionary dictUnknown)
 {
     //pWordItems: Items; nItemCount: the count of items;core dictionary and unknown recognition dictionary
     int i = 0, j, nStartPos;
     Reset(false);
     while (i > -1 && i < pWordItems.Length && pWordItems[i].sWord != null)
     {
         nStartPos = i; //Start Position
         i = GetFrom(pWordItems, nStartPos, dictCore, dictUnknown);
         GetBestPOS();
         switch (m_tagType)
         {
             case TAG_TYPE.TT_NORMAL:
                 //normal POS tagging
                 j = 1;
                 while (m_nBestTag[j] != -1 && j < m_nCurLength)
                 {
                     //Store the best POS tagging
                     pWordItems[j + nStartPos - 1].nPOS = m_nBestTag[j];
                     //Let 。be 0
                     if (pWordItems[j + nStartPos - 1].dValue > 0 && dictCore.IsExist(pWordItems[j + nStartPos - 1].sWord, -1))
                         //Exist and update its frequncy as a POS value
                         pWordItems[j + nStartPos - 1].dValue = dictCore.GetFrequency(pWordItems[j + nStartPos - 1].sWord, m_nBestTag[j]);
                     j += 1;
                 }
                 break;
             case TAG_TYPE.TT_PERSON:
                 //Person recognition
                 PersonRecognize(dictUnknown);
                 break;
             case TAG_TYPE.TT_PLACE:
             //Place name recognition
             case TAG_TYPE.TT_TRANS_PERSON:
                 //Transliteration Person
                 PlaceRecognize(dictCore, dictUnknown);
                 break;
             default:
                 break;
         }
         Reset();
     }
     return true;
 }
Пример #6
0
        //====================================================================
        //���ǰһ���������֣���ǰ���ԡ�������-����ʼ�����Ҳ�ֹ��һ���ַ���
        //��ô���ˡ��������Ŵӵ�ǰ���з��������
        //���� ��3 / -4 / �¡���Ҫ��ֳɡ�3 / - / 4 / �¡�
        //====================================================================
        private static void SplitMiddleSlashFromDigitalWords(ref WordLinkedArray linkedArray)
        {
            if (linkedArray.Count < 2)
            return;

             WordNode pCur = linkedArray.first.next;
             WordNode pPre = linkedArray.first;

             while (pCur != null)
             {
            //27904='m'*256
            if ((Math.Abs(pPre.theWord.nPOS) == 27904 || Math.Abs(pPre.theWord.nPOS) == 29696) &&
               (Utility.IsAllNum(pCur.theWord.sWord) || Utility.IsAllChineseNum(pCur.theWord.sWord)) &&
               ("-��".IndexOf(pCur.theWord.sWord.ToCharArray()[0]) >= 0) && pCur.theWord.sWord.Length > 1)
            {
               // ����������ֳ�����
               WordNode newNode = new WordNode();
               newNode.row = pCur.row + 1;
               newNode.col = pCur.col;
               newNode.sWordInSegGraph = pCur.theWord.sWord.Substring(1);
               WordResult theWord = new WordResult();
               theWord.sWord = newNode.sWordInSegGraph;
               theWord.nPOS = 27904;
               theWord.dValue = pCur.theWord.dValue;
               newNode.theWord = theWord;

               pCur.col = pCur.row + 1;
               pCur.theWord.sWord = pCur.theWord.sWord.Substring(0, 1);
               pCur.theWord.nPOS = 30464; //'w'*256;
               pCur.theWord.dValue = 0;

               newNode.next = pCur.next;
               pCur.next = newNode;

               linkedArray.Count++;
            }
            pCur = pCur.next;
            pPre = pPre.next;
             }
        }
Пример #7
0
        //====================================================================
        // Generate Word according the segmentation route
        //====================================================================
        private static WordResult[] GenerateWord(int[] uniPath, WordLinkedArray linkedArray, RowFirstDynamicArray<ChainContent> m_graphOptimum)
        {
            if (linkedArray.Count == 0)
            return null;

             //--------------------------------------------------------------------
             //Merge all seperate continue num into one number
             MergeContinueNumIntoOne(ref linkedArray);

             //--------------------------------------------------------------------
             //The delimiter "����"
             ChangeDelimiterPOS(ref linkedArray);

             //--------------------------------------------------------------------
             //���ǰһ���������֣���ǰ���ԡ�������-����ʼ�����Ҳ�ֹ��һ���ַ���
             //��ô���ˡ��������Ŵӵ�ǰ���з��������
             //���� ��3 / -4 / �¡���Ҫ��ֳɡ�3 / - / 4 / �¡�
             SplitMiddleSlashFromDigitalWords(ref linkedArray);

             //--------------------------------------------------------------------
             //1�������ǰ�������֣���һ�����ǡ��¡��ա�ʱ���֡��롢�·ݡ��е�һ������ϲ�,�ҵ�ǰ�ʴ�����ʱ��
             //2�������ǰ���ǿ�����Ϊ��ݵ����֣���һ�����ǡ��ꡱ����ϲ�������Ϊʱ�䣬����Ϊ���֡�
             //3��������һ��������"��" ������Ϊ��ǰ������ʱ��
             //4�������ǰ�����һ�����ֲ���"�á�����"�Ͱ�ǵ�'.''/'����ô����
             //5����ǰ�����һ��������"�á�����"�Ͱ�ǵ�'.''/'���ҳ��ȴ���1����ôȥ�����һ���ַ�������"1."
             CheckDateElements(ref linkedArray);

             //--------------------------------------------------------------------
             //������
             WordResult[] result = new WordResult[linkedArray.Count];

             WordNode pCur = linkedArray.first;
             int i = 0;
             while (pCur != null)
             {
            WordResult item = new WordResult();
            item.sWord = pCur.theWord.sWord;
            item.nPOS = pCur.theWord.nPOS;
            item.dValue = pCur.theWord.dValue;
            item.sLocation = pCur.row;
            result[i] = item;

            m_graphOptimum.SetElement(pCur.row, pCur.col, new ChainContent(item.dValue, item.nPOS, pCur.sWordInSegGraph));

            pCur = pCur.next;
            i++;
             }

             return result;
        }
Пример #8
0
      //====================================================================
      // Generate Word according the segmentation route
      //====================================================================
      private static WordResult[] GenerateWord(int[] uniPath, WordLinkedArray linkedArray, RowFirstDynamicArray<ChainContent> m_graphOptimum)
      {
         if (linkedArray.Count == 0)
            return null;

         //--------------------------------------------------------------------
         //Merge all seperate continue num into one number
         MergeContinueNumIntoOne(ref linkedArray);

         //--------------------------------------------------------------------
         //The delimiter "--"
         ChangeDelimiterPOS(ref linkedArray);

         //--------------------------------------------------------------------
         //如果前一个词是数字,当前词以“-”或“-”开始,并且不止这一个字符,
         //那么将此“-”符号从当前词中分离出来。
         //例如 “3 / -4 / 月”需要拆分成“3 / - / 4 / 月”
         SplitMiddleSlashFromDigitalWords(ref linkedArray);

         //--------------------------------------------------------------------
         //1、如果当前词是数字,下一个词是“月、日、时、分、秒、月份”中的一个,则合并,且当前词词性是时间
         //2、如果当前词是可以作为年份的数字,下一个词是“年”,则合并,词性为时间,否则为数字。
         //3、如果最后一个汉字是"点" ,则认为当前数字是时间
         //4、如果当前串最后一个汉字不是"∶·./"和半角的'.''/',那么是数
         //5、当前串最后一个汉字是"∶·./"和半角的'.''/',且长度大于1,那么去掉最后一个字符。例如"1."
         CheckDateElements(ref linkedArray);

         //--------------------------------------------------------------------
         //输出结果
         WordResult[] result = new WordResult[linkedArray.Count];

         WordNode pCur = linkedArray.first;
         int i = 0;
         while (pCur != null)
         {
            WordResult item = new WordResult();
            item.sWord = pCur.theWord.sWord;
            item.nPOS = pCur.theWord.nPOS;
            item.dValue = pCur.theWord.dValue;
            result[i] = item;

            m_graphOptimum.SetElement(pCur.row, pCur.col, new ChainContent(item.dValue, item.nPOS, pCur.sWordInSegGraph));

            pCur = pCur.next;
            i++;
         }

         return result;
      }
 private static string PrintResultStringOnly(WordResult[] wr)
 {
     string s = "";
     for (int j = 1; j < wr.Length - 1; j++)
         s = s + string.Format(@"{0}", wr[j].sWord);
     return s;
 }
 private static string PrintResult(WordResult[] wr)
 {
     string s = "";
     for (int j = 1; j < wr.Length - 1; j++)
         s = s + string.Format(@"{0}/{1}", wr[j].sWord, Utility.GetPOSString(wr[j].nPOS, nPosLevel.LevelOne));
     return s;
 }