Beispiel #1
0
        //POS tagging with Hidden Markov Model
        public bool POSTagging(WordResult[] pWordItems, WordDictionary dictCore, WordDictionary dictUnknown)
        {
            //pWordItems: Items; nItemCount: the count of items;core dictionary and unknown recognition dictionary
            int i = 0, j, nStartPos;

            Reset(false);
            while (i > -1 && i < pWordItems.Length && pWordItems[i].sWord != null)
            {
                nStartPos = i; //Start Position
                i         = GetFrom(pWordItems, nStartPos, dictCore, dictUnknown);
                GetBestPOS();
                switch (m_tagType)
                {
                case TAG_TYPE.TT_NORMAL:
                    //normal POS tagging
                    j = 1;
                    while (m_nBestTag[j] != -1 && j < m_nCurLength)
                    {
                        //Store the best POS tagging
                        pWordItems[j + nStartPos - 1].nPOS = m_nBestTag[j];
                        //Let 。be 0
                        if (pWordItems[j + nStartPos - 1].dValue > 0 && dictCore.IsExist(pWordItems[j + nStartPos - 1].sWord, -1))
                        {
                            //Exist and update its frequncy as a POS value
                            pWordItems[j + nStartPos - 1].dValue = dictCore.GetFrequency(pWordItems[j + nStartPos - 1].sWord, m_nBestTag[j]);
                        }
                        j += 1;
                    }
                    break;

                case TAG_TYPE.TT_PERSON:
                    //Person recognition
                    PersonRecognize(dictUnknown);
                    break;

                case TAG_TYPE.TT_PLACE:
                //Place name recognition
                case TAG_TYPE.TT_TRANS_PERSON:
                    //Transliteration Person
                    PlaceRecognize(dictCore, dictUnknown);
                    break;

                default:
                    break;
                }
                Reset();
            }
            return(true);
        }
Beispiel #2
0
        private int GetFrom(WordResult[] pWordItems, int nIndex, WordDictionary dictCore, WordDictionary dictUnknown)
        {
            WordInfo info;
            int[] aPOS = new int[Predefine.MAX_POS_PER_WORD];
            int[] aFreq = new int[Predefine.MAX_POS_PER_WORD];
            int nFreq = 0, j, nRetPos = 0, nWordsIndex = 0;
            bool bSplit = false; //Need to split in Transliteration recognition 
            int i = 1, nPOSCount;
            string sCurWord; //Current word

            nWordsIndex = i + nIndex - 1;
            for (i = 1; i < Predefine.MAX_WORDS_PER_SENTENCE && nWordsIndex < pWordItems.Length; i++)
            {
                if (m_tagType == TAG_TYPE.TT_NORMAL || !dictUnknown.IsExist(pWordItems[nWordsIndex].sWord, 44))
                {
                    m_sWords[i] = pWordItems[nWordsIndex].sWord; //store current word
                    m_nWordPosition[i + 1] = m_nWordPosition[i] + m_sWords[i].Length;
                }
                else
                {
                    if (!bSplit)
                    {
                        m_sWords[i] = pWordItems[nWordsIndex].sWord.Substring(0, 1);
                        //store current word
                        bSplit = true;
                    }
                    else
                    {
                        m_sWords[i] = pWordItems[nWordsIndex].sWord.Substring(1);
                        //store current word
                        bSplit = false;
                    }
                    m_nWordPosition[i + 1] = m_nWordPosition[i] + m_sWords[i].Length;
                }
                //Record the position of current word
                m_nStartPos = m_nWordPosition[i + 1];
                //Move the Start POS to the ending
                if (m_tagType != TAG_TYPE.TT_NORMAL)
                {
                    //Get the POSs from the unknown recognition dictionary
                    sCurWord = m_sWords[i];
                    if (m_tagType == TAG_TYPE.TT_TRANS_PERSON && i > 0 && m_sWords[i - 1] != null &&
                       Utility.charType(m_sWords[i - 1].ToCharArray()[0]) == Predefine.CT_CHINESE)
                    {
                        if (m_sWords[i] == ".")
                            sCurWord = ".";
                        else if (m_sWords[i] == "-")
                            sCurWord = "-";
                    }

                    info = dictUnknown.GetWordInfo(sCurWord);
                    if (info != null)
                    {
                        nPOSCount = info.Count + 1;
                        for (j = 0; j < info.Count; j++)
                        {
                            //Get the POS set of sCurWord in the unknown dictionary
                            m_nTags[i, j] = info.POSs[j];
                            m_dFrequency[i, j] = -Math.Log((double)(1 + info.Frequencies[j])) +
                               Math.Log((double)(m_context.GetFrequency(0, info.POSs[j]) + nPOSCount));
                        }
                    }
                    else
                    {
                        nPOSCount = 1;
                        j = 0;
                    }

                    //Get the POS set of sCurWord in the core dictionary
                    //We ignore the POS in the core dictionary and recognize them as other (0).
                    //We add their frequency to get the possibility as POS 0
                    if (string.Compare(m_sWords[i], "始##始") == 0)
                    {
                        m_nTags[i, j] = 100;
                        m_dFrequency[i, j] = 0;
                        j++;
                    }
                    else if (string.Compare(m_sWords[i], "末##末") == 0)
                    {
                        m_nTags[i, j] = 101;
                        m_dFrequency[i, j] = 0;
                        j++;
                    }
                    else
                    {
                        //dictCore.GetHandle(m_sWords[i], &nCount, aPOS, aFreq);
                        info = dictCore.GetWordInfo(m_sWords[i]);
                        nFreq = 0;
                        if (info != null)
                        {
                            for (int k = 0; k < info.Count; k++)
                            {
                                nFreq += info.Frequencies[k];
                            }
                            if (info.Count > 0)
                            {
                                m_nTags[i, j] = 0;
                                //m_dFrequency[i][j]=(double)(1+nFreq)/(double)(m_context.GetFrequency(0,0)+1);
                                m_dFrequency[i, j] = -Math.Log((double)(1 + nFreq)) + Math.Log((double)(m_context.GetFrequency(0, 0) + nPOSCount));
                                j++;
                            }
                        }
                    }
                }
                else
                //For normal POS tagging
                {
                    j = 0;
                    //Get the POSs from the unknown recognition dictionary
                    if (pWordItems[nWordsIndex].nPOS > 0)
                    {
                        //The word has  is only one POS value
                        //We have record its POS and nFrequncy in the items.
                        m_nTags[i, j] = pWordItems[nWordsIndex].nPOS;
                        m_dFrequency[i, j] = -Math.Log(pWordItems[nWordsIndex].dValue) + Math.Log((double)(m_context.GetFrequency(0, m_nTags[i, j]) + 1));
                        if (m_dFrequency[i, j] < 0)
                            //Not permit the value less than 0
                            m_dFrequency[i, j] = 0;
                        j++;
                    }
                    else
                    {
                        //The word has multiple POSs, we should retrieve the information from Core Dictionary 
                        if (pWordItems[nWordsIndex].nPOS < 0)
                        {
                            //The word has  is only one POS value
                            //We have record its POS and nFrequncy in the items.
                            m_nTags[i, j] = -pWordItems[nWordsIndex].nPOS;
                            m_dFrequency[i, j++] = pWordItems[nWordsIndex].dValue;
                        }
                        //dictCore.GetHandle(m_sWords[i], &nCount, aPOS, aFreq);
                        info = dictCore.GetWordInfo(m_sWords[i]);
                        if (info != null)
                        {
                            nPOSCount = info.Count;
                            for (; j < info.Count; j++)
                            {
                                //Get the POS set of sCurWord in the unknown dictionary
                                m_nTags[i, j] = info.POSs[j];
                                m_dFrequency[i, j] = -Math.Log(1 + info.Frequencies[j]) + Math.Log(m_context.GetFrequency(0, m_nTags[i, j]) + nPOSCount);
                            }
                        }
                    }
                }
                if (j == 0)
                {
                    //We donot know the POS, so we have to guess them according lexical knowledge
                    GuessPOS(i, out j); //Guess the POS of current word
                }
                m_nTags[i, j] = -1; //Set the ending POS 
                if (j == 1 && m_nTags[i, j] != Predefine.CT_SENTENCE_BEGIN)
                //No ambuguity
                {
                    //No ambuguity, so we can break from the loop
                    i++;
                    m_sWords[i] = null;
                    break;
                }
                if (!bSplit)
                    nWordsIndex++;
            }
            if (nWordsIndex == pWordItems.Length)
                nRetPos = -1;
            //Reaching ending

            if (m_nTags[i - 1, 1] != -1)
            //||m_sWords[i][0]==0
            {
                //Set end for words like "张/华/平"
                if (m_tagType != TAG_TYPE.TT_NORMAL)
                    m_nTags[i, 0] = 101;
                else
                    m_nTags[i, 0] = 1;

                m_dFrequency[i, 0] = 0;
                m_sWords[i] = null; //Set virtual ending
                m_nTags[i++, 1] = -1;
            }
            m_nCurLength = i; //The current word count
            if (nRetPos != -1)
                return nWordsIndex + 1;
            //Next start position
            return -1; //Reaching ending
        }
Beispiel #3
0
 //POS tagging with Hidden Markov Model
 public bool POSTagging(WordResult[] pWordItems, WordDictionary dictCore, WordDictionary dictUnknown)
 {
     //pWordItems: Items; nItemCount: the count of items;core dictionary and unknown recognition dictionary
     int i = 0, j, nStartPos;
     Reset(false);
     while (i > -1 && i < pWordItems.Length && pWordItems[i].sWord != null)
     {
         nStartPos = i; //Start Position
         i = GetFrom(pWordItems, nStartPos, dictCore, dictUnknown);
         GetBestPOS();
         switch (m_tagType)
         {
             case TAG_TYPE.TT_NORMAL:
                 //normal POS tagging
                 j = 1;
                 while (m_nBestTag[j] != -1 && j < m_nCurLength)
                 {
                     //Store the best POS tagging
                     pWordItems[j + nStartPos - 1].nPOS = m_nBestTag[j];
                     //Let 。be 0
                     if (pWordItems[j + nStartPos - 1].dValue > 0 && dictCore.IsExist(pWordItems[j + nStartPos - 1].sWord, -1))
                         //Exist and update its frequncy as a POS value
                         pWordItems[j + nStartPos - 1].dValue = dictCore.GetFrequency(pWordItems[j + nStartPos - 1].sWord, m_nBestTag[j]);
                     j += 1;
                 }
                 break;
             case TAG_TYPE.TT_PERSON:
                 //Person recognition
                 PersonRecognize(dictUnknown);
                 break;
             case TAG_TYPE.TT_PLACE:
             //Place name recognition
             case TAG_TYPE.TT_TRANS_PERSON:
                 //Transliteration Person
                 PlaceRecognize(dictCore, dictUnknown);
                 break;
             default:
                 break;
         }
         Reset();
     }
     return true;
 }
Beispiel #4
0
        private int GetFrom(WordResult[] pWordItems, int nIndex, WordDictionary dictCore, WordDictionary dictUnknown)
        {
            WordInfo info;

            int[]  aPOS = new int[Predefine.MAX_POS_PER_WORD];
            int[]  aFreq = new int[Predefine.MAX_POS_PER_WORD];
            int    nFreq = 0, j, nRetPos = 0, nWordsIndex = 0;
            bool   bSplit = false; //Need to split in Transliteration recognition
            int    i = 1, nPOSCount;
            string sCurWord;       //Current word

            nWordsIndex = i + nIndex - 1;
            for (i = 1; i < Predefine.MAX_WORDS_PER_SENTENCE && nWordsIndex < pWordItems.Length; i++)
            {
                if (m_tagType == TAG_TYPE.TT_NORMAL || !dictUnknown.IsExist(pWordItems[nWordsIndex].sWord, 44))
                {
                    m_sWords[i]            = pWordItems[nWordsIndex].sWord; //store current word
                    m_nWordPosition[i + 1] = m_nWordPosition[i] + m_sWords[i].Length;
                }
                else
                {
                    if (!bSplit)
                    {
                        m_sWords[i] = pWordItems[nWordsIndex].sWord.Substring(0, 1);
                        //store current word
                        bSplit = true;
                    }
                    else
                    {
                        m_sWords[i] = pWordItems[nWordsIndex].sWord.Substring(1);
                        //store current word
                        bSplit = false;
                    }
                    m_nWordPosition[i + 1] = m_nWordPosition[i] + m_sWords[i].Length;
                }
                //Record the position of current word
                m_nStartPos = m_nWordPosition[i + 1];
                //Move the Start POS to the ending
                if (m_tagType != TAG_TYPE.TT_NORMAL)
                {
                    //Get the POSs from the unknown recognition dictionary
                    sCurWord = m_sWords[i];
                    if (m_tagType == TAG_TYPE.TT_TRANS_PERSON && i > 0 && m_sWords[i - 1] != null &&
                        Utility.charType(m_sWords[i - 1].ToCharArray()[0]) == Predefine.CT_CHINESE)
                    {
                        if (m_sWords[i] == ".")
                        {
                            sCurWord = ".";
                        }
                        else if (m_sWords[i] == "-")
                        {
                            sCurWord = "-";
                        }
                    }

                    info = dictUnknown.GetWordInfo(sCurWord);
                    if (info != null)
                    {
                        nPOSCount = info.Count + 1;
                        for (j = 0; j < info.Count; j++)
                        {
                            //Get the POS set of sCurWord in the unknown dictionary
                            m_nTags[i, j]      = info.POSs[j];
                            m_dFrequency[i, j] = -Math.Log((double)(1 + info.Frequencies[j])) +
                                                 Math.Log((double)(m_context.GetFrequency(0, info.POSs[j]) + nPOSCount));
                        }
                    }
                    else
                    {
                        nPOSCount = 1;
                        j         = 0;
                    }

                    //Get the POS set of sCurWord in the core dictionary
                    //We ignore the POS in the core dictionary and recognize them as other (0).
                    //We add their frequency to get the possibility as POS 0
                    if (string.Compare(m_sWords[i], "始##始") == 0)
                    {
                        m_nTags[i, j]      = 100;
                        m_dFrequency[i, j] = 0;
                        j++;
                    }
                    else if (string.Compare(m_sWords[i], "末##末") == 0)
                    {
                        m_nTags[i, j]      = 101;
                        m_dFrequency[i, j] = 0;
                        j++;
                    }
                    else
                    {
                        //dictCore.GetHandle(m_sWords[i], &nCount, aPOS, aFreq);
                        info  = dictCore.GetWordInfo(m_sWords[i]);
                        nFreq = 0;
                        if (info != null)
                        {
                            for (int k = 0; k < info.Count; k++)
                            {
                                nFreq += info.Frequencies[k];
                            }
                            if (info.Count > 0)
                            {
                                m_nTags[i, j] = 0;
                                //m_dFrequency[i][j]=(double)(1+nFreq)/(double)(m_context.GetFrequency(0,0)+1);
                                m_dFrequency[i, j] = -Math.Log((double)(1 + nFreq)) + Math.Log((double)(m_context.GetFrequency(0, 0) + nPOSCount));
                                j++;
                            }
                        }
                    }
                }
                else
                //For normal POS tagging
                {
                    j = 0;
                    //Get the POSs from the unknown recognition dictionary
                    if (pWordItems[nWordsIndex].nPOS > 0)
                    {
                        //The word has  is only one POS value
                        //We have record its POS and nFrequncy in the items.
                        m_nTags[i, j]      = pWordItems[nWordsIndex].nPOS;
                        m_dFrequency[i, j] = -Math.Log(pWordItems[nWordsIndex].dValue) + Math.Log((double)(m_context.GetFrequency(0, m_nTags[i, j]) + 1));
                        if (m_dFrequency[i, j] < 0)
                        {
                            //Not permit the value less than 0
                            m_dFrequency[i, j] = 0;
                        }
                        j++;
                    }
                    else
                    {
                        //The word has multiple POSs, we should retrieve the information from Core Dictionary
                        if (pWordItems[nWordsIndex].nPOS < 0)
                        {
                            //The word has  is only one POS value
                            //We have record its POS and nFrequncy in the items.
                            m_nTags[i, j]        = -pWordItems[nWordsIndex].nPOS;
                            m_dFrequency[i, j++] = pWordItems[nWordsIndex].dValue;
                        }
                        //dictCore.GetHandle(m_sWords[i], &nCount, aPOS, aFreq);
                        info = dictCore.GetWordInfo(m_sWords[i]);
                        if (info != null)
                        {
                            nPOSCount = info.Count;
                            for (; j < info.Count; j++)
                            {
                                //Get the POS set of sCurWord in the unknown dictionary
                                m_nTags[i, j]      = info.POSs[j];
                                m_dFrequency[i, j] = -Math.Log(1 + info.Frequencies[j]) + Math.Log(m_context.GetFrequency(0, m_nTags[i, j]) + nPOSCount);
                            }
                        }
                    }
                }
                if (j == 0)
                {
                    //We donot know the POS, so we have to guess them according lexical knowledge
                    GuessPOS(i, out j); //Guess the POS of current word
                }
                m_nTags[i, j] = -1;     //Set the ending POS
                if (j == 1 && m_nTags[i, j] != Predefine.CT_SENTENCE_BEGIN)
                //No ambuguity
                {
                    //No ambuguity, so we can break from the loop
                    i++;
                    m_sWords[i] = null;
                    break;
                }
                if (!bSplit)
                {
                    nWordsIndex++;
                }
            }
            if (nWordsIndex == pWordItems.Length)
            {
                nRetPos = -1;
            }
            //Reaching ending

            if (m_nTags[i - 1, 1] != -1)
            //||m_sWords[i][0]==0
            {
                //Set end for words like "张/华/平"
                if (m_tagType != TAG_TYPE.TT_NORMAL)
                {
                    m_nTags[i, 0] = 101;
                }
                else
                {
                    m_nTags[i, 0] = 1;
                }

                m_dFrequency[i, 0] = 0;
                m_sWords[i]        = null; //Set virtual ending
                m_nTags[i++, 1]    = -1;
            }
            m_nCurLength = i; //The current word count
            if (nRetPos != -1)
            {
                return(nWordsIndex + 1);
            }
            //Next start position
            return(-1); //Reaching ending
        }