//POS tagging with Hidden Markov Model public bool POSTagging(WordResult[] pWordItems, WordDictionary dictCore, WordDictionary dictUnknown) { //pWordItems: Items; nItemCount: the count of items;core dictionary and unknown recognition dictionary int i = 0, j, nStartPos; Reset(false); while (i > -1 && i < pWordItems.Length && pWordItems[i].sWord != null) { nStartPos = i; //Start Position i = GetFrom(pWordItems, nStartPos, dictCore, dictUnknown); GetBestPOS(); switch (m_tagType) { case TAG_TYPE.TT_NORMAL: //normal POS tagging j = 1; while (m_nBestTag[j] != -1 && j < m_nCurLength) { //Store the best POS tagging pWordItems[j + nStartPos - 1].nPOS = m_nBestTag[j]; //Let 。be 0 if (pWordItems[j + nStartPos - 1].dValue > 0 && dictCore.IsExist(pWordItems[j + nStartPos - 1].sWord, -1)) { //Exist and update its frequncy as a POS value pWordItems[j + nStartPos - 1].dValue = dictCore.GetFrequency(pWordItems[j + nStartPos - 1].sWord, m_nBestTag[j]); } j += 1; } break; case TAG_TYPE.TT_PERSON: //Person recognition PersonRecognize(dictUnknown); break; case TAG_TYPE.TT_PLACE: //Place name recognition case TAG_TYPE.TT_TRANS_PERSON: //Transliteration Person PlaceRecognize(dictCore, dictUnknown); break; default: break; } Reset(); } return(true); }
private int GetFrom(WordResult[] pWordItems, int nIndex, WordDictionary dictCore, WordDictionary dictUnknown) { WordInfo info; int[] aPOS = new int[Predefine.MAX_POS_PER_WORD]; int[] aFreq = new int[Predefine.MAX_POS_PER_WORD]; int nFreq = 0, j, nRetPos = 0, nWordsIndex = 0; bool bSplit = false; //Need to split in Transliteration recognition int i = 1, nPOSCount; string sCurWord; //Current word nWordsIndex = i + nIndex - 1; for (i = 1; i < Predefine.MAX_WORDS_PER_SENTENCE && nWordsIndex < pWordItems.Length; i++) { if (m_tagType == TAG_TYPE.TT_NORMAL || !dictUnknown.IsExist(pWordItems[nWordsIndex].sWord, 44)) { m_sWords[i] = pWordItems[nWordsIndex].sWord; //store current word m_nWordPosition[i + 1] = m_nWordPosition[i] + m_sWords[i].Length; } else { if (!bSplit) { m_sWords[i] = pWordItems[nWordsIndex].sWord.Substring(0, 1); //store current word bSplit = true; } else { m_sWords[i] = pWordItems[nWordsIndex].sWord.Substring(1); //store current word bSplit = false; } m_nWordPosition[i + 1] = m_nWordPosition[i] + m_sWords[i].Length; } //Record the position of current word m_nStartPos = m_nWordPosition[i + 1]; //Move the Start POS to the ending if (m_tagType != TAG_TYPE.TT_NORMAL) { //Get the POSs from the unknown recognition dictionary sCurWord = m_sWords[i]; if (m_tagType == TAG_TYPE.TT_TRANS_PERSON && i > 0 && m_sWords[i - 1] != null && Utility.charType(m_sWords[i - 1].ToCharArray()[0]) == Predefine.CT_CHINESE) { if (m_sWords[i] == ".") sCurWord = "."; else if (m_sWords[i] == "-") sCurWord = "-"; } info = dictUnknown.GetWordInfo(sCurWord); if (info != null) { nPOSCount = info.Count + 1; for (j = 0; j < info.Count; j++) { //Get the POS set of sCurWord in the unknown dictionary m_nTags[i, j] = info.POSs[j]; m_dFrequency[i, j] = -Math.Log((double)(1 + info.Frequencies[j])) + Math.Log((double)(m_context.GetFrequency(0, info.POSs[j]) + nPOSCount)); } } else { nPOSCount = 1; j = 0; } //Get the POS set of sCurWord in the core dictionary //We ignore the POS in the core dictionary and recognize them as other (0). //We add their frequency to get the possibility as POS 0 if (string.Compare(m_sWords[i], "始##始") == 0) { m_nTags[i, j] = 100; m_dFrequency[i, j] = 0; j++; } else if (string.Compare(m_sWords[i], "末##末") == 0) { m_nTags[i, j] = 101; m_dFrequency[i, j] = 0; j++; } else { //dictCore.GetHandle(m_sWords[i], &nCount, aPOS, aFreq); info = dictCore.GetWordInfo(m_sWords[i]); nFreq = 0; if (info != null) { for (int k = 0; k < info.Count; k++) { nFreq += info.Frequencies[k]; } if (info.Count > 0) { m_nTags[i, j] = 0; //m_dFrequency[i][j]=(double)(1+nFreq)/(double)(m_context.GetFrequency(0,0)+1); m_dFrequency[i, j] = -Math.Log((double)(1 + nFreq)) + Math.Log((double)(m_context.GetFrequency(0, 0) + nPOSCount)); j++; } } } } else //For normal POS tagging { j = 0; //Get the POSs from the unknown recognition dictionary if (pWordItems[nWordsIndex].nPOS > 0) { //The word has is only one POS value //We have record its POS and nFrequncy in the items. m_nTags[i, j] = pWordItems[nWordsIndex].nPOS; m_dFrequency[i, j] = -Math.Log(pWordItems[nWordsIndex].dValue) + Math.Log((double)(m_context.GetFrequency(0, m_nTags[i, j]) + 1)); if (m_dFrequency[i, j] < 0) //Not permit the value less than 0 m_dFrequency[i, j] = 0; j++; } else { //The word has multiple POSs, we should retrieve the information from Core Dictionary if (pWordItems[nWordsIndex].nPOS < 0) { //The word has is only one POS value //We have record its POS and nFrequncy in the items. m_nTags[i, j] = -pWordItems[nWordsIndex].nPOS; m_dFrequency[i, j++] = pWordItems[nWordsIndex].dValue; } //dictCore.GetHandle(m_sWords[i], &nCount, aPOS, aFreq); info = dictCore.GetWordInfo(m_sWords[i]); if (info != null) { nPOSCount = info.Count; for (; j < info.Count; j++) { //Get the POS set of sCurWord in the unknown dictionary m_nTags[i, j] = info.POSs[j]; m_dFrequency[i, j] = -Math.Log(1 + info.Frequencies[j]) + Math.Log(m_context.GetFrequency(0, m_nTags[i, j]) + nPOSCount); } } } } if (j == 0) { //We donot know the POS, so we have to guess them according lexical knowledge GuessPOS(i, out j); //Guess the POS of current word } m_nTags[i, j] = -1; //Set the ending POS if (j == 1 && m_nTags[i, j] != Predefine.CT_SENTENCE_BEGIN) //No ambuguity { //No ambuguity, so we can break from the loop i++; m_sWords[i] = null; break; } if (!bSplit) nWordsIndex++; } if (nWordsIndex == pWordItems.Length) nRetPos = -1; //Reaching ending if (m_nTags[i - 1, 1] != -1) //||m_sWords[i][0]==0 { //Set end for words like "张/华/平" if (m_tagType != TAG_TYPE.TT_NORMAL) m_nTags[i, 0] = 101; else m_nTags[i, 0] = 1; m_dFrequency[i, 0] = 0; m_sWords[i] = null; //Set virtual ending m_nTags[i++, 1] = -1; } m_nCurLength = i; //The current word count if (nRetPos != -1) return nWordsIndex + 1; //Next start position return -1; //Reaching ending }
//POS tagging with Hidden Markov Model public bool POSTagging(WordResult[] pWordItems, WordDictionary dictCore, WordDictionary dictUnknown) { //pWordItems: Items; nItemCount: the count of items;core dictionary and unknown recognition dictionary int i = 0, j, nStartPos; Reset(false); while (i > -1 && i < pWordItems.Length && pWordItems[i].sWord != null) { nStartPos = i; //Start Position i = GetFrom(pWordItems, nStartPos, dictCore, dictUnknown); GetBestPOS(); switch (m_tagType) { case TAG_TYPE.TT_NORMAL: //normal POS tagging j = 1; while (m_nBestTag[j] != -1 && j < m_nCurLength) { //Store the best POS tagging pWordItems[j + nStartPos - 1].nPOS = m_nBestTag[j]; //Let 。be 0 if (pWordItems[j + nStartPos - 1].dValue > 0 && dictCore.IsExist(pWordItems[j + nStartPos - 1].sWord, -1)) //Exist and update its frequncy as a POS value pWordItems[j + nStartPos - 1].dValue = dictCore.GetFrequency(pWordItems[j + nStartPos - 1].sWord, m_nBestTag[j]); j += 1; } break; case TAG_TYPE.TT_PERSON: //Person recognition PersonRecognize(dictUnknown); break; case TAG_TYPE.TT_PLACE: //Place name recognition case TAG_TYPE.TT_TRANS_PERSON: //Transliteration Person PlaceRecognize(dictCore, dictUnknown); break; default: break; } Reset(); } return true; }
private int GetFrom(WordResult[] pWordItems, int nIndex, WordDictionary dictCore, WordDictionary dictUnknown) { WordInfo info; int[] aPOS = new int[Predefine.MAX_POS_PER_WORD]; int[] aFreq = new int[Predefine.MAX_POS_PER_WORD]; int nFreq = 0, j, nRetPos = 0, nWordsIndex = 0; bool bSplit = false; //Need to split in Transliteration recognition int i = 1, nPOSCount; string sCurWord; //Current word nWordsIndex = i + nIndex - 1; for (i = 1; i < Predefine.MAX_WORDS_PER_SENTENCE && nWordsIndex < pWordItems.Length; i++) { if (m_tagType == TAG_TYPE.TT_NORMAL || !dictUnknown.IsExist(pWordItems[nWordsIndex].sWord, 44)) { m_sWords[i] = pWordItems[nWordsIndex].sWord; //store current word m_nWordPosition[i + 1] = m_nWordPosition[i] + m_sWords[i].Length; } else { if (!bSplit) { m_sWords[i] = pWordItems[nWordsIndex].sWord.Substring(0, 1); //store current word bSplit = true; } else { m_sWords[i] = pWordItems[nWordsIndex].sWord.Substring(1); //store current word bSplit = false; } m_nWordPosition[i + 1] = m_nWordPosition[i] + m_sWords[i].Length; } //Record the position of current word m_nStartPos = m_nWordPosition[i + 1]; //Move the Start POS to the ending if (m_tagType != TAG_TYPE.TT_NORMAL) { //Get the POSs from the unknown recognition dictionary sCurWord = m_sWords[i]; if (m_tagType == TAG_TYPE.TT_TRANS_PERSON && i > 0 && m_sWords[i - 1] != null && Utility.charType(m_sWords[i - 1].ToCharArray()[0]) == Predefine.CT_CHINESE) { if (m_sWords[i] == ".") { sCurWord = "."; } else if (m_sWords[i] == "-") { sCurWord = "-"; } } info = dictUnknown.GetWordInfo(sCurWord); if (info != null) { nPOSCount = info.Count + 1; for (j = 0; j < info.Count; j++) { //Get the POS set of sCurWord in the unknown dictionary m_nTags[i, j] = info.POSs[j]; m_dFrequency[i, j] = -Math.Log((double)(1 + info.Frequencies[j])) + Math.Log((double)(m_context.GetFrequency(0, info.POSs[j]) + nPOSCount)); } } else { nPOSCount = 1; j = 0; } //Get the POS set of sCurWord in the core dictionary //We ignore the POS in the core dictionary and recognize them as other (0). //We add their frequency to get the possibility as POS 0 if (string.Compare(m_sWords[i], "始##始") == 0) { m_nTags[i, j] = 100; m_dFrequency[i, j] = 0; j++; } else if (string.Compare(m_sWords[i], "末##末") == 0) { m_nTags[i, j] = 101; m_dFrequency[i, j] = 0; j++; } else { //dictCore.GetHandle(m_sWords[i], &nCount, aPOS, aFreq); info = dictCore.GetWordInfo(m_sWords[i]); nFreq = 0; if (info != null) { for (int k = 0; k < info.Count; k++) { nFreq += info.Frequencies[k]; } if (info.Count > 0) { m_nTags[i, j] = 0; //m_dFrequency[i][j]=(double)(1+nFreq)/(double)(m_context.GetFrequency(0,0)+1); m_dFrequency[i, j] = -Math.Log((double)(1 + nFreq)) + Math.Log((double)(m_context.GetFrequency(0, 0) + nPOSCount)); j++; } } } } else //For normal POS tagging { j = 0; //Get the POSs from the unknown recognition dictionary if (pWordItems[nWordsIndex].nPOS > 0) { //The word has is only one POS value //We have record its POS and nFrequncy in the items. m_nTags[i, j] = pWordItems[nWordsIndex].nPOS; m_dFrequency[i, j] = -Math.Log(pWordItems[nWordsIndex].dValue) + Math.Log((double)(m_context.GetFrequency(0, m_nTags[i, j]) + 1)); if (m_dFrequency[i, j] < 0) { //Not permit the value less than 0 m_dFrequency[i, j] = 0; } j++; } else { //The word has multiple POSs, we should retrieve the information from Core Dictionary if (pWordItems[nWordsIndex].nPOS < 0) { //The word has is only one POS value //We have record its POS and nFrequncy in the items. m_nTags[i, j] = -pWordItems[nWordsIndex].nPOS; m_dFrequency[i, j++] = pWordItems[nWordsIndex].dValue; } //dictCore.GetHandle(m_sWords[i], &nCount, aPOS, aFreq); info = dictCore.GetWordInfo(m_sWords[i]); if (info != null) { nPOSCount = info.Count; for (; j < info.Count; j++) { //Get the POS set of sCurWord in the unknown dictionary m_nTags[i, j] = info.POSs[j]; m_dFrequency[i, j] = -Math.Log(1 + info.Frequencies[j]) + Math.Log(m_context.GetFrequency(0, m_nTags[i, j]) + nPOSCount); } } } } if (j == 0) { //We donot know the POS, so we have to guess them according lexical knowledge GuessPOS(i, out j); //Guess the POS of current word } m_nTags[i, j] = -1; //Set the ending POS if (j == 1 && m_nTags[i, j] != Predefine.CT_SENTENCE_BEGIN) //No ambuguity { //No ambuguity, so we can break from the loop i++; m_sWords[i] = null; break; } if (!bSplit) { nWordsIndex++; } } if (nWordsIndex == pWordItems.Length) { nRetPos = -1; } //Reaching ending if (m_nTags[i - 1, 1] != -1) //||m_sWords[i][0]==0 { //Set end for words like "张/华/平" if (m_tagType != TAG_TYPE.TT_NORMAL) { m_nTags[i, 0] = 101; } else { m_nTags[i, 0] = 1; } m_dFrequency[i, 0] = 0; m_sWords[i] = null; //Set virtual ending m_nTags[i++, 1] = -1; } m_nCurLength = i; //The current word count if (nRetPos != -1) { return(nWordsIndex + 1); } //Next start position return(-1); //Reaching ending }