//Judge whether the name is a given name public bool IsGivenName(string sName) { char sFirstChar, sSecondChar; double dGivenNamePossibility = 0, dSingleNamePossibility = 0; if (sName.Length != 2) { return(false); } sFirstChar = sName.ToCharArray()[0]; sSecondChar = sName.ToCharArray()[1]; //The possibility of P(Wi|Ti) dGivenNamePossibility += Math.Log((double)m_dict.GetFrequency(sFirstChar.ToString(), 2) + 1.0) - Math.Log(m_roleTag.m_context.GetFrequency(0, 2) + 1.0); dGivenNamePossibility += Math.Log((double)m_dict.GetFrequency(sSecondChar.ToString(), 3) + 1.0) - Math.Log(m_roleTag.m_context.GetFrequency(0, 3) + 1.0); //The possibility of conversion from 2 to 3 dGivenNamePossibility += Math.Log(m_roleTag.m_context.GetContextPossibility(0, 2, 3) + 1.0) - Math.Log(m_roleTag.m_context.GetFrequency(0, 2) + 1.0); //The possibility of P(Wi|Ti) dSingleNamePossibility += Math.Log((double)m_dict.GetFrequency(sFirstChar.ToString(), 1) + 1.0) - Math.Log(m_roleTag.m_context.GetFrequency(0, 1) + 1.0); dSingleNamePossibility += Math.Log((double)m_dict.GetFrequency(sSecondChar.ToString(), 4) + 1.0) - Math.Log(m_roleTag.m_context.GetFrequency(0, 4) + 1.0); //The possibility of conversion from 1 to 4 dSingleNamePossibility += Math.Log(m_roleTag.m_context.GetContextPossibility(0, 1, 4) + 1.0) - Math.Log(m_roleTag.m_context.GetFrequency(0, 1) + 1.0); if (dSingleNamePossibility >= dGivenNamePossibility) { //张震||m_dict.GetFrequency(sFirstChar,1)/m_dict.GetFrequency(sFirstChar,2)>=10 //The possibility being a single given name is more than being a 2-char given name return(false); } return(true); }
//POS tagging with Hidden Markov Model public bool POSTagging(WordResult[] pWordItems, WordDictionary dictCore, WordDictionary dictUnknown) { //pWordItems: Items; nItemCount: the count of items;core dictionary and unknown recognition dictionary int i = 0, j, nStartPos; Reset(false); while (i > -1 && i < pWordItems.Length && pWordItems[i].sWord != null) { nStartPos = i; //Start Position i = GetFrom(pWordItems, nStartPos, dictCore, dictUnknown); GetBestPOS(); switch (m_tagType) { case TAG_TYPE.TT_NORMAL: //normal POS tagging j = 1; while (m_nBestTag[j] != -1 && j < m_nCurLength) { //Store the best POS tagging pWordItems[j + nStartPos - 1].nPOS = m_nBestTag[j]; //Let 。be 0 if (pWordItems[j + nStartPos - 1].dValue > 0 && dictCore.IsExist(pWordItems[j + nStartPos - 1].sWord, -1)) { //Exist and update its frequncy as a POS value pWordItems[j + nStartPos - 1].dValue = dictCore.GetFrequency(pWordItems[j + nStartPos - 1].sWord, m_nBestTag[j]); } j += 1; } break; case TAG_TYPE.TT_PERSON: //Person recognition PersonRecognize(dictUnknown); break; case TAG_TYPE.TT_PLACE: //Place name recognition case TAG_TYPE.TT_TRANS_PERSON: //Transliteration Person PlaceRecognize(dictCore, dictUnknown); break; default: break; } Reset(); } return(true); }
private double ComputePossibility(int nStartPos, int nLength, WordDictionary dict) { double dRetValue = 0, dPOSPoss; //dPOSPoss: the possibility of a POS appears //dContextPoss: The possibility of context POS appears int nFreq; for (int i = nStartPos; i < nStartPos + nLength; i++) { nFreq = dict.GetFrequency(m_sWords[i], m_nBestTag[i]); //nFreq is word being the POS dPOSPoss = Math.Log((double)(m_context.GetFrequency(0, m_nBestTag[i]) + 1)) - Math.Log((double)(nFreq + 1)); dRetValue += dPOSPoss; /* * if(i<nStartPos+nLength-1) * { * dContextPoss=log((double)(m_context.GetContextPossibility(0,m_nBestTag[i],m_nBestTag[i+1])+1)); * dRetValue+=dPOSPoss-dContextPoss; * } */ } return(dRetValue); }
//POS tagging with Hidden Markov Model public bool POSTagging(WordResult[] pWordItems, WordDictionary dictCore, WordDictionary dictUnknown) { //pWordItems: Items; nItemCount: the count of items;core dictionary and unknown recognition dictionary int i = 0, j, nStartPos; Reset(false); while (i > -1 && i < pWordItems.Length && pWordItems[i].sWord != null) { nStartPos = i; //Start Position i = GetFrom(pWordItems, nStartPos, dictCore, dictUnknown); GetBestPOS(); switch (m_tagType) { case TAG_TYPE.TT_NORMAL: //normal POS tagging j = 1; while (m_nBestTag[j] != -1 && j < m_nCurLength) { //Store the best POS tagging pWordItems[j + nStartPos - 1].nPOS = m_nBestTag[j]; //Let 。be 0 if (pWordItems[j + nStartPos - 1].dValue > 0 && dictCore.IsExist(pWordItems[j + nStartPos - 1].sWord, -1)) //Exist and update its frequncy as a POS value pWordItems[j + nStartPos - 1].dValue = dictCore.GetFrequency(pWordItems[j + nStartPos - 1].sWord, m_nBestTag[j]); j += 1; } break; case TAG_TYPE.TT_PERSON: //Person recognition PersonRecognize(dictUnknown); break; case TAG_TYPE.TT_PLACE: //Place name recognition case TAG_TYPE.TT_TRANS_PERSON: //Transliteration Person PlaceRecognize(dictCore, dictUnknown); break; default: break; } Reset(); } return true; }
private double ComputePossibility(int nStartPos, int nLength, WordDictionary dict) { double dRetValue = 0, dPOSPoss; //dPOSPoss: the possibility of a POS appears //dContextPoss: The possibility of context POS appears int nFreq; for (int i = nStartPos; i < nStartPos + nLength; i++) { nFreq = dict.GetFrequency(m_sWords[i], m_nBestTag[i]); //nFreq is word being the POS dPOSPoss = Math.Log((double)(m_context.GetFrequency(0, m_nBestTag[i]) + 1)) - Math.Log((double)(nFreq + 1)); dRetValue += dPOSPoss; /* if(i<nStartPos+nLength-1) { dContextPoss=log((double)(m_context.GetContextPossibility(0,m_nBestTag[i],m_nBestTag[i+1])+1)); dRetValue+=dPOSPoss-dContextPoss; } */ } return dRetValue; }
public bool PersonRecognize(WordDictionary personDict) { StringBuilder sb = new StringBuilder(); int i; string sPOS = "z", sPersonName; string[] sPatterns = { "BBCD", "BBC", "BBE", "BBZ", "BCD", "BEE", "BE", "BG", "BXD", "BZ", "CDCD", "CD", "EE", "FB", "Y", "XD", "" }; double[] dFactor = { 0.003606, 0.000021, 0.001314, 0.000315, 0.656624, 0.000021, 0.146116, 0.009136, 0.000042, 0.038971, 0, 0.090367, 0.000273, 0.009157, 0.034324, 0.009735, 0 }; /*------------------------------------ About parameter: BBCD 343 0.003606 BBC 2 0.000021 BBE 125 0.001314 BBZ 30 0.000315 BCD 62460 0.656624 BEE 0 0.000000 BE 13899 0.146116 BG 869 0.009136 BXD 4 0.000042 BZ 3707 0.038971 CD 8596 0.090367 EE 26 0.000273 FB 871 0.009157 Y 3265 0.034324 XD 926 0.009735 The person recognition patterns set BBCD:姓+姓+名1+名2; BBE: 姓+姓+单名; BBZ: 姓+姓+双名成词; BCD: 姓+名1+名2; BE: 姓+单名; BEE: 姓+单名+单名;韩磊磊 BG: 姓+后缀 BXD: 姓+姓双名首字成词+双名末字 BZ: 姓+双名成词; B: 姓 CD: 名1+名2; EE: 单名+单名; FB: 前缀+姓 XD: 姓双名首字成词+双名末字 Y: 姓单名成词 ------------------------------------*/ int[] nPatternLen = { 4, 3, 3, 3, 3, 3, 2, 2, 3, 2, 4, 2, 2, 2, 1, 2, 0 }; //Convert to string from POS sb.Append('z'); for (i = 1; m_nBestTag[i] > -1; i++) sb.Append(Convert.ToChar(m_nBestTag[i] + Convert.ToInt32('A'))); sPOS = sb.ToString(); int j = 1, k, nPos; //Find the proper pattern from the first POS int nLittleFreqCount; //Counter for the person name role with little frequecy bool bMatched = false; while (j < i) { bMatched = false; for (k = 0; !bMatched && nPatternLen[k] > 0; k++) { if (string.Compare(sPatterns[k], 0, sPOS, j, nPatternLen[k]) == 0 && string.Compare(m_sWords[j - 1], "·") != 0 && string.Compare(m_sWords[j + nPatternLen[k]], "·") != 0) { //Find the proper pattern k if (string.Compare(sPatterns[k], "FB") == 0 && (sPOS[j + 2] == 'E' || sPOS[j + 2] == 'C' || sPOS[j + 2] == 'G')) { //Rule 1 for exclusion:前缀+姓+名1(名2): 规则(前缀+姓)失效; continue; } /* if((strcmp(sPatterns[k],"BEE")==0||strcmp(sPatterns[k],"EE")==0)&&strcmp(m_sWords[j+nPatternLen[k]-1],m_sWords[j+nPatternLen[k]-2])!=0) {//Rule 2 for exclusion:姓+单名+单名:单名+单名 若EE对应的字不同,规则失效.如:韩磊磊 continue; } if(strcmp(sPatterns[k],"B")==0&&m_nBestTag[j+1]!=12) {//Rule 3 for exclusion: 若姓后不是后缀,规则失效.如:江主席、刘大娘 continue; } */ //Get the possible name nPos = j; //Record the person position in the tag sequence sPersonName = null; nLittleFreqCount = 0; //Record the number of role with little frequency while (nPos < j + nPatternLen[k]) { //Get the possible person name // if (m_nBestTag[nPos] < 4 && personDict.GetFrequency(m_sWords[nPos], m_nBestTag[nPos]) < Predefine.LITTLE_FREQUENCY) nLittleFreqCount++; //The counter increase sPersonName += m_sWords[nPos]; nPos += 1; } /* if(IsAllForeign(sPersonName)&&personDict.GetFrequency(m_sWords[j],1)<LITTLE_FREQUENCY) {//Exclusion foreign name //Rule 2 for exclusion:若均为外国人名用字 规则(名1+名2)失效 j+=nPatternLen[k]-1; continue; } */ if (string.Compare(sPatterns[k], "CDCD") == 0) { //Rule for exclusion //规则(名1+名2+名1+名2)本身是排除规则:女高音歌唱家迪里拜尔演唱 //Rule 3 for exclusion:含外国人名用字 规则适用 //否则,排除规则失效:黑妞白妞姐俩拔了头筹。 if (Utility.GetForeignCharCount(sPersonName) > 0) j += nPatternLen[k] - 1; continue; } /* if(strcmp(sPatterns[k],"CD")==0&&IsAllForeign(sPersonName)) {// j+=nPatternLen[k]-1; continue; } if(nLittleFreqCount==nPatternLen[k]||nLittleFreqCount==3) //马哈蒂尔;小扎耶德与他的中国阿姨胡彩玲受华黎明大使之邀, //The all roles appear with two lower frequecy,we will ignore them continue; */ m_nUnknownWords[m_nUnknownWordsCount, 0] = m_nWordPosition[j]; m_nUnknownWords[m_nUnknownWordsCount, 1] = m_nWordPosition[j + nPatternLen[k]]; m_dWordsPossibility[m_nUnknownWordsCount] = -Math.Log(dFactor[k]) + ComputePossibility(j, nPatternLen[k], personDict); //Mutiply the factor m_nUnknownWordsCount += 1; j += nPatternLen[k]; bMatched = true; } } if (!bMatched) //Not matched, add j by 1 j += 1; } return true; }
//==================================================================== // ����������֮��Ķ���ͼ�� //==================================================================== public static ColumnFirstDynamicArray<ChainContent> BiGraphGenerate( RowFirstDynamicArray<ChainContent> aWord, double smoothPara, WordDictionary biDict, WordDictionary coreDict) { ColumnFirstDynamicArray<ChainContent> aBiWordNet = new ColumnFirstDynamicArray<ChainContent>(); ChainItem<ChainContent> pCur, pNextWords; int nTwoWordsFreq = 0, nCurWordIndex, nNextWordIndex; double dCurFreqency, dValue, dTemp; string sTwoWords; StringBuilder sb = new StringBuilder(); //Record the position map of possible words int[] m_npWordPosMapTable = PreparePositionMap(aWord); pCur = aWord.GetHead(); while (pCur != null) { if (pCur.Content.nPOS >= 0) //It's not an unknown words dCurFreqency = pCur.Content.eWeight; else //Unknown words dCurFreqency = coreDict.GetFrequency(pCur.Content.sWord, 2); //Get next words which begin with pCur.col��ע��������Ķ�Ӧ��ϵ�� pNextWords = aWord.GetFirstElementOfRow(pCur.col); while (pNextWords != null && pNextWords.row == pCur.col) { sb.Remove(0, sb.Length); sb.Append(pCur.Content.sWord); sb.Append(Predefine.WORD_SEGMENTER); sb.Append(pNextWords.Content.sWord); sTwoWords = sb.ToString(); //Two linked Words frequency nTwoWordsFreq = biDict.GetFrequency(sTwoWords, 3); //Smoothing dTemp = 1.0 / Predefine.MAX_FREQUENCE; //-log{a*P(Ci-1)+(1-a)P(Ci|Ci-1)} Note 0<a<1 dValue = -Math.Log(smoothPara * (1.0 + dCurFreqency) / (Predefine.MAX_FREQUENCE + 80000.0) + (1.0 - smoothPara) * ((1.0 - dTemp) * nTwoWordsFreq / (1.0 + dCurFreqency) + dTemp)); //Unknown words: P(Wi|Ci);while known words:1 if (pCur.Content.nPOS < 0) dValue += pCur.Content.nPOS; //Get the position index of current word in the position map table nCurWordIndex = Utility.BinarySearch(pCur.row * Predefine.MAX_SENTENCE_LEN + pCur.col, m_npWordPosMapTable); nNextWordIndex = Utility.BinarySearch(pNextWords.row * Predefine.MAX_SENTENCE_LEN + pNextWords.col, m_npWordPosMapTable); aBiWordNet.SetElement(nCurWordIndex, nNextWordIndex, new ChainContent(dValue, pCur.Content.nPOS, sTwoWords)); pNextWords = pNextWords.next; //Get next word } pCur = pCur.next; } return aBiWordNet; }
//==================================================================== // 生成两两词之间的二叉图表 //==================================================================== public static ColumnFirstDynamicArray <ChainContent> BiGraphGenerate( RowFirstDynamicArray <ChainContent> aWord, double smoothPara, WordDictionary biDict, WordDictionary coreDict) { ColumnFirstDynamicArray <ChainContent> aBiWordNet = new ColumnFirstDynamicArray <ChainContent>(); ChainItem <ChainContent> pCur, pNextWords; int nTwoWordsFreq = 0, nCurWordIndex, nNextWordIndex; double dCurFreqency, dValue, dTemp; string sTwoWords; StringBuilder sb = new StringBuilder(); //Record the position map of possible words int[] m_npWordPosMapTable = PreparePositionMap(aWord); pCur = aWord.GetHead(); while (pCur != null) { if (pCur.Content.nPOS >= 0) { //It's not an unknown words dCurFreqency = pCur.Content.eWeight; } else { //Unknown words dCurFreqency = coreDict.GetFrequency(pCur.Content.sWord, 2); } //Get next words which begin with pCur.col(注:很特殊的对应关系) pNextWords = aWord.GetFirstElementOfRow(pCur.col); while (pNextWords != null && pNextWords.row == pCur.col) { sb.Remove(0, sb.Length); sb.Append(pCur.Content.sWord); sb.Append(Predefine.WORD_SEGMENTER); sb.Append(pNextWords.Content.sWord); sTwoWords = sb.ToString(); //Two linked Words frequency nTwoWordsFreq = biDict.GetFrequency(sTwoWords, 3); //Smoothing dTemp = 1.0 / Predefine.MAX_FREQUENCE; //-log{a*P(Ci-1)+(1-a)P(Ci|Ci-1)} Note 0<a<1 dValue = -Math.Log(smoothPara * (1.0 + dCurFreqency) / (Predefine.MAX_FREQUENCE + 80000.0) + (1.0 - smoothPara) * ((1.0 - dTemp) * nTwoWordsFreq / (1.0 + dCurFreqency) + dTemp)); //Unknown words: P(Wi|Ci);while known words:1 if (pCur.Content.nPOS < 0) { dValue += pCur.Content.nPOS; } //Get the position index of current word in the position map table nCurWordIndex = Utility.BinarySearch(pCur.row * Predefine.MAX_SENTENCE_LEN + pCur.col, m_npWordPosMapTable); nNextWordIndex = Utility.BinarySearch(pNextWords.row * Predefine.MAX_SENTENCE_LEN + pNextWords.col, m_npWordPosMapTable); aBiWordNet.SetElement(nCurWordIndex, nNextWordIndex, new ChainContent(dValue, pCur.Content.nPOS, sTwoWords)); pNextWords = pNextWords.next; //Get next word } pCur = pCur.next; } return(aBiWordNet); }
public bool PersonRecognize(WordDictionary personDict) { StringBuilder sb = new StringBuilder(); int i; string sPOS = "z", sPersonName; string[] sPatterns = { "BBCD", "BBC", "BBE", "BBZ", "BCD", "BEE", "BE", "BG", "BXD", "BZ", "CDCD", "CD", "EE", "FB", "Y", "XD", "" }; double[] dFactor = { 0.003606, 0.000021, 0.001314, 0.000315, 0.656624, 0.000021, 0.146116, 0.009136, 0.000042, 0.038971, 0, 0.090367, 0.000273, 0.009157, 0.034324, 0.009735, 0 }; /*------------------------------------ * About parameter: * * BBCD 343 0.003606 * BBC 2 0.000021 * BBE 125 0.001314 * BBZ 30 0.000315 * BCD 62460 0.656624 * BEE 0 0.000000 * BE 13899 0.146116 * BG 869 0.009136 * BXD 4 0.000042 * BZ 3707 0.038971 * CD 8596 0.090367 * EE 26 0.000273 * FB 871 0.009157 * Y 3265 0.034324 * XD 926 0.009735 * * The person recognition patterns set * BBCD:姓+姓+名1+名2; * BBE: 姓+姓+单名; * BBZ: 姓+姓+双名成词; * BCD: 姓+名1+名2; * BE: 姓+单名; * BEE: 姓+单名+单名;韩磊磊 * BG: 姓+后缀 * BXD: 姓+姓双名首字成词+双名末字 * BZ: 姓+双名成词; * B: 姓 * CD: 名1+名2; * EE: 单名+单名; * FB: 前缀+姓 * XD: 姓双名首字成词+双名末字 * Y: 姓单名成词 * ------------------------------------*/ int[] nPatternLen = { 4, 3, 3, 3, 3, 3, 2, 2, 3, 2, 4, 2, 2, 2, 1, 2, 0 }; //Convert to string from POS sb.Append('z'); for (i = 1; m_nBestTag[i] > -1; i++) { sb.Append(Convert.ToChar(m_nBestTag[i] + Convert.ToInt32('A'))); } sPOS = sb.ToString(); int j = 1, k, nPos; //Find the proper pattern from the first POS int nLittleFreqCount; //Counter for the person name role with little frequecy bool bMatched = false; while (j < i) { bMatched = false; for (k = 0; !bMatched && nPatternLen[k] > 0; k++) { if (string.Compare(sPatterns[k], 0, sPOS, j, nPatternLen[k]) == 0 && string.Compare(m_sWords[j - 1], "·") != 0 && string.Compare(m_sWords[j + nPatternLen[k]], "·") != 0) { //Find the proper pattern k if (string.Compare(sPatterns[k], "FB") == 0 && (sPOS[j + 2] == 'E' || sPOS[j + 2] == 'C' || sPOS[j + 2] == 'G')) { //Rule 1 for exclusion:前缀+姓+名1(名2): 规则(前缀+姓)失效; continue; } /* * if((strcmp(sPatterns[k],"BEE")==0||strcmp(sPatterns[k],"EE")==0)&&strcmp(m_sWords[j+nPatternLen[k]-1],m_sWords[j+nPatternLen[k]-2])!=0) * {//Rule 2 for exclusion:姓+单名+单名:单名+单名 若EE对应的字不同,规则失效.如:韩磊磊 * continue; * } * * if(strcmp(sPatterns[k],"B")==0&&m_nBestTag[j+1]!=12) * {//Rule 3 for exclusion: 若姓后不是后缀,规则失效.如:江主席、刘大娘 * continue; * } */ //Get the possible name nPos = j; //Record the person position in the tag sequence sPersonName = null; nLittleFreqCount = 0; //Record the number of role with little frequency while (nPos < j + nPatternLen[k]) { //Get the possible person name // if (m_nBestTag[nPos] < 4 && personDict.GetFrequency(m_sWords[nPos], m_nBestTag[nPos]) < Predefine.LITTLE_FREQUENCY) { nLittleFreqCount++; } //The counter increase sPersonName += m_sWords[nPos]; nPos += 1; } /* * if(IsAllForeign(sPersonName)&&personDict.GetFrequency(m_sWords[j],1)<LITTLE_FREQUENCY) * {//Exclusion foreign name * //Rule 2 for exclusion:若均为外国人名用字 规则(名1+名2)失效 * j+=nPatternLen[k]-1; * continue; * } */ if (string.Compare(sPatterns[k], "CDCD") == 0) { //Rule for exclusion //规则(名1+名2+名1+名2)本身是排除规则:女高音歌唱家迪里拜尔演唱 //Rule 3 for exclusion:含外国人名用字 规则适用 //否则,排除规则失效:黑妞白妞姐俩拔了头筹。 if (Utility.GetForeignCharCount(sPersonName) > 0) { j += nPatternLen[k] - 1; } continue; } /* * if(strcmp(sPatterns[k],"CD")==0&&IsAllForeign(sPersonName)) * {// * j+=nPatternLen[k]-1; * continue; * } * if(nLittleFreqCount==nPatternLen[k]||nLittleFreqCount==3) * //马哈蒂尔;小扎耶德与他的中国阿姨胡彩玲受华黎明大使之邀, * //The all roles appear with two lower frequecy,we will ignore them * continue; */ m_nUnknownWords[m_nUnknownWordsCount, 0] = m_nWordPosition[j]; m_nUnknownWords[m_nUnknownWordsCount, 1] = m_nWordPosition[j + nPatternLen[k]]; m_dWordsPossibility[m_nUnknownWordsCount] = -Math.Log(dFactor[k]) + ComputePossibility(j, nPatternLen[k], personDict); //Mutiply the factor m_nUnknownWordsCount += 1; j += nPatternLen[k]; bMatched = true; } } if (!bMatched) { //Not matched, add j by 1 j += 1; } } return(true); }