public static void TestBiGraphGenerate() { WordDictionary coreDict = new WordDictionary(); if (!coreDict.Load(coreDictFile)) { Console.WriteLine("coreDict 字典装入错误!"); return; } WordDictionary biDict = new WordDictionary(); if (!biDict.Load(biDictFile)) { Console.WriteLine("字典装入错误!"); return; } string sSentence = @"他说的确实在理"; sSentence = Predefine.SENTENCE_BEGIN + sSentence + Predefine.SENTENCE_END; //---原子分词 List<AtomNode> atomSegment = Segment.AtomSegment(sSentence); //---检索词库,加入所有可能分词方案并存入链表结构 RowFirstDynamicArray<ChainContent> segGraph = Segment.GenerateWordNet(atomSegment, coreDict); //---检索所有可能的两两组合 ColumnFirstDynamicArray<ChainContent> biGraphResult = Segment.BiGraphGenerate(segGraph, 0.1, biDict, coreDict); Console.WriteLine(biGraphResult.ToString()); }
public Segment(WordDictionary biDict, WordDictionary coreDict) { this.biDict = biDict; this.coreDict = coreDict; rawNShortPath = new NShortPath(); optNShortPath = new NShortPath(); }
public WordSegment(double SmoothingParameter) { m_dictCore = new WordDictionary(); m_dictBigram = new WordDictionary(); m_POSTagger = new Span(); m_uPerson = new UnknowWord(); m_uTransPerson = new UnknowWord(); m_uPlace = new UnknowWord(); m_Seg = new Segment(m_dictBigram, m_dictCore); m_Seg.OnSegmentEvent += new SegmentEventHandler(this.OnSegmentEventHandler); m_dSmoothingPara = SmoothingParameter;//Smoothing parameter }
//Unknown word recognition //pWordSegResult:word Segmentation result; //graphOptimum: The optimized segmentation graph //graphSeg: The original segmentation graph public bool Recognition(WordResult[] pWordSegResult, RowFirstDynamicArray<ChainContent> graphOptimum, List<AtomNode> atomSegment, WordDictionary dictCore) { ChainItem<ChainContent> item; int nStartPos = 0, j = 0, nAtomStart, nAtomEnd, nPOSOriginal; double dValue; m_roleTag.POSTagging(pWordSegResult, dictCore, m_dict); //Tag the segmentation with unknown recognition roles according the core dictionary and unknown recognition dictionary for (int i = 0; i < m_roleTag.m_nUnknownWordsCount; i++) { while (j < atomSegment.Count && nStartPos < m_roleTag.m_nUnknownWords[i, 0]) nStartPos += atomSegment[j++].sWord.Length; nAtomStart = j; while (j < atomSegment.Count && nStartPos < m_roleTag.m_nUnknownWords[i, 1]) nStartPos += atomSegment[j++].sWord.Length; nAtomEnd = j; if (nAtomStart < nAtomEnd) { item = graphOptimum.GetElement(nAtomStart, nAtomEnd); if (item != null) { dValue = item.Content.eWeight; nPOSOriginal = item.Content.nPOS; } else dValue = Predefine.INFINITE_VALUE; if (dValue > m_roleTag.m_dWordsPossibility[i]) //Set the element with less frequency graphOptimum.SetElement(nAtomStart, nAtomEnd, new ChainContent(m_roleTag.m_dWordsPossibility[i], m_nPOS, m_sUnknownFlags)); } } return true; }
//==================================================================== // ����������֮��Ķ���ͼ�� //==================================================================== public static ColumnFirstDynamicArray<ChainContent> BiGraphGenerate( RowFirstDynamicArray<ChainContent> aWord, double smoothPara, WordDictionary biDict, WordDictionary coreDict) { ColumnFirstDynamicArray<ChainContent> aBiWordNet = new ColumnFirstDynamicArray<ChainContent>(); ChainItem<ChainContent> pCur, pNextWords; int nTwoWordsFreq = 0, nCurWordIndex, nNextWordIndex; double dCurFreqency, dValue, dTemp; string sTwoWords; StringBuilder sb = new StringBuilder(); //Record the position map of possible words int[] m_npWordPosMapTable = PreparePositionMap(aWord); pCur = aWord.GetHead(); while (pCur != null) { if (pCur.Content.nPOS >= 0) //It's not an unknown words dCurFreqency = pCur.Content.eWeight; else //Unknown words dCurFreqency = coreDict.GetFrequency(pCur.Content.sWord, 2); //Get next words which begin with pCur.col��ע��������Ķ�Ӧ��ϵ�� pNextWords = aWord.GetFirstElementOfRow(pCur.col); while (pNextWords != null && pNextWords.row == pCur.col) { sb.Remove(0, sb.Length); sb.Append(pCur.Content.sWord); sb.Append(Predefine.WORD_SEGMENTER); sb.Append(pNextWords.Content.sWord); sTwoWords = sb.ToString(); //Two linked Words frequency nTwoWordsFreq = biDict.GetFrequency(sTwoWords, 3); //Smoothing dTemp = 1.0 / Predefine.MAX_FREQUENCE; //-log{a*P(Ci-1)+(1-a)P(Ci|Ci-1)} Note 0<a<1 dValue = -Math.Log(smoothPara * (1.0 + dCurFreqency) / (Predefine.MAX_FREQUENCE + 80000.0) + (1.0 - smoothPara) * ((1.0 - dTemp) * nTwoWordsFreq / (1.0 + dCurFreqency) + dTemp)); //Unknown words: P(Wi|Ci);while known words:1 if (pCur.Content.nPOS < 0) dValue += pCur.Content.nPOS; //Get the position index of current word in the position map table nCurWordIndex = Utility.BinarySearch(pCur.row * Predefine.MAX_SENTENCE_LEN + pCur.col, m_npWordPosMapTable); nNextWordIndex = Utility.BinarySearch(pNextWords.row * Predefine.MAX_SENTENCE_LEN + pNextWords.col, m_npWordPosMapTable); aBiWordNet.SetElement(nCurWordIndex, nNextWordIndex, new ChainContent(dValue, pCur.Content.nPOS, sTwoWords)); pNextWords = pNextWords.next; //Get next word } pCur = pCur.next; } return aBiWordNet; }
/// <summary> /// 导入外部词库,词频按照重合词频比例平均值 /// </summary> /// <param name="ImportDicFile">外部词库文件名</param> /// <param name="ImportEncoding">外部词库文件编码</param> /// <param name="SourceDicFile">源dct文件名</param> /// <param name="DestDicFile">目标dct文件名</param> /// <param name="DicFormat">外部词库类型</param> /// <param name="OddLines">导入的库中无效且不在源库中的数据</param> /// <param name="ImportFrqRate">设置固定的导入文件频度比例(除以此数字后入库,小于等于0则按照AvgFrqRate入库)</param> /// <param name="AvgFrqRate">导入文件的平均频度比例</param> /// <returns>导入的条数</returns> public static int ImportDictionary(string ImportDicFile, Encoding ImportEncoding, string SourceDicFile, string DestDicFile, DictionaryFormat DicFormat, out string[] OddLines, out double AvgFrqRate, double ImportFrqRate = 0) { //初始化 double MaxFrqRate, MinFrqRate; WordDictionary.DicWordInfo[] NewWords; WordDictionary.DicWordInfo[] ExistWords; FindDifferent(ImportDicFile, ImportEncoding, DicFormat, SourceDicFile, out OddLines, out NewWords, out ExistWords, out MaxFrqRate, out MinFrqRate, out AvgFrqRate); //加载词库 WordDictionary dict = new WordDictionary(); if (!dict.Load(SourceDicFile)) throw new Exception("load source dic file fail"); //加入新词 foreach (WordDictionary.DicWordInfo Word in NewWords) { int Frq = Convert.ToInt32(ImportFrqRate <= 0 ? Word.Frequence / AvgFrqRate : Word.Frequence / ImportFrqRate); dict.AddWord(Word.Word, Word.Pos, Frq); } //保存 dict.Save(DestDicFile); dict.ReleaseDict(); return NewWords.Length; }
//==================================================================== //Merge dict2 into current dictionary and the frequency ratio from dict2 and current dict is nRatio //==================================================================== public bool Merge(WordDictionary dict2, int nRatio) { int i, j, k, nCmpValue; string sWord; //Modification made, not to output when modify table exists. if (modifyTable != null || dict2.modifyTable != null) { return(false); } for (i = 0; i < Predefine.CC_NUM; i++) { j = 0; k = 0; while (j < indexTable[i].nCount && k < dict2.indexTable[i].nCount) { nCmpValue = Utility.CCStringCompare(indexTable[i].WordItems[j].sWord, dict2.indexTable[i].WordItems[k].sWord); if (nCmpValue == 0) //Same Words and determine the different handle { if (indexTable[i].WordItems[j].nPOS < dict2.indexTable[i].WordItems[k].nPOS) { nCmpValue = -1; } else if (indexTable[i].WordItems[j].nPOS > dict2.indexTable[i].WordItems[k].nPOS) { nCmpValue = 1; } } if (nCmpValue == 0) { indexTable[i].WordItems[j].nFrequency = (nRatio * indexTable[i].WordItems[j].nFrequency + dict2.indexTable[i].WordItems[k].nFrequency) / (nRatio + 1); j += 1; k += 1; } //Get next word in the current dictionary else if (nCmpValue < 0) { indexTable[i].WordItems[j].nFrequency = (nRatio * indexTable[i].WordItems[j].nFrequency) / (nRatio + 1); j += 1; } else //Get next word in the second dictionary { if (dict2.indexTable[i].WordItems[k].nFrequency > (nRatio + 1) / 10) { sWord = string.Format("{0}{1}", Utility.CC_ID2Char(i).ToString(), dict2.indexTable[i].WordItems[k].sWord); AddItem(sWord, dict2.indexTable[i].WordItems[k].nPOS, dict2.indexTable[i].WordItems[k].nFrequency / (nRatio + 1)); } k += 1; } } //words in current dictionary are left while (j < indexTable[i].nCount) { indexTable[i].WordItems[j].nFrequency = (nRatio * indexTable[i].WordItems[j].nFrequency) / (nRatio + 1); j += 1; } //words in Dict2 are left while (k < dict2.indexTable[i].nCount) { if (dict2.indexTable[i].WordItems[k].nFrequency > (nRatio + 1) / 10) { sWord = string.Format("{0}{1}", Utility.CC_ID2Char(i).ToString(), dict2.indexTable[i].WordItems[k].sWord); AddItem(sWord, dict2.indexTable[i].WordItems[k].nPOS, dict2.indexTable[i].WordItems[k].nFrequency / (nRatio + 1)); } k += 1; } } return(true); }
public bool PlaceRecognize(WordDictionary dictCore, WordDictionary placeDict) { int nStart = 1, nEnd = 1, i = 1, nTemp; double dPanelty = 1.0; //Panelty value while (m_nBestTag[i] > -1) { if (m_nBestTag[i] == 1) //1 Trigger the recognition procession { nStart = i; nEnd = nStart + 1; //=========== by zhenyulu: 此处nEnd = nStart + 1;有些强迫之嫌,因此后面处理了一下 while (m_nBestTag[nEnd] == 1) // { if (nEnd > nStart + 1) dPanelty += 1.0; nEnd++; } while (m_nBestTag[nEnd] == 2) //2,12,22 nEnd++; nTemp = nEnd; while (m_nBestTag[nEnd] == 3) { if (nEnd > nTemp) dPanelty += 1.0; nEnd++; } } else if (m_nBestTag[i] == 2) //1,11,21 Trigger the recognition { dPanelty += 1.0; nStart = i; nEnd = nStart + 1; while (m_nBestTag[nEnd] == 2) //2 nEnd++; nTemp = nEnd; while (m_nBestTag[nEnd] == 3) //2 { if (nEnd > nTemp) dPanelty += 1.0; nEnd++; } } if (nEnd > nStart) { //=========== by zhenyulu: 避免上面强迫之嫌带来的负面影响 if (m_sWords[nEnd] == null) nEnd--; m_nUnknownWords[m_nUnknownWordsCount, 0] = m_nWordPosition[nStart]; m_nUnknownWords[m_nUnknownWordsCount, 1] = m_nWordPosition[nEnd]; m_dWordsPossibility[m_nUnknownWordsCount++] = ComputePossibility(nStart, nEnd - nStart + 1, placeDict) + Math.Log(dPanelty); nStart = nEnd; } if (i < nEnd) i = nEnd; else i = i + 1; } return true; }
private int GetFrom(WordResult[] pWordItems, int nIndex, WordDictionary dictCore, WordDictionary dictUnknown) { WordInfo info; int[] aPOS = new int[Predefine.MAX_POS_PER_WORD]; int[] aFreq = new int[Predefine.MAX_POS_PER_WORD]; int nFreq = 0, j, nRetPos = 0, nWordsIndex = 0; bool bSplit = false; //Need to split in Transliteration recognition int i = 1, nPOSCount; string sCurWord; //Current word nWordsIndex = i + nIndex - 1; for (i = 1; i < Predefine.MAX_WORDS_PER_SENTENCE && nWordsIndex < pWordItems.Length; i++) { if (m_tagType == TAG_TYPE.TT_NORMAL || !dictUnknown.IsExist(pWordItems[nWordsIndex].sWord, 44)) { m_sWords[i] = pWordItems[nWordsIndex].sWord; //store current word m_nWordPosition[i + 1] = m_nWordPosition[i] + m_sWords[i].Length; } else { if (!bSplit) { m_sWords[i] = pWordItems[nWordsIndex].sWord.Substring(0, 1); //store current word bSplit = true; } else { m_sWords[i] = pWordItems[nWordsIndex].sWord.Substring(1); //store current word bSplit = false; } m_nWordPosition[i + 1] = m_nWordPosition[i] + m_sWords[i].Length; } //Record the position of current word m_nStartPos = m_nWordPosition[i + 1]; //Move the Start POS to the ending if (m_tagType != TAG_TYPE.TT_NORMAL) { //Get the POSs from the unknown recognition dictionary sCurWord = m_sWords[i]; if (m_tagType == TAG_TYPE.TT_TRANS_PERSON && i > 0 && m_sWords[i - 1] != null && Utility.charType(m_sWords[i - 1].ToCharArray()[0]) == Predefine.CT_CHINESE) { if (m_sWords[i] == ".") sCurWord = "."; else if (m_sWords[i] == "-") sCurWord = "-"; } info = dictUnknown.GetWordInfo(sCurWord); if (info != null) { nPOSCount = info.Count + 1; for (j = 0; j < info.Count; j++) { //Get the POS set of sCurWord in the unknown dictionary m_nTags[i, j] = info.POSs[j]; m_dFrequency[i, j] = -Math.Log((double)(1 + info.Frequencies[j])) + Math.Log((double)(m_context.GetFrequency(0, info.POSs[j]) + nPOSCount)); } } else { nPOSCount = 1; j = 0; } //Get the POS set of sCurWord in the core dictionary //We ignore the POS in the core dictionary and recognize them as other (0). //We add their frequency to get the possibility as POS 0 if (string.Compare(m_sWords[i], "始##始") == 0) { m_nTags[i, j] = 100; m_dFrequency[i, j] = 0; j++; } else if (string.Compare(m_sWords[i], "末##末") == 0) { m_nTags[i, j] = 101; m_dFrequency[i, j] = 0; j++; } else { //dictCore.GetHandle(m_sWords[i], &nCount, aPOS, aFreq); info = dictCore.GetWordInfo(m_sWords[i]); nFreq = 0; if (info != null) { for (int k = 0; k < info.Count; k++) { nFreq += info.Frequencies[k]; } if (info.Count > 0) { m_nTags[i, j] = 0; //m_dFrequency[i][j]=(double)(1+nFreq)/(double)(m_context.GetFrequency(0,0)+1); m_dFrequency[i, j] = -Math.Log((double)(1 + nFreq)) + Math.Log((double)(m_context.GetFrequency(0, 0) + nPOSCount)); j++; } } } } else //For normal POS tagging { j = 0; //Get the POSs from the unknown recognition dictionary if (pWordItems[nWordsIndex].nPOS > 0) { //The word has is only one POS value //We have record its POS and nFrequncy in the items. m_nTags[i, j] = pWordItems[nWordsIndex].nPOS; m_dFrequency[i, j] = -Math.Log(pWordItems[nWordsIndex].dValue) + Math.Log((double)(m_context.GetFrequency(0, m_nTags[i, j]) + 1)); if (m_dFrequency[i, j] < 0) //Not permit the value less than 0 m_dFrequency[i, j] = 0; j++; } else { //The word has multiple POSs, we should retrieve the information from Core Dictionary if (pWordItems[nWordsIndex].nPOS < 0) { //The word has is only one POS value //We have record its POS and nFrequncy in the items. m_nTags[i, j] = -pWordItems[nWordsIndex].nPOS; m_dFrequency[i, j++] = pWordItems[nWordsIndex].dValue; } //dictCore.GetHandle(m_sWords[i], &nCount, aPOS, aFreq); info = dictCore.GetWordInfo(m_sWords[i]); if (info != null) { nPOSCount = info.Count; for (; j < info.Count; j++) { //Get the POS set of sCurWord in the unknown dictionary m_nTags[i, j] = info.POSs[j]; m_dFrequency[i, j] = -Math.Log(1 + info.Frequencies[j]) + Math.Log(m_context.GetFrequency(0, m_nTags[i, j]) + nPOSCount); } } } } if (j == 0) { //We donot know the POS, so we have to guess them according lexical knowledge GuessPOS(i, out j); //Guess the POS of current word } m_nTags[i, j] = -1; //Set the ending POS if (j == 1 && m_nTags[i, j] != Predefine.CT_SENTENCE_BEGIN) //No ambuguity { //No ambuguity, so we can break from the loop i++; m_sWords[i] = null; break; } if (!bSplit) nWordsIndex++; } if (nWordsIndex == pWordItems.Length) nRetPos = -1; //Reaching ending if (m_nTags[i - 1, 1] != -1) //||m_sWords[i][0]==0 { //Set end for words like "张/华/平" if (m_tagType != TAG_TYPE.TT_NORMAL) m_nTags[i, 0] = 101; else m_nTags[i, 0] = 1; m_dFrequency[i, 0] = 0; m_sWords[i] = null; //Set virtual ending m_nTags[i++, 1] = -1; } m_nCurLength = i; //The current word count if (nRetPos != -1) return nWordsIndex + 1; //Next start position return -1; //Reaching ending }
//==================================================================== // Func Name : GenerateWordNet // Description: Generate the segmentation word net according // the original sentence // Parameters : sSentence: the sentence // dictCore : core dictionary // bOriginalFreq=false: output original frequency // Returns : bool //==================================================================== public static RowFirstDynamicArray<ChainContent> GenerateWordNet(List<AtomNode> atomSegment, WordDictionary coreDict) { string sWord = "", sMaxMatchWord; int nPOSRet, nPOS, nTotalFreq; double dValue = 0; RowFirstDynamicArray<ChainContent> m_segGraph = new RowFirstDynamicArray<ChainContent>(); m_segGraph.SetEmpty(); // 将原子部分存入m_segGraph for (int i = 0; i < atomSegment.Count; i++)//Init the cost array { if (atomSegment[i].nPOS == Predefine.CT_CHINESE) m_segGraph.SetElement(i, i + 1, new ChainContent(0, 0, atomSegment[i].sWord)); else { sWord = atomSegment[i].sWord;//init the word dValue = Predefine.MAX_FREQUENCE; switch (atomSegment[i].nPOS) { case Predefine.CT_INDEX: case Predefine.CT_NUM: nPOS = -27904;//'m'*256 sWord = "未##数"; dValue = 0; break; case Predefine.CT_DELIMITER: nPOS = 30464;//'w'*256; break; case Predefine.CT_LETTER: nPOS = -28280; // -'n' * 256 - 'x'; dValue = 0; sWord = "未##串"; break; case Predefine.CT_SINGLE://12021-2129-3121 if (Regex.IsMatch(atomSegment[i].sWord, @"^(-?\d+)(\.\d+)?$")) //匹配浮点数 { nPOS = -27904;//'m'*256 sWord = "未##数"; } else { nPOS = -28280; // -'n' * 256 - 'x' sWord = "未##串"; } dValue = 0; break; default: nPOS = atomSegment[i].nPOS;//'?'*256; break; } m_segGraph.SetElement(i, i + 1, new ChainContent(dValue, nPOS, sWord));//init the link with minimum } } // 将所有可能的组词存入m_segGraph for (int i = 0; i < atomSegment.Count; i++)//All the word { sWord = atomSegment[i].sWord;//Get the current atom int j = i + 1; while (j < atomSegment.Count && coreDict.GetMaxMatch(sWord, out sMaxMatchWord, out nPOSRet)) { if (sMaxMatchWord == sWord) // 就是我们要找的词 { WordInfo info = coreDict.GetWordInfo(sWord); // 该词可能就有多种词性 // 计算该词的所有词频之和 nTotalFreq = 0; for (int k = 0; k < info.Count; k++) nTotalFreq += info.Frequencies[k]; // 限制出现某些特殊词 if (sWord.Length == 2 && (sWord.StartsWith("年") || sWord.StartsWith("月")) && i >= 1 && (Utility.IsAllNum(atomSegment[i - 1].sWord) || Utility.IsAllChineseNum(atomSegment[i - 1].sWord))) { //1年内、1999年末 if ("末内中底前间初".IndexOf(sWord.Substring(1)) >= 0) break; } // 如果该词只有一个词性,则存储,否则词性记录为 0 if (info.Count == 1) m_segGraph.SetElement(i, j, new ChainContent(nTotalFreq, info.POSs[0], sWord)); else m_segGraph.SetElement(i, j, new ChainContent(nTotalFreq, 0, sWord)); } sWord += atomSegment[j++].sWord; } } return m_segGraph; }
public bool PersonRecognize(WordDictionary personDict) { StringBuilder sb = new StringBuilder(); int i; string sPOS = "z", sPersonName; string[] sPatterns = { "BBCD", "BBC", "BBE", "BBZ", "BCD", "BEE", "BE", "BG", "BXD", "BZ", "CDCD", "CD", "EE", "FB", "Y", "XD", "" }; double[] dFactor = { 0.003606, 0.000021, 0.001314, 0.000315, 0.656624, 0.000021, 0.146116, 0.009136, 0.000042, 0.038971, 0, 0.090367, 0.000273, 0.009157, 0.034324, 0.009735, 0 }; /*------------------------------------ About parameter: BBCD 343 0.003606 BBC 2 0.000021 BBE 125 0.001314 BBZ 30 0.000315 BCD 62460 0.656624 BEE 0 0.000000 BE 13899 0.146116 BG 869 0.009136 BXD 4 0.000042 BZ 3707 0.038971 CD 8596 0.090367 EE 26 0.000273 FB 871 0.009157 Y 3265 0.034324 XD 926 0.009735 The person recognition patterns set BBCD:姓+姓+名1+名2; BBE: 姓+姓+单名; BBZ: 姓+姓+双名成词; BCD: 姓+名1+名2; BE: 姓+单名; BEE: 姓+单名+单名;韩磊磊 BG: 姓+后缀 BXD: 姓+姓双名首字成词+双名末字 BZ: 姓+双名成词; B: 姓 CD: 名1+名2; EE: 单名+单名; FB: 前缀+姓 XD: 姓双名首字成词+双名末字 Y: 姓单名成词 ------------------------------------*/ int[] nPatternLen = { 4, 3, 3, 3, 3, 3, 2, 2, 3, 2, 4, 2, 2, 2, 1, 2, 0 }; //Convert to string from POS sb.Append('z'); for (i = 1; m_nBestTag[i] > -1; i++) sb.Append(Convert.ToChar(m_nBestTag[i] + Convert.ToInt32('A'))); sPOS = sb.ToString(); int j = 1, k, nPos; //Find the proper pattern from the first POS int nLittleFreqCount; //Counter for the person name role with little frequecy bool bMatched = false; while (j < i) { bMatched = false; for (k = 0; !bMatched && nPatternLen[k] > 0; k++) { if (string.Compare(sPatterns[k], 0, sPOS, j, nPatternLen[k]) == 0 && string.Compare(m_sWords[j - 1], "·") != 0 && string.Compare(m_sWords[j + nPatternLen[k]], "·") != 0) { //Find the proper pattern k if (string.Compare(sPatterns[k], "FB") == 0 && (sPOS[j + 2] == 'E' || sPOS[j + 2] == 'C' || sPOS[j + 2] == 'G')) { //Rule 1 for exclusion:前缀+姓+名1(名2): 规则(前缀+姓)失效; continue; } /* if((strcmp(sPatterns[k],"BEE")==0||strcmp(sPatterns[k],"EE")==0)&&strcmp(m_sWords[j+nPatternLen[k]-1],m_sWords[j+nPatternLen[k]-2])!=0) {//Rule 2 for exclusion:姓+单名+单名:单名+单名 若EE对应的字不同,规则失效.如:韩磊磊 continue; } if(strcmp(sPatterns[k],"B")==0&&m_nBestTag[j+1]!=12) {//Rule 3 for exclusion: 若姓后不是后缀,规则失效.如:江主席、刘大娘 continue; } */ //Get the possible name nPos = j; //Record the person position in the tag sequence sPersonName = null; nLittleFreqCount = 0; //Record the number of role with little frequency while (nPos < j + nPatternLen[k]) { //Get the possible person name // if (m_nBestTag[nPos] < 4 && personDict.GetFrequency(m_sWords[nPos], m_nBestTag[nPos]) < Predefine.LITTLE_FREQUENCY) nLittleFreqCount++; //The counter increase sPersonName += m_sWords[nPos]; nPos += 1; } /* if(IsAllForeign(sPersonName)&&personDict.GetFrequency(m_sWords[j],1)<LITTLE_FREQUENCY) {//Exclusion foreign name //Rule 2 for exclusion:若均为外国人名用字 规则(名1+名2)失效 j+=nPatternLen[k]-1; continue; } */ if (string.Compare(sPatterns[k], "CDCD") == 0) { //Rule for exclusion //规则(名1+名2+名1+名2)本身是排除规则:女高音歌唱家迪里拜尔演唱 //Rule 3 for exclusion:含外国人名用字 规则适用 //否则,排除规则失效:黑妞白妞姐俩拔了头筹。 if (Utility.GetForeignCharCount(sPersonName) > 0) j += nPatternLen[k] - 1; continue; } /* if(strcmp(sPatterns[k],"CD")==0&&IsAllForeign(sPersonName)) {// j+=nPatternLen[k]-1; continue; } if(nLittleFreqCount==nPatternLen[k]||nLittleFreqCount==3) //马哈蒂尔;小扎耶德与他的中国阿姨胡彩玲受华黎明大使之邀, //The all roles appear with two lower frequecy,we will ignore them continue; */ m_nUnknownWords[m_nUnknownWordsCount, 0] = m_nWordPosition[j]; m_nUnknownWords[m_nUnknownWordsCount, 1] = m_nWordPosition[j + nPatternLen[k]]; m_dWordsPossibility[m_nUnknownWordsCount] = -Math.Log(dFactor[k]) + ComputePossibility(j, nPatternLen[k], personDict); //Mutiply the factor m_nUnknownWordsCount += 1; j += nPatternLen[k]; bMatched = true; } } if (!bMatched) //Not matched, add j by 1 j += 1; } return true; }
public bool PlaceRecognize(WordDictionary dictCore, WordDictionary placeDict) { int nStart = 1, nEnd = 1, i = 1, nTemp; double dPanelty = 1.0; //Panelty value while (m_nBestTag[i] > -1) { if (m_nBestTag[i] == 1) //1 Trigger the recognition procession { nStart = i; nEnd = nStart + 1; //=========== by zhenyulu: 此处nEnd = nStart + 1;有些强迫之嫌,因此后面处理了一下 while (m_nBestTag[nEnd] == 1) // { if (nEnd > nStart + 1) { dPanelty += 1.0; } nEnd++; } while (m_nBestTag[nEnd] == 2) { //2,12,22 nEnd++; } nTemp = nEnd; while (m_nBestTag[nEnd] == 3) { if (nEnd > nTemp) { dPanelty += 1.0; } nEnd++; } } else if (m_nBestTag[i] == 2) //1,11,21 Trigger the recognition { dPanelty += 1.0; nStart = i; nEnd = nStart + 1; while (m_nBestTag[nEnd] == 2) { //2 nEnd++; } nTemp = nEnd; while (m_nBestTag[nEnd] == 3) //2 { if (nEnd > nTemp) { dPanelty += 1.0; } nEnd++; } } if (nEnd > nStart) { //=========== by zhenyulu: 避免上面强迫之嫌带来的负面影响 if (m_sWords[nEnd] == null) { nEnd--; } m_nUnknownWords[m_nUnknownWordsCount, 0] = m_nWordPosition[nStart]; m_nUnknownWords[m_nUnknownWordsCount, 1] = m_nWordPosition[nEnd]; m_dWordsPossibility[m_nUnknownWordsCount++] = ComputePossibility(nStart, nEnd - nStart + 1, placeDict) + Math.Log(dPanelty); nStart = nEnd; } if (i < nEnd) { i = nEnd; } else { i = i + 1; } } return(true); }
public static void TestBiSegment() { List<string> sentence = new List<string>(); List<string> description = new List<string>(); sentence.Add(@"他说的的确实在理"); description.Add(@"普通分词测试"); sentence.Add(@"张华平3-4月份来北京开会"); description.Add(@"数字切分"); sentence.Add(@"1.加强管理"); description.Add(@"剔除多余的“.”"); sentence.Add(@"他出生于1980年1月1日10点"); description.Add(@"日期合并"); sentence.Add(@"他出生于甲子年"); description.Add(@"年份识别"); sentence.Add(@"馆内陈列周恩来和邓颖超生前使用过的物品"); description.Add(@"姓名识别"); WordDictionary coreDict = new WordDictionary(); if (!coreDict.Load(coreDictFile)) { Console.WriteLine("coreDict 字典装入错误!"); return; } WordDictionary biDict = new WordDictionary(); if (!biDict.Load(biDictFile)) { Console.WriteLine("字典装入错误!"); return; } string sSentence; string sDescription; for (int i = 0; i < sentence.Count; i++) { sSentence = sentence[i]; sDescription = description[i]; Console.WriteLine("\r\n============ {0} ============", sDescription); sSentence = Predefine.SENTENCE_BEGIN + sSentence + Predefine.SENTENCE_END; List<AtomNode> nodes = Segment.AtomSegment(sSentence); Console.WriteLine("原子切分:"); for (int j = 0; j < nodes.Count; j++) Console.Write("{0}, ", nodes[j].sWord); Console.WriteLine("\r\n\r\n实际切分:"); Segment segment = new Segment(biDict, coreDict); segment.BiSegment(sSentence, 0.1, 1); for (int k = 0; k < segment.m_pWordSeg.Count; k++) { for (int j = 0; j < segment.m_pWordSeg[k].Length; j++) Console.Write("{0}, ", segment.m_pWordSeg[k][j].sWord); Console.WriteLine(); } } }
public static void TestGenerateWordNet() { WordDictionary coreDict = new WordDictionary(); if (!coreDict.Load(coreDictFile)) { Console.WriteLine("字典装入错误!"); return; } string sSentence = @"人民币现在很值钱"; sSentence = Predefine.SENTENCE_BEGIN + sSentence + Predefine.SENTENCE_END; List<AtomNode> atomSegment = Segment.AtomSegment(sSentence); RowFirstDynamicArray<ChainContent> m_segGraph = Segment.GenerateWordNet(atomSegment, coreDict); Console.WriteLine(m_segGraph.ToString()); }
public static void TestDictionary() { WordDictionary dict = new WordDictionary(); if (dict.Load(coreDictFile, false)) { for (int j = 2; j <= 5; j++) { Console.WriteLine("====================================\r\n汉字:{0}, ID :{1}\r\n", Utility.CC_ID2Char(j), j); Console.WriteLine(" 词长 频率 词性 词"); for (int i = 0; i < dict.indexTable[j].nCount; i++) Console.WriteLine("{0,5} {1,6} {2,5} ({3}){4}", dict.indexTable[j].WordItems[i].nWordLen, dict.indexTable[j].WordItems[i].nFrequency, Utility.GetPOSString(dict.indexTable[j].WordItems[i].nPOS), Utility.CC_ID2Char(j), dict.indexTable[j].WordItems[i].sWord); } } else Console.WriteLine("Wrong!"); }
//==================================================================== //Merge dict2 into current dictionary and the frequency ratio from dict2 and current dict is nRatio //==================================================================== public bool Merge(WordDictionary dict2, int nRatio) { int i, j, k, nCmpValue; string sWord; //Modification made, not to output when modify table exists. if (modifyTable != null || dict2.modifyTable != null) return false; for (i = 0; i < Predefine.CC_NUM; i++) { j = 0; k = 0; while (j < indexTable[i].nCount && k < dict2.indexTable[i].nCount) { nCmpValue = Utility.CCStringCompare(indexTable[i].WordItems[j].sWord, dict2.indexTable[i].WordItems[k].sWord); if (nCmpValue == 0) //Same Words and determine the different handle { if (indexTable[i].WordItems[j].nPOS < dict2.indexTable[i].WordItems[k].nPOS) nCmpValue = -1; else if (indexTable[i].WordItems[j].nPOS > dict2.indexTable[i].WordItems[k].nPOS) nCmpValue = 1; } if (nCmpValue == 0) { indexTable[i].WordItems[j].nFrequency = (nRatio * indexTable[i].WordItems[j].nFrequency + dict2.indexTable[i].WordItems[k].nFrequency) / (nRatio + 1); j += 1; k += 1; } //Get next word in the current dictionary else if (nCmpValue < 0) { indexTable[i].WordItems[j].nFrequency = (nRatio * indexTable[i].WordItems[j].nFrequency) / (nRatio + 1); j += 1; } else //Get next word in the second dictionary { if (dict2.indexTable[i].WordItems[k].nFrequency > (nRatio + 1) / 10) { sWord = string.Format("{0}{1}", Utility.CC_ID2Char(i).ToString(), dict2.indexTable[i].WordItems[k].sWord); AddItem(sWord, dict2.indexTable[i].WordItems[k].nPOS, dict2.indexTable[i].WordItems[k].nFrequency / (nRatio + 1)); } k += 1; } } //words in current dictionary are left while (j < indexTable[i].nCount) { indexTable[i].WordItems[j].nFrequency = (nRatio * indexTable[i].WordItems[j].nFrequency) / (nRatio + 1); j += 1; } //words in Dict2 are left while (k < dict2.indexTable[i].nCount) { if (dict2.indexTable[i].WordItems[k].nFrequency > (nRatio + 1) / 10) { sWord = string.Format("{0}{1}", Utility.CC_ID2Char(i).ToString(), dict2.indexTable[i].WordItems[k].sWord); AddItem(sWord, dict2.indexTable[i].WordItems[k].nPOS, dict2.indexTable[i].WordItems[k].nFrequency / (nRatio + 1)); } k += 1; } } return true; }
//==================================================================== // Func Name : GenerateWordNet // Description: Generate the segmentation word net according // the original sentence // Parameters : sSentence: the sentence // dictCore : core dictionary // bOriginalFreq=false: output original frequency // Returns : bool //==================================================================== public static RowFirstDynamicArray<ChainContent> GenerateWordNet(List<AtomNode> atomSegment, WordDictionary coreDict) { string sWord = "", sMaxMatchWord; int nPOSRet, nPOS, nTotalFreq; double dValue = 0; RowFirstDynamicArray<ChainContent> m_segGraph = new RowFirstDynamicArray<ChainContent>(); m_segGraph.SetEmpty(); // ��ԭ�Ӳ��ִ���m_segGraph for (int i = 0; i < atomSegment.Count; i++)//Init the cost array { if (atomSegment[i].nPOS == Predefine.CT_CHINESE) m_segGraph.SetElement(i, i + 1, new ChainContent(0, 0, atomSegment[i].sWord)); else { sWord = atomSegment[i].sWord;//init the word dValue = Predefine.MAX_FREQUENCE; switch (atomSegment[i].nPOS) { case Predefine.CT_INDEX: case Predefine.CT_NUM: nPOS = -27904;//'m'*256 sWord = "δ##��"; dValue = 0; break; case Predefine.CT_DELIMITER: nPOS = 30464;//'w'*256; break; case Predefine.CT_LETTER: nPOS = -28280; // -'n' * 256 - 'x'; dValue = 0; sWord = "δ##��"; break; case Predefine.CT_SINGLE://12021-2129-3121 if (Regex.IsMatch(atomSegment[i].sWord, @"^(-?\d+)(\.\d+)?$"))����//ƥ�両���� { nPOS = -27904;//'m'*256 sWord = "δ##��"; } else { nPOS = -28280; // -'n' * 256 - 'x' sWord = "δ##��"; } dValue = 0; break; default: nPOS = atomSegment[i].nPOS;//'?'*256; break; } m_segGraph.SetElement(i, i + 1, new ChainContent(dValue, nPOS, sWord));//init the link with minimum } } // �����п��ܵ���ʴ���m_segGraph for (int i = 0; i < atomSegment.Count; i++)//All the word { sWord = atomSegment[i].sWord;//Get the current atom int j = i + 1; while (j < atomSegment.Count && coreDict.GetMaxMatch(sWord, out sMaxMatchWord, out nPOSRet)) { if (sMaxMatchWord == sWord) // ��������Ҫ�ҵĴ� { WordInfo info = coreDict.GetWordInfo(sWord); // �ôʿ��ܾ��ж��ִ��� // ����ôʵ����д�Ƶ֮�� nTotalFreq = 0; for (int k = 0; k < info.Count; k++) nTotalFreq += info.Frequencies[k]; // ���Ƴ���ijЩ����� if (sWord.Length == 2 && (sWord.StartsWith("��") || sWord.StartsWith("��")) && i >= 1 && (Utility.IsAllNum(atomSegment[i - 1].sWord) || Utility.IsAllChineseNum(atomSegment[i - 1].sWord))) { //1���ڡ�1999��ĩ if ("ĩ���е�ǰ���".IndexOf(sWord.Substring(1)) >= 0) break; } // ����ô�ֻ��һ�����ԣ���洢��������Լ�¼Ϊ 0 if (info.Count == 1) m_segGraph.SetElement(i, j, new ChainContent(nTotalFreq, info.POSs[0], sWord)); else m_segGraph.SetElement(i, j, new ChainContent(nTotalFreq, 0, sWord)); } sWord += atomSegment[j++].sWord; } } return m_segGraph; }
//==================================================================== // Func Name : GenerateWordNet // Description: Generate the segmentation word net according // the original sentence // Parameters : sSentence: the sentence // dictCore : core dictionary // bOriginalFreq=false: output original frequency // Returns : bool //==================================================================== public static RowFirstDynamicArray <ChainContent> GenerateWordNet(List <AtomNode> atomSegment, WordDictionary coreDict) { string sWord = "", sMaxMatchWord; int nPOSRet, nPOS, nTotalFreq; double dValue = 0; RowFirstDynamicArray <ChainContent> m_segGraph = new RowFirstDynamicArray <ChainContent>(); m_segGraph.SetEmpty(); // 将原子部分存入m_segGraph for (int i = 0; i < atomSegment.Count; i++)//Init the cost array { if (atomSegment[i].nPOS == Predefine.CT_CHINESE) { m_segGraph.SetElement(i, i + 1, new ChainContent(0, 0, atomSegment[i].sWord)); } else { sWord = atomSegment[i].sWord;//init the word dValue = Predefine.MAX_FREQUENCE; switch (atomSegment[i].nPOS) { case Predefine.CT_INDEX: case Predefine.CT_NUM: nPOS = -27904;//'m'*256 sWord = "未##数"; dValue = 0; break; case Predefine.CT_DELIMITER: nPOS = 30464;//'w'*256; break; case Predefine.CT_LETTER: nPOS = -28280; // -'n' * 256 - 'x'; dValue = 0; sWord = "未##串"; break; case Predefine.CT_SINGLE://12021-2129-3121 if (Regex.IsMatch(atomSegment[i].sWord, @"^(-?\d+)(\.\d+)?$")) { //匹配浮点数 { nPOS = -27904; //'m'*256 sWord = "未##数"; } } else { nPOS = -28280; // -'n' * 256 - 'x' sWord = "未##串"; } dValue = 0; break; default: nPOS = atomSegment[i].nPOS;//'?'*256; break; } m_segGraph.SetElement(i, i + 1, new ChainContent(dValue, nPOS, sWord));//init the link with minimum } } // 将所有可能的组词存入m_segGraph for (int i = 0; i < atomSegment.Count; i++) //All the word { sWord = atomSegment[i].sWord; //Get the current atom int j = i + 1; while (j < atomSegment.Count && coreDict.GetMaxMatch(sWord, out sMaxMatchWord, out nPOSRet)) { if (sMaxMatchWord == sWord) // 就是我们要找的词 { WordInfo info = coreDict.GetWordInfo(sWord); // 该词可能就有多种词性 // 计算该词的所有词频之和 nTotalFreq = 0; for (int k = 0; k < info.Count; k++) { nTotalFreq += info.Frequencies[k]; } // 限制出现某些特殊词 if (sWord.Length == 2 && (sWord.StartsWith("年") || sWord.StartsWith("月")) && i >= 1 && (Utility.IsAllNum(atomSegment[i - 1].sWord) || Utility.IsAllChineseNum(atomSegment[i - 1].sWord))) { //1年内、1999年末 if ("末内中底前间初".IndexOf(sWord.Substring(1)) >= 0) { break; } } // 如果该词只有一个词性,则存储,否则词性记录为 0 if (info.Count == 1) { m_segGraph.SetElement(i, j, new ChainContent(nTotalFreq, info.POSs[0], sWord)); } else { m_segGraph.SetElement(i, j, new ChainContent(nTotalFreq, 0, sWord)); } } sWord += atomSegment[j++].sWord; } } return(m_segGraph); }
public Segment(WordDictionary biDict, WordDictionary coreDict) { this.biDict = biDict; this.coreDict = coreDict; }
//==================================================================== // 生成两两词之间的二叉图表 //==================================================================== public static ColumnFirstDynamicArray <ChainContent> BiGraphGenerate( RowFirstDynamicArray <ChainContent> aWord, double smoothPara, WordDictionary biDict, WordDictionary coreDict) { ColumnFirstDynamicArray <ChainContent> aBiWordNet = new ColumnFirstDynamicArray <ChainContent>(); ChainItem <ChainContent> pCur, pNextWords; int nTwoWordsFreq = 0, nCurWordIndex, nNextWordIndex; double dCurFreqency, dValue, dTemp; string sTwoWords; StringBuilder sb = new StringBuilder(); //Record the position map of possible words int[] m_npWordPosMapTable = PreparePositionMap(aWord); pCur = aWord.GetHead(); while (pCur != null) { if (pCur.Content.nPOS >= 0) { //It's not an unknown words dCurFreqency = pCur.Content.eWeight; } else { //Unknown words dCurFreqency = coreDict.GetFrequency(pCur.Content.sWord, 2); } //Get next words which begin with pCur.col(注:很特殊的对应关系) pNextWords = aWord.GetFirstElementOfRow(pCur.col); while (pNextWords != null && pNextWords.row == pCur.col) { sb.Remove(0, sb.Length); sb.Append(pCur.Content.sWord); sb.Append(Predefine.WORD_SEGMENTER); sb.Append(pNextWords.Content.sWord); sTwoWords = sb.ToString(); //Two linked Words frequency nTwoWordsFreq = biDict.GetFrequency(sTwoWords, 3); //Smoothing dTemp = 1.0 / Predefine.MAX_FREQUENCE; //-log{a*P(Ci-1)+(1-a)P(Ci|Ci-1)} Note 0<a<1 dValue = -Math.Log(smoothPara * (1.0 + dCurFreqency) / (Predefine.MAX_FREQUENCE + 80000.0) + (1.0 - smoothPara) * ((1.0 - dTemp) * nTwoWordsFreq / (1.0 + dCurFreqency) + dTemp)); //Unknown words: P(Wi|Ci);while known words:1 if (pCur.Content.nPOS < 0) { dValue += pCur.Content.nPOS; } //Get the position index of current word in the position map table nCurWordIndex = Utility.BinarySearch(pCur.row * Predefine.MAX_SENTENCE_LEN + pCur.col, m_npWordPosMapTable); nNextWordIndex = Utility.BinarySearch(pNextWords.row * Predefine.MAX_SENTENCE_LEN + pNextWords.col, m_npWordPosMapTable); aBiWordNet.SetElement(nCurWordIndex, nNextWordIndex, new ChainContent(dValue, pCur.Content.nPOS, sTwoWords)); pNextWords = pNextWords.next; //Get next word } pCur = pCur.next; } return(aBiWordNet); }
private double ComputePossibility(int nStartPos, int nLength, WordDictionary dict) { double dRetValue = 0, dPOSPoss; //dPOSPoss: the possibility of a POS appears //dContextPoss: The possibility of context POS appears int nFreq; for (int i = nStartPos; i < nStartPos + nLength; i++) { nFreq = dict.GetFrequency(m_sWords[i], m_nBestTag[i]); //nFreq is word being the POS dPOSPoss = Math.Log((double)(m_context.GetFrequency(0, m_nBestTag[i]) + 1)) - Math.Log((double)(nFreq + 1)); dRetValue += dPOSPoss; /* if(i<nStartPos+nLength-1) { dContextPoss=log((double)(m_context.GetContextPossibility(0,m_nBestTag[i],m_nBestTag[i+1])+1)); dRetValue+=dPOSPoss-dContextPoss; } */ } return dRetValue; }
//POS tagging with Hidden Markov Model public bool POSTagging(WordResult[] pWordItems, WordDictionary dictCore, WordDictionary dictUnknown) { //pWordItems: Items; nItemCount: the count of items;core dictionary and unknown recognition dictionary int i = 0, j, nStartPos; Reset(false); while (i > -1 && i < pWordItems.Length && pWordItems[i].sWord != null) { nStartPos = i; //Start Position i = GetFrom(pWordItems, nStartPos, dictCore, dictUnknown); GetBestPOS(); switch (m_tagType) { case TAG_TYPE.TT_NORMAL: //normal POS tagging j = 1; while (m_nBestTag[j] != -1 && j < m_nCurLength) { //Store the best POS tagging pWordItems[j + nStartPos - 1].nPOS = m_nBestTag[j]; //Let 。be 0 if (pWordItems[j + nStartPos - 1].dValue > 0 && dictCore.IsExist(pWordItems[j + nStartPos - 1].sWord, -1)) //Exist and update its frequncy as a POS value pWordItems[j + nStartPos - 1].dValue = dictCore.GetFrequency(pWordItems[j + nStartPos - 1].sWord, m_nBestTag[j]); j += 1; } break; case TAG_TYPE.TT_PERSON: //Person recognition PersonRecognize(dictUnknown); break; case TAG_TYPE.TT_PLACE: //Place name recognition case TAG_TYPE.TT_TRANS_PERSON: //Transliteration Person PlaceRecognize(dictCore, dictUnknown); break; default: break; } Reset(); } return true; }
public bool PersonRecognize(WordDictionary personDict) { StringBuilder sb = new StringBuilder(); int i; string sPOS = "z", sPersonName; string[] sPatterns = { "BBCD", "BBC", "BBE", "BBZ", "BCD", "BEE", "BE", "BG", "BXD", "BZ", "CDCD", "CD", "EE", "FB", "Y", "XD", "" }; double[] dFactor = { 0.003606, 0.000021, 0.001314, 0.000315, 0.656624, 0.000021, 0.146116, 0.009136, 0.000042, 0.038971, 0, 0.090367, 0.000273, 0.009157, 0.034324, 0.009735, 0 }; /*------------------------------------ * About parameter: * * BBCD 343 0.003606 * BBC 2 0.000021 * BBE 125 0.001314 * BBZ 30 0.000315 * BCD 62460 0.656624 * BEE 0 0.000000 * BE 13899 0.146116 * BG 869 0.009136 * BXD 4 0.000042 * BZ 3707 0.038971 * CD 8596 0.090367 * EE 26 0.000273 * FB 871 0.009157 * Y 3265 0.034324 * XD 926 0.009735 * * The person recognition patterns set * BBCD:姓+姓+名1+名2; * BBE: 姓+姓+单名; * BBZ: 姓+姓+双名成词; * BCD: 姓+名1+名2; * BE: 姓+单名; * BEE: 姓+单名+单名;韩磊磊 * BG: 姓+后缀 * BXD: 姓+姓双名首字成词+双名末字 * BZ: 姓+双名成词; * B: 姓 * CD: 名1+名2; * EE: 单名+单名; * FB: 前缀+姓 * XD: 姓双名首字成词+双名末字 * Y: 姓单名成词 * ------------------------------------*/ int[] nPatternLen = { 4, 3, 3, 3, 3, 3, 2, 2, 3, 2, 4, 2, 2, 2, 1, 2, 0 }; //Convert to string from POS sb.Append('z'); for (i = 1; m_nBestTag[i] > -1; i++) { sb.Append(Convert.ToChar(m_nBestTag[i] + Convert.ToInt32('A'))); } sPOS = sb.ToString(); int j = 1, k, nPos; //Find the proper pattern from the first POS int nLittleFreqCount; //Counter for the person name role with little frequecy bool bMatched = false; while (j < i) { bMatched = false; for (k = 0; !bMatched && nPatternLen[k] > 0; k++) { if (string.Compare(sPatterns[k], 0, sPOS, j, nPatternLen[k]) == 0 && string.Compare(m_sWords[j - 1], "·") != 0 && string.Compare(m_sWords[j + nPatternLen[k]], "·") != 0) { //Find the proper pattern k if (string.Compare(sPatterns[k], "FB") == 0 && (sPOS[j + 2] == 'E' || sPOS[j + 2] == 'C' || sPOS[j + 2] == 'G')) { //Rule 1 for exclusion:前缀+姓+名1(名2): 规则(前缀+姓)失效; continue; } /* * if((strcmp(sPatterns[k],"BEE")==0||strcmp(sPatterns[k],"EE")==0)&&strcmp(m_sWords[j+nPatternLen[k]-1],m_sWords[j+nPatternLen[k]-2])!=0) * {//Rule 2 for exclusion:姓+单名+单名:单名+单名 若EE对应的字不同,规则失效.如:韩磊磊 * continue; * } * * if(strcmp(sPatterns[k],"B")==0&&m_nBestTag[j+1]!=12) * {//Rule 3 for exclusion: 若姓后不是后缀,规则失效.如:江主席、刘大娘 * continue; * } */ //Get the possible name nPos = j; //Record the person position in the tag sequence sPersonName = null; nLittleFreqCount = 0; //Record the number of role with little frequency while (nPos < j + nPatternLen[k]) { //Get the possible person name // if (m_nBestTag[nPos] < 4 && personDict.GetFrequency(m_sWords[nPos], m_nBestTag[nPos]) < Predefine.LITTLE_FREQUENCY) { nLittleFreqCount++; } //The counter increase sPersonName += m_sWords[nPos]; nPos += 1; } /* * if(IsAllForeign(sPersonName)&&personDict.GetFrequency(m_sWords[j],1)<LITTLE_FREQUENCY) * {//Exclusion foreign name * //Rule 2 for exclusion:若均为外国人名用字 规则(名1+名2)失效 * j+=nPatternLen[k]-1; * continue; * } */ if (string.Compare(sPatterns[k], "CDCD") == 0) { //Rule for exclusion //规则(名1+名2+名1+名2)本身是排除规则:女高音歌唱家迪里拜尔演唱 //Rule 3 for exclusion:含外国人名用字 规则适用 //否则,排除规则失效:黑妞白妞姐俩拔了头筹。 if (Utility.GetForeignCharCount(sPersonName) > 0) { j += nPatternLen[k] - 1; } continue; } /* * if(strcmp(sPatterns[k],"CD")==0&&IsAllForeign(sPersonName)) * {// * j+=nPatternLen[k]-1; * continue; * } * if(nLittleFreqCount==nPatternLen[k]||nLittleFreqCount==3) * //马哈蒂尔;小扎耶德与他的中国阿姨胡彩玲受华黎明大使之邀, * //The all roles appear with two lower frequecy,we will ignore them * continue; */ m_nUnknownWords[m_nUnknownWordsCount, 0] = m_nWordPosition[j]; m_nUnknownWords[m_nUnknownWordsCount, 1] = m_nWordPosition[j + nPatternLen[k]]; m_dWordsPossibility[m_nUnknownWordsCount] = -Math.Log(dFactor[k]) + ComputePossibility(j, nPatternLen[k], personDict); //Mutiply the factor m_nUnknownWordsCount += 1; j += nPatternLen[k]; bMatched = true; } } if (!bMatched) { //Not matched, add j by 1 j += 1; } } return(true); }
/// <summary> /// 得到所有可能的分词方案 /// </summary> /// <returns></returns> public RowFirstDynamicArray<ChainContent> GetSegGraph(string sSentence) { WordDictionary coreDict = new WordDictionary(); if (!coreDict.Load(coreDictFile)) { Console.WriteLine("字典装入错误!"); return null; } //string sSentence = @"他说的确实实在"; sSentence = Predefine.SENTENCE_BEGIN + sSentence + Predefine.SENTENCE_END; List<AtomNode> atomSegment = Segment.AtomSegment(sSentence); RowFirstDynamicArray<ChainContent> m_segGraph = Segment.GenerateWordNet(atomSegment, coreDict); return m_segGraph; }
/// <summary> /// 找到导入库和现有库的不同 /// </summary> /// <param name="NewDicFile">导入库文件</param> /// <param name="Encoding">导入库文件编码</param> /// <param name="DicFormat">导入库文件格式</param> /// <param name="SourceDict">原库对象</param> /// <param name="OddLines">输出没有词性标注且现有库中也没有的词行</param> /// <param name="NewWords">输出新词或现有词的新词性</param> /// <param name="ExistWords">输出重复词,且词性也相同</param> /// <param name="MaxFrqRate">重复词的最大词频比例</param> /// <param name="MinFrqRate">重复词的最小词频比例</param> /// <param name="AvgFrqRate">重复词的平均词频比例</param> public static void FindDifferent(string NewDicFile, Encoding Encoding, DictionaryFormat DicFormat, WordDictionary SourceDict, out string[] OddLines, out WordDictionary.DicWordInfo[] NewWords, out WordDictionary.DicWordInfo[] ExistWords, out double MaxFrqRate, out double MinFrqRate, out double AvgFrqRate) { //初始化 MaxFrqRate = double.MinValue; MinFrqRate = double.MaxValue; decimal SumFrqRate = 0; //const string[] CheckPos = new string[] { "n", "ns", "nr", "ng", "v", "j", "m", "vn", "a", "q" }; //准备词性转换 Dictionary<string, string> PosTrans = getPosTransformMap(DicFormat); //加载词库 Dictionary<string, WordDictionary.DicWordInfo> OldWords = SourceDict.ToWordDictionary(); ; //内存词组 List<string> Odds = new List<string>(OldWords.Count / 2); List<WordDictionary.DicWordInfo> Exists = new List<SharpICTCLAS.WordDictionary.DicWordInfo>(OldWords.Count / 2); List<WordDictionary.DicWordInfo> News = new List<WordDictionary.DicWordInfo>(OldWords.Count / 2); //加载词库并统计库内有的词的词频,以估算词频转换的比例关系 foreach (string Line in File.ReadAllLines(NewDicFile, Encoding)) { string Word; int Frq; string Poses; switch (DicFormat) { case DictionaryFormat.SogouW2006: string[] s = Line.Split('\t', ' '); Word = s[0]; Frq = s.Length == 1 ? -1 : int.Parse(s[1]); Poses = s.Length < 2 ? null : s[2]; break; case DictionaryFormat.ExcelCSV: default: int p1 = Line.IndexOf(','); int p2 = Line.IndexOf(',', p1 + 1); Word = Line.Substring(0, p1); Frq = int.Parse(Line.Substring(p1 + 1, p2 - p1 - 1)); Poses = Line.Substring(p2 + 1).Trim('"').Trim(); break; } if (string.IsNullOrEmpty(Poses)) { if (!OldWords.ContainsKey(Word.ToLower())) Odds.Add(Line); continue; } foreach (string InputPos in Poses.TrimEnd(',').Split(',')) { if (string.IsNullOrEmpty(InputPos)) continue; //如果映射表中没有,则保留原始词性字母 string Pos = PosTrans.ContainsKey(InputPos.ToLower()) ? PosTrans[InputPos.ToLower()] : InputPos.ToLower(); //是否存在 if (OldWords.ContainsKey(Word.ToLower()) && OldWords[Word.ToLower()].Pos.Contains(Pos)) { int SourceFrq = OldWords[Word.ToLower()].Frequence; double FrqR = SourceFrq == 0 ? Frq : (double)Frq / SourceFrq; if (FrqR > MaxFrqRate) MaxFrqRate = FrqR; if (FrqR < MinFrqRate) MinFrqRate = FrqR; SumFrqRate += (decimal)FrqR; Exists.Add(new WordDictionary.DicWordInfo(Word, Pos, Frq)); } else //新词或新词性 { News.Add(new WordDictionary.DicWordInfo(Word, Pos, Frq)); } } } //平均频度转换倍数 AvgFrqRate = Exists.Count > 0 ? Convert.ToDouble(SumFrqRate / Exists.Count) : 0; OddLines = Odds.ToArray(); NewWords = News.ToArray(); ExistWords = Exists.ToArray(); }
/// <summary> /// 找到导入库和现有库的不同 /// </summary> /// <param name="NewDicFile">导入库文件</param> /// <param name="Encoding">导入库文件编码</param> /// <param name="DicFormat">导入库文件格式</param> /// <param name="SourceDictFileName">原库文件</param> /// <param name="OddLines">输出没有词性标注且现有库中也没有的词行</param> /// <param name="NewWords">输出新词或现有词的新词性</param> /// <param name="ExistWords">输出重复词,且词性也相同</param> /// <param name="MaxFrqRate">重复词的最大词频比例</param> /// <param name="MinFrqRate">重复词的最小词频比例</param> /// <param name="AvgFrqRate">重复词的平均词频比例</param> public static void FindDifferent(string NewDicFile, Encoding Encoding, DictionaryFormat DicFormat, string SourceDictFileName, out string[] OddLines, out WordDictionary.DicWordInfo[] NewWords, out WordDictionary.DicWordInfo[] ExistWords, out double MaxFrqRate, out double MinFrqRate, out double AvgFrqRate) { WordDictionary SourceDict = new WordDictionary(); if (!SourceDict.Load(SourceDictFileName)) throw new Exception("load source dic file fail"); FindDifferent(NewDicFile, Encoding, DicFormat, SourceDict, out OddLines, out NewWords, out ExistWords, out MaxFrqRate, out MinFrqRate, out AvgFrqRate); SourceDict.ReleaseDict(); }
private int GetFrom(WordResult[] pWordItems, int nIndex, WordDictionary dictCore, WordDictionary dictUnknown) { WordInfo info; int[] aPOS = new int[Predefine.MAX_POS_PER_WORD]; int[] aFreq = new int[Predefine.MAX_POS_PER_WORD]; int nFreq = 0, j, nRetPos = 0, nWordsIndex = 0; bool bSplit = false; //Need to split in Transliteration recognition int i = 1, nPOSCount; string sCurWord; //Current word nWordsIndex = i + nIndex - 1; for (i = 1; i < Predefine.MAX_WORDS_PER_SENTENCE && nWordsIndex < pWordItems.Length; i++) { if (m_tagType == TAG_TYPE.TT_NORMAL || !dictUnknown.IsExist(pWordItems[nWordsIndex].sWord, 44)) { m_sWords[i] = pWordItems[nWordsIndex].sWord; //store current word m_nWordPosition[i + 1] = m_nWordPosition[i] + m_sWords[i].Length; } else { if (!bSplit) { m_sWords[i] = pWordItems[nWordsIndex].sWord.Substring(0, 1); //store current word bSplit = true; } else { m_sWords[i] = pWordItems[nWordsIndex].sWord.Substring(1); //store current word bSplit = false; } m_nWordPosition[i + 1] = m_nWordPosition[i] + m_sWords[i].Length; } //Record the position of current word m_nStartPos = m_nWordPosition[i + 1]; //Move the Start POS to the ending if (m_tagType != TAG_TYPE.TT_NORMAL) { //Get the POSs from the unknown recognition dictionary sCurWord = m_sWords[i]; if (m_tagType == TAG_TYPE.TT_TRANS_PERSON && i > 0 && m_sWords[i - 1] != null && Utility.charType(m_sWords[i - 1].ToCharArray()[0]) == Predefine.CT_CHINESE) { if (m_sWords[i] == ".") { sCurWord = "."; } else if (m_sWords[i] == "-") { sCurWord = "-"; } } info = dictUnknown.GetWordInfo(sCurWord); if (info != null) { nPOSCount = info.Count + 1; for (j = 0; j < info.Count; j++) { //Get the POS set of sCurWord in the unknown dictionary m_nTags[i, j] = info.POSs[j]; m_dFrequency[i, j] = -Math.Log((double)(1 + info.Frequencies[j])) + Math.Log((double)(m_context.GetFrequency(0, info.POSs[j]) + nPOSCount)); } } else { nPOSCount = 1; j = 0; } //Get the POS set of sCurWord in the core dictionary //We ignore the POS in the core dictionary and recognize them as other (0). //We add their frequency to get the possibility as POS 0 if (string.Compare(m_sWords[i], "始##始") == 0) { m_nTags[i, j] = 100; m_dFrequency[i, j] = 0; j++; } else if (string.Compare(m_sWords[i], "末##末") == 0) { m_nTags[i, j] = 101; m_dFrequency[i, j] = 0; j++; } else { //dictCore.GetHandle(m_sWords[i], &nCount, aPOS, aFreq); info = dictCore.GetWordInfo(m_sWords[i]); nFreq = 0; if (info != null) { for (int k = 0; k < info.Count; k++) { nFreq += info.Frequencies[k]; } if (info.Count > 0) { m_nTags[i, j] = 0; //m_dFrequency[i][j]=(double)(1+nFreq)/(double)(m_context.GetFrequency(0,0)+1); m_dFrequency[i, j] = -Math.Log((double)(1 + nFreq)) + Math.Log((double)(m_context.GetFrequency(0, 0) + nPOSCount)); j++; } } } } else //For normal POS tagging { j = 0; //Get the POSs from the unknown recognition dictionary if (pWordItems[nWordsIndex].nPOS > 0) { //The word has is only one POS value //We have record its POS and nFrequncy in the items. m_nTags[i, j] = pWordItems[nWordsIndex].nPOS; m_dFrequency[i, j] = -Math.Log(pWordItems[nWordsIndex].dValue) + Math.Log((double)(m_context.GetFrequency(0, m_nTags[i, j]) + 1)); if (m_dFrequency[i, j] < 0) { //Not permit the value less than 0 m_dFrequency[i, j] = 0; } j++; } else { //The word has multiple POSs, we should retrieve the information from Core Dictionary if (pWordItems[nWordsIndex].nPOS < 0) { //The word has is only one POS value //We have record its POS and nFrequncy in the items. m_nTags[i, j] = -pWordItems[nWordsIndex].nPOS; m_dFrequency[i, j++] = pWordItems[nWordsIndex].dValue; } //dictCore.GetHandle(m_sWords[i], &nCount, aPOS, aFreq); info = dictCore.GetWordInfo(m_sWords[i]); if (info != null) { nPOSCount = info.Count; for (; j < info.Count; j++) { //Get the POS set of sCurWord in the unknown dictionary m_nTags[i, j] = info.POSs[j]; m_dFrequency[i, j] = -Math.Log(1 + info.Frequencies[j]) + Math.Log(m_context.GetFrequency(0, m_nTags[i, j]) + nPOSCount); } } } } if (j == 0) { //We donot know the POS, so we have to guess them according lexical knowledge GuessPOS(i, out j); //Guess the POS of current word } m_nTags[i, j] = -1; //Set the ending POS if (j == 1 && m_nTags[i, j] != Predefine.CT_SENTENCE_BEGIN) //No ambuguity { //No ambuguity, so we can break from the loop i++; m_sWords[i] = null; break; } if (!bSplit) { nWordsIndex++; } } if (nWordsIndex == pWordItems.Length) { nRetPos = -1; } //Reaching ending if (m_nTags[i - 1, 1] != -1) //||m_sWords[i][0]==0 { //Set end for words like "张/华/平" if (m_tagType != TAG_TYPE.TT_NORMAL) { m_nTags[i, 0] = 101; } else { m_nTags[i, 0] = 1; } m_dFrequency[i, 0] = 0; m_sWords[i] = null; //Set virtual ending m_nTags[i++, 1] = -1; } m_nCurLength = i; //The current word count if (nRetPos != -1) { return(nWordsIndex + 1); } //Next start position return(-1); //Reaching ending }