//==================================================================== // Generate Word according the segmentation route //==================================================================== private static WordResult[] GenerateWord(int[] uniPath, WordLinkedArray linkedArray, RowFirstDynamicArray <ChainContent> m_graphOptimum) { if (linkedArray.Count == 0) { return(null); } //-------------------------------------------------------------------- //Merge all seperate continue num into one number MergeContinueNumIntoOne(ref linkedArray); //-------------------------------------------------------------------- //The delimiter "--" ChangeDelimiterPOS(ref linkedArray); //-------------------------------------------------------------------- //如果前一个词是数字,当前词以“-”或“-”开始,并且不止这一个字符, //那么将此“-”符号从当前词中分离出来。 //例如 “3 / -4 / 月”需要拆分成“3 / - / 4 / 月” SplitMiddleSlashFromDigitalWords(ref linkedArray); //-------------------------------------------------------------------- //1、如果当前词是数字,下一个词是“月、日、时、分、秒、月份”中的一个,则合并,且当前词词性是时间 //2、如果当前词是可以作为年份的数字,下一个词是“年”,则合并,词性为时间,否则为数字。 //3、如果最后一个汉字是"点" ,则认为当前数字是时间 //4、如果当前串最后一个汉字不是"∶·./"和半角的'.''/',那么是数 //5、当前串最后一个汉字是"∶·./"和半角的'.''/',且长度大于1,那么去掉最后一个字符。例如"1." CheckDateElements(ref linkedArray); //-------------------------------------------------------------------- //输出结果 WordResult[] result = new WordResult[linkedArray.Count]; WordNode pCur = linkedArray.first; int i = 0; while (pCur != null) { WordResult item = new WordResult(); item.sWord = pCur.theWord.sWord; item.nPOS = pCur.theWord.nPOS; item.dValue = pCur.theWord.dValue; result[i] = item; m_graphOptimum.SetElement(pCur.row, pCur.col, new ChainContent(item.dValue, item.nPOS, pCur.sWordInSegGraph)); pCur = pCur.next; i++; } return(result); }
//==================================================================== //如果前一个词是数字,当前词以“-”或“-”开始,并且不止这一个字符, //那么将此“-”符号从当前词中分离出来。 //例如 “3 / -4 / 月”需要拆分成“3 / - / 4 / 月” //==================================================================== private static void SplitMiddleSlashFromDigitalWords(ref WordLinkedArray linkedArray) { if (linkedArray.Count < 2) { return; } WordNode pCur = linkedArray.first.next; WordNode pPre = linkedArray.first; while (pCur != null) { //27904='m'*256 if ((Math.Abs(pPre.theWord.nPOS) == 27904 || Math.Abs(pPre.theWord.nPOS) == 29696) && (Utility.IsAllNum(pCur.theWord.sWord) || Utility.IsAllChineseNum(pCur.theWord.sWord)) && ("--".IndexOf(pCur.theWord.sWord.ToCharArray()[0]) >= 0) && pCur.theWord.sWord.Length > 1) { // 将“-”拆分出来。 WordNode newNode = new WordNode(); newNode.row = pCur.row + 1; newNode.col = pCur.col; newNode.sWordInSegGraph = pCur.theWord.sWord.Substring(1); WordResult theWord = new WordResult(); theWord.sWord = newNode.sWordInSegGraph; theWord.nPOS = 27904; theWord.dValue = pCur.theWord.dValue; newNode.theWord = theWord; pCur.col = pCur.row + 1; pCur.theWord.sWord = pCur.theWord.sWord.Substring(0, 1); pCur.theWord.nPOS = 30464; //'w'*256; pCur.theWord.dValue = 0; newNode.next = pCur.next; pCur.next = newNode; linkedArray.Count++; } pCur = pCur.next; pPre = pPre.next; } }
//Unknown word recognition //pWordSegResult:word Segmentation result; //graphOptimum: The optimized segmentation graph //graphSeg: The original segmentation graph public bool Recognition(WordResult[] pWordSegResult, RowFirstDynamicArray<ChainContent> graphOptimum, List<AtomNode> atomSegment, WordDictionary dictCore) { ChainItem<ChainContent> item; int nStartPos = 0, j = 0, nAtomStart, nAtomEnd, nPOSOriginal; double dValue; m_roleTag.POSTagging(pWordSegResult, dictCore, m_dict); //Tag the segmentation with unknown recognition roles according the core dictionary and unknown recognition dictionary for (int i = 0; i < m_roleTag.m_nUnknownWordsCount; i++) { while (j < atomSegment.Count && nStartPos < m_roleTag.m_nUnknownWords[i, 0]) nStartPos += atomSegment[j++].sWord.Length; nAtomStart = j; while (j < atomSegment.Count && nStartPos < m_roleTag.m_nUnknownWords[i, 1]) nStartPos += atomSegment[j++].sWord.Length; nAtomEnd = j; if (nAtomStart < nAtomEnd) { item = graphOptimum.GetElement(nAtomStart, nAtomEnd); if (item != null) { dValue = item.Content.eWeight; nPOSOriginal = item.Content.nPOS; } else dValue = Predefine.INFINITE_VALUE; if (dValue > m_roleTag.m_dWordsPossibility[i]) //Set the element with less frequency graphOptimum.SetElement(nAtomStart, nAtomEnd, new ChainContent(m_roleTag.m_dWordsPossibility[i], m_nPOS, m_sUnknownFlags)); } } return true; }
private int GetFrom(WordResult[] pWordItems, int nIndex, WordDictionary dictCore, WordDictionary dictUnknown) { WordInfo info; int[] aPOS = new int[Predefine.MAX_POS_PER_WORD]; int[] aFreq = new int[Predefine.MAX_POS_PER_WORD]; int nFreq = 0, j, nRetPos = 0, nWordsIndex = 0; bool bSplit = false; //Need to split in Transliteration recognition int i = 1, nPOSCount; string sCurWord; //Current word nWordsIndex = i + nIndex - 1; for (i = 1; i < Predefine.MAX_WORDS_PER_SENTENCE && nWordsIndex < pWordItems.Length; i++) { if (m_tagType == TAG_TYPE.TT_NORMAL || !dictUnknown.IsExist(pWordItems[nWordsIndex].sWord, 44)) { m_sWords[i] = pWordItems[nWordsIndex].sWord; //store current word m_nWordPosition[i + 1] = m_nWordPosition[i] + m_sWords[i].Length; } else { if (!bSplit) { m_sWords[i] = pWordItems[nWordsIndex].sWord.Substring(0, 1); //store current word bSplit = true; } else { m_sWords[i] = pWordItems[nWordsIndex].sWord.Substring(1); //store current word bSplit = false; } m_nWordPosition[i + 1] = m_nWordPosition[i] + m_sWords[i].Length; } //Record the position of current word m_nStartPos = m_nWordPosition[i + 1]; //Move the Start POS to the ending if (m_tagType != TAG_TYPE.TT_NORMAL) { //Get the POSs from the unknown recognition dictionary sCurWord = m_sWords[i]; if (m_tagType == TAG_TYPE.TT_TRANS_PERSON && i > 0 && m_sWords[i - 1] != null && Utility.charType(m_sWords[i - 1].ToCharArray()[0]) == Predefine.CT_CHINESE) { if (m_sWords[i] == ".") sCurWord = "."; else if (m_sWords[i] == "-") sCurWord = "-"; } info = dictUnknown.GetWordInfo(sCurWord); if (info != null) { nPOSCount = info.Count + 1; for (j = 0; j < info.Count; j++) { //Get the POS set of sCurWord in the unknown dictionary m_nTags[i, j] = info.POSs[j]; m_dFrequency[i, j] = -Math.Log((double)(1 + info.Frequencies[j])) + Math.Log((double)(m_context.GetFrequency(0, info.POSs[j]) + nPOSCount)); } } else { nPOSCount = 1; j = 0; } //Get the POS set of sCurWord in the core dictionary //We ignore the POS in the core dictionary and recognize them as other (0). //We add their frequency to get the possibility as POS 0 if (string.Compare(m_sWords[i], "始##始") == 0) { m_nTags[i, j] = 100; m_dFrequency[i, j] = 0; j++; } else if (string.Compare(m_sWords[i], "末##末") == 0) { m_nTags[i, j] = 101; m_dFrequency[i, j] = 0; j++; } else { //dictCore.GetHandle(m_sWords[i], &nCount, aPOS, aFreq); info = dictCore.GetWordInfo(m_sWords[i]); nFreq = 0; if (info != null) { for (int k = 0; k < info.Count; k++) { nFreq += info.Frequencies[k]; } if (info.Count > 0) { m_nTags[i, j] = 0; //m_dFrequency[i][j]=(double)(1+nFreq)/(double)(m_context.GetFrequency(0,0)+1); m_dFrequency[i, j] = -Math.Log((double)(1 + nFreq)) + Math.Log((double)(m_context.GetFrequency(0, 0) + nPOSCount)); j++; } } } } else //For normal POS tagging { j = 0; //Get the POSs from the unknown recognition dictionary if (pWordItems[nWordsIndex].nPOS > 0) { //The word has is only one POS value //We have record its POS and nFrequncy in the items. m_nTags[i, j] = pWordItems[nWordsIndex].nPOS; m_dFrequency[i, j] = -Math.Log(pWordItems[nWordsIndex].dValue) + Math.Log((double)(m_context.GetFrequency(0, m_nTags[i, j]) + 1)); if (m_dFrequency[i, j] < 0) //Not permit the value less than 0 m_dFrequency[i, j] = 0; j++; } else { //The word has multiple POSs, we should retrieve the information from Core Dictionary if (pWordItems[nWordsIndex].nPOS < 0) { //The word has is only one POS value //We have record its POS and nFrequncy in the items. m_nTags[i, j] = -pWordItems[nWordsIndex].nPOS; m_dFrequency[i, j++] = pWordItems[nWordsIndex].dValue; } //dictCore.GetHandle(m_sWords[i], &nCount, aPOS, aFreq); info = dictCore.GetWordInfo(m_sWords[i]); if (info != null) { nPOSCount = info.Count; for (; j < info.Count; j++) { //Get the POS set of sCurWord in the unknown dictionary m_nTags[i, j] = info.POSs[j]; m_dFrequency[i, j] = -Math.Log(1 + info.Frequencies[j]) + Math.Log(m_context.GetFrequency(0, m_nTags[i, j]) + nPOSCount); } } } } if (j == 0) { //We donot know the POS, so we have to guess them according lexical knowledge GuessPOS(i, out j); //Guess the POS of current word } m_nTags[i, j] = -1; //Set the ending POS if (j == 1 && m_nTags[i, j] != Predefine.CT_SENTENCE_BEGIN) //No ambuguity { //No ambuguity, so we can break from the loop i++; m_sWords[i] = null; break; } if (!bSplit) nWordsIndex++; } if (nWordsIndex == pWordItems.Length) nRetPos = -1; //Reaching ending if (m_nTags[i - 1, 1] != -1) //||m_sWords[i][0]==0 { //Set end for words like "张/华/平" if (m_tagType != TAG_TYPE.TT_NORMAL) m_nTags[i, 0] = 101; else m_nTags[i, 0] = 1; m_dFrequency[i, 0] = 0; m_sWords[i] = null; //Set virtual ending m_nTags[i++, 1] = -1; } m_nCurLength = i; //The current word count if (nRetPos != -1) return nWordsIndex + 1; //Next start position return -1; //Reaching ending }
//POS tagging with Hidden Markov Model public bool POSTagging(WordResult[] pWordItems, WordDictionary dictCore, WordDictionary dictUnknown) { //pWordItems: Items; nItemCount: the count of items;core dictionary and unknown recognition dictionary int i = 0, j, nStartPos; Reset(false); while (i > -1 && i < pWordItems.Length && pWordItems[i].sWord != null) { nStartPos = i; //Start Position i = GetFrom(pWordItems, nStartPos, dictCore, dictUnknown); GetBestPOS(); switch (m_tagType) { case TAG_TYPE.TT_NORMAL: //normal POS tagging j = 1; while (m_nBestTag[j] != -1 && j < m_nCurLength) { //Store the best POS tagging pWordItems[j + nStartPos - 1].nPOS = m_nBestTag[j]; //Let 。be 0 if (pWordItems[j + nStartPos - 1].dValue > 0 && dictCore.IsExist(pWordItems[j + nStartPos - 1].sWord, -1)) //Exist and update its frequncy as a POS value pWordItems[j + nStartPos - 1].dValue = dictCore.GetFrequency(pWordItems[j + nStartPos - 1].sWord, m_nBestTag[j]); j += 1; } break; case TAG_TYPE.TT_PERSON: //Person recognition PersonRecognize(dictUnknown); break; case TAG_TYPE.TT_PLACE: //Place name recognition case TAG_TYPE.TT_TRANS_PERSON: //Transliteration Person PlaceRecognize(dictCore, dictUnknown); break; default: break; } Reset(); } return true; }
//==================================================================== //���ǰһ���������֣���ǰ���ԡ�������-����ʼ�����Ҳ�ֹ��һ���ַ��� //��ô���ˡ��������Ŵӵ�ǰ���з�������� //���� ��3 / -4 / �¡���Ҫ��ֳɡ�3 / - / 4 / �¡� //==================================================================== private static void SplitMiddleSlashFromDigitalWords(ref WordLinkedArray linkedArray) { if (linkedArray.Count < 2) return; WordNode pCur = linkedArray.first.next; WordNode pPre = linkedArray.first; while (pCur != null) { //27904='m'*256 if ((Math.Abs(pPre.theWord.nPOS) == 27904 || Math.Abs(pPre.theWord.nPOS) == 29696) && (Utility.IsAllNum(pCur.theWord.sWord) || Utility.IsAllChineseNum(pCur.theWord.sWord)) && ("-��".IndexOf(pCur.theWord.sWord.ToCharArray()[0]) >= 0) && pCur.theWord.sWord.Length > 1) { // ����������ֳ����� WordNode newNode = new WordNode(); newNode.row = pCur.row + 1; newNode.col = pCur.col; newNode.sWordInSegGraph = pCur.theWord.sWord.Substring(1); WordResult theWord = new WordResult(); theWord.sWord = newNode.sWordInSegGraph; theWord.nPOS = 27904; theWord.dValue = pCur.theWord.dValue; newNode.theWord = theWord; pCur.col = pCur.row + 1; pCur.theWord.sWord = pCur.theWord.sWord.Substring(0, 1); pCur.theWord.nPOS = 30464; //'w'*256; pCur.theWord.dValue = 0; newNode.next = pCur.next; pCur.next = newNode; linkedArray.Count++; } pCur = pCur.next; pPre = pPre.next; } }
//==================================================================== // Generate Word according the segmentation route //==================================================================== private static WordResult[] GenerateWord(int[] uniPath, WordLinkedArray linkedArray, RowFirstDynamicArray<ChainContent> m_graphOptimum) { if (linkedArray.Count == 0) return null; //-------------------------------------------------------------------- //Merge all seperate continue num into one number MergeContinueNumIntoOne(ref linkedArray); //-------------------------------------------------------------------- //The delimiter "����" ChangeDelimiterPOS(ref linkedArray); //-------------------------------------------------------------------- //���ǰһ���������֣���ǰ���ԡ�������-����ʼ�����Ҳ�ֹ��һ���ַ��� //��ô���ˡ��������Ŵӵ�ǰ���з�������� //���� ��3 / -4 / �¡���Ҫ��ֳɡ�3 / - / 4 / �¡� SplitMiddleSlashFromDigitalWords(ref linkedArray); //-------------------------------------------------------------------- //1�������ǰ�������֣���һ�����ǡ��¡��ա�ʱ���֡��롢�·ݡ��е�һ������ϲ�,�ҵ�ǰ�ʴ�����ʱ�� //2�������ǰ���ǿ�����Ϊ��ݵ����֣���һ�����ǡ��ꡱ����ϲ�������Ϊʱ�䣬����Ϊ���֡� //3��������һ��������"��" ������Ϊ��ǰ������ʱ�� //4�������ǰ�����һ�����ֲ���"�á�����"�Ͱ�ǵ�'.''/'����ô���� //5����ǰ�����һ��������"�á�����"�Ͱ�ǵ�'.''/'���ҳ��ȴ���1����ôȥ�����һ���ַ�������"1." CheckDateElements(ref linkedArray); //-------------------------------------------------------------------- //������ WordResult[] result = new WordResult[linkedArray.Count]; WordNode pCur = linkedArray.first; int i = 0; while (pCur != null) { WordResult item = new WordResult(); item.sWord = pCur.theWord.sWord; item.nPOS = pCur.theWord.nPOS; item.dValue = pCur.theWord.dValue; item.sLocation = pCur.row; result[i] = item; m_graphOptimum.SetElement(pCur.row, pCur.col, new ChainContent(item.dValue, item.nPOS, pCur.sWordInSegGraph)); pCur = pCur.next; i++; } return result; }
//==================================================================== // Generate Word according the segmentation route //==================================================================== private static WordResult[] GenerateWord(int[] uniPath, WordLinkedArray linkedArray, RowFirstDynamicArray<ChainContent> m_graphOptimum) { if (linkedArray.Count == 0) return null; //-------------------------------------------------------------------- //Merge all seperate continue num into one number MergeContinueNumIntoOne(ref linkedArray); //-------------------------------------------------------------------- //The delimiter "--" ChangeDelimiterPOS(ref linkedArray); //-------------------------------------------------------------------- //如果前一个词是数字,当前词以“-”或“-”开始,并且不止这一个字符, //那么将此“-”符号从当前词中分离出来。 //例如 “3 / -4 / 月”需要拆分成“3 / - / 4 / 月” SplitMiddleSlashFromDigitalWords(ref linkedArray); //-------------------------------------------------------------------- //1、如果当前词是数字,下一个词是“月、日、时、分、秒、月份”中的一个,则合并,且当前词词性是时间 //2、如果当前词是可以作为年份的数字,下一个词是“年”,则合并,词性为时间,否则为数字。 //3、如果最后一个汉字是"点" ,则认为当前数字是时间 //4、如果当前串最后一个汉字不是"∶·./"和半角的'.''/',那么是数 //5、当前串最后一个汉字是"∶·./"和半角的'.''/',且长度大于1,那么去掉最后一个字符。例如"1." CheckDateElements(ref linkedArray); //-------------------------------------------------------------------- //输出结果 WordResult[] result = new WordResult[linkedArray.Count]; WordNode pCur = linkedArray.first; int i = 0; while (pCur != null) { WordResult item = new WordResult(); item.sWord = pCur.theWord.sWord; item.nPOS = pCur.theWord.nPOS; item.dValue = pCur.theWord.dValue; result[i] = item; m_graphOptimum.SetElement(pCur.row, pCur.col, new ChainContent(item.dValue, item.nPOS, pCur.sWordInSegGraph)); pCur = pCur.next; i++; } return result; }
private static string PrintResultStringOnly(WordResult[] wr) { string s = ""; for (int j = 1; j < wr.Length - 1; j++) s = s + string.Format(@"{0}", wr[j].sWord); return s; }
private static string PrintResult(WordResult[] wr) { string s = ""; for (int j = 1; j < wr.Length - 1; j++) s = s + string.Format(@"{0}/{1}", wr[j].sWord, Utility.GetPOSString(wr[j].nPOS, nPosLevel.LevelOne)); return s; }