private int GetFrom(WordResult[] pWordItems, int nIndex, WordDictionary dictCore, WordDictionary dictUnknown) { WordInfo info; int[] aPOS = new int[Predefine.MAX_POS_PER_WORD]; int[] aFreq = new int[Predefine.MAX_POS_PER_WORD]; int nFreq = 0, j, nRetPos = 0, nWordsIndex = 0; bool bSplit = false; //Need to split in Transliteration recognition int i = 1, nPOSCount; string sCurWord; //Current word nWordsIndex = i + nIndex - 1; for (i = 1; i < Predefine.MAX_WORDS_PER_SENTENCE && nWordsIndex < pWordItems.Length; i++) { if (m_tagType == TAG_TYPE.TT_NORMAL || !dictUnknown.IsExist(pWordItems[nWordsIndex].sWord, 44)) { m_sWords[i] = pWordItems[nWordsIndex].sWord; //store current word m_nWordPosition[i + 1] = m_nWordPosition[i] + m_sWords[i].Length; } else { if (!bSplit) { m_sWords[i] = pWordItems[nWordsIndex].sWord.Substring(0, 1); //store current word bSplit = true; } else { m_sWords[i] = pWordItems[nWordsIndex].sWord.Substring(1); //store current word bSplit = false; } m_nWordPosition[i + 1] = m_nWordPosition[i] + m_sWords[i].Length; } //Record the position of current word m_nStartPos = m_nWordPosition[i + 1]; //Move the Start POS to the ending if (m_tagType != TAG_TYPE.TT_NORMAL) { //Get the POSs from the unknown recognition dictionary sCurWord = m_sWords[i]; if (m_tagType == TAG_TYPE.TT_TRANS_PERSON && i > 0 && m_sWords[i - 1] != null && Utility.charType(m_sWords[i - 1].ToCharArray()[0]) == Predefine.CT_CHINESE) { if (m_sWords[i] == ".") sCurWord = "."; else if (m_sWords[i] == "-") sCurWord = "-"; } info = dictUnknown.GetWordInfo(sCurWord); if (info != null) { nPOSCount = info.Count + 1; for (j = 0; j < info.Count; j++) { //Get the POS set of sCurWord in the unknown dictionary m_nTags[i, j] = info.POSs[j]; m_dFrequency[i, j] = -Math.Log((double)(1 + info.Frequencies[j])) + Math.Log((double)(m_context.GetFrequency(0, info.POSs[j]) + nPOSCount)); } } else { nPOSCount = 1; j = 0; } //Get the POS set of sCurWord in the core dictionary //We ignore the POS in the core dictionary and recognize them as other (0). //We add their frequency to get the possibility as POS 0 if (string.Compare(m_sWords[i], "始##始") == 0) { m_nTags[i, j] = 100; m_dFrequency[i, j] = 0; j++; } else if (string.Compare(m_sWords[i], "末##末") == 0) { m_nTags[i, j] = 101; m_dFrequency[i, j] = 0; j++; } else { //dictCore.GetHandle(m_sWords[i], &nCount, aPOS, aFreq); info = dictCore.GetWordInfo(m_sWords[i]); nFreq = 0; if (info != null) { for (int k = 0; k < info.Count; k++) { nFreq += info.Frequencies[k]; } if (info.Count > 0) { m_nTags[i, j] = 0; //m_dFrequency[i][j]=(double)(1+nFreq)/(double)(m_context.GetFrequency(0,0)+1); m_dFrequency[i, j] = -Math.Log((double)(1 + nFreq)) + Math.Log((double)(m_context.GetFrequency(0, 0) + nPOSCount)); j++; } } } } else //For normal POS tagging { j = 0; //Get the POSs from the unknown recognition dictionary if (pWordItems[nWordsIndex].nPOS > 0) { //The word has is only one POS value //We have record its POS and nFrequncy in the items. m_nTags[i, j] = pWordItems[nWordsIndex].nPOS; m_dFrequency[i, j] = -Math.Log(pWordItems[nWordsIndex].dValue) + Math.Log((double)(m_context.GetFrequency(0, m_nTags[i, j]) + 1)); if (m_dFrequency[i, j] < 0) //Not permit the value less than 0 m_dFrequency[i, j] = 0; j++; } else { //The word has multiple POSs, we should retrieve the information from Core Dictionary if (pWordItems[nWordsIndex].nPOS < 0) { //The word has is only one POS value //We have record its POS and nFrequncy in the items. m_nTags[i, j] = -pWordItems[nWordsIndex].nPOS; m_dFrequency[i, j++] = pWordItems[nWordsIndex].dValue; } //dictCore.GetHandle(m_sWords[i], &nCount, aPOS, aFreq); info = dictCore.GetWordInfo(m_sWords[i]); if (info != null) { nPOSCount = info.Count; for (; j < info.Count; j++) { //Get the POS set of sCurWord in the unknown dictionary m_nTags[i, j] = info.POSs[j]; m_dFrequency[i, j] = -Math.Log(1 + info.Frequencies[j]) + Math.Log(m_context.GetFrequency(0, m_nTags[i, j]) + nPOSCount); } } } } if (j == 0) { //We donot know the POS, so we have to guess them according lexical knowledge GuessPOS(i, out j); //Guess the POS of current word } m_nTags[i, j] = -1; //Set the ending POS if (j == 1 && m_nTags[i, j] != Predefine.CT_SENTENCE_BEGIN) //No ambuguity { //No ambuguity, so we can break from the loop i++; m_sWords[i] = null; break; } if (!bSplit) nWordsIndex++; } if (nWordsIndex == pWordItems.Length) nRetPos = -1; //Reaching ending if (m_nTags[i - 1, 1] != -1) //||m_sWords[i][0]==0 { //Set end for words like "张/华/平" if (m_tagType != TAG_TYPE.TT_NORMAL) m_nTags[i, 0] = 101; else m_nTags[i, 0] = 1; m_dFrequency[i, 0] = 0; m_sWords[i] = null; //Set virtual ending m_nTags[i++, 1] = -1; } m_nCurLength = i; //The current word count if (nRetPos != -1) return nWordsIndex + 1; //Next start position return -1; //Reaching ending }
//==================================================================== // Func Name : GenerateWordNet // Description: Generate the segmentation word net according // the original sentence // Parameters : sSentence: the sentence // dictCore : core dictionary // bOriginalFreq=false: output original frequency // Returns : bool //==================================================================== public static RowFirstDynamicArray<ChainContent> GenerateWordNet(List<AtomNode> atomSegment, WordDictionary coreDict) { string sWord = "", sMaxMatchWord; int nPOSRet, nPOS, nTotalFreq; double dValue = 0; RowFirstDynamicArray<ChainContent> m_segGraph = new RowFirstDynamicArray<ChainContent>(); m_segGraph.SetEmpty(); // ��ԭ�Ӳ��ִ���m_segGraph for (int i = 0; i < atomSegment.Count; i++)//Init the cost array { if (atomSegment[i].nPOS == Predefine.CT_CHINESE) m_segGraph.SetElement(i, i + 1, new ChainContent(0, 0, atomSegment[i].sWord)); else { sWord = atomSegment[i].sWord;//init the word dValue = Predefine.MAX_FREQUENCE; switch (atomSegment[i].nPOS) { case Predefine.CT_INDEX: case Predefine.CT_NUM: nPOS = -27904;//'m'*256 sWord = "δ##��"; dValue = 0; break; case Predefine.CT_DELIMITER: nPOS = 30464;//'w'*256; break; case Predefine.CT_LETTER: nPOS = -28280; // -'n' * 256 - 'x'; dValue = 0; sWord = "δ##��"; break; case Predefine.CT_SINGLE://12021-2129-3121 if (Regex.IsMatch(atomSegment[i].sWord, @"^(-?\d+)(\.\d+)?$"))����//ƥ�両���� { nPOS = -27904;//'m'*256 sWord = "δ##��"; } else { nPOS = -28280; // -'n' * 256 - 'x' sWord = "δ##��"; } dValue = 0; break; default: nPOS = atomSegment[i].nPOS;//'?'*256; break; } m_segGraph.SetElement(i, i + 1, new ChainContent(dValue, nPOS, sWord));//init the link with minimum } } // �����п��ܵ���ʴ���m_segGraph for (int i = 0; i < atomSegment.Count; i++)//All the word { sWord = atomSegment[i].sWord;//Get the current atom int j = i + 1; while (j < atomSegment.Count && coreDict.GetMaxMatch(sWord, out sMaxMatchWord, out nPOSRet)) { if (sMaxMatchWord == sWord) // ��������Ҫ�ҵĴ� { WordInfo info = coreDict.GetWordInfo(sWord); // �ôʿ��ܾ��ж��ִ��� // ����ôʵ����д�Ƶ֮�� nTotalFreq = 0; for (int k = 0; k < info.Count; k++) nTotalFreq += info.Frequencies[k]; // ���Ƴ���ijЩ����� if (sWord.Length == 2 && (sWord.StartsWith("��") || sWord.StartsWith("��")) && i >= 1 && (Utility.IsAllNum(atomSegment[i - 1].sWord) || Utility.IsAllChineseNum(atomSegment[i - 1].sWord))) { //1���ڡ�1999��ĩ if ("ĩ���е�ǰ���".IndexOf(sWord.Substring(1)) >= 0) break; } // ����ô�ֻ��һ�����ԣ���洢��������Լ�¼Ϊ 0 if (info.Count == 1) m_segGraph.SetElement(i, j, new ChainContent(nTotalFreq, info.POSs[0], sWord)); else m_segGraph.SetElement(i, j, new ChainContent(nTotalFreq, 0, sWord)); } sWord += atomSegment[j++].sWord; } } return m_segGraph; }
//==================================================================== // Func Name : GenerateWordNet // Description: Generate the segmentation word net according // the original sentence // Parameters : sSentence: the sentence // dictCore : core dictionary // bOriginalFreq=false: output original frequency // Returns : bool //==================================================================== public static RowFirstDynamicArray<ChainContent> GenerateWordNet(List<AtomNode> atomSegment, WordDictionary coreDict) { string sWord = "", sMaxMatchWord; int nPOSRet, nPOS, nTotalFreq; double dValue = 0; RowFirstDynamicArray<ChainContent> m_segGraph = new RowFirstDynamicArray<ChainContent>(); m_segGraph.SetEmpty(); // 将原子部分存入m_segGraph for (int i = 0; i < atomSegment.Count; i++)//Init the cost array { if (atomSegment[i].nPOS == Predefine.CT_CHINESE) m_segGraph.SetElement(i, i + 1, new ChainContent(0, 0, atomSegment[i].sWord)); else { sWord = atomSegment[i].sWord;//init the word dValue = Predefine.MAX_FREQUENCE; switch (atomSegment[i].nPOS) { case Predefine.CT_INDEX: case Predefine.CT_NUM: nPOS = -27904;//'m'*256 sWord = "未##数"; dValue = 0; break; case Predefine.CT_DELIMITER: nPOS = 30464;//'w'*256; break; case Predefine.CT_LETTER: nPOS = -28280; // -'n' * 256 - 'x'; dValue = 0; sWord = "未##串"; break; case Predefine.CT_SINGLE://12021-2129-3121 if (Regex.IsMatch(atomSegment[i].sWord, @"^(-?\d+)(\.\d+)?$")) //匹配浮点数 { nPOS = -27904;//'m'*256 sWord = "未##数"; } else { nPOS = -28280; // -'n' * 256 - 'x' sWord = "未##串"; } dValue = 0; break; default: nPOS = atomSegment[i].nPOS;//'?'*256; break; } m_segGraph.SetElement(i, i + 1, new ChainContent(dValue, nPOS, sWord));//init the link with minimum } } // 将所有可能的组词存入m_segGraph for (int i = 0; i < atomSegment.Count; i++)//All the word { sWord = atomSegment[i].sWord;//Get the current atom int j = i + 1; while (j < atomSegment.Count && coreDict.GetMaxMatch(sWord, out sMaxMatchWord, out nPOSRet)) { if (sMaxMatchWord == sWord) // 就是我们要找的词 { WordInfo info = coreDict.GetWordInfo(sWord); // 该词可能就有多种词性 // 计算该词的所有词频之和 nTotalFreq = 0; for (int k = 0; k < info.Count; k++) nTotalFreq += info.Frequencies[k]; // 限制出现某些特殊词 if (sWord.Length == 2 && (sWord.StartsWith("年") || sWord.StartsWith("月")) && i >= 1 && (Utility.IsAllNum(atomSegment[i - 1].sWord) || Utility.IsAllChineseNum(atomSegment[i - 1].sWord))) { //1年内、1999年末 if ("末内中底前间初".IndexOf(sWord.Substring(1)) >= 0) break; } // 如果该词只有一个词性,则存储,否则词性记录为 0 if (info.Count == 1) m_segGraph.SetElement(i, j, new ChainContent(nTotalFreq, info.POSs[0], sWord)); else m_segGraph.SetElement(i, j, new ChainContent(nTotalFreq, 0, sWord)); } sWord += atomSegment[j++].sWord; } } return m_segGraph; }
//==================================================================== // Func Name : GenerateWordNet // Description: Generate the segmentation word net according // the original sentence // Parameters : sSentence: the sentence // dictCore : core dictionary // bOriginalFreq=false: output original frequency // Returns : bool //==================================================================== public static RowFirstDynamicArray <ChainContent> GenerateWordNet(List <AtomNode> atomSegment, WordDictionary coreDict) { string sWord = "", sMaxMatchWord; int nPOSRet, nPOS, nTotalFreq; double dValue = 0; RowFirstDynamicArray <ChainContent> m_segGraph = new RowFirstDynamicArray <ChainContent>(); m_segGraph.SetEmpty(); // 将原子部分存入m_segGraph for (int i = 0; i < atomSegment.Count; i++)//Init the cost array { if (atomSegment[i].nPOS == Predefine.CT_CHINESE) { m_segGraph.SetElement(i, i + 1, new ChainContent(0, 0, atomSegment[i].sWord)); } else { sWord = atomSegment[i].sWord;//init the word dValue = Predefine.MAX_FREQUENCE; switch (atomSegment[i].nPOS) { case Predefine.CT_INDEX: case Predefine.CT_NUM: nPOS = -27904;//'m'*256 sWord = "未##数"; dValue = 0; break; case Predefine.CT_DELIMITER: nPOS = 30464;//'w'*256; break; case Predefine.CT_LETTER: nPOS = -28280; // -'n' * 256 - 'x'; dValue = 0; sWord = "未##串"; break; case Predefine.CT_SINGLE://12021-2129-3121 if (Regex.IsMatch(atomSegment[i].sWord, @"^(-?\d+)(\.\d+)?$")) { //匹配浮点数 { nPOS = -27904; //'m'*256 sWord = "未##数"; } } else { nPOS = -28280; // -'n' * 256 - 'x' sWord = "未##串"; } dValue = 0; break; default: nPOS = atomSegment[i].nPOS;//'?'*256; break; } m_segGraph.SetElement(i, i + 1, new ChainContent(dValue, nPOS, sWord));//init the link with minimum } } // 将所有可能的组词存入m_segGraph for (int i = 0; i < atomSegment.Count; i++) //All the word { sWord = atomSegment[i].sWord; //Get the current atom int j = i + 1; while (j < atomSegment.Count && coreDict.GetMaxMatch(sWord, out sMaxMatchWord, out nPOSRet)) { if (sMaxMatchWord == sWord) // 就是我们要找的词 { WordInfo info = coreDict.GetWordInfo(sWord); // 该词可能就有多种词性 // 计算该词的所有词频之和 nTotalFreq = 0; for (int k = 0; k < info.Count; k++) { nTotalFreq += info.Frequencies[k]; } // 限制出现某些特殊词 if (sWord.Length == 2 && (sWord.StartsWith("年") || sWord.StartsWith("月")) && i >= 1 && (Utility.IsAllNum(atomSegment[i - 1].sWord) || Utility.IsAllChineseNum(atomSegment[i - 1].sWord))) { //1年内、1999年末 if ("末内中底前间初".IndexOf(sWord.Substring(1)) >= 0) { break; } } // 如果该词只有一个词性,则存储,否则词性记录为 0 if (info.Count == 1) { m_segGraph.SetElement(i, j, new ChainContent(nTotalFreq, info.POSs[0], sWord)); } else { m_segGraph.SetElement(i, j, new ChainContent(nTotalFreq, 0, sWord)); } } sWord += atomSegment[j++].sWord; } } return(m_segGraph); }
private int GetFrom(WordResult[] pWordItems, int nIndex, WordDictionary dictCore, WordDictionary dictUnknown) { WordInfo info; int[] aPOS = new int[Predefine.MAX_POS_PER_WORD]; int[] aFreq = new int[Predefine.MAX_POS_PER_WORD]; int nFreq = 0, j, nRetPos = 0, nWordsIndex = 0; bool bSplit = false; //Need to split in Transliteration recognition int i = 1, nPOSCount; string sCurWord; //Current word nWordsIndex = i + nIndex - 1; for (i = 1; i < Predefine.MAX_WORDS_PER_SENTENCE && nWordsIndex < pWordItems.Length; i++) { if (m_tagType == TAG_TYPE.TT_NORMAL || !dictUnknown.IsExist(pWordItems[nWordsIndex].sWord, 44)) { m_sWords[i] = pWordItems[nWordsIndex].sWord; //store current word m_nWordPosition[i + 1] = m_nWordPosition[i] + m_sWords[i].Length; } else { if (!bSplit) { m_sWords[i] = pWordItems[nWordsIndex].sWord.Substring(0, 1); //store current word bSplit = true; } else { m_sWords[i] = pWordItems[nWordsIndex].sWord.Substring(1); //store current word bSplit = false; } m_nWordPosition[i + 1] = m_nWordPosition[i] + m_sWords[i].Length; } //Record the position of current word m_nStartPos = m_nWordPosition[i + 1]; //Move the Start POS to the ending if (m_tagType != TAG_TYPE.TT_NORMAL) { //Get the POSs from the unknown recognition dictionary sCurWord = m_sWords[i]; if (m_tagType == TAG_TYPE.TT_TRANS_PERSON && i > 0 && m_sWords[i - 1] != null && Utility.charType(m_sWords[i - 1].ToCharArray()[0]) == Predefine.CT_CHINESE) { if (m_sWords[i] == ".") { sCurWord = "."; } else if (m_sWords[i] == "-") { sCurWord = "-"; } } info = dictUnknown.GetWordInfo(sCurWord); if (info != null) { nPOSCount = info.Count + 1; for (j = 0; j < info.Count; j++) { //Get the POS set of sCurWord in the unknown dictionary m_nTags[i, j] = info.POSs[j]; m_dFrequency[i, j] = -Math.Log((double)(1 + info.Frequencies[j])) + Math.Log((double)(m_context.GetFrequency(0, info.POSs[j]) + nPOSCount)); } } else { nPOSCount = 1; j = 0; } //Get the POS set of sCurWord in the core dictionary //We ignore the POS in the core dictionary and recognize them as other (0). //We add their frequency to get the possibility as POS 0 if (string.Compare(m_sWords[i], "始##始") == 0) { m_nTags[i, j] = 100; m_dFrequency[i, j] = 0; j++; } else if (string.Compare(m_sWords[i], "末##末") == 0) { m_nTags[i, j] = 101; m_dFrequency[i, j] = 0; j++; } else { //dictCore.GetHandle(m_sWords[i], &nCount, aPOS, aFreq); info = dictCore.GetWordInfo(m_sWords[i]); nFreq = 0; if (info != null) { for (int k = 0; k < info.Count; k++) { nFreq += info.Frequencies[k]; } if (info.Count > 0) { m_nTags[i, j] = 0; //m_dFrequency[i][j]=(double)(1+nFreq)/(double)(m_context.GetFrequency(0,0)+1); m_dFrequency[i, j] = -Math.Log((double)(1 + nFreq)) + Math.Log((double)(m_context.GetFrequency(0, 0) + nPOSCount)); j++; } } } } else //For normal POS tagging { j = 0; //Get the POSs from the unknown recognition dictionary if (pWordItems[nWordsIndex].nPOS > 0) { //The word has is only one POS value //We have record its POS and nFrequncy in the items. m_nTags[i, j] = pWordItems[nWordsIndex].nPOS; m_dFrequency[i, j] = -Math.Log(pWordItems[nWordsIndex].dValue) + Math.Log((double)(m_context.GetFrequency(0, m_nTags[i, j]) + 1)); if (m_dFrequency[i, j] < 0) { //Not permit the value less than 0 m_dFrequency[i, j] = 0; } j++; } else { //The word has multiple POSs, we should retrieve the information from Core Dictionary if (pWordItems[nWordsIndex].nPOS < 0) { //The word has is only one POS value //We have record its POS and nFrequncy in the items. m_nTags[i, j] = -pWordItems[nWordsIndex].nPOS; m_dFrequency[i, j++] = pWordItems[nWordsIndex].dValue; } //dictCore.GetHandle(m_sWords[i], &nCount, aPOS, aFreq); info = dictCore.GetWordInfo(m_sWords[i]); if (info != null) { nPOSCount = info.Count; for (; j < info.Count; j++) { //Get the POS set of sCurWord in the unknown dictionary m_nTags[i, j] = info.POSs[j]; m_dFrequency[i, j] = -Math.Log(1 + info.Frequencies[j]) + Math.Log(m_context.GetFrequency(0, m_nTags[i, j]) + nPOSCount); } } } } if (j == 0) { //We donot know the POS, so we have to guess them according lexical knowledge GuessPOS(i, out j); //Guess the POS of current word } m_nTags[i, j] = -1; //Set the ending POS if (j == 1 && m_nTags[i, j] != Predefine.CT_SENTENCE_BEGIN) //No ambuguity { //No ambuguity, so we can break from the loop i++; m_sWords[i] = null; break; } if (!bSplit) { nWordsIndex++; } } if (nWordsIndex == pWordItems.Length) { nRetPos = -1; } //Reaching ending if (m_nTags[i - 1, 1] != -1) //||m_sWords[i][0]==0 { //Set end for words like "张/华/平" if (m_tagType != TAG_TYPE.TT_NORMAL) { m_nTags[i, 0] = 101; } else { m_nTags[i, 0] = 1; } m_dFrequency[i, 0] = 0; m_sWords[i] = null; //Set virtual ending m_nTags[i++, 1] = -1; } m_nCurLength = i; //The current word count if (nRetPos != -1) { return(nWordsIndex + 1); } //Next start position return(-1); //Reaching ending }