//==================================================================== // Generate Word according the segmentation route //==================================================================== private static WordResult[] GenerateWord(int[] uniPath, WordLinkedArray linkedArray, RowFirstDynamicArray <ChainContent> m_graphOptimum) { if (linkedArray.Count == 0) { return(null); } //-------------------------------------------------------------------- //Merge all seperate continue num into one number MergeContinueNumIntoOne(ref linkedArray); //-------------------------------------------------------------------- //The delimiter "--" ChangeDelimiterPOS(ref linkedArray); //-------------------------------------------------------------------- //如果前一个词是数字,当前词以“-”或“-”开始,并且不止这一个字符, //那么将此“-”符号从当前词中分离出来。 //例如 “3 / -4 / 月”需要拆分成“3 / - / 4 / 月” SplitMiddleSlashFromDigitalWords(ref linkedArray); //-------------------------------------------------------------------- //1、如果当前词是数字,下一个词是“月、日、时、分、秒、月份”中的一个,则合并,且当前词词性是时间 //2、如果当前词是可以作为年份的数字,下一个词是“年”,则合并,词性为时间,否则为数字。 //3、如果最后一个汉字是"点" ,则认为当前数字是时间 //4、如果当前串最后一个汉字不是"∶·./"和半角的'.''/',那么是数 //5、当前串最后一个汉字是"∶·./"和半角的'.''/',且长度大于1,那么去掉最后一个字符。例如"1." CheckDateElements(ref linkedArray); //-------------------------------------------------------------------- //输出结果 WordResult[] result = new WordResult[linkedArray.Count]; WordNode pCur = linkedArray.first; int i = 0; while (pCur != null) { WordResult item = new WordResult(); item.sWord = pCur.theWord.sWord; item.nPOS = pCur.theWord.nPOS; item.dValue = pCur.theWord.dValue; result[i] = item; m_graphOptimum.SetElement(pCur.row, pCur.col, new ChainContent(item.dValue, item.nPOS, pCur.sWordInSegGraph)); pCur = pCur.next; i++; } return(result); }
//Unknown word recognition //pWordSegResult:word Segmentation result; //graphOptimum: The optimized segmentation graph //graphSeg: The original segmentation graph public bool Recognition(WordResult[] pWordSegResult, RowFirstDynamicArray <ChainContent> graphOptimum, List <AtomNode> atomSegment, WordDictionary dictCore) { ChainItem <ChainContent> item; int nStartPos = 0, j = 0, nAtomStart, nAtomEnd, nPOSOriginal; double dValue; m_roleTag.POSTagging(pWordSegResult, dictCore, m_dict); //Tag the segmentation with unknown recognition roles according the core dictionary and unknown recognition dictionary for (int i = 0; i < m_roleTag.m_nUnknownWordsCount; i++) { while (j < atomSegment.Count && nStartPos < m_roleTag.m_nUnknownWords[i, 0]) { nStartPos += atomSegment[j++].sWord.Length; } nAtomStart = j; while (j < atomSegment.Count && nStartPos < m_roleTag.m_nUnknownWords[i, 1]) { nStartPos += atomSegment[j++].sWord.Length; } nAtomEnd = j; if (nAtomStart < nAtomEnd) { item = graphOptimum.GetElement(nAtomStart, nAtomEnd); if (item != null) { dValue = item.Content.eWeight; nPOSOriginal = item.Content.nPOS; } else { dValue = Predefine.INFINITE_VALUE; } if (dValue > m_roleTag.m_dWordsPossibility[i]) { //Set the element with less frequency graphOptimum.SetElement(nAtomStart, nAtomEnd, new ChainContent(m_roleTag.m_dWordsPossibility[i], m_nPOS, m_sUnknownFlags)); } } } return(true); }
//==================================================================== // Func Name : GenerateWordNet // Description: Generate the segmentation word net according // the original sentence // Parameters : sSentence: the sentence // dictCore : core dictionary // bOriginalFreq=false: output original frequency // Returns : bool //==================================================================== public static RowFirstDynamicArray <ChainContent> GenerateWordNet(List <AtomNode> atomSegment, WordDictionary coreDict) { string sWord = "", sMaxMatchWord; int nPOSRet, nPOS, nTotalFreq; double dValue = 0; RowFirstDynamicArray <ChainContent> m_segGraph = new RowFirstDynamicArray <ChainContent>(); m_segGraph.SetEmpty(); // 将原子部分存入m_segGraph for (int i = 0; i < atomSegment.Count; i++)//Init the cost array { if (atomSegment[i].nPOS == Predefine.CT_CHINESE) { m_segGraph.SetElement(i, i + 1, new ChainContent(0, 0, atomSegment[i].sWord)); } else { sWord = atomSegment[i].sWord;//init the word dValue = Predefine.MAX_FREQUENCE; switch (atomSegment[i].nPOS) { case Predefine.CT_INDEX: case Predefine.CT_NUM: nPOS = -27904;//'m'*256 sWord = "未##数"; dValue = 0; break; case Predefine.CT_DELIMITER: nPOS = 30464;//'w'*256; break; case Predefine.CT_LETTER: nPOS = -28280; // -'n' * 256 - 'x'; dValue = 0; sWord = "未##串"; break; case Predefine.CT_SINGLE://12021-2129-3121 if (Regex.IsMatch(atomSegment[i].sWord, @"^(-?\d+)(\.\d+)?$")) { //匹配浮点数 { nPOS = -27904; //'m'*256 sWord = "未##数"; } } else { nPOS = -28280; // -'n' * 256 - 'x' sWord = "未##串"; } dValue = 0; break; default: nPOS = atomSegment[i].nPOS;//'?'*256; break; } m_segGraph.SetElement(i, i + 1, new ChainContent(dValue, nPOS, sWord));//init the link with minimum } } // 将所有可能的组词存入m_segGraph for (int i = 0; i < atomSegment.Count; i++) //All the word { sWord = atomSegment[i].sWord; //Get the current atom int j = i + 1; while (j < atomSegment.Count && coreDict.GetMaxMatch(sWord, out sMaxMatchWord, out nPOSRet)) { if (sMaxMatchWord == sWord) // 就是我们要找的词 { WordInfo info = coreDict.GetWordInfo(sWord); // 该词可能就有多种词性 // 计算该词的所有词频之和 nTotalFreq = 0; for (int k = 0; k < info.Count; k++) { nTotalFreq += info.Frequencies[k]; } // 限制出现某些特殊词 if (sWord.Length == 2 && (sWord.StartsWith("年") || sWord.StartsWith("月")) && i >= 1 && (Utility.IsAllNum(atomSegment[i - 1].sWord) || Utility.IsAllChineseNum(atomSegment[i - 1].sWord))) { //1年内、1999年末 if ("末内中底前间初".IndexOf(sWord.Substring(1)) >= 0) { break; } } // 如果该词只有一个词性,则存储,否则词性记录为 0 if (info.Count == 1) { m_segGraph.SetElement(i, j, new ChainContent(nTotalFreq, info.POSs[0], sWord)); } else { m_segGraph.SetElement(i, j, new ChainContent(nTotalFreq, 0, sWord)); } } sWord += atomSegment[j++].sWord; } } return(m_segGraph); }