//==================================================================== // Func Name : GenerateWordNet // Description: Generate the segmentation word net according // the original sentence // Parameters : sSentence: the sentence // dictCore : core dictionary // bOriginalFreq=false: output original frequency // Returns : bool //==================================================================== public static RowFirstDynamicArray <ChainContent> GenerateWordNet(List <AtomNode> atomSegment, WordDictionary coreDict) { string sWord = "", sMaxMatchWord; int nPOSRet, nPOS, nTotalFreq; double dValue = 0; RowFirstDynamicArray <ChainContent> m_segGraph = new RowFirstDynamicArray <ChainContent>(); m_segGraph.SetEmpty(); // 将原子部分存入m_segGraph for (int i = 0; i < atomSegment.Count; i++)//Init the cost array { if (atomSegment[i].nPOS == Predefine.CT_CHINESE) { m_segGraph.SetElement(i, i + 1, new ChainContent(0, 0, atomSegment[i].sWord)); } else { sWord = atomSegment[i].sWord;//init the word dValue = Predefine.MAX_FREQUENCE; switch (atomSegment[i].nPOS) { case Predefine.CT_INDEX: case Predefine.CT_NUM: nPOS = -27904;//'m'*256 sWord = "未##数"; dValue = 0; break; case Predefine.CT_DELIMITER: nPOS = 30464;//'w'*256; break; case Predefine.CT_LETTER: nPOS = -28280; // -'n' * 256 - 'x'; dValue = 0; sWord = "未##串"; break; case Predefine.CT_SINGLE://12021-2129-3121 if (Regex.IsMatch(atomSegment[i].sWord, @"^(-?\d+)(\.\d+)?$")) { //匹配浮点数 { nPOS = -27904; //'m'*256 sWord = "未##数"; } } else { nPOS = -28280; // -'n' * 256 - 'x' sWord = "未##串"; } dValue = 0; break; default: nPOS = atomSegment[i].nPOS;//'?'*256; break; } m_segGraph.SetElement(i, i + 1, new ChainContent(dValue, nPOS, sWord));//init the link with minimum } } // 将所有可能的组词存入m_segGraph for (int i = 0; i < atomSegment.Count; i++) //All the word { sWord = atomSegment[i].sWord; //Get the current atom int j = i + 1; while (j < atomSegment.Count && coreDict.GetMaxMatch(sWord, out sMaxMatchWord, out nPOSRet)) { if (sMaxMatchWord == sWord) // 就是我们要找的词 { WordInfo info = coreDict.GetWordInfo(sWord); // 该词可能就有多种词性 // 计算该词的所有词频之和 nTotalFreq = 0; for (int k = 0; k < info.Count; k++) { nTotalFreq += info.Frequencies[k]; } // 限制出现某些特殊词 if (sWord.Length == 2 && (sWord.StartsWith("年") || sWord.StartsWith("月")) && i >= 1 && (Utility.IsAllNum(atomSegment[i - 1].sWord) || Utility.IsAllChineseNum(atomSegment[i - 1].sWord))) { //1年内、1999年末 if ("末内中底前间初".IndexOf(sWord.Substring(1)) >= 0) { break; } } // 如果该词只有一个词性,则存储,否则词性记录为 0 if (info.Count == 1) { m_segGraph.SetElement(i, j, new ChainContent(nTotalFreq, info.POSs[0], sWord)); } else { m_segGraph.SetElement(i, j, new ChainContent(nTotalFreq, 0, sWord)); } } sWord += atomSegment[j++].sWord; } } return(m_segGraph); }
public WordInfo GetWordInfo(string sWord) { WordInfo info = new WordInfo(); info.sWord = sWord; string sWordGet; int nFirstCharId, nFoundPos; WordChain pPre, pCur; if (!PreProcessing(ref sWord, out nFirstCharId, out sWordGet)) return null; if (FindFirstMatchItemInOrgTbl(nFirstCharId, sWordGet, out nFoundPos)) { while (nFoundPos < indexTable[nFirstCharId].nCount && string.Compare(indexTable[nFirstCharId].WordItems[nFoundPos].sWord, sWordGet) == 0) { info.POSs.Add(indexTable[nFirstCharId].WordItems[nFoundPos].nPOS); info.Frequencies.Add(indexTable[nFirstCharId].WordItems[nFoundPos].nFrequency); info.Count++; nFoundPos++; } return info; } //Operation in the index table and its items if (FindInModifyTable(nFirstCharId, sWordGet, out pPre)) { pCur = modifyTable[nFirstCharId].pWordItemHead; if (pPre != null) pCur = pPre.next; while (pCur != null && string.Compare(pCur.data.sWord, sWordGet, true) == 0) { info.POSs.Add(pCur.data.nPOS); info.Frequencies.Add(pCur.data.nFrequency); info.Count++; pCur = pCur.next; } return info; } return null; }