private void OnNShortPath(List <int[]> paths, RowFirstDynamicArray <ChainContent> segGraph) { List <ChainItem <ChainContent> > list = segGraph.ToListItems(); string theWord; int[] aPath; StringBuilder sb = new StringBuilder(); for (int i = 0; i < paths.Count; i++) { aPath = paths[i]; for (int j = 0; j < aPath.Length; j++) { theWord = list[aPath[j]].Content.sWord; if (theWord == "未##人" || theWord == "未##地" || theWord == "未##数" || theWord == "未##时" || theWord == "未##串") { for (int k = list[aPath[j]].row; k < list[aPath[j]].col; k++) { sb.Append(atomSegment[k].sWord); } sb.Append(", "); } else { sb.Append(string.Format("{0}, ", list[aPath[j]].Content.sWord)); } } sb.Append("\r\n"); } SendEvents(new SegmentEventArgs(SegmentStage.NShortPath, sb.ToString())); }
//==================================================================== // 准备PositionMap,用于记录词的位置 //==================================================================== private static int[] PreparePositionMap(RowFirstDynamicArray <ChainContent> aWord) { int[] m_npWordPosMapTable; ChainItem <ChainContent> pTail, pCur; int nWordIndex = 0, m_nWordCount; //Get tail element and return the words count m_nWordCount = aWord.GetTail(out pTail); if (m_nWordCount > 0) { m_npWordPosMapTable = new int[m_nWordCount]; } else { m_npWordPosMapTable = null; } //Record the position of possible words pCur = aWord.GetHead(); while (pCur != null) { m_npWordPosMapTable[nWordIndex++] = pCur.row * Predefine.MAX_SENTENCE_LEN + pCur.col; pCur = pCur.next; } return(m_npWordPosMapTable); }
public int BiOptimumSegment(int nResultCount, double dSmoothingPara) { WordResult[] tmpResult; WordLinkedArray linkedArray; //Generate the biword link net ColumnFirstDynamicArray <ChainContent> aBiwordsNet = BiGraphGenerate(m_graphOptimum, dSmoothingPara, biDict, coreDict); OnGenBiOptimumSegGraph(aBiwordsNet); NShortPath.Calculate(aBiwordsNet, nResultCount); List <int[]> spResult = NShortPath.GetNPaths(Predefine.MAX_SEGMENT_NUM); m_pWordSeg = new List <WordResult[]>(); segGraph = m_graphOptimum; m_graphOptimum = new RowFirstDynamicArray <ChainContent>(); for (int i = 0; i < spResult.Count; i++) { linkedArray = BiPath2LinkedArray(spResult[i], segGraph, atomSegment); tmpResult = GenerateWord(spResult[i], linkedArray, m_graphOptimum); if (tmpResult != null) { m_pWordSeg.Add(tmpResult); } } return(m_pWordSeg.Count); }
public ColumnFirstDynamicArray <ChainContent> TestSegment(string sSentence, double smoothPara, int nKind) { WordResult[] tmpResult; WordLinkedArray linkedArray; if (biDict == null || coreDict == null) { throw new Exception("biDict 或 coreDict 尚未初始化!"); } //---原子分词 atomSegment = AtomSegment(sSentence); OnAtomSegment(atomSegment); //---检索词库,加入所有可能分词方案并存入链表结构 segGraph = GenerateWordNet(atomSegment, coreDict); //OnGenSegGraph(segGraph); //---检索所有可能的两两组合 biGraphResult = BiGraphGenerate(segGraph, smoothPara, biDict, coreDict); //OnGenBiSegGraph(biGraphResult); return(biGraphResult); //--逆向匹配优化 //biGraphResult = BackwardOptimize(biGraphResult); //OnBackwardOptimize(biGraphResult); }
public int BiSegment(string sSentence, double smoothPara, int nKind) { WordResult[] tmpResult; WordLinkedArray linkedArray; if (biDict == null || coreDict == null) { throw new Exception("biDict 或 coreDict 尚未初始化!"); } //---原子分词 atomSegment = AtomSegment(sSentence); OnAtomSegment(atomSegment); //---检索词库,加入所有可能分词方案并存入链表结构 segGraph = GenerateWordNet(atomSegment, coreDict); OnGenSegGraph(segGraph); //---检索所有可能的两两组合 biGraphResult = BiGraphGenerate(segGraph, smoothPara, biDict, coreDict); OnGenBiSegGraph(biGraphResult); //--逆向匹配优化 biGraphResult = BackwardOptimize(biGraphResult); OnBackwardOptimize(biGraphResult); //---N 最短路径计算出多个分词方案 NShortPath.Calculate(biGraphResult, nKind); List <int[]> spResult = NShortPath.GetNPaths(Predefine.MAX_SEGMENT_NUM); OnNShortPath(spResult, segGraph); m_pWordSeg = new List <WordResult[]>(); m_graphOptimum = new RowFirstDynamicArray <ChainContent>(); for (int i = 0; i < spResult.Count; i++) { linkedArray = BiPath2LinkedArray(spResult[i], segGraph, atomSegment); tmpResult = GenerateWord(spResult[i], linkedArray, m_graphOptimum); if (tmpResult != null) { m_pWordSeg.Add(tmpResult); } } OnBeforeOptimize(m_pWordSeg); return(m_pWordSeg.Count); }
//Unknown word recognition //pWordSegResult:word Segmentation result; //graphOptimum: The optimized segmentation graph //graphSeg: The original segmentation graph public bool Recognition(WordResult[] pWordSegResult, RowFirstDynamicArray <ChainContent> graphOptimum, List <AtomNode> atomSegment, WordDictionary dictCore) { ChainItem <ChainContent> item; int nStartPos = 0, j = 0, nAtomStart, nAtomEnd, nPOSOriginal; double dValue; m_roleTag.POSTagging(pWordSegResult, dictCore, m_dict); //Tag the segmentation with unknown recognition roles according the core dictionary and unknown recognition dictionary for (int i = 0; i < m_roleTag.m_nUnknownWordsCount; i++) { while (j < atomSegment.Count && nStartPos < m_roleTag.m_nUnknownWords[i, 0]) { nStartPos += atomSegment[j++].sWord.Length; } nAtomStart = j; while (j < atomSegment.Count && nStartPos < m_roleTag.m_nUnknownWords[i, 1]) { nStartPos += atomSegment[j++].sWord.Length; } nAtomEnd = j; if (nAtomStart < nAtomEnd) { item = graphOptimum.GetElement(nAtomStart, nAtomEnd); if (item != null) { dValue = item.Content.eWeight; nPOSOriginal = item.Content.nPOS; } else { dValue = Predefine.INFINITE_VALUE; } if (dValue > m_roleTag.m_dWordsPossibility[i]) { //Set the element with less frequency graphOptimum.SetElement(nAtomStart, nAtomEnd, new ChainContent(m_roleTag.m_dWordsPossibility[i], m_nPOS, m_sUnknownFlags)); } } } return(true); }
//==================================================================== // 将BiPath转换为LinkedArray // 例如“他说的确实在理” // BiPath:(0, 1, 2, 3, 6, 9, 11, 12) // 0 1 2 3 4 5 6 7 8 9 10 11 12 // 始##始 他 说 的 的确 确 确实 实 实在 在 在理 理 末##末 //==================================================================== private static WordLinkedArray BiPath2LinkedArray(int[] biPath, RowFirstDynamicArray <ChainContent> segGraph, List <AtomNode> atomSegment) { List <ChainItem <ChainContent> > list = segGraph.ToListItems(); StringBuilder sb = new StringBuilder(); WordLinkedArray result = new WordLinkedArray(); for (int i = 0; i < biPath.Length; i++) { WordNode node = new WordNode(); node.row = list[biPath[i]].row; node.col = list[biPath[i]].col; node.sWordInSegGraph = list[biPath[i]].Content.sWord; node.theWord = new WordResult(); if (node.sWordInSegGraph == "未##人" || node.sWordInSegGraph == "未##地" || node.sWordInSegGraph == "未##数" || node.sWordInSegGraph == "未##时" || node.sWordInSegGraph == "未##串") { sb.Remove(0, sb.Length); for (int j = node.row; j < node.col; j++) { sb.Append(atomSegment[j].sWord); } node.theWord.sWord = sb.ToString(); } else { node.theWord.sWord = list[biPath[i]].Content.sWord; } node.theWord.nPOS = list[biPath[i]].Content.nPOS; node.theWord.dValue = list[biPath[i]].Content.eWeight; result.AppendNode(node); } return(result); }
private void OnPersonAndPlaceRecognition(RowFirstDynamicArray <ChainContent> m_graphOptimum) { SendEvents(new SegmentEventArgs(SegmentStage.PersonAndPlaceRecognition, m_graphOptimum.ToString())); }
private void OnOptimumSegment(RowFirstDynamicArray <ChainContent> m_graphOptimum) { SendEvents(new SegmentEventArgs(SegmentStage.OptimumSegment, m_graphOptimum.ToString())); }
private void OnGenSegGraph(RowFirstDynamicArray <ChainContent> segGraph) { SendEvents(new SegmentEventArgs(SegmentStage.GenSegGraph, segGraph.ToString())); }
//==================================================================== // Generate Word according the segmentation route //==================================================================== private static WordResult[] GenerateWord(int[] uniPath, WordLinkedArray linkedArray, RowFirstDynamicArray <ChainContent> m_graphOptimum) { if (linkedArray.Count == 0) { return(null); } //-------------------------------------------------------------------- //Merge all seperate continue num into one number MergeContinueNumIntoOne(ref linkedArray); //-------------------------------------------------------------------- //The delimiter "--" ChangeDelimiterPOS(ref linkedArray); //-------------------------------------------------------------------- //如果前一个词是数字,当前词以“-”或“-”开始,并且不止这一个字符, //那么将此“-”符号从当前词中分离出来。 //例如 “3 / -4 / 月”需要拆分成“3 / - / 4 / 月” SplitMiddleSlashFromDigitalWords(ref linkedArray); //-------------------------------------------------------------------- //1、如果当前词是数字,下一个词是“月、日、时、分、秒、月份”中的一个,则合并,且当前词词性是时间 //2、如果当前词是可以作为年份的数字,下一个词是“年”,则合并,词性为时间,否则为数字。 //3、如果最后一个汉字是"点" ,则认为当前数字是时间 //4、如果当前串最后一个汉字不是"∶·./"和半角的'.''/',那么是数 //5、当前串最后一个汉字是"∶·./"和半角的'.''/',且长度大于1,那么去掉最后一个字符。例如"1." CheckDateElements(ref linkedArray); //-------------------------------------------------------------------- //输出结果 WordResult[] result = new WordResult[linkedArray.Count]; WordNode pCur = linkedArray.first; int i = 0; while (pCur != null) { WordResult item = new WordResult(); item.sWord = pCur.theWord.sWord; item.nPOS = pCur.theWord.nPOS; item.dValue = pCur.theWord.dValue; result[i] = item; m_graphOptimum.SetElement(pCur.row, pCur.col, new ChainContent(item.dValue, item.nPOS, pCur.sWordInSegGraph)); pCur = pCur.next; i++; } return(result); }
//==================================================================== // 生成两两词之间的二叉图表 //==================================================================== public static ColumnFirstDynamicArray <ChainContent> BiGraphGenerate( RowFirstDynamicArray <ChainContent> aWord, double smoothPara, WordDictionary biDict, WordDictionary coreDict) { ColumnFirstDynamicArray <ChainContent> aBiWordNet = new ColumnFirstDynamicArray <ChainContent>(); ChainItem <ChainContent> pCur, pNextWords; int nTwoWordsFreq = 0, nCurWordIndex, nNextWordIndex; double dCurFreqency, dValue, dTemp; string sTwoWords; StringBuilder sb = new StringBuilder(); //Record the position map of possible words int[] m_npWordPosMapTable = PreparePositionMap(aWord); pCur = aWord.GetHead(); while (pCur != null) { if (pCur.Content.nPOS >= 0) { //It's not an unknown words dCurFreqency = pCur.Content.eWeight; } else { //Unknown words dCurFreqency = coreDict.GetFrequency(pCur.Content.sWord, 2); } //Get next words which begin with pCur.col(注:很特殊的对应关系) pNextWords = aWord.GetFirstElementOfRow(pCur.col); while (pNextWords != null && pNextWords.row == pCur.col) { sb.Remove(0, sb.Length); sb.Append(pCur.Content.sWord); sb.Append(Predefine.WORD_SEGMENTER); sb.Append(pNextWords.Content.sWord); sTwoWords = sb.ToString(); //Two linked Words frequency nTwoWordsFreq = biDict.GetFrequency(sTwoWords, 3); //Smoothing dTemp = 1.0 / Predefine.MAX_FREQUENCE; //-log{a*P(Ci-1)+(1-a)P(Ci|Ci-1)} Note 0<a<1 dValue = -Math.Log(smoothPara * (1.0 + dCurFreqency) / (Predefine.MAX_FREQUENCE + 80000.0) + (1.0 - smoothPara) * ((1.0 - dTemp) * nTwoWordsFreq / (1.0 + dCurFreqency) + dTemp)); //Unknown words: P(Wi|Ci);while known words:1 if (pCur.Content.nPOS < 0) { dValue += pCur.Content.nPOS; } //Get the position index of current word in the position map table nCurWordIndex = Utility.BinarySearch(pCur.row * Predefine.MAX_SENTENCE_LEN + pCur.col, m_npWordPosMapTable); nNextWordIndex = Utility.BinarySearch(pNextWords.row * Predefine.MAX_SENTENCE_LEN + pNextWords.col, m_npWordPosMapTable); aBiWordNet.SetElement(nCurWordIndex, nNextWordIndex, new ChainContent(dValue, pCur.Content.nPOS, sTwoWords)); pNextWords = pNextWords.next; //Get next word } pCur = pCur.next; } return(aBiWordNet); }
//==================================================================== // Func Name : GenerateWordNet // Description: Generate the segmentation word net according // the original sentence // Parameters : sSentence: the sentence // dictCore : core dictionary // bOriginalFreq=false: output original frequency // Returns : bool //==================================================================== public static RowFirstDynamicArray <ChainContent> GenerateWordNet(List <AtomNode> atomSegment, WordDictionary coreDict) { string sWord = "", sMaxMatchWord; int nPOSRet, nPOS, nTotalFreq; double dValue = 0; RowFirstDynamicArray <ChainContent> m_segGraph = new RowFirstDynamicArray <ChainContent>(); m_segGraph.SetEmpty(); // 将原子部分存入m_segGraph for (int i = 0; i < atomSegment.Count; i++)//Init the cost array { if (atomSegment[i].nPOS == Predefine.CT_CHINESE) { m_segGraph.SetElement(i, i + 1, new ChainContent(0, 0, atomSegment[i].sWord)); } else { sWord = atomSegment[i].sWord;//init the word dValue = Predefine.MAX_FREQUENCE; switch (atomSegment[i].nPOS) { case Predefine.CT_INDEX: case Predefine.CT_NUM: nPOS = -27904;//'m'*256 sWord = "未##数"; dValue = 0; break; case Predefine.CT_DELIMITER: nPOS = 30464;//'w'*256; break; case Predefine.CT_LETTER: nPOS = -28280; // -'n' * 256 - 'x'; dValue = 0; sWord = "未##串"; break; case Predefine.CT_SINGLE://12021-2129-3121 if (Regex.IsMatch(atomSegment[i].sWord, @"^(-?\d+)(\.\d+)?$")) { //匹配浮点数 { nPOS = -27904; //'m'*256 sWord = "未##数"; } } else { nPOS = -28280; // -'n' * 256 - 'x' sWord = "未##串"; } dValue = 0; break; default: nPOS = atomSegment[i].nPOS;//'?'*256; break; } m_segGraph.SetElement(i, i + 1, new ChainContent(dValue, nPOS, sWord));//init the link with minimum } } // 将所有可能的组词存入m_segGraph for (int i = 0; i < atomSegment.Count; i++) //All the word { sWord = atomSegment[i].sWord; //Get the current atom int j = i + 1; while (j < atomSegment.Count && coreDict.GetMaxMatch(sWord, out sMaxMatchWord, out nPOSRet)) { if (sMaxMatchWord == sWord) // 就是我们要找的词 { WordInfo info = coreDict.GetWordInfo(sWord); // 该词可能就有多种词性 // 计算该词的所有词频之和 nTotalFreq = 0; for (int k = 0; k < info.Count; k++) { nTotalFreq += info.Frequencies[k]; } // 限制出现某些特殊词 if (sWord.Length == 2 && (sWord.StartsWith("年") || sWord.StartsWith("月")) && i >= 1 && (Utility.IsAllNum(atomSegment[i - 1].sWord) || Utility.IsAllChineseNum(atomSegment[i - 1].sWord))) { //1年内、1999年末 if ("末内中底前间初".IndexOf(sWord.Substring(1)) >= 0) { break; } } // 如果该词只有一个词性,则存储,否则词性记录为 0 if (info.Count == 1) { m_segGraph.SetElement(i, j, new ChainContent(nTotalFreq, info.POSs[0], sWord)); } else { m_segGraph.SetElement(i, j, new ChainContent(nTotalFreq, 0, sWord)); } } sWord += atomSegment[j++].sWord; } } return(m_segGraph); }