private static void InitNShortPath(ColumnFirstDynamicArray <ChainContent> apCost, int nValueKind) { m_apCost = apCost; //Set the cost m_nValueKind = nValueKind; //Set the value kind // 获取顶点的数目 // ----------------- 注:by zhenyulu ------------------ // 原来程序为m_nNode = Math.Max(apCost.ColumnCount, apCost.RowCount) + 1; // 但apCost.ColumnCount应该一定大于apCost.RowCount,所以改成这样。 m_nNode = apCost.ColumnCount + 1; m_pParent = new CQueue[m_nNode - 1][]; //not including the first node m_pWeight = new double[m_nNode - 1][]; //The queue array for every node for (int i = 0; i < m_nNode - 1; i++) { m_pParent[i] = new CQueue[nValueKind]; m_pWeight[i] = new double[nValueKind]; for (int j = 0; j < nValueKind; j++) { m_pParent[i][j] = new CQueue(); } } }
public int BiOptimumSegment(int nResultCount, double dSmoothingPara) { WordResult[] tmpResult; WordLinkedArray linkedArray; //Generate the biword link net ColumnFirstDynamicArray <ChainContent> aBiwordsNet = BiGraphGenerate(m_graphOptimum, dSmoothingPara, biDict, coreDict); OnGenBiOptimumSegGraph(aBiwordsNet); NShortPath.Calculate(aBiwordsNet, nResultCount); List <int[]> spResult = NShortPath.GetNPaths(Predefine.MAX_SEGMENT_NUM); m_pWordSeg = new List <WordResult[]>(); segGraph = m_graphOptimum; m_graphOptimum = new RowFirstDynamicArray <ChainContent>(); for (int i = 0; i < spResult.Count; i++) { linkedArray = BiPath2LinkedArray(spResult[i], segGraph, atomSegment); tmpResult = GenerateWord(spResult[i], linkedArray, m_graphOptimum); if (tmpResult != null) { m_pWordSeg.Add(tmpResult); } } return(m_pWordSeg.Count); }
public ColumnFirstDynamicArray <ChainContent> TestSegment(string sSentence, double smoothPara, int nKind) { WordResult[] tmpResult; WordLinkedArray linkedArray; if (biDict == null || coreDict == null) { throw new Exception("biDict 或 coreDict 尚未初始化!"); } //---原子分词 atomSegment = AtomSegment(sSentence); OnAtomSegment(atomSegment); //---检索词库,加入所有可能分词方案并存入链表结构 segGraph = GenerateWordNet(atomSegment, coreDict); //OnGenSegGraph(segGraph); //---检索所有可能的两两组合 biGraphResult = BiGraphGenerate(segGraph, smoothPara, biDict, coreDict); //OnGenBiSegGraph(biGraphResult); return(biGraphResult); //--逆向匹配优化 //biGraphResult = BackwardOptimize(biGraphResult); //OnBackwardOptimize(biGraphResult); }
public static ColumnFirstDynamicArray <ChainContent> BackwardOptimize(ColumnFirstDynamicArray <ChainContent> test) { ChainItem <ChainContent> pCur = test.GetHead(); StringBuilder sb = new StringBuilder(); while (pCur != null) { double multiNum = Math.Pow(backNum, pCur.col); pCur.Content.eWeight *= multiNum; pCur = pCur.next; } return(test); }
public int BiSegment(string sSentence, double smoothPara, int nKind) { WordResult[] tmpResult; WordLinkedArray linkedArray; if (biDict == null || coreDict == null) { throw new Exception("biDict 或 coreDict 尚未初始化!"); } //---原子分词 atomSegment = AtomSegment(sSentence); OnAtomSegment(atomSegment); //---检索词库,加入所有可能分词方案并存入链表结构 segGraph = GenerateWordNet(atomSegment, coreDict); OnGenSegGraph(segGraph); //---检索所有可能的两两组合 biGraphResult = BiGraphGenerate(segGraph, smoothPara, biDict, coreDict); OnGenBiSegGraph(biGraphResult); //--逆向匹配优化 biGraphResult = BackwardOptimize(biGraphResult); OnBackwardOptimize(biGraphResult); //---N 最短路径计算出多个分词方案 NShortPath.Calculate(biGraphResult, nKind); List <int[]> spResult = NShortPath.GetNPaths(Predefine.MAX_SEGMENT_NUM); OnNShortPath(spResult, segGraph); m_pWordSeg = new List <WordResult[]>(); m_graphOptimum = new RowFirstDynamicArray <ChainContent>(); for (int i = 0; i < spResult.Count; i++) { linkedArray = BiPath2LinkedArray(spResult[i], segGraph, atomSegment); tmpResult = GenerateWord(spResult[i], linkedArray, m_graphOptimum); if (tmpResult != null) { m_pWordSeg.Add(tmpResult); } } OnBeforeOptimize(m_pWordSeg); return(m_pWordSeg.Count); }
//==================================================================== // 计算出所有结点上可能的路径,为路径数据提供数据准备 //==================================================================== public static void Calculate(ColumnFirstDynamicArray <ChainContent> apCost, int nValueKind) { InitNShortPath(apCost, nValueKind); QueueElement tmpElement; CQueue queWork = new CQueue(); double eWeight; for (int nCurNode = 1; nCurNode < m_nNode; nCurNode++) { // 将所有到当前结点(nCurNode)可能的边根据eWeight排序并压入队列 EnQueueCurNodeEdges(ref queWork, nCurNode); // 初始化当前结点所有边的eWeight值 for (int i = 0; i < m_nValueKind; i++) { m_pWeight[nCurNode - 1][i] = Predefine.INFINITE_VALUE; } // 将queWork中的内容装入m_pWeight与m_pParent tmpElement = queWork.DeQueue(); if (tmpElement != null) { for (int i = 0; i < m_nValueKind; i++) { eWeight = tmpElement.eWeight; m_pWeight[nCurNode - 1][i] = eWeight; do { m_pParent[nCurNode - 1][i].EnQueue(new QueueElement(tmpElement.nParent, tmpElement.nIndex, 0)); tmpElement = queWork.DeQueue(); if (tmpElement == null) { goto nextnode; } } while (tmpElement.eWeight == eWeight); } } nextnode :; } }
private void OnGenBiOptimumSegGraph(ColumnFirstDynamicArray <ChainContent> biOptGraph) { SendEvents(new SegmentEventArgs(SegmentStage.GenBiSegGraph, biOptGraph.ToString())); }
//==================================================================== // 生成两两词之间的二叉图表 //==================================================================== public static ColumnFirstDynamicArray <ChainContent> BiGraphGenerate( RowFirstDynamicArray <ChainContent> aWord, double smoothPara, WordDictionary biDict, WordDictionary coreDict) { ColumnFirstDynamicArray <ChainContent> aBiWordNet = new ColumnFirstDynamicArray <ChainContent>(); ChainItem <ChainContent> pCur, pNextWords; int nTwoWordsFreq = 0, nCurWordIndex, nNextWordIndex; double dCurFreqency, dValue, dTemp; string sTwoWords; StringBuilder sb = new StringBuilder(); //Record the position map of possible words int[] m_npWordPosMapTable = PreparePositionMap(aWord); pCur = aWord.GetHead(); while (pCur != null) { if (pCur.Content.nPOS >= 0) { //It's not an unknown words dCurFreqency = pCur.Content.eWeight; } else { //Unknown words dCurFreqency = coreDict.GetFrequency(pCur.Content.sWord, 2); } //Get next words which begin with pCur.col(注:很特殊的对应关系) pNextWords = aWord.GetFirstElementOfRow(pCur.col); while (pNextWords != null && pNextWords.row == pCur.col) { sb.Remove(0, sb.Length); sb.Append(pCur.Content.sWord); sb.Append(Predefine.WORD_SEGMENTER); sb.Append(pNextWords.Content.sWord); sTwoWords = sb.ToString(); //Two linked Words frequency nTwoWordsFreq = biDict.GetFrequency(sTwoWords, 3); //Smoothing dTemp = 1.0 / Predefine.MAX_FREQUENCE; //-log{a*P(Ci-1)+(1-a)P(Ci|Ci-1)} Note 0<a<1 dValue = -Math.Log(smoothPara * (1.0 + dCurFreqency) / (Predefine.MAX_FREQUENCE + 80000.0) + (1.0 - smoothPara) * ((1.0 - dTemp) * nTwoWordsFreq / (1.0 + dCurFreqency) + dTemp)); //Unknown words: P(Wi|Ci);while known words:1 if (pCur.Content.nPOS < 0) { dValue += pCur.Content.nPOS; } //Get the position index of current word in the position map table nCurWordIndex = Utility.BinarySearch(pCur.row * Predefine.MAX_SENTENCE_LEN + pCur.col, m_npWordPosMapTable); nNextWordIndex = Utility.BinarySearch(pNextWords.row * Predefine.MAX_SENTENCE_LEN + pNextWords.col, m_npWordPosMapTable); aBiWordNet.SetElement(nCurWordIndex, nNextWordIndex, new ChainContent(dValue, pCur.Content.nPOS, sTwoWords)); pNextWords = pNextWords.next; //Get next word } pCur = pCur.next; } return(aBiWordNet); }
private void OnBackwardOptimize(ColumnFirstDynamicArray <ChainContent> biGraph) { SendEvents(new SegmentEventArgs(SegmentStage.BcakwardOptimize, biGraph.ToString())); }