public static void TestBiGraphGenerate() { WordDictionary coreDict = new WordDictionary(); if (!coreDict.Load(coreDictFile)) { Console.WriteLine("coreDict 字典装入错误!"); return; } WordDictionary biDict = new WordDictionary(); if (!biDict.Load(biDictFile)) { Console.WriteLine("字典装入错误!"); return; } string sSentence = @"他说的确实在理"; sSentence = Predefine.SENTENCE_BEGIN + sSentence + Predefine.SENTENCE_END; //---原子分词 List <AtomNode> atomSegment = Segment.AtomSegment(sSentence); //---检索词库,加入所有可能分词方案并存入链表结构 RowFirstDynamicArray <ChainContent> segGraph = Segment.GenerateWordNet(atomSegment, coreDict); //---检索所有可能的两两组合 ColumnFirstDynamicArray <ChainContent> biGraphResult = Segment.BiGraphGenerate(segGraph, 0.1, biDict, coreDict); Console.WriteLine(biGraphResult.ToString()); }
public int BiSegment(string sSentence, double smoothPara, int nKind) { WordResult[] tmpResult; WordLinkedArray linkedArray; if (biDict == null || coreDict == null) throw new Exception("biDict 或 coreDict 尚未初始化!"); //---原子分词 atomSegment = AtomSegment(sSentence); OnAtomSegment(atomSegment); //---检索词库,加入所有可能分词方案并存入链表结构 segGraph = GenerateWordNet(atomSegment, coreDict); OnGenSegGraph(segGraph); //---检索所有可能的两两组合 biGraphResult = BiGraphGenerate(segGraph, smoothPara, biDict, coreDict); OnGenBiSegGraph(biGraphResult); //---N 最短路径计算出多个分词方案 NShortPath.Calculate(biGraphResult, nKind); List<int[]> spResult = NShortPath.GetNPaths(Predefine.MAX_SEGMENT_NUM); OnNShortPath(spResult, segGraph); m_pWordSeg = new List<WordResult[]>(); m_graphOptimum = new RowFirstDynamicArray<ChainContent>(); for (int i = 0; i < spResult.Count; i++) { linkedArray = BiPath2LinkedArray(spResult[i], segGraph, atomSegment); tmpResult = GenerateWord(spResult[i], linkedArray, m_graphOptimum); if (tmpResult != null) m_pWordSeg.Add(tmpResult); } OnBeforeOptimize(m_pWordSeg); return m_pWordSeg.Count; }
//==================================================================== // ��������н���Ͽ��ܵ�·����Ϊ·�������ṩ������ //==================================================================== public static void Calculate(ColumnFirstDynamicArray<ChainContent> apCost, int nValueKind) { InitNShortPath(apCost, nValueKind); QueueElement tmpElement; CQueue queWork = new CQueue(); double eWeight; for (int nCurNode = 1; nCurNode < m_nNode; nCurNode++) { // �����е���ǰ��㣨nCurNode)���ܵı߸���eWeight����ѹ����� EnQueueCurNodeEdges(ref queWork, nCurNode); // ��ʼ����ǰ������бߵ�eWeightֵ for (int i = 0; i < m_nValueKind; i++) m_pWeight[nCurNode - 1][i] = Predefine.INFINITE_VALUE; // ��queWork�е�����װ��m_pWeight��m_pParent tmpElement = queWork.DeQueue(); if (tmpElement != null) { for (int i = 0; i < m_nValueKind; i++) { eWeight = tmpElement.eWeight; m_pWeight[nCurNode - 1][i] = eWeight; do { m_pParent[nCurNode - 1][i].EnQueue(new QueueElement(tmpElement.nParent, tmpElement.nIndex, 0)); tmpElement = queWork.DeQueue(); if (tmpElement == null) goto nextnode; } while (tmpElement.eWeight == eWeight); } } nextnode: ; } }
private void InitNShortPath(ColumnFirstDynamicArray<ChainContent> apCost, int nValueKind) { m_apCost = apCost; //Set the cost m_nValueKind = nValueKind; //Set the value kind // 获取顶点的数目 // ----------------- 注:by zhenyulu ------------------ // 原来程序为m_nNode = Math.Max(apCost.ColumnCount, apCost.RowCount) + 1; // 但apCost.ColumnCount应该一定大于apCost.RowCount,所以改成这样。 m_nNode = apCost.ColumnCount + 1; m_pParent = new CQueue[m_nNode - 1][]; //not including the first node m_pWeight = new double[m_nNode - 1][]; //The queue array for every node for (int i = 0; i < m_nNode - 1; i++) { m_pParent[i] = new CQueue[nValueKind]; m_pWeight[i] = new double[nValueKind]; for (int j = 0; j < nValueKind; j++) m_pParent[i][j] = new CQueue(); } }
private static void InitNShortPath(ColumnFirstDynamicArray<ChainContent> apCost, int nValueKind) { m_apCost = apCost; //Set the cost m_nValueKind = nValueKind; //Set the value kind // ��ȡ�������Ŀ // ----------------- ע��by zhenyulu ------------------ // ԭ������Ϊm_nNode = Math.Max(apCost.ColumnCount, apCost.RowCount) + 1; // ��apCost.ColumnCountӦ��һ������apCost.RowCount�����Ըij������� m_nNode = apCost.ColumnCount + 1; m_pParent = new CQueue[m_nNode - 1][]; //not including the first node m_pWeight = new double[m_nNode - 1][]; //The queue array for every node for (int i = 0; i < m_nNode - 1; i++) { m_pParent[i] = new CQueue[nValueKind]; m_pWeight[i] = new double[nValueKind]; for (int j = 0; j < nValueKind; j++) m_pParent[i][j] = new CQueue(); } }
private void OnGenBiSegGraph(ColumnFirstDynamicArray<ChainContent> biGraph) { SendEvents(new SegmentEventArgs(SegmentStage.GenBiSegGraph, biGraph.ToString())); }
public int BiSegment(string sSentence, double smoothPara, int nKind) { WordResult[] tmpResult; WordLinkedArray linkedArray; if (biDict == null || coreDict == null) throw new Exception("biDict �� coreDict ��δ��ʼ����"); //---ԭ�ӷִ� atomSegment = AtomSegment(sSentence); OnAtomSegment(atomSegment); //---�����ʿ⣬�������п��ִܷʷ�������������ṹ segGraph = GenerateWordNet(atomSegment, coreDict); OnGenSegGraph(segGraph); //---�������п��ܵ�������� biGraphResult = BiGraphGenerate(segGraph, smoothPara, biDict, coreDict); OnGenBiSegGraph(biGraphResult); //---N ���·�����������ִʷ��� NShortPath.Calculate(biGraphResult, nKind); List<int[]> spResult = NShortPath.GetNPaths(Predefine.MAX_SEGMENT_NUM); OnNShortPath(spResult, segGraph); m_pWordSeg = new List<WordResult[]>(); m_graphOptimum = new RowFirstDynamicArray<ChainContent>(); for (int i = 0; i < spResult.Count; i++) { linkedArray = BiPath2LinkedArray(spResult[i], segGraph, atomSegment); tmpResult = GenerateWord(spResult[i], linkedArray, m_graphOptimum); if (tmpResult != null) m_pWordSeg.Add(tmpResult); } OnBeforeOptimize(m_pWordSeg); return m_pWordSeg.Count; }
//==================================================================== // ����������֮��Ķ���ͼ�� //==================================================================== public static ColumnFirstDynamicArray<ChainContent> BiGraphGenerate( RowFirstDynamicArray<ChainContent> aWord, double smoothPara, WordDictionary biDict, WordDictionary coreDict) { ColumnFirstDynamicArray<ChainContent> aBiWordNet = new ColumnFirstDynamicArray<ChainContent>(); ChainItem<ChainContent> pCur, pNextWords; int nTwoWordsFreq = 0, nCurWordIndex, nNextWordIndex; double dCurFreqency, dValue, dTemp; string sTwoWords; StringBuilder sb = new StringBuilder(); //Record the position map of possible words int[] m_npWordPosMapTable = PreparePositionMap(aWord); pCur = aWord.GetHead(); while (pCur != null) { if (pCur.Content.nPOS >= 0) //It's not an unknown words dCurFreqency = pCur.Content.eWeight; else //Unknown words dCurFreqency = coreDict.GetFrequency(pCur.Content.sWord, 2); //Get next words which begin with pCur.col��ע��������Ķ�Ӧ��ϵ�� pNextWords = aWord.GetFirstElementOfRow(pCur.col); while (pNextWords != null && pNextWords.row == pCur.col) { sb.Remove(0, sb.Length); sb.Append(pCur.Content.sWord); sb.Append(Predefine.WORD_SEGMENTER); sb.Append(pNextWords.Content.sWord); sTwoWords = sb.ToString(); //Two linked Words frequency nTwoWordsFreq = biDict.GetFrequency(sTwoWords, 3); //Smoothing dTemp = 1.0 / Predefine.MAX_FREQUENCE; //-log{a*P(Ci-1)+(1-a)P(Ci|Ci-1)} Note 0<a<1 dValue = -Math.Log(smoothPara * (1.0 + dCurFreqency) / (Predefine.MAX_FREQUENCE + 80000.0) + (1.0 - smoothPara) * ((1.0 - dTemp) * nTwoWordsFreq / (1.0 + dCurFreqency) + dTemp)); //Unknown words: P(Wi|Ci);while known words:1 if (pCur.Content.nPOS < 0) dValue += pCur.Content.nPOS; //Get the position index of current word in the position map table nCurWordIndex = Utility.BinarySearch(pCur.row * Predefine.MAX_SENTENCE_LEN + pCur.col, m_npWordPosMapTable); nNextWordIndex = Utility.BinarySearch(pNextWords.row * Predefine.MAX_SENTENCE_LEN + pNextWords.col, m_npWordPosMapTable); aBiWordNet.SetElement(nCurWordIndex, nNextWordIndex, new ChainContent(dValue, pCur.Content.nPOS, sTwoWords)); pNextWords = pNextWords.next; //Get next word } pCur = pCur.next; } return aBiWordNet; }
//==================================================================== // 计算出所有结点上可能的路径,为路径数据提供数据准备 //==================================================================== public void Calculate(ColumnFirstDynamicArray<ChainContent> apCost, int nValueKind) { InitNShortPath(apCost, nValueKind); QueueElement tmpElement; CQueue queWork = new CQueue(); double eWeight; for (int nCurNode = 1; nCurNode < m_nNode; nCurNode++) { // 将所有到当前结点(nCurNode)可能的边根据eWeight排序并压入队列 EnQueueCurNodeEdges(ref queWork, nCurNode); // 初始化当前结点所有边的eWeight值 for (int i = 0; i < m_nValueKind; i++) m_pWeight[nCurNode - 1][i] = Predefine.INFINITE_VALUE; // 将queWork中的内容装入m_pWeight与m_pParent tmpElement = queWork.DeQueue(); if (tmpElement != null) { for (int i = 0; i < m_nValueKind; i++) { eWeight = tmpElement.eWeight; m_pWeight[nCurNode - 1][i] = eWeight; do { m_pParent[nCurNode - 1][i].EnQueue(new QueueElement(tmpElement.nParent, tmpElement.nIndex, 0)); tmpElement = queWork.DeQueue(); if (tmpElement == null) goto nextnode; } while (tmpElement.eWeight == eWeight); } } nextnode: ; } }
private void OnBackwardOptimize(ColumnFirstDynamicArray<ChainContent> biGraph) { SendEvents(new SegmentEventArgs(SegmentStage.BcakwardOptimize, biGraph.ToString())); }
public ColumnFirstDynamicArray<ChainContent> TestSegment(string sSentence, double smoothPara, int nKind) { WordResult[] tmpResult; WordLinkedArray linkedArray; if (biDict == null || coreDict == null) throw new Exception("biDict �� coreDict ��δ��ʼ����"); //---ԭ�ӷִ� atomSegment = AtomSegment(sSentence); OnAtomSegment(atomSegment); //---�����ʿ⣬�������п��ִܷʷ�������������ṹ segGraph = GenerateWordNet(atomSegment, coreDict); //OnGenSegGraph(segGraph); //---�������п��ܵ�������� biGraphResult = BiGraphGenerate(segGraph, smoothPara, biDict, coreDict); //OnGenBiSegGraph(biGraphResult); return biGraphResult; //--����ƥ���Ż� //biGraphResult = BackwardOptimize(biGraphResult); //OnBackwardOptimize(biGraphResult); }
public static ColumnFirstDynamicArray<ChainContent> BackwardOptimize(ColumnFirstDynamicArray<ChainContent> test) { ChainItem<ChainContent> pCur = test.GetHead(); StringBuilder sb = new StringBuilder(); while(pCur!=null) { double multiNum = Math.Pow(backNum,pCur.col); pCur.Content.eWeight *= multiNum; pCur = pCur.next; } return test; }
public static void TestNShortPath() { int n = 2; List <int[]> result; int[] aPath; //--------------------------------------------------------------edie by SharpKey string dictPath = Path.Combine(Environment.CurrentDirectory, "Data") + Path.DirectorySeparatorChar; Console.WriteLine("正在初始化字典库,请稍候..."); //WordSegmentSample sample = new WordSegmentSample(DictPath, 2); WordSegment wordSegment = new WordSegment(); wordSegment.InitWordSegment(dictPath); Segment m_Seg = new Segment(wordSegment.m_dictBigram, wordSegment.m_dictCore);//Seg class //wordSegment.Segment("", 2); ColumnFirstDynamicArray <ChainContent> apCost = m_Seg.TestSegment("始##始这个人的确实在末##末", 0.1, 2); Console.WriteLine(apCost.ToString()); //---------------------------------- NShortPath.Calculate(apCost, n); NShortPath.printResultByIndex(); //---------------------------------------------------- // 所有路径 //---------------------------------------------------- Console.WriteLine("\r\n\r\n所有路径:"); for (int i = 0; i < n; i++) { result = NShortPath.GetPaths(i); for (int j = 0; j < result.Count; j++) { aPath = result[j]; for (int k = 0; k < aPath.Length; k++) { Console.Write("{0}, ", aPath[k]); } Console.WriteLine(); } Console.WriteLine("========================"); } //---------------------------------------------------- // 最佳路径 //---------------------------------------------------- Console.WriteLine("\r\n最佳路径:"); aPath = NShortPath.GetBestPath(); for (int k = 0; k < aPath.Length; k++) { Console.Write("{0}, ", aPath[k]); } Console.WriteLine(); //---------------------------------------------------- // 最多 n 个路径 //---------------------------------------------------- Console.WriteLine("\r\n最多 {0} 条路径:", 5); result = NShortPath.GetNPaths(5); for (int j = 0; j < result.Count; j++) { aPath = result[j]; for (int k = 0; k < aPath.Length; k++) { Console.Write("{0}, ", aPath[k]); } Console.WriteLine(); } }
public static void TestNShortPath() { int n = 2; List <int[]> result; int[] aPath; ColumnFirstDynamicArray <ChainContent> apCost = new ColumnFirstDynamicArray <ChainContent>(); apCost.SetElement(0, 1, new ChainContent(1)); apCost.SetElement(1, 2, new ChainContent(1)); apCost.SetElement(1, 3, new ChainContent(2)); apCost.SetElement(2, 3, new ChainContent(1)); apCost.SetElement(2, 4, new ChainContent(1)); apCost.SetElement(3, 4, new ChainContent(1)); apCost.SetElement(4, 5, new ChainContent(1)); apCost.SetElement(3, 6, new ChainContent(2)); apCost.SetElement(4, 6, new ChainContent(3)); apCost.SetElement(5, 6, new ChainContent(1)); Console.WriteLine(apCost.ToString()); NShortPath.Calculate(apCost, n); NShortPath.printResultByIndex(); //---------------------------------------------------- // 所有路径 //---------------------------------------------------- Console.WriteLine("\r\n\r\n所有路径:"); for (int i = 0; i < n; i++) { result = NShortPath.GetPaths(i); for (int j = 0; j < result.Count; j++) { aPath = result[j]; for (int k = 0; k < aPath.Length; k++) { Console.Write("{0}, ", aPath[k]); } Console.WriteLine(); } Console.WriteLine("========================"); } //---------------------------------------------------- // 最佳路径 //---------------------------------------------------- Console.WriteLine("\r\n最佳路径:"); aPath = NShortPath.GetBestPath(); for (int k = 0; k < aPath.Length; k++) { Console.Write("{0}, ", aPath[k]); } Console.WriteLine(); //---------------------------------------------------- // 最多 n 个路径 //---------------------------------------------------- Console.WriteLine("\r\n最多 {0} 条路径:", 5); result = NShortPath.GetNPaths(5); for (int j = 0; j < result.Count; j++) { aPath = result[j]; for (int k = 0; k < aPath.Length; k++) { Console.Write("{0}, ", aPath[k]); } Console.WriteLine(); } }