//逆向最长匹配 public string BackSplitting(RowFirstDynamicArray <ChainContent> m_segGraph) { string abc = ""; int nCol = m_segGraph.ColumnCount - 1; int nRow = m_segGraph.RowCount - 1; //ChainItem<ChainContent> dfg = m_segGraph.GetElement(m_segGraph.RowCount-1, m_segGraph.ColumnCount-1); //List<ChainItem<ChainContent>> ab = new List<ChainItem<ChainContent>>(); while (nCol > 1) { for (int i = 0; i <= nRow; i++) { if (null != m_segGraph.GetElement(i, nCol)) { if (abc == "") { abc = m_segGraph.GetElement(i, nCol).Content.sWord; } else { abc = m_segGraph.GetElement(i, nCol).Content.sWord + "/" + abc; } nCol = i; break; } } } return(abc); }
public static void TestBiGraphGenerate() { WordDictionary coreDict = new WordDictionary(); if (!coreDict.Load(coreDictFile)) { Console.WriteLine("coreDict 字典装入错误!"); return; } WordDictionary biDict = new WordDictionary(); if (!biDict.Load(biDictFile)) { Console.WriteLine("字典装入错误!"); return; } string sSentence = @"他说的确实在理"; sSentence = Predefine.SENTENCE_BEGIN + sSentence + Predefine.SENTENCE_END; //---原子分词 List <AtomNode> atomSegment = Segment.AtomSegment(sSentence); //---检索词库,加入所有可能分词方案并存入链表结构 RowFirstDynamicArray <ChainContent> segGraph = Segment.GenerateWordNet(atomSegment, coreDict); //---检索所有可能的两两组合 ColumnFirstDynamicArray <ChainContent> biGraphResult = Segment.BiGraphGenerate(segGraph, 0.1, biDict, coreDict); Console.WriteLine(biGraphResult.ToString()); }
/// <summary> /// 得到所有可能的分词方案 /// </summary> /// <returns></returns> public RowFirstDynamicArray <ChainContent> GetSegGraph(string sSentence) { WordDictionary coreDict = new WordDictionary(); if (!coreDict.Load(coreDictFile)) { Console.WriteLine("字典装入错误!"); return(null); } //string sSentence = @"他说的确实实在"; sSentence = Predefine.SENTENCE_BEGIN + sSentence + Predefine.SENTENCE_END; List <AtomNode> atomSegment = Segment.AtomSegment(sSentence); RowFirstDynamicArray <ChainContent> m_segGraph = Segment.GenerateWordNet(atomSegment, coreDict); return(m_segGraph); }
public static void TestGenerateWordNet() { WordDictionary coreDict = new WordDictionary(); if (!coreDict.Load(coreDictFile)) { Console.WriteLine("字典装入错误!"); return; } string sSentence = @"人民币现在很值钱"; sSentence = Predefine.SENTENCE_BEGIN + sSentence + Predefine.SENTENCE_END; List <AtomNode> atomSegment = Segment.AtomSegment(sSentence); RowFirstDynamicArray <ChainContent> m_segGraph = Segment.GenerateWordNet(atomSegment, coreDict); Console.WriteLine(m_segGraph.ToString()); }
public int BiSegment(string sSentence, double smoothPara, int nKind) { WordResult[] tmpResult; WordLinkedArray linkedArray; if (biDict == null || coreDict == null) throw new Exception("biDict 或 coreDict 尚未初始化!"); //---原子分词 atomSegment = AtomSegment(sSentence); OnAtomSegment(atomSegment); //---检索词库,加入所有可能分词方案并存入链表结构 segGraph = GenerateWordNet(atomSegment, coreDict); OnGenSegGraph(segGraph); //---检索所有可能的两两组合 biGraphResult = BiGraphGenerate(segGraph, smoothPara, biDict, coreDict); OnGenBiSegGraph(biGraphResult); //---N 最短路径计算出多个分词方案 NShortPath.Calculate(biGraphResult, nKind); List<int[]> spResult = NShortPath.GetNPaths(Predefine.MAX_SEGMENT_NUM); OnNShortPath(spResult, segGraph); m_pWordSeg = new List<WordResult[]>(); m_graphOptimum = new RowFirstDynamicArray<ChainContent>(); for (int i = 0; i < spResult.Count; i++) { linkedArray = BiPath2LinkedArray(spResult[i], segGraph, atomSegment); tmpResult = GenerateWord(spResult[i], linkedArray, m_graphOptimum); if (tmpResult != null) m_pWordSeg.Add(tmpResult); } OnBeforeOptimize(m_pWordSeg); return m_pWordSeg.Count; }
//正向最长匹配 public string ForwardSplitting(RowFirstDynamicArray <ChainContent> m_segGraph) { string abc = ""; // =GetSegGraph(); int currcol = 0; ChainItem <ChainContent> dfg = m_segGraph.GetElement(0, 1); ChainItem <ChainContent> aa = dfg.next; while (null != aa.next) { if (aa.next.row == aa.row) { currcol = aa.next.col; aa = aa.next; } else { abc += aa.Content.sWord; currcol = aa.col; aa = m_segGraph.GetFirstElementOfRow(currcol); break; } } while (null != aa.next) { if (aa.next.row == aa.row) { currcol = aa.next.col; aa = aa.next; } else { abc += "/" + aa.Content.sWord; currcol = aa.col; aa = m_segGraph.GetFirstElementOfRow(currcol); } } return(abc); }
//正向最长匹配 public string ForwardSplitting(RowFirstDynamicArray<ChainContent> m_segGraph) { string abc = ""; // =GetSegGraph(); int currcol = 0; ChainItem<ChainContent> dfg= m_segGraph.GetElement(0, 1); ChainItem<ChainContent> aa = dfg.next; while (null != aa.next) { if (aa.next.row == aa.row) { currcol = aa.next.col; aa = aa.next; } else { abc += aa.Content.sWord ; currcol = aa.col; aa = m_segGraph.GetFirstElementOfRow(currcol); break; } } while (null != aa.next) { if (aa.next.row == aa.row) { currcol = aa.next.col; aa = aa.next; } else { abc += "/"+aa.Content.sWord ; currcol = aa.col; aa = m_segGraph.GetFirstElementOfRow(currcol); } } return abc; }
public static string Foo() { test abc = new test(); string biaodian = @"—()、,!“”《》『』"; string path = @"D:\txt2.txt"; StringBuilder outBuffer = new StringBuilder(); try { Encoding fileEncoding = Encoding.GetEncoding("GB2312"); using (StreamReader sr = new StreamReader(path, fileEncoding)) { while (sr.Peek() >= 0) { char dfg = (char)sr.Read(); //string dfg = Convert.ToString((char)sr.Read()); if (biaodian.IndexOf(dfg) >= 0) { //abc.BackSplitting(outBuffer.ToString()); RowFirstDynamicArray <ChainContent> de = abc.GetSegGraph(outBuffer.ToString()); //string hyn= abc.ForwardSplitting(de); WriteToTxt(abc.BackSplitting(de)); WriteToTxt(Convert.ToString(dfg)); outBuffer.Remove(0, outBuffer.Length); continue; } outBuffer.Append(dfg); } } } catch (Exception ex) { outBuffer.AppendFormat("The process failed: {0}", ex.ToString()).AppendLine(); } // return(outBuffer.ToString()); }
//逆向最长匹配 public string BackSplitting(RowFirstDynamicArray<ChainContent> m_segGraph) { string abc = ""; int nCol = m_segGraph.ColumnCount-1; int nRow = m_segGraph.RowCount-1; //ChainItem<ChainContent> dfg = m_segGraph.GetElement(m_segGraph.RowCount-1, m_segGraph.ColumnCount-1); //List<ChainItem<ChainContent>> ab = new List<ChainItem<ChainContent>>(); while (nCol > 1) { for (int i = 0; i <=nRow; i++) { if (null != m_segGraph.GetElement(i, nCol)) { if (abc == "") { abc = m_segGraph.GetElement(i, nCol).Content.sWord; } else{ abc = m_segGraph.GetElement(i, nCol).Content.sWord + "/" + abc; }nCol = i; break; } } } return abc; }
//Unknown word recognition //pWordSegResult:word Segmentation result; //graphOptimum: The optimized segmentation graph //graphSeg: The original segmentation graph public bool Recognition(WordResult[] pWordSegResult, RowFirstDynamicArray<ChainContent> graphOptimum, List<AtomNode> atomSegment, WordDictionary dictCore) { ChainItem<ChainContent> item; int nStartPos = 0, j = 0, nAtomStart, nAtomEnd, nPOSOriginal; double dValue; m_roleTag.POSTagging(pWordSegResult, dictCore, m_dict); //Tag the segmentation with unknown recognition roles according the core dictionary and unknown recognition dictionary for (int i = 0; i < m_roleTag.m_nUnknownWordsCount; i++) { while (j < atomSegment.Count && nStartPos < m_roleTag.m_nUnknownWords[i, 0]) nStartPos += atomSegment[j++].sWord.Length; nAtomStart = j; while (j < atomSegment.Count && nStartPos < m_roleTag.m_nUnknownWords[i, 1]) nStartPos += atomSegment[j++].sWord.Length; nAtomEnd = j; if (nAtomStart < nAtomEnd) { item = graphOptimum.GetElement(nAtomStart, nAtomEnd); if (item != null) { dValue = item.Content.eWeight; nPOSOriginal = item.Content.nPOS; } else dValue = Predefine.INFINITE_VALUE; if (dValue > m_roleTag.m_dWordsPossibility[i]) //Set the element with less frequency graphOptimum.SetElement(nAtomStart, nAtomEnd, new ChainContent(m_roleTag.m_dWordsPossibility[i], m_nPOS, m_sUnknownFlags)); } } return true; }
//==================================================================== // Func Name : GenerateWordNet // Description: Generate the segmentation word net according // the original sentence // Parameters : sSentence: the sentence // dictCore : core dictionary // bOriginalFreq=false: output original frequency // Returns : bool //==================================================================== public static RowFirstDynamicArray<ChainContent> GenerateWordNet(List<AtomNode> atomSegment, WordDictionary coreDict) { string sWord = "", sMaxMatchWord; int nPOSRet, nPOS, nTotalFreq; double dValue = 0; RowFirstDynamicArray<ChainContent> m_segGraph = new RowFirstDynamicArray<ChainContent>(); m_segGraph.SetEmpty(); // ��ԭ�Ӳ��ִ���m_segGraph for (int i = 0; i < atomSegment.Count; i++)//Init the cost array { if (atomSegment[i].nPOS == Predefine.CT_CHINESE) m_segGraph.SetElement(i, i + 1, new ChainContent(0, 0, atomSegment[i].sWord)); else { sWord = atomSegment[i].sWord;//init the word dValue = Predefine.MAX_FREQUENCE; switch (atomSegment[i].nPOS) { case Predefine.CT_INDEX: case Predefine.CT_NUM: nPOS = -27904;//'m'*256 sWord = "δ##��"; dValue = 0; break; case Predefine.CT_DELIMITER: nPOS = 30464;//'w'*256; break; case Predefine.CT_LETTER: nPOS = -28280; // -'n' * 256 - 'x'; dValue = 0; sWord = "δ##��"; break; case Predefine.CT_SINGLE://12021-2129-3121 if (Regex.IsMatch(atomSegment[i].sWord, @"^(-?\d+)(\.\d+)?$"))����//ƥ�両���� { nPOS = -27904;//'m'*256 sWord = "δ##��"; } else { nPOS = -28280; // -'n' * 256 - 'x' sWord = "δ##��"; } dValue = 0; break; default: nPOS = atomSegment[i].nPOS;//'?'*256; break; } m_segGraph.SetElement(i, i + 1, new ChainContent(dValue, nPOS, sWord));//init the link with minimum } } // �����п��ܵ���ʴ���m_segGraph for (int i = 0; i < atomSegment.Count; i++)//All the word { sWord = atomSegment[i].sWord;//Get the current atom int j = i + 1; while (j < atomSegment.Count && coreDict.GetMaxMatch(sWord, out sMaxMatchWord, out nPOSRet)) { if (sMaxMatchWord == sWord) // ��������Ҫ�ҵĴ� { WordInfo info = coreDict.GetWordInfo(sWord); // �ôʿ��ܾ��ж��ִ��� // ����ôʵ����д�Ƶ֮�� nTotalFreq = 0; for (int k = 0; k < info.Count; k++) nTotalFreq += info.Frequencies[k]; // ���Ƴ���ijЩ����� if (sWord.Length == 2 && (sWord.StartsWith("��") || sWord.StartsWith("��")) && i >= 1 && (Utility.IsAllNum(atomSegment[i - 1].sWord) || Utility.IsAllChineseNum(atomSegment[i - 1].sWord))) { //1���ڡ�1999��ĩ if ("ĩ���е�ǰ���".IndexOf(sWord.Substring(1)) >= 0) break; } // ����ô�ֻ��һ�����ԣ���洢��������Լ�¼Ϊ 0 if (info.Count == 1) m_segGraph.SetElement(i, j, new ChainContent(nTotalFreq, info.POSs[0], sWord)); else m_segGraph.SetElement(i, j, new ChainContent(nTotalFreq, 0, sWord)); } sWord += atomSegment[j++].sWord; } } return m_segGraph; }
private void OnOptimumSegment(RowFirstDynamicArray<ChainContent> m_graphOptimum) { SendEvents(new SegmentEventArgs(SegmentStage.OptimumSegment, m_graphOptimum.ToString())); }
private void OnNShortPath(List<int[]> paths, RowFirstDynamicArray<ChainContent> segGraph) { List<ChainItem<ChainContent>> list = segGraph.ToListItems(); string theWord; int[] aPath; StringBuilder sb = new StringBuilder(); for (int i = 0; i < paths.Count; i++) { aPath = paths[i]; for (int j = 0; j < aPath.Length; j++) { theWord = list[aPath[j]].Content.sWord; if (theWord == "δ##��" || theWord == "δ##��" || theWord == "δ##��" || theWord == "δ##ʱ" || theWord == "δ##��") { for (int k = list[aPath[j]].row; k < list[aPath[j]].col; k++) sb.Append(atomSegment[k].sWord); sb.Append(", "); } else sb.Append(string.Format("{0}, ", list[aPath[j]].Content.sWord)); } sb.Append("\r\n"); } SendEvents(new SegmentEventArgs(SegmentStage.NShortPath, sb.ToString())); }
private void OnGenSegGraph(RowFirstDynamicArray<ChainContent> segGraph) { SendEvents(new SegmentEventArgs(SegmentStage.GenSegGraph, segGraph.ToString())); }
//==================================================================== // ��PositionMap�����ڼ�¼�ʵ�λ�� //==================================================================== private static int[] PreparePositionMap(RowFirstDynamicArray<ChainContent> aWord) { int[] m_npWordPosMapTable; ChainItem<ChainContent> pTail, pCur; int nWordIndex = 0, m_nWordCount; //Get tail element and return the words count m_nWordCount = aWord.GetTail(out pTail); if (m_nWordCount > 0) m_npWordPosMapTable = new int[m_nWordCount]; else m_npWordPosMapTable = null; //Record the position of possible words pCur = aWord.GetHead(); while (pCur != null) { m_npWordPosMapTable[nWordIndex++] = pCur.row * Predefine.MAX_SENTENCE_LEN + pCur.col; pCur = pCur.next; } return m_npWordPosMapTable; }
//==================================================================== // Generate Word according the segmentation route //==================================================================== private static WordResult[] GenerateWord(int[] uniPath, WordLinkedArray linkedArray, RowFirstDynamicArray<ChainContent> m_graphOptimum) { if (linkedArray.Count == 0) return null; //-------------------------------------------------------------------- //Merge all seperate continue num into one number MergeContinueNumIntoOne(ref linkedArray); //-------------------------------------------------------------------- //The delimiter "����" ChangeDelimiterPOS(ref linkedArray); //-------------------------------------------------------------------- //���ǰһ���������֣���ǰ���ԡ�������-����ʼ�����Ҳ�ֹ��һ���ַ��� //��ô���ˡ��������Ŵӵ�ǰ���з�������� //���� ��3 / -4 / �¡���Ҫ��ֳɡ�3 / - / 4 / �¡� SplitMiddleSlashFromDigitalWords(ref linkedArray); //-------------------------------------------------------------------- //1�������ǰ�������֣���һ�����ǡ��¡��ա�ʱ���֡��롢�·ݡ��е�һ������ϲ�,�ҵ�ǰ�ʴ�����ʱ�� //2�������ǰ���ǿ�����Ϊ��ݵ����֣���һ�����ǡ��ꡱ����ϲ�������Ϊʱ�䣬����Ϊ���֡� //3��������һ��������"��" ������Ϊ��ǰ������ʱ�� //4�������ǰ�����һ�����ֲ���"�á�����"�Ͱ�ǵ�'.''/'����ô���� //5����ǰ�����һ��������"�á�����"�Ͱ�ǵ�'.''/'���ҳ��ȴ���1����ôȥ�����һ���ַ�������"1." CheckDateElements(ref linkedArray); //-------------------------------------------------------------------- //������ WordResult[] result = new WordResult[linkedArray.Count]; WordNode pCur = linkedArray.first; int i = 0; while (pCur != null) { WordResult item = new WordResult(); item.sWord = pCur.theWord.sWord; item.nPOS = pCur.theWord.nPOS; item.dValue = pCur.theWord.dValue; item.sLocation = pCur.row; result[i] = item; m_graphOptimum.SetElement(pCur.row, pCur.col, new ChainContent(item.dValue, item.nPOS, pCur.sWordInSegGraph)); pCur = pCur.next; i++; } return result; }
//==================================================================== // ��BiPathת��ΪLinkedArray // ���硰��˵��ȷʵ����� // BiPath����0, 1, 2, 3, 6, 9, 11, 12�� // 0 1 2 3 4 5 6 7 8 9 10 11 12 // ʼ##ʼ �� ˵ �� ��ȷ ȷ ȷʵ ʵ ʵ�� �� ���� �� ĩ##ĩ //==================================================================== private static WordLinkedArray BiPath2LinkedArray(int[] biPath, RowFirstDynamicArray<ChainContent> segGraph, List<AtomNode> atomSegment) { List<ChainItem<ChainContent>> list = segGraph.ToListItems(); StringBuilder sb = new StringBuilder(); WordLinkedArray result = new WordLinkedArray(); for (int i = 0; i < biPath.Length; i++) { WordNode node = new WordNode(); node.row = list[biPath[i]].row; node.col = list[biPath[i]].col; node.sWordInSegGraph = list[biPath[i]].Content.sWord; node.theWord = new WordResult(); if (node.sWordInSegGraph == "δ##��" || node.sWordInSegGraph == "δ##��" || node.sWordInSegGraph == "δ##��" || node.sWordInSegGraph == "δ##ʱ" || node.sWordInSegGraph == "δ##��") { sb.Remove(0, sb.Length); for (int j = node.row; j < node.col; j++) sb.Append(atomSegment[j].sWord); node.theWord.sWord = sb.ToString(); } else node.theWord.sWord = list[biPath[i]].Content.sWord; node.theWord.nPOS = list[biPath[i]].Content.nPOS; node.theWord.dValue = list[biPath[i]].Content.eWeight; result.AppendNode(node); } return result; }
public int BiSegment(string sSentence, double smoothPara, int nKind) { WordResult[] tmpResult; WordLinkedArray linkedArray; if (biDict == null || coreDict == null) throw new Exception("biDict �� coreDict ��δ��ʼ����"); //---ԭ�ӷִ� atomSegment = AtomSegment(sSentence); OnAtomSegment(atomSegment); //---�����ʿ⣬�������п��ִܷʷ�������������ṹ segGraph = GenerateWordNet(atomSegment, coreDict); OnGenSegGraph(segGraph); //---�������п��ܵ�������� biGraphResult = BiGraphGenerate(segGraph, smoothPara, biDict, coreDict); OnGenBiSegGraph(biGraphResult); //---N ���·�����������ִʷ��� NShortPath.Calculate(biGraphResult, nKind); List<int[]> spResult = NShortPath.GetNPaths(Predefine.MAX_SEGMENT_NUM); OnNShortPath(spResult, segGraph); m_pWordSeg = new List<WordResult[]>(); m_graphOptimum = new RowFirstDynamicArray<ChainContent>(); for (int i = 0; i < spResult.Count; i++) { linkedArray = BiPath2LinkedArray(spResult[i], segGraph, atomSegment); tmpResult = GenerateWord(spResult[i], linkedArray, m_graphOptimum); if (tmpResult != null) m_pWordSeg.Add(tmpResult); } OnBeforeOptimize(m_pWordSeg); return m_pWordSeg.Count; }
public ColumnFirstDynamicArray<ChainContent> TestSegment(string sSentence, double smoothPara, int nKind) { WordResult[] tmpResult; WordLinkedArray linkedArray; if (biDict == null || coreDict == null) throw new Exception("biDict �� coreDict ��δ��ʼ����"); //---ԭ�ӷִ� atomSegment = AtomSegment(sSentence); OnAtomSegment(atomSegment); //---�����ʿ⣬�������п��ִܷʷ�������������ṹ segGraph = GenerateWordNet(atomSegment, coreDict); //OnGenSegGraph(segGraph); //---�������п��ܵ�������� biGraphResult = BiGraphGenerate(segGraph, smoothPara, biDict, coreDict); //OnGenBiSegGraph(biGraphResult); return biGraphResult; //--����ƥ���Ż� //biGraphResult = BackwardOptimize(biGraphResult); //OnBackwardOptimize(biGraphResult); }
//==================================================================== // ����������֮��Ķ���ͼ�� //==================================================================== public static ColumnFirstDynamicArray<ChainContent> BiGraphGenerate( RowFirstDynamicArray<ChainContent> aWord, double smoothPara, WordDictionary biDict, WordDictionary coreDict) { ColumnFirstDynamicArray<ChainContent> aBiWordNet = new ColumnFirstDynamicArray<ChainContent>(); ChainItem<ChainContent> pCur, pNextWords; int nTwoWordsFreq = 0, nCurWordIndex, nNextWordIndex; double dCurFreqency, dValue, dTemp; string sTwoWords; StringBuilder sb = new StringBuilder(); //Record the position map of possible words int[] m_npWordPosMapTable = PreparePositionMap(aWord); pCur = aWord.GetHead(); while (pCur != null) { if (pCur.Content.nPOS >= 0) //It's not an unknown words dCurFreqency = pCur.Content.eWeight; else //Unknown words dCurFreqency = coreDict.GetFrequency(pCur.Content.sWord, 2); //Get next words which begin with pCur.col��ע��������Ķ�Ӧ��ϵ�� pNextWords = aWord.GetFirstElementOfRow(pCur.col); while (pNextWords != null && pNextWords.row == pCur.col) { sb.Remove(0, sb.Length); sb.Append(pCur.Content.sWord); sb.Append(Predefine.WORD_SEGMENTER); sb.Append(pNextWords.Content.sWord); sTwoWords = sb.ToString(); //Two linked Words frequency nTwoWordsFreq = biDict.GetFrequency(sTwoWords, 3); //Smoothing dTemp = 1.0 / Predefine.MAX_FREQUENCE; //-log{a*P(Ci-1)+(1-a)P(Ci|Ci-1)} Note 0<a<1 dValue = -Math.Log(smoothPara * (1.0 + dCurFreqency) / (Predefine.MAX_FREQUENCE + 80000.0) + (1.0 - smoothPara) * ((1.0 - dTemp) * nTwoWordsFreq / (1.0 + dCurFreqency) + dTemp)); //Unknown words: P(Wi|Ci);while known words:1 if (pCur.Content.nPOS < 0) dValue += pCur.Content.nPOS; //Get the position index of current word in the position map table nCurWordIndex = Utility.BinarySearch(pCur.row * Predefine.MAX_SENTENCE_LEN + pCur.col, m_npWordPosMapTable); nNextWordIndex = Utility.BinarySearch(pNextWords.row * Predefine.MAX_SENTENCE_LEN + pNextWords.col, m_npWordPosMapTable); aBiWordNet.SetElement(nCurWordIndex, nNextWordIndex, new ChainContent(dValue, pCur.Content.nPOS, sTwoWords)); pNextWords = pNextWords.next; //Get next word } pCur = pCur.next; } return aBiWordNet; }
//==================================================================== // Generate Word according the segmentation route //==================================================================== private static WordResult[] GenerateWord(int[] uniPath, WordLinkedArray linkedArray, RowFirstDynamicArray<ChainContent> m_graphOptimum) { if (linkedArray.Count == 0) return null; //-------------------------------------------------------------------- //Merge all seperate continue num into one number MergeContinueNumIntoOne(ref linkedArray); //-------------------------------------------------------------------- //The delimiter "--" ChangeDelimiterPOS(ref linkedArray); //-------------------------------------------------------------------- //如果前一个词是数字,当前词以“-”或“-”开始,并且不止这一个字符, //那么将此“-”符号从当前词中分离出来。 //例如 “3 / -4 / 月”需要拆分成“3 / - / 4 / 月” SplitMiddleSlashFromDigitalWords(ref linkedArray); //-------------------------------------------------------------------- //1、如果当前词是数字,下一个词是“月、日、时、分、秒、月份”中的一个,则合并,且当前词词性是时间 //2、如果当前词是可以作为年份的数字,下一个词是“年”,则合并,词性为时间,否则为数字。 //3、如果最后一个汉字是"点" ,则认为当前数字是时间 //4、如果当前串最后一个汉字不是"∶·./"和半角的'.''/',那么是数 //5、当前串最后一个汉字是"∶·./"和半角的'.''/',且长度大于1,那么去掉最后一个字符。例如"1." CheckDateElements(ref linkedArray); //-------------------------------------------------------------------- //输出结果 WordResult[] result = new WordResult[linkedArray.Count]; WordNode pCur = linkedArray.first; int i = 0; while (pCur != null) { WordResult item = new WordResult(); item.sWord = pCur.theWord.sWord; item.nPOS = pCur.theWord.nPOS; item.dValue = pCur.theWord.dValue; result[i] = item; m_graphOptimum.SetElement(pCur.row, pCur.col, new ChainContent(item.dValue, item.nPOS, pCur.sWordInSegGraph)); pCur = pCur.next; i++; } return result; }
//==================================================================== // Func Name : GenerateWordNet // Description: Generate the segmentation word net according // the original sentence // Parameters : sSentence: the sentence // dictCore : core dictionary // bOriginalFreq=false: output original frequency // Returns : bool //==================================================================== public static RowFirstDynamicArray<ChainContent> GenerateWordNet(List<AtomNode> atomSegment, WordDictionary coreDict) { string sWord = "", sMaxMatchWord; int nPOSRet, nPOS, nTotalFreq; double dValue = 0; RowFirstDynamicArray<ChainContent> m_segGraph = new RowFirstDynamicArray<ChainContent>(); m_segGraph.SetEmpty(); // 将原子部分存入m_segGraph for (int i = 0; i < atomSegment.Count; i++)//Init the cost array { if (atomSegment[i].nPOS == Predefine.CT_CHINESE) m_segGraph.SetElement(i, i + 1, new ChainContent(0, 0, atomSegment[i].sWord)); else { sWord = atomSegment[i].sWord;//init the word dValue = Predefine.MAX_FREQUENCE; switch (atomSegment[i].nPOS) { case Predefine.CT_INDEX: case Predefine.CT_NUM: nPOS = -27904;//'m'*256 sWord = "未##数"; dValue = 0; break; case Predefine.CT_DELIMITER: nPOS = 30464;//'w'*256; break; case Predefine.CT_LETTER: nPOS = -28280; // -'n' * 256 - 'x'; dValue = 0; sWord = "未##串"; break; case Predefine.CT_SINGLE://12021-2129-3121 if (Regex.IsMatch(atomSegment[i].sWord, @"^(-?\d+)(\.\d+)?$")) //匹配浮点数 { nPOS = -27904;//'m'*256 sWord = "未##数"; } else { nPOS = -28280; // -'n' * 256 - 'x' sWord = "未##串"; } dValue = 0; break; default: nPOS = atomSegment[i].nPOS;//'?'*256; break; } m_segGraph.SetElement(i, i + 1, new ChainContent(dValue, nPOS, sWord));//init the link with minimum } } // 将所有可能的组词存入m_segGraph for (int i = 0; i < atomSegment.Count; i++)//All the word { sWord = atomSegment[i].sWord;//Get the current atom int j = i + 1; while (j < atomSegment.Count && coreDict.GetMaxMatch(sWord, out sMaxMatchWord, out nPOSRet)) { if (sMaxMatchWord == sWord) // 就是我们要找的词 { WordInfo info = coreDict.GetWordInfo(sWord); // 该词可能就有多种词性 // 计算该词的所有词频之和 nTotalFreq = 0; for (int k = 0; k < info.Count; k++) nTotalFreq += info.Frequencies[k]; // 限制出现某些特殊词 if (sWord.Length == 2 && (sWord.StartsWith("年") || sWord.StartsWith("月")) && i >= 1 && (Utility.IsAllNum(atomSegment[i - 1].sWord) || Utility.IsAllChineseNum(atomSegment[i - 1].sWord))) { //1年内、1999年末 if ("末内中底前间初".IndexOf(sWord.Substring(1)) >= 0) break; } // 如果该词只有一个词性,则存储,否则词性记录为 0 if (info.Count == 1) m_segGraph.SetElement(i, j, new ChainContent(nTotalFreq, info.POSs[0], sWord)); else m_segGraph.SetElement(i, j, new ChainContent(nTotalFreq, 0, sWord)); } sWord += atomSegment[j++].sWord; } } return m_segGraph; }
private void OnPersonAndPlaceRecognition(RowFirstDynamicArray<ChainContent> m_graphOptimum) { SendEvents(new SegmentEventArgs(SegmentStage.PersonAndPlaceRecognition, m_graphOptimum.ToString())); }
public int BiOptimumSegment(int nResultCount, double dSmoothingPara) { WordResult[] tmpResult; WordLinkedArray linkedArray; //Generate the biword link net ColumnFirstDynamicArray<ChainContent> aBiwordsNet = BiGraphGenerate(m_graphOptimum, dSmoothingPara, biDict, coreDict); OnGenBiOptimumSegGraph(aBiwordsNet); NShortPath.Calculate(aBiwordsNet, nResultCount); List<int[]> spResult = NShortPath.GetNPaths(Predefine.MAX_SEGMENT_NUM); m_pWordSeg = new List<WordResult[]>(); segGraph = m_graphOptimum; m_graphOptimum = new RowFirstDynamicArray<ChainContent>(); for (int i = 0; i < spResult.Count; i++) { linkedArray = BiPath2LinkedArray(spResult[i], segGraph, atomSegment); tmpResult = GenerateWord(spResult[i], linkedArray, m_graphOptimum); if (tmpResult != null) m_pWordSeg.Add(tmpResult); } return m_pWordSeg.Count; }