private static void MergeContinueNumIntoOne(ref WordLinkedArray linkedArray) { if (linkedArray.Count < 2) { return; } string tmp; WordNode pCur = linkedArray.first; WordNode pNext = pCur.next; while (pNext != null) { if ((Utility.IsAllNum(pCur.theWord.sWord) || Utility.IsAllChineseNum(pCur.theWord.sWord)) && (Utility.IsAllNum(pNext.theWord.sWord) || Utility.IsAllChineseNum(pNext.theWord.sWord))) { tmp = pCur.theWord.sWord + pNext.theWord.sWord; if (Utility.IsAllNum(tmp) || Utility.IsAllChineseNum(tmp)) { pCur.theWord.sWord += pNext.theWord.sWord; pCur.col = pNext.col; pCur.next = pNext.next; linkedArray.Count--; pNext = pCur.next; continue; } } pCur = pCur.next; pNext = pNext.next; } }
//==================================================================== // Generate Word according the segmentation route //==================================================================== private static WordResult[] GenerateWord(int[] uniPath, WordLinkedArray linkedArray, RowFirstDynamicArray <ChainContent> m_graphOptimum) { if (linkedArray.Count == 0) { return(null); } //-------------------------------------------------------------------- //Merge all seperate continue num into one number MergeContinueNumIntoOne(ref linkedArray); //-------------------------------------------------------------------- //The delimiter "--" ChangeDelimiterPOS(ref linkedArray); //-------------------------------------------------------------------- //如果前一个词是数字,当前词以“-”或“-”开始,并且不止这一个字符, //那么将此“-”符号从当前词中分离出来。 //例如 “3 / -4 / 月”需要拆分成“3 / - / 4 / 月” SplitMiddleSlashFromDigitalWords(ref linkedArray); //-------------------------------------------------------------------- //1、如果当前词是数字,下一个词是“月、日、时、分、秒、月份”中的一个,则合并,且当前词词性是时间 //2、如果当前词是可以作为年份的数字,下一个词是“年”,则合并,词性为时间,否则为数字。 //3、如果最后一个汉字是"点" ,则认为当前数字是时间 //4、如果当前串最后一个汉字不是"∶·./"和半角的'.''/',那么是数 //5、当前串最后一个汉字是"∶·./"和半角的'.''/',且长度大于1,那么去掉最后一个字符。例如"1." CheckDateElements(ref linkedArray); //-------------------------------------------------------------------- //输出结果 WordResult[] result = new WordResult[linkedArray.Count]; WordNode pCur = linkedArray.first; int i = 0; while (pCur != null) { WordResult item = new WordResult(); item.sWord = pCur.theWord.sWord; item.nPOS = pCur.theWord.nPOS; item.dValue = pCur.theWord.dValue; result[i] = item; m_graphOptimum.SetElement(pCur.row, pCur.col, new ChainContent(item.dValue, item.nPOS, pCur.sWordInSegGraph)); pCur = pCur.next; i++; } return(result); }
public override string ToString() { StringBuilder sb = new StringBuilder(); WordNode cur = first; while (cur != null) { sb.Append(string.Format("{0}, ", cur.theWord.sWord)); cur = cur.next; } return(sb.ToString()); }
public void AppendNode(WordNode node) { if (first == null && last == null) { first = node; last = node; } else { last.next = node; last = node; } Count++; }
private static void ChangeDelimiterPOS(ref WordLinkedArray linkedArray) { WordNode pCur = linkedArray.first; while (pCur != null) { if (pCur.theWord.sWord == "--" || pCur.theWord.sWord == "—" || pCur.theWord.sWord == "-") { pCur.theWord.nPOS = 30464; //'w'*256;Set the POS with 'w' pCur.theWord.dValue = 0; } pCur = pCur.next; } }
//==================================================================== //如果前一个词是数字,当前词以“-”或“-”开始,并且不止这一个字符, //那么将此“-”符号从当前词中分离出来。 //例如 “3 / -4 / 月”需要拆分成“3 / - / 4 / 月” //==================================================================== private static void SplitMiddleSlashFromDigitalWords(ref WordLinkedArray linkedArray) { if (linkedArray.Count < 2) { return; } WordNode pCur = linkedArray.first.next; WordNode pPre = linkedArray.first; while (pCur != null) { //27904='m'*256 if ((Math.Abs(pPre.theWord.nPOS) == 27904 || Math.Abs(pPre.theWord.nPOS) == 29696) && (Utility.IsAllNum(pCur.theWord.sWord) || Utility.IsAllChineseNum(pCur.theWord.sWord)) && ("--".IndexOf(pCur.theWord.sWord.ToCharArray()[0]) >= 0) && pCur.theWord.sWord.Length > 1) { // 将“-”拆分出来。 WordNode newNode = new WordNode(); newNode.row = pCur.row + 1; newNode.col = pCur.col; newNode.sWordInSegGraph = pCur.theWord.sWord.Substring(1); WordResult theWord = new WordResult(); theWord.sWord = newNode.sWordInSegGraph; theWord.nPOS = 27904; theWord.dValue = pCur.theWord.dValue; newNode.theWord = theWord; pCur.col = pCur.row + 1; pCur.theWord.sWord = pCur.theWord.sWord.Substring(0, 1); pCur.theWord.nPOS = 30464; //'w'*256; pCur.theWord.dValue = 0; newNode.next = pCur.next; pCur.next = newNode; linkedArray.Count++; } pCur = pCur.next; pPre = pPre.next; } }
//==================================================================== // 将BiPath转换为LinkedArray // 例如“他说的确实在理” // BiPath:(0, 1, 2, 3, 6, 9, 11, 12) // 0 1 2 3 4 5 6 7 8 9 10 11 12 // 始##始 他 说 的 的确 确 确实 实 实在 在 在理 理 末##末 //==================================================================== private static WordLinkedArray BiPath2LinkedArray(int[] biPath, RowFirstDynamicArray <ChainContent> segGraph, List <AtomNode> atomSegment) { List <ChainItem <ChainContent> > list = segGraph.ToListItems(); StringBuilder sb = new StringBuilder(); WordLinkedArray result = new WordLinkedArray(); for (int i = 0; i < biPath.Length; i++) { WordNode node = new WordNode(); node.row = list[biPath[i]].row; node.col = list[biPath[i]].col; node.sWordInSegGraph = list[biPath[i]].Content.sWord; node.theWord = new WordResult(); if (node.sWordInSegGraph == "未##人" || node.sWordInSegGraph == "未##地" || node.sWordInSegGraph == "未##数" || node.sWordInSegGraph == "未##时" || node.sWordInSegGraph == "未##串") { sb.Remove(0, sb.Length); for (int j = node.row; j < node.col; j++) { sb.Append(atomSegment[j].sWord); } node.theWord.sWord = sb.ToString(); } else { node.theWord.sWord = list[biPath[i]].Content.sWord; } node.theWord.nPOS = list[biPath[i]].Content.nPOS; node.theWord.dValue = list[biPath[i]].Content.eWeight; result.AppendNode(node); } return(result); }
//==================================================================== //���ǰһ���������֣���ǰ���ԡ�������-����ʼ�����Ҳ�ֹ��һ���ַ��� //��ô���ˡ��������Ŵӵ�ǰ���з�������� //���� ��3 / -4 / �¡���Ҫ��ֳɡ�3 / - / 4 / �¡� //==================================================================== private static void SplitMiddleSlashFromDigitalWords(ref WordLinkedArray linkedArray) { if (linkedArray.Count < 2) return; WordNode pCur = linkedArray.first.next; WordNode pPre = linkedArray.first; while (pCur != null) { //27904='m'*256 if ((Math.Abs(pPre.theWord.nPOS) == 27904 || Math.Abs(pPre.theWord.nPOS) == 29696) && (Utility.IsAllNum(pCur.theWord.sWord) || Utility.IsAllChineseNum(pCur.theWord.sWord)) && ("-��".IndexOf(pCur.theWord.sWord.ToCharArray()[0]) >= 0) && pCur.theWord.sWord.Length > 1) { // ����������ֳ����� WordNode newNode = new WordNode(); newNode.row = pCur.row + 1; newNode.col = pCur.col; newNode.sWordInSegGraph = pCur.theWord.sWord.Substring(1); WordResult theWord = new WordResult(); theWord.sWord = newNode.sWordInSegGraph; theWord.nPOS = 27904; theWord.dValue = pCur.theWord.dValue; newNode.theWord = theWord; pCur.col = pCur.row + 1; pCur.theWord.sWord = pCur.theWord.sWord.Substring(0, 1); pCur.theWord.nPOS = 30464; //'w'*256; pCur.theWord.dValue = 0; newNode.next = pCur.next; pCur.next = newNode; linkedArray.Count++; } pCur = pCur.next; pPre = pPre.next; } }
//==================================================================== // ��BiPathת��ΪLinkedArray // ���硰��˵��ȷʵ����� // BiPath����0, 1, 2, 3, 6, 9, 11, 12�� // 0 1 2 3 4 5 6 7 8 9 10 11 12 // ʼ##ʼ �� ˵ �� ��ȷ ȷ ȷʵ ʵ ʵ�� �� ���� �� ĩ##ĩ //==================================================================== private static WordLinkedArray BiPath2LinkedArray(int[] biPath, RowFirstDynamicArray<ChainContent> segGraph, List<AtomNode> atomSegment) { List<ChainItem<ChainContent>> list = segGraph.ToListItems(); StringBuilder sb = new StringBuilder(); WordLinkedArray result = new WordLinkedArray(); for (int i = 0; i < biPath.Length; i++) { WordNode node = new WordNode(); node.row = list[biPath[i]].row; node.col = list[biPath[i]].col; node.sWordInSegGraph = list[biPath[i]].Content.sWord; node.theWord = new WordResult(); if (node.sWordInSegGraph == "δ##��" || node.sWordInSegGraph == "δ##��" || node.sWordInSegGraph == "δ##��" || node.sWordInSegGraph == "δ##ʱ" || node.sWordInSegGraph == "δ##��") { sb.Remove(0, sb.Length); for (int j = node.row; j < node.col; j++) sb.Append(atomSegment[j].sWord); node.theWord.sWord = sb.ToString(); } else node.theWord.sWord = list[biPath[i]].Content.sWord; node.theWord.nPOS = list[biPath[i]].Content.nPOS; node.theWord.dValue = list[biPath[i]].Content.eWeight; result.AppendNode(node); } return result; }
//==================================================================== //1、如果当前词是数字,下一个词是“月、日、时、分、秒、月份”中的一个,则合并且当前词词性是时间 //2、如果当前词是可以作为年份的数字,下一个词是“年”,则合并,词性为时间,否则为数字。 //3、如果最后一个汉字是"点" ,则认为当前数字是时间 //4、如果当前串最后一个汉字不是"∶·./"和半角的'.''/',那么是数 //5、当前串最后一个汉字是"∶·./"和半角的'.''/',且长度大于1,那么去掉最后一个字符。例如"1." //==================================================================== private static void CheckDateElements(ref WordLinkedArray linkedArray) { if (linkedArray.Count < 2) { return; } string nextWord; WordNode pCur = linkedArray.first; WordNode pNext = pCur.next; while (pNext != null) { if (Utility.IsAllNum(pCur.theWord.sWord) || Utility.IsAllChineseNum(pCur.theWord.sWord)) { //===== 1、如果当前词是数字,下一个词是“月、日、时、分、秒、月份”中的一个,则合并且当前词词性是时间 nextWord = pNext.theWord.sWord; if ((nextWord.Length == 1 && "月日时分秒".IndexOf(nextWord) != -1) || (nextWord.Length == 2 && nextWord == "月份")) { //2001年 pCur.theWord.sWord += nextWord; pCur.col = pNext.col; pCur.sWordInSegGraph = "未##时"; pCur.theWord.nPOS = -29696; //'t'*256;//Set the POS with 'm' pCur.next = pNext.next; pNext = pCur.next; linkedArray.Count--; } //===== 2、如果当前词是可以作为年份的数字,下一个词是“年”,则合并,词性为时间,否则为数字。 else if (nextWord == "年") { if (IsYearTime(pCur.theWord.sWord)) { pCur.theWord.sWord += nextWord; pCur.col = pNext.col; pCur.sWordInSegGraph = "未##时"; pCur.theWord.nPOS = -29696; //'t'*256;//Set the POS with 'm' pCur.next = pNext.next; pNext = pCur.next; linkedArray.Count--; } //===== 否则当前词就是数字了 ===== else { pCur.sWordInSegGraph = "未##数"; pCur.theWord.nPOS = -27904; //Set the POS with 'm' } } else { //===== 3、如果最后一个汉字是"点" ,则认为当前数字是时间 if (pCur.theWord.sWord.EndsWith("点")) { pCur.sWordInSegGraph = "未##时"; pCur.theWord.nPOS = -29696; //Set the POS with 't' } else { char[] tmpcharArray = pCur.theWord.sWord.ToCharArray(); string lastChar = tmpcharArray[tmpcharArray.Length - 1].ToString(); //===== 4、如果当前串最后一个汉字不是"∶·./"和半角的'.''/',那么是数 if ("∶·././".IndexOf(lastChar) == -1) { pCur.sWordInSegGraph = "未##数"; pCur.theWord.nPOS = -27904; //'m'*256;Set the POS with 'm' } //===== 5、当前串最后一个汉字是"∶·./"和半角的'.''/',且长度大于1,那么去掉最后一个字符。例如"1." else if (pCur.theWord.sWord.Length > 1) { pCur.theWord.sWord = pCur.theWord.sWord.Substring(0, pCur.theWord.sWord.Length - 1); pCur.sWordInSegGraph = "未##数"; pCur.theWord.nPOS = -27904; //'m'*256;Set the POS with 'm' } } } } pCur = pCur.next; pNext = pNext.next; } }