Example #1
0
        private void OnNShortPath(List <int[]> paths, RowFirstDynamicArray <ChainContent> segGraph)
        {
            List <ChainItem <ChainContent> > list = segGraph.ToListItems();
            string theWord;

            int[]         aPath;
            StringBuilder sb = new StringBuilder();

            for (int i = 0; i < paths.Count; i++)
            {
                aPath = paths[i];
                for (int j = 0; j < aPath.Length; j++)
                {
                    theWord = list[aPath[j]].Content.sWord;
                    if (theWord == "未##人" || theWord == "未##地" || theWord == "未##数" || theWord == "未##时" || theWord == "未##串")
                    {
                        for (int k = list[aPath[j]].row; k < list[aPath[j]].col; k++)
                        {
                            sb.Append(atomSegment[k].sWord);
                        }
                        sb.Append(", ");
                    }
                    else
                    {
                        sb.Append(string.Format("{0}, ", list[aPath[j]].Content.sWord));
                    }
                }

                sb.Append("\r\n");
            }

            SendEvents(new SegmentEventArgs(SegmentStage.NShortPath, sb.ToString()));
        }
Example #2
0
        //====================================================================
        // 准备PositionMap,用于记录词的位置
        //====================================================================
        private static int[] PreparePositionMap(RowFirstDynamicArray <ChainContent> aWord)
        {
            int[] m_npWordPosMapTable;
            ChainItem <ChainContent> pTail, pCur;
            int nWordIndex = 0, m_nWordCount;

            //Get tail element and return the words count
            m_nWordCount = aWord.GetTail(out pTail);

            if (m_nWordCount > 0)
            {
                m_npWordPosMapTable = new int[m_nWordCount];
            }
            else
            {
                m_npWordPosMapTable = null;
            }

            //Record the  position of possible words
            pCur = aWord.GetHead();
            while (pCur != null)
            {
                m_npWordPosMapTable[nWordIndex++] = pCur.row * Predefine.MAX_SENTENCE_LEN + pCur.col;
                pCur = pCur.next;
            }

            return(m_npWordPosMapTable);
        }
Example #3
0
        public int BiOptimumSegment(int nResultCount, double dSmoothingPara)
        {
            WordResult[]    tmpResult;
            WordLinkedArray linkedArray;

            //Generate the biword link net
            ColumnFirstDynamicArray <ChainContent> aBiwordsNet = BiGraphGenerate(m_graphOptimum, dSmoothingPara, biDict, coreDict);

            OnGenBiOptimumSegGraph(aBiwordsNet);

            NShortPath.Calculate(aBiwordsNet, nResultCount);
            List <int[]> spResult = NShortPath.GetNPaths(Predefine.MAX_SEGMENT_NUM);

            m_pWordSeg     = new List <WordResult[]>();
            segGraph       = m_graphOptimum;
            m_graphOptimum = new RowFirstDynamicArray <ChainContent>();

            for (int i = 0; i < spResult.Count; i++)
            {
                linkedArray = BiPath2LinkedArray(spResult[i], segGraph, atomSegment);
                tmpResult   = GenerateWord(spResult[i], linkedArray, m_graphOptimum);

                if (tmpResult != null)
                {
                    m_pWordSeg.Add(tmpResult);
                }
            }

            return(m_pWordSeg.Count);
        }
Example #4
0
        public ColumnFirstDynamicArray <ChainContent> TestSegment(string sSentence, double smoothPara, int nKind)
        {
            WordResult[]    tmpResult;
            WordLinkedArray linkedArray;

            if (biDict == null || coreDict == null)
            {
                throw new Exception("biDict 或 coreDict 尚未初始化!");
            }

            //---原子分词
            atomSegment = AtomSegment(sSentence);
            OnAtomSegment(atomSegment);

            //---检索词库,加入所有可能分词方案并存入链表结构
            segGraph = GenerateWordNet(atomSegment, coreDict);
            //OnGenSegGraph(segGraph);

            //---检索所有可能的两两组合
            biGraphResult = BiGraphGenerate(segGraph, smoothPara, biDict, coreDict);
            //OnGenBiSegGraph(biGraphResult);

            return(biGraphResult);
            //--逆向匹配优化
            //biGraphResult = BackwardOptimize(biGraphResult);
            //OnBackwardOptimize(biGraphResult);
        }
Example #5
0
        public int BiSegment(string sSentence, double smoothPara, int nKind)
        {
            WordResult[]    tmpResult;
            WordLinkedArray linkedArray;

            if (biDict == null || coreDict == null)
            {
                throw new Exception("biDict 或 coreDict 尚未初始化!");
            }

            //---原子分词
            atomSegment = AtomSegment(sSentence);
            OnAtomSegment(atomSegment);

            //---检索词库,加入所有可能分词方案并存入链表结构
            segGraph = GenerateWordNet(atomSegment, coreDict);
            OnGenSegGraph(segGraph);

            //---检索所有可能的两两组合
            biGraphResult = BiGraphGenerate(segGraph, smoothPara, biDict, coreDict);
            OnGenBiSegGraph(biGraphResult);

            //--逆向匹配优化
            biGraphResult = BackwardOptimize(biGraphResult);
            OnBackwardOptimize(biGraphResult);

            //---N 最短路径计算出多个分词方案
            NShortPath.Calculate(biGraphResult, nKind);
            List <int[]> spResult = NShortPath.GetNPaths(Predefine.MAX_SEGMENT_NUM);

            OnNShortPath(spResult, segGraph);

            m_pWordSeg     = new List <WordResult[]>();
            m_graphOptimum = new RowFirstDynamicArray <ChainContent>();

            for (int i = 0; i < spResult.Count; i++)
            {
                linkedArray = BiPath2LinkedArray(spResult[i], segGraph, atomSegment);
                tmpResult   = GenerateWord(spResult[i], linkedArray, m_graphOptimum);

                if (tmpResult != null)
                {
                    m_pWordSeg.Add(tmpResult);
                }
            }

            OnBeforeOptimize(m_pWordSeg);

            return(m_pWordSeg.Count);
        }
Example #6
0
        //Unknown word recognition
        //pWordSegResult:word Segmentation result;
        //graphOptimum: The optimized segmentation graph
        //graphSeg: The original segmentation graph
        public bool Recognition(WordResult[] pWordSegResult, RowFirstDynamicArray <ChainContent> graphOptimum,
                                List <AtomNode> atomSegment, WordDictionary dictCore)
        {
            ChainItem <ChainContent> item;
            int    nStartPos = 0, j = 0, nAtomStart, nAtomEnd, nPOSOriginal;
            double dValue;

            m_roleTag.POSTagging(pWordSegResult, dictCore, m_dict);
            //Tag the segmentation with unknown recognition roles according the core dictionary and unknown recognition dictionary
            for (int i = 0; i < m_roleTag.m_nUnknownWordsCount; i++)
            {
                while (j < atomSegment.Count && nStartPos < m_roleTag.m_nUnknownWords[i, 0])
                {
                    nStartPos += atomSegment[j++].sWord.Length;
                }

                nAtomStart = j;
                while (j < atomSegment.Count && nStartPos < m_roleTag.m_nUnknownWords[i, 1])
                {
                    nStartPos += atomSegment[j++].sWord.Length;
                }

                nAtomEnd = j;
                if (nAtomStart < nAtomEnd)
                {
                    item = graphOptimum.GetElement(nAtomStart, nAtomEnd);
                    if (item != null)
                    {
                        dValue       = item.Content.eWeight;
                        nPOSOriginal = item.Content.nPOS;
                    }
                    else
                    {
                        dValue = Predefine.INFINITE_VALUE;
                    }

                    if (dValue > m_roleTag.m_dWordsPossibility[i])
                    {
                        //Set the element with less frequency
                        graphOptimum.SetElement(nAtomStart, nAtomEnd, new ChainContent(m_roleTag.m_dWordsPossibility[i], m_nPOS, m_sUnknownFlags));
                    }
                }
            }
            return(true);
        }
Example #7
0
        //====================================================================
        // 将BiPath转换为LinkedArray
        // 例如“他说的确实在理”
        // BiPath:(0, 1, 2, 3, 6, 9, 11, 12)
        //    0    1   2   3   4     5   6     7   8     9   10    11  12
        // 始##始  他  说  的  的确  确  确实  实  实在  在  在理  理  末##末
        //====================================================================
        private static WordLinkedArray BiPath2LinkedArray(int[] biPath, RowFirstDynamicArray <ChainContent> segGraph, List <AtomNode> atomSegment)
        {
            List <ChainItem <ChainContent> > list = segGraph.ToListItems();
            StringBuilder sb = new StringBuilder();

            WordLinkedArray result = new WordLinkedArray();

            for (int i = 0; i < biPath.Length; i++)
            {
                WordNode node = new WordNode();

                node.row             = list[biPath[i]].row;
                node.col             = list[biPath[i]].col;
                node.sWordInSegGraph = list[biPath[i]].Content.sWord;

                node.theWord = new WordResult();
                if (node.sWordInSegGraph == "未##人" || node.sWordInSegGraph == "未##地" ||
                    node.sWordInSegGraph == "未##数" || node.sWordInSegGraph == "未##时" || node.sWordInSegGraph == "未##串")
                {
                    sb.Remove(0, sb.Length);
                    for (int j = node.row; j < node.col; j++)
                    {
                        sb.Append(atomSegment[j].sWord);
                    }

                    node.theWord.sWord = sb.ToString();
                }
                else
                {
                    node.theWord.sWord = list[biPath[i]].Content.sWord;
                }

                node.theWord.nPOS   = list[biPath[i]].Content.nPOS;
                node.theWord.dValue = list[biPath[i]].Content.eWeight;

                result.AppendNode(node);
            }

            return(result);
        }
Example #8
0
 private void OnPersonAndPlaceRecognition(RowFirstDynamicArray <ChainContent> m_graphOptimum)
 {
     SendEvents(new SegmentEventArgs(SegmentStage.PersonAndPlaceRecognition, m_graphOptimum.ToString()));
 }
Example #9
0
 private void OnOptimumSegment(RowFirstDynamicArray <ChainContent> m_graphOptimum)
 {
     SendEvents(new SegmentEventArgs(SegmentStage.OptimumSegment, m_graphOptimum.ToString()));
 }
Example #10
0
 private void OnGenSegGraph(RowFirstDynamicArray <ChainContent> segGraph)
 {
     SendEvents(new SegmentEventArgs(SegmentStage.GenSegGraph, segGraph.ToString()));
 }
Example #11
0
        //====================================================================
        // Generate Word according the segmentation route
        //====================================================================
        private static WordResult[] GenerateWord(int[] uniPath, WordLinkedArray linkedArray, RowFirstDynamicArray <ChainContent> m_graphOptimum)
        {
            if (linkedArray.Count == 0)
            {
                return(null);
            }

            //--------------------------------------------------------------------
            //Merge all seperate continue num into one number
            MergeContinueNumIntoOne(ref linkedArray);

            //--------------------------------------------------------------------
            //The delimiter "--"
            ChangeDelimiterPOS(ref linkedArray);

            //--------------------------------------------------------------------
            //如果前一个词是数字,当前词以“-”或“-”开始,并且不止这一个字符,
            //那么将此“-”符号从当前词中分离出来。
            //例如 “3 / -4 / 月”需要拆分成“3 / - / 4 / 月”
            SplitMiddleSlashFromDigitalWords(ref linkedArray);

            //--------------------------------------------------------------------
            //1、如果当前词是数字,下一个词是“月、日、时、分、秒、月份”中的一个,则合并,且当前词词性是时间
            //2、如果当前词是可以作为年份的数字,下一个词是“年”,则合并,词性为时间,否则为数字。
            //3、如果最后一个汉字是"点" ,则认为当前数字是时间
            //4、如果当前串最后一个汉字不是"∶·./"和半角的'.''/',那么是数
            //5、当前串最后一个汉字是"∶·./"和半角的'.''/',且长度大于1,那么去掉最后一个字符。例如"1."
            CheckDateElements(ref linkedArray);

            //--------------------------------------------------------------------
            //输出结果
            WordResult[] result = new WordResult[linkedArray.Count];

            WordNode pCur = linkedArray.first;
            int      i    = 0;

            while (pCur != null)
            {
                WordResult item = new WordResult();
                item.sWord  = pCur.theWord.sWord;
                item.nPOS   = pCur.theWord.nPOS;
                item.dValue = pCur.theWord.dValue;
                result[i]   = item;

                m_graphOptimum.SetElement(pCur.row, pCur.col, new ChainContent(item.dValue, item.nPOS, pCur.sWordInSegGraph));

                pCur = pCur.next;
                i++;
            }

            return(result);
        }
Example #12
0
        //====================================================================
        // 生成两两词之间的二叉图表
        //====================================================================
        public static ColumnFirstDynamicArray <ChainContent> BiGraphGenerate(
            RowFirstDynamicArray <ChainContent> aWord, double smoothPara, WordDictionary biDict, WordDictionary coreDict)
        {
            ColumnFirstDynamicArray <ChainContent> aBiWordNet = new ColumnFirstDynamicArray <ChainContent>();

            ChainItem <ChainContent> pCur, pNextWords;
            int           nTwoWordsFreq = 0, nCurWordIndex, nNextWordIndex;
            double        dCurFreqency, dValue, dTemp;
            string        sTwoWords;
            StringBuilder sb = new StringBuilder();

            //Record the position map of possible words
            int[] m_npWordPosMapTable = PreparePositionMap(aWord);

            pCur = aWord.GetHead();
            while (pCur != null)
            {
                if (pCur.Content.nPOS >= 0)
                {
                    //It's not an unknown words
                    dCurFreqency = pCur.Content.eWeight;
                }
                else
                {
                    //Unknown words
                    dCurFreqency = coreDict.GetFrequency(pCur.Content.sWord, 2);
                }

                //Get next words which begin with pCur.col(注:很特殊的对应关系)
                pNextWords = aWord.GetFirstElementOfRow(pCur.col);

                while (pNextWords != null && pNextWords.row == pCur.col)
                {
                    sb.Remove(0, sb.Length);
                    sb.Append(pCur.Content.sWord);
                    sb.Append(Predefine.WORD_SEGMENTER);
                    sb.Append(pNextWords.Content.sWord);

                    sTwoWords = sb.ToString();

                    //Two linked Words frequency
                    nTwoWordsFreq = biDict.GetFrequency(sTwoWords, 3);

                    //Smoothing
                    dTemp = 1.0 / Predefine.MAX_FREQUENCE;

                    //-log{a*P(Ci-1)+(1-a)P(Ci|Ci-1)} Note 0<a<1
                    dValue = -Math.Log(smoothPara * (1.0 + dCurFreqency) / (Predefine.MAX_FREQUENCE + 80000.0)
                                       + (1.0 - smoothPara) * ((1.0 - dTemp) * nTwoWordsFreq / (1.0 + dCurFreqency) +
                                                               dTemp));

                    //Unknown words: P(Wi|Ci);while known words:1
                    if (pCur.Content.nPOS < 0)
                    {
                        dValue += pCur.Content.nPOS;
                    }

                    //Get the position index of current word in the position map table
                    nCurWordIndex  = Utility.BinarySearch(pCur.row * Predefine.MAX_SENTENCE_LEN + pCur.col, m_npWordPosMapTable);
                    nNextWordIndex = Utility.BinarySearch(pNextWords.row * Predefine.MAX_SENTENCE_LEN + pNextWords.col, m_npWordPosMapTable);

                    aBiWordNet.SetElement(nCurWordIndex, nNextWordIndex, new ChainContent(dValue, pCur.Content.nPOS, sTwoWords));

                    pNextWords = pNextWords.next; //Get next word
                }
                pCur = pCur.next;
            }

            return(aBiWordNet);
        }
Example #13
0
        //====================================================================
        // Func Name  : GenerateWordNet
        // Description: Generate the segmentation word net according
        //              the original sentence
        // Parameters : sSentence: the sentence
        //              dictCore : core dictionary
        //              bOriginalFreq=false: output original frequency
        // Returns    : bool
        //====================================================================
        public static RowFirstDynamicArray <ChainContent> GenerateWordNet(List <AtomNode> atomSegment, WordDictionary coreDict)
        {
            string sWord = "", sMaxMatchWord;
            int    nPOSRet, nPOS, nTotalFreq;
            double dValue = 0;

            RowFirstDynamicArray <ChainContent> m_segGraph = new RowFirstDynamicArray <ChainContent>();

            m_segGraph.SetEmpty();

            // 将原子部分存入m_segGraph
            for (int i = 0; i < atomSegment.Count; i++)//Init the cost array
            {
                if (atomSegment[i].nPOS == Predefine.CT_CHINESE)
                {
                    m_segGraph.SetElement(i, i + 1, new ChainContent(0, 0, atomSegment[i].sWord));
                }
                else
                {
                    sWord  = atomSegment[i].sWord;//init the word
                    dValue = Predefine.MAX_FREQUENCE;
                    switch (atomSegment[i].nPOS)
                    {
                    case Predefine.CT_INDEX:
                    case Predefine.CT_NUM:
                        nPOS   = -27904;//'m'*256
                        sWord  = "未##数";
                        dValue = 0;
                        break;

                    case Predefine.CT_DELIMITER:
                        nPOS = 30464;//'w'*256;
                        break;

                    case Predefine.CT_LETTER:
                        nPOS   = -28280; // -'n' * 256 - 'x';
                        dValue = 0;
                        sWord  = "未##串";
                        break;

                    case Predefine.CT_SINGLE://12021-2129-3121
                        if (Regex.IsMatch(atomSegment[i].sWord, @"^(-?\d+)(\.\d+)?$"))
                        {
                                                //匹配浮点数
                            {
                                nPOS  = -27904; //'m'*256
                                sWord = "未##数";
                            }
                        }
                        else
                        {
                            nPOS  = -28280; // -'n' * 256 - 'x'
                            sWord = "未##串";
                        }
                        dValue = 0;
                        break;

                    default:
                        nPOS = atomSegment[i].nPOS;//'?'*256;
                        break;
                    }
                    m_segGraph.SetElement(i, i + 1, new ChainContent(dValue, nPOS, sWord));//init the link with minimum
                }
            }

            // 将所有可能的组词存入m_segGraph
            for (int i = 0; i < atomSegment.Count; i++) //All the word
            {
                sWord = atomSegment[i].sWord;           //Get the current atom
                int j = i + 1;

                while (j < atomSegment.Count && coreDict.GetMaxMatch(sWord, out sMaxMatchWord, out nPOSRet))
                {
                    if (sMaxMatchWord == sWord)                      // 就是我们要找的词
                    {
                        WordInfo info = coreDict.GetWordInfo(sWord); // 该词可能就有多种词性

                        // 计算该词的所有词频之和
                        nTotalFreq = 0;
                        for (int k = 0; k < info.Count; k++)
                        {
                            nTotalFreq += info.Frequencies[k];
                        }

                        // 限制出现某些特殊词
                        if (sWord.Length == 2 && (sWord.StartsWith("年") || sWord.StartsWith("月")) && i >= 1 &&
                            (Utility.IsAllNum(atomSegment[i - 1].sWord) ||
                             Utility.IsAllChineseNum(atomSegment[i - 1].sWord)))
                        {
                            //1年内、1999年末
                            if ("末内中底前间初".IndexOf(sWord.Substring(1)) >= 0)
                            {
                                break;
                            }
                        }

                        // 如果该词只有一个词性,则存储,否则词性记录为 0
                        if (info.Count == 1)
                        {
                            m_segGraph.SetElement(i, j, new ChainContent(nTotalFreq, info.POSs[0], sWord));
                        }
                        else
                        {
                            m_segGraph.SetElement(i, j, new ChainContent(nTotalFreq, 0, sWord));
                        }
                    }

                    sWord += atomSegment[j++].sWord;
                }
            }
            return(m_segGraph);
        }