Esempio n. 1
0
        //====================================================================
        // 准备PositionMap,用于记录词的位置
        //====================================================================
        private static int[] PreparePositionMap(RowFirstDynamicArray <ChainContent> aWord)
        {
            int[] m_npWordPosMapTable;
            ChainItem <ChainContent> pTail, pCur;
            int nWordIndex = 0, m_nWordCount;

            //Get tail element and return the words count
            m_nWordCount = aWord.GetTail(out pTail);

            if (m_nWordCount > 0)
            {
                m_npWordPosMapTable = new int[m_nWordCount];
            }
            else
            {
                m_npWordPosMapTable = null;
            }

            //Record the  position of possible words
            pCur = aWord.GetHead();
            while (pCur != null)
            {
                m_npWordPosMapTable[nWordIndex++] = pCur.row * Predefine.MAX_SENTENCE_LEN + pCur.col;
                pCur = pCur.next;
            }

            return(m_npWordPosMapTable);
        }
Esempio n. 2
0
        //====================================================================
        // 生成两两词之间的二叉图表
        //====================================================================
        public static ColumnFirstDynamicArray <ChainContent> BiGraphGenerate(
            RowFirstDynamicArray <ChainContent> aWord, double smoothPara, WordDictionary biDict, WordDictionary coreDict)
        {
            ColumnFirstDynamicArray <ChainContent> aBiWordNet = new ColumnFirstDynamicArray <ChainContent>();

            ChainItem <ChainContent> pCur, pNextWords;
            int           nTwoWordsFreq = 0, nCurWordIndex, nNextWordIndex;
            double        dCurFreqency, dValue, dTemp;
            string        sTwoWords;
            StringBuilder sb = new StringBuilder();

            //Record the position map of possible words
            int[] m_npWordPosMapTable = PreparePositionMap(aWord);

            pCur = aWord.GetHead();
            while (pCur != null)
            {
                if (pCur.Content.nPOS >= 0)
                {
                    //It's not an unknown words
                    dCurFreqency = pCur.Content.eWeight;
                }
                else
                {
                    //Unknown words
                    dCurFreqency = coreDict.GetFrequency(pCur.Content.sWord, 2);
                }

                //Get next words which begin with pCur.col(注:很特殊的对应关系)
                pNextWords = aWord.GetFirstElementOfRow(pCur.col);

                while (pNextWords != null && pNextWords.row == pCur.col)
                {
                    sb.Remove(0, sb.Length);
                    sb.Append(pCur.Content.sWord);
                    sb.Append(Predefine.WORD_SEGMENTER);
                    sb.Append(pNextWords.Content.sWord);

                    sTwoWords = sb.ToString();

                    //Two linked Words frequency
                    nTwoWordsFreq = biDict.GetFrequency(sTwoWords, 3);

                    //Smoothing
                    dTemp = 1.0 / Predefine.MAX_FREQUENCE;

                    //-log{a*P(Ci-1)+(1-a)P(Ci|Ci-1)} Note 0<a<1
                    dValue = -Math.Log(smoothPara * (1.0 + dCurFreqency) / (Predefine.MAX_FREQUENCE + 80000.0)
                                       + (1.0 - smoothPara) * ((1.0 - dTemp) * nTwoWordsFreq / (1.0 + dCurFreqency) +
                                                               dTemp));

                    //Unknown words: P(Wi|Ci);while known words:1
                    if (pCur.Content.nPOS < 0)
                    {
                        dValue += pCur.Content.nPOS;
                    }

                    //Get the position index of current word in the position map table
                    nCurWordIndex  = Utility.BinarySearch(pCur.row * Predefine.MAX_SENTENCE_LEN + pCur.col, m_npWordPosMapTable);
                    nNextWordIndex = Utility.BinarySearch(pNextWords.row * Predefine.MAX_SENTENCE_LEN + pNextWords.col, m_npWordPosMapTable);

                    aBiWordNet.SetElement(nCurWordIndex, nNextWordIndex, new ChainContent(dValue, pCur.Content.nPOS, sTwoWords));

                    pNextWords = pNextWords.next; //Get next word
                }
                pCur = pCur.next;
            }

            return(aBiWordNet);
        }