Example #1
0
        /// <summary>
        /// Generate a <see cref="BiSegGraph"/> based upon a <see cref="SegGraph"/>
        /// </summary>
        private void GenerateBiSegGraph(SegGraph segGraph)
        {
            double smooth = 0.1;
            int    wordPairFreq = 0;
            int    maxStart = segGraph.MaxStart;
            double oneWordFreq, weight, tinyDouble = 1.0 / Utility.MAX_FREQUENCE;

            int next;

            char[] idBuffer;
            // get the list of tokens ordered and indexed
            segTokenList = segGraph.MakeIndex();
            // Because the beginning position of startToken is -1, therefore startToken can be obtained when key = -1
            int key                     = -1;
            IList <SegToken> nextTokens = null;

            while (key < maxStart)
            {
                if (segGraph.IsStartExist(key))
                {
                    IList <SegToken> tokenList = segGraph.GetStartList(key);

                    // Calculate all tokens for a given key.
                    foreach (SegToken t1 in tokenList)
                    {
                        oneWordFreq = t1.Weight;
                        next        = t1.EndOffset;
                        nextTokens  = null;
                        // Find the next corresponding Token.
                        // For example: "Sunny seashore", the present Token is "sunny", next one should be "sea" or "seashore".
                        // If we cannot find the next Token, then go to the end and repeat the same cycle.
                        while (next <= maxStart)
                        {
                            // Because the beginning position of endToken is sentenceLen, so equal to sentenceLen can find endToken.
                            if (segGraph.IsStartExist(next))
                            {
                                nextTokens = segGraph.GetStartList(next);
                                break;
                            }
                            next++;
                        }
                        if (nextTokens == null)
                        {
                            break;
                        }
                        foreach (SegToken t2 in nextTokens)
                        {
                            idBuffer = new char[t1.CharArray.Length + t2.CharArray.Length + 1];
                            System.Array.Copy(t1.CharArray, 0, idBuffer, 0, t1.CharArray.Length);
                            idBuffer[t1.CharArray.Length] = BigramDictionary.WORD_SEGMENT_CHAR;
                            System.Array.Copy(t2.CharArray, 0, idBuffer,
                                              t1.CharArray.Length + 1, t2.CharArray.Length);

                            // Two linked Words frequency
                            wordPairFreq = bigramDict.GetFrequency(idBuffer);

                            // Smoothing

                            // -log{a*P(Ci-1)+(1-a)P(Ci|Ci-1)} Note 0<a<1
                            weight = -Math
                                     .Log(smooth
                                          * (1.0 + oneWordFreq)
                                          / (Utility.MAX_FREQUENCE + 0.0)
                                          + (1.0 - smooth)
                                          * ((1.0 - tinyDouble) * wordPairFreq / (1.0 + oneWordFreq) + tinyDouble));

                            SegTokenPair tokenPair = new SegTokenPair(idBuffer, t1.Index,
                                                                      t2.Index, weight);
                            this.AddSegTokenPair(tokenPair);
                        }
                    }
                }
                key++;
            }
        }
Example #2
0
 public BiSegGraph(SegGraph segGraph)
 {
     segTokenList = segGraph.MakeIndex();
     GenerateBiSegGraph(segGraph);
 }