Esempio n. 1
0
        /// <summary>
        /// Generate a <see cref="BiSegGraph"/> based upon a <see cref="SegGraph"/>
        /// </summary>
        private void GenerateBiSegGraph(SegGraph segGraph)
        {
            double smooth = 0.1;
            int    wordPairFreq = 0;
            int    maxStart = segGraph.MaxStart;
            double oneWordFreq, weight, tinyDouble = 1.0 / Utility.MAX_FREQUENCE;

            int next;

            char[] idBuffer;
            // get the list of tokens ordered and indexed
            segTokenList = segGraph.MakeIndex();
            // Because the beginning position of startToken is -1, therefore startToken can be obtained when key = -1
            int key                     = -1;
            IList <SegToken> nextTokens = null;

            while (key < maxStart)
            {
                if (segGraph.IsStartExist(key))
                {
                    IList <SegToken> tokenList = segGraph.GetStartList(key);

                    // Calculate all tokens for a given key.
                    foreach (SegToken t1 in tokenList)
                    {
                        oneWordFreq = t1.Weight;
                        next        = t1.EndOffset;
                        nextTokens  = null;
                        // Find the next corresponding Token.
                        // For example: "Sunny seashore", the present Token is "sunny", next one should be "sea" or "seashore".
                        // If we cannot find the next Token, then go to the end and repeat the same cycle.
                        while (next <= maxStart)
                        {
                            // Because the beginning position of endToken is sentenceLen, so equal to sentenceLen can find endToken.
                            if (segGraph.IsStartExist(next))
                            {
                                nextTokens = segGraph.GetStartList(next);
                                break;
                            }
                            next++;
                        }
                        if (nextTokens == null)
                        {
                            break;
                        }
                        foreach (SegToken t2 in nextTokens)
                        {
                            idBuffer = new char[t1.CharArray.Length + t2.CharArray.Length + 1];
                            System.Array.Copy(t1.CharArray, 0, idBuffer, 0, t1.CharArray.Length);
                            idBuffer[t1.CharArray.Length] = BigramDictionary.WORD_SEGMENT_CHAR;
                            System.Array.Copy(t2.CharArray, 0, idBuffer,
                                              t1.CharArray.Length + 1, t2.CharArray.Length);

                            // Two linked Words frequency
                            wordPairFreq = bigramDict.GetFrequency(idBuffer);

                            // Smoothing

                            // -log{a*P(Ci-1)+(1-a)P(Ci|Ci-1)} Note 0<a<1
                            weight = -Math
                                     .Log(smooth
                                          * (1.0 + oneWordFreq)
                                          / (Utility.MAX_FREQUENCE + 0.0)
                                          + (1.0 - smooth)
                                          * ((1.0 - tinyDouble) * wordPairFreq / (1.0 + oneWordFreq) + tinyDouble));

                            SegTokenPair tokenPair = new SegTokenPair(idBuffer, t1.Index,
                                                                      t2.Index, weight);
                            this.AddSegTokenPair(tokenPair);
                        }
                    }
                }
                key++;
            }
        }
Esempio n. 2
0
        /// <summary>
        /// Create the <see cref="SegGraph"/> for a sentence.
        /// </summary>
        /// <param name="sentence">input sentence, without start and end markers</param>
        /// <returns><see cref="SegGraph"/> corresponding to the input sentence.</returns>
        private SegGraph CreateSegGraph(string sentence)
        {
            int i = 0, j;
            int length = sentence.Length;
            int foundIndex;

            CharType[]    charTypeArray = GetCharTypes(sentence);
            StringBuilder wordBuf       = new StringBuilder();
            SegToken      token;
            int           frequency = 0; // the number of times word appears.
            bool          hasFullWidth;
            WordType      wordType;

            char[] charArray;

            SegGraph segGraph = new SegGraph();

            while (i < length)
            {
                hasFullWidth = false;
                switch (charTypeArray[i])
                {
                case CharType.SPACE_LIKE:
                    i++;
                    break;

                case CharType.HANZI:
                    j = i + 1;
                    //wordBuf.delete(0, wordBuf.length());
                    wordBuf.Remove(0, wordBuf.Length);
                    // It doesn't matter if a single Chinese character (Hanzi) can form a phrase or not,
                    // it will store that single Chinese character (Hanzi) in the SegGraph.  Otherwise, it will
                    // cause word division.
                    wordBuf.Append(sentence[i]);
                    charArray = new char[] { sentence[i] };
                    frequency = wordDict.GetFrequency(charArray);
                    token     = new SegToken(charArray, i, j, WordType.CHINESE_WORD,
                                             frequency);
                    segGraph.AddToken(token);

                    foundIndex = wordDict.GetPrefixMatch(charArray);
                    while (j <= length && foundIndex != -1)
                    {
                        if (wordDict.IsEqual(charArray, foundIndex) && charArray.Length > 1)
                        {
                            // It is the phrase we are looking for; In other words, we have found a phrase SegToken
                            // from i to j.  It is not a monosyllabic word (single word).
                            frequency = wordDict.GetFrequency(charArray);
                            token     = new SegToken(charArray, i, j, WordType.CHINESE_WORD,
                                                     frequency);
                            segGraph.AddToken(token);
                        }

                        while (j < length && charTypeArray[j] == CharType.SPACE_LIKE)
                        {
                            j++;
                        }

                        if (j < length && charTypeArray[j] == CharType.HANZI)
                        {
                            wordBuf.Append(sentence[j]);
                            charArray = new char[wordBuf.Length];
                            //wordBuf.GetChars(0, charArray.Length, charArray, 0);
                            wordBuf.CopyTo(0, charArray, 0, charArray.Length);
                            // idArray has been found (foundWordIndex!=-1) as a prefix before.
                            // Therefore, idArray after it has been lengthened can only appear after foundWordIndex.
                            // So start searching after foundWordIndex.
                            foundIndex = wordDict.GetPrefixMatch(charArray, foundIndex);
                            j++;
                        }
                        else
                        {
                            break;
                        }
                    }
                    i++;
                    break;

                case CharType.FULLWIDTH_LETTER:
                    hasFullWidth = true;     /* intentional fallthrough */

                    j = i + 1;
                    while (j < length &&
                           (charTypeArray[j] == CharType.LETTER || charTypeArray[j] == CharType.FULLWIDTH_LETTER))
                    {
                        if (charTypeArray[j] == CharType.FULLWIDTH_LETTER)
                        {
                            hasFullWidth = true;
                        }
                        j++;
                    }
                    // Found a Token from i to j. Type is LETTER char string.
                    charArray = Utility.STRING_CHAR_ARRAY;
                    frequency = wordDict.GetFrequency(charArray);
                    wordType  = hasFullWidth ? WordType.FULLWIDTH_STRING : WordType.STRING;
                    token     = new SegToken(charArray, i, j, wordType, frequency);
                    segGraph.AddToken(token);
                    i = j;
                    break;

                case CharType.LETTER:
                    j = i + 1;
                    while (j < length &&
                           (charTypeArray[j] == CharType.LETTER || charTypeArray[j] == CharType.FULLWIDTH_LETTER))
                    {
                        if (charTypeArray[j] == CharType.FULLWIDTH_LETTER)
                        {
                            hasFullWidth = true;
                        }
                        j++;
                    }
                    // Found a Token from i to j. Type is LETTER char string.
                    charArray = Utility.STRING_CHAR_ARRAY;
                    frequency = wordDict.GetFrequency(charArray);
                    wordType  = hasFullWidth ? WordType.FULLWIDTH_STRING : WordType.STRING;
                    token     = new SegToken(charArray, i, j, wordType, frequency);
                    segGraph.AddToken(token);
                    i = j;
                    break;

                case CharType.FULLWIDTH_DIGIT:
                    hasFullWidth = true;     /* intentional fallthrough */

                    j = i + 1;
                    while (j < length &&
                           (charTypeArray[j] == CharType.DIGIT || charTypeArray[j] == CharType.FULLWIDTH_DIGIT))
                    {
                        if (charTypeArray[j] == CharType.FULLWIDTH_DIGIT)
                        {
                            hasFullWidth = true;
                        }
                        j++;
                    }
                    // Found a Token from i to j. Type is NUMBER char string.
                    charArray = Utility.NUMBER_CHAR_ARRAY;
                    frequency = wordDict.GetFrequency(charArray);
                    wordType  = hasFullWidth ? WordType.FULLWIDTH_NUMBER : WordType.NUMBER;
                    token     = new SegToken(charArray, i, j, wordType, frequency);
                    segGraph.AddToken(token);
                    i = j;
                    break;

                case CharType.DIGIT:
                    j = i + 1;
                    while (j < length &&
                           (charTypeArray[j] == CharType.DIGIT || charTypeArray[j] == CharType.FULLWIDTH_DIGIT))
                    {
                        if (charTypeArray[j] == CharType.FULLWIDTH_DIGIT)
                        {
                            hasFullWidth = true;
                        }
                        j++;
                    }
                    // Found a Token from i to j. Type is NUMBER char string.
                    charArray = Utility.NUMBER_CHAR_ARRAY;
                    frequency = wordDict.GetFrequency(charArray);
                    wordType  = hasFullWidth ? WordType.FULLWIDTH_NUMBER : WordType.NUMBER;
                    token     = new SegToken(charArray, i, j, wordType, frequency);
                    segGraph.AddToken(token);
                    i = j;
                    break;

                case CharType.DELIMITER:
                    j = i + 1;
                    // No need to search the weight for the punctuation.  Picking the highest frequency will work.
                    frequency = Utility.MAX_FREQUENCE;
                    charArray = new char[] { sentence[i] };
                    token     = new SegToken(charArray, i, j, WordType.DELIMITER, frequency);
                    segGraph.AddToken(token);
                    i = j;
                    break;

                default:
                    j = i + 1;
                    // Treat the unrecognized char symbol as unknown string.
                    // For example, any symbol not in GB2312 is treated as one of these.
                    charArray = Utility.STRING_CHAR_ARRAY;
                    frequency = wordDict.GetFrequency(charArray);
                    token     = new SegToken(charArray, i, j, WordType.STRING, frequency);
                    segGraph.AddToken(token);
                    i = j;
                    break;
                }
            }

            // Add two more Tokens: "beginning xx beginning"
            charArray = Utility.START_CHAR_ARRAY;
            frequency = wordDict.GetFrequency(charArray);
            token     = new SegToken(charArray, -1, 0, WordType.SENTENCE_BEGIN, frequency);
            segGraph.AddToken(token);

            // "end xx end"
            charArray = Utility.END_CHAR_ARRAY;
            frequency = wordDict.GetFrequency(charArray);
            token     = new SegToken(charArray, length, length + 1, WordType.SENTENCE_END,
                                     frequency);
            segGraph.AddToken(token);

            return(segGraph);
        }
Esempio n. 3
0
 public BiSegGraph(SegGraph segGraph)
 {
     segTokenList = segGraph.MakeIndex();
     GenerateBiSegGraph(segGraph);
 }