Пример #1
0
        public virtual IList <SegToken> GetShortPath()
        {
            int current;
            int nodeCount             = ToCount;
            IList <PathNode> path     = new List <PathNode>();
            PathNode         zeroPath = new PathNode();

            zeroPath.Weight  = 0;
            zeroPath.PreNode = 0;
            path.Add(zeroPath);
            for (current = 1; current <= nodeCount; current++)
            {
                double weight;
                IList <SegTokenPair> edges = GetToList(current);

                double       minWeight = double.MaxValue;
                SegTokenPair minEdge   = null;
                foreach (SegTokenPair edge in edges)
                {
                    weight = edge.Weight;
                    PathNode preNode2 = path[edge.From];
                    if (preNode2.Weight + weight < minWeight)
                    {
                        minWeight = preNode2.Weight + weight;
                        minEdge   = edge;
                    }
                }
                PathNode newNode = new PathNode();
                newNode.Weight  = minWeight;
                newNode.PreNode = minEdge.From;
                path.Add(newNode);
            }

            // Calculate PathNodes
            int preNode, lastNode;

            lastNode = path.Count - 1;
            current  = lastNode;
            IList <int>      rpath      = new List <int>();
            IList <SegToken> resultPath = new List <SegToken>();

            rpath.Add(current);
            while (current != 0)
            {
                PathNode currentPathNode = path[current];
                preNode = currentPathNode.PreNode;
                rpath.Add(preNode);
                current = preNode;
            }
            for (int j = rpath.Count - 1; j >= 0; j--)
            {
                //int idInteger = rpath.get(j);
                //int id = idInteger.intValue();
                int      id = rpath[j];
                SegToken t  = segTokenList[id];
                resultPath.Add(t);
            }
            return(resultPath);
        }
Пример #2
0
        /// <summary>
        /// <see cref="object.Equals(object)"/>
        /// </summary>
        public override bool Equals(object obj)
        {
            if (this == obj)
            {
                return(true);
            }
            if (obj == null)
            {
                return(false);
            }
            if (GetType() != obj.GetType())
            {
                return(false);
            }
            SegToken other = (SegToken)obj;

            if (!Arrays.Equals(CharArray, other.CharArray))
            {
                return(false);
            }
            if (EndOffset != other.EndOffset)
            {
                return(false);
            }
            if (Index != other.Index)
            {
                return(false);
            }
            if (StartOffset != other.StartOffset)
            {
                return(false);
            }
            if (Weight != other.Weight)
            {
                return(false);
            }
            if (WordType != other.WordType)
            {
                return(false);
            }
            return(true);
        }
Пример #3
0
        /// <summary>
        /// Add a <see cref="SegToken"/> to the mapping, creating a new mapping at the token's startOffset if one does not exist.
        /// </summary>
        /// <param name="token">token <see cref="SegToken"/>.</param>
        public virtual void AddToken(SegToken token)
        {
            int s = token.StartOffset;

            if (!IsStartExist(s))
            {
                List <SegToken> newlist = new List <SegToken>();
                newlist.Add(token);
                tokenListTable[s] = newlist;
            }
            else
            {
                IList <SegToken> tokenList = tokenListTable[s];
                tokenList.Add(token);
            }
            if (s > maxStart)
            {
                maxStart = s;
            }
        }
Пример #4
0
        /// <summary>
        /// Create the <see cref="SegGraph"/> for a sentence.
        /// </summary>
        /// <param name="sentence">input sentence, without start and end markers</param>
        /// <returns><see cref="SegGraph"/> corresponding to the input sentence.</returns>
        private SegGraph CreateSegGraph(string sentence)
        {
            int i = 0, j;
            int length = sentence.Length;
            int foundIndex;

            CharType[]    charTypeArray = GetCharTypes(sentence);
            StringBuilder wordBuf       = new StringBuilder();
            SegToken      token;
            int           frequency = 0; // the number of times word appears.
            bool          hasFullWidth;
            WordType      wordType;

            char[] charArray;

            SegGraph segGraph = new SegGraph();

            while (i < length)
            {
                hasFullWidth = false;
                switch (charTypeArray[i])
                {
                case CharType.SPACE_LIKE:
                    i++;
                    break;

                case CharType.HANZI:
                    j = i + 1;
                    //wordBuf.delete(0, wordBuf.length());
                    wordBuf.Remove(0, wordBuf.Length);
                    // It doesn't matter if a single Chinese character (Hanzi) can form a phrase or not,
                    // it will store that single Chinese character (Hanzi) in the SegGraph.  Otherwise, it will
                    // cause word division.
                    wordBuf.Append(sentence[i]);
                    charArray = new char[] { sentence[i] };
                    frequency = wordDict.GetFrequency(charArray);
                    token     = new SegToken(charArray, i, j, WordType.CHINESE_WORD,
                                             frequency);
                    segGraph.AddToken(token);

                    foundIndex = wordDict.GetPrefixMatch(charArray);
                    while (j <= length && foundIndex != -1)
                    {
                        if (wordDict.IsEqual(charArray, foundIndex) && charArray.Length > 1)
                        {
                            // It is the phrase we are looking for; In other words, we have found a phrase SegToken
                            // from i to j.  It is not a monosyllabic word (single word).
                            frequency = wordDict.GetFrequency(charArray);
                            token     = new SegToken(charArray, i, j, WordType.CHINESE_WORD,
                                                     frequency);
                            segGraph.AddToken(token);
                        }

                        while (j < length && charTypeArray[j] == CharType.SPACE_LIKE)
                        {
                            j++;
                        }

                        if (j < length && charTypeArray[j] == CharType.HANZI)
                        {
                            wordBuf.Append(sentence[j]);
                            charArray = new char[wordBuf.Length];
                            //wordBuf.GetChars(0, charArray.Length, charArray, 0);
                            wordBuf.CopyTo(0, charArray, 0, charArray.Length);
                            // idArray has been found (foundWordIndex!=-1) as a prefix before.
                            // Therefore, idArray after it has been lengthened can only appear after foundWordIndex.
                            // So start searching after foundWordIndex.
                            foundIndex = wordDict.GetPrefixMatch(charArray, foundIndex);
                            j++;
                        }
                        else
                        {
                            break;
                        }
                    }
                    i++;
                    break;

                case CharType.FULLWIDTH_LETTER:
                    hasFullWidth = true;     /* intentional fallthrough */

                    j = i + 1;
                    while (j < length &&
                           (charTypeArray[j] == CharType.LETTER || charTypeArray[j] == CharType.FULLWIDTH_LETTER))
                    {
                        if (charTypeArray[j] == CharType.FULLWIDTH_LETTER)
                        {
                            hasFullWidth = true;
                        }
                        j++;
                    }
                    // Found a Token from i to j. Type is LETTER char string.
                    charArray = Utility.STRING_CHAR_ARRAY;
                    frequency = wordDict.GetFrequency(charArray);
                    wordType  = hasFullWidth ? WordType.FULLWIDTH_STRING : WordType.STRING;
                    token     = new SegToken(charArray, i, j, wordType, frequency);
                    segGraph.AddToken(token);
                    i = j;
                    break;

                case CharType.LETTER:
                    j = i + 1;
                    while (j < length &&
                           (charTypeArray[j] == CharType.LETTER || charTypeArray[j] == CharType.FULLWIDTH_LETTER))
                    {
                        if (charTypeArray[j] == CharType.FULLWIDTH_LETTER)
                        {
                            hasFullWidth = true;
                        }
                        j++;
                    }
                    // Found a Token from i to j. Type is LETTER char string.
                    charArray = Utility.STRING_CHAR_ARRAY;
                    frequency = wordDict.GetFrequency(charArray);
                    wordType  = hasFullWidth ? WordType.FULLWIDTH_STRING : WordType.STRING;
                    token     = new SegToken(charArray, i, j, wordType, frequency);
                    segGraph.AddToken(token);
                    i = j;
                    break;

                case CharType.FULLWIDTH_DIGIT:
                    hasFullWidth = true;     /* intentional fallthrough */

                    j = i + 1;
                    while (j < length &&
                           (charTypeArray[j] == CharType.DIGIT || charTypeArray[j] == CharType.FULLWIDTH_DIGIT))
                    {
                        if (charTypeArray[j] == CharType.FULLWIDTH_DIGIT)
                        {
                            hasFullWidth = true;
                        }
                        j++;
                    }
                    // Found a Token from i to j. Type is NUMBER char string.
                    charArray = Utility.NUMBER_CHAR_ARRAY;
                    frequency = wordDict.GetFrequency(charArray);
                    wordType  = hasFullWidth ? WordType.FULLWIDTH_NUMBER : WordType.NUMBER;
                    token     = new SegToken(charArray, i, j, wordType, frequency);
                    segGraph.AddToken(token);
                    i = j;
                    break;

                case CharType.DIGIT:
                    j = i + 1;
                    while (j < length &&
                           (charTypeArray[j] == CharType.DIGIT || charTypeArray[j] == CharType.FULLWIDTH_DIGIT))
                    {
                        if (charTypeArray[j] == CharType.FULLWIDTH_DIGIT)
                        {
                            hasFullWidth = true;
                        }
                        j++;
                    }
                    // Found a Token from i to j. Type is NUMBER char string.
                    charArray = Utility.NUMBER_CHAR_ARRAY;
                    frequency = wordDict.GetFrequency(charArray);
                    wordType  = hasFullWidth ? WordType.FULLWIDTH_NUMBER : WordType.NUMBER;
                    token     = new SegToken(charArray, i, j, wordType, frequency);
                    segGraph.AddToken(token);
                    i = j;
                    break;

                case CharType.DELIMITER:
                    j = i + 1;
                    // No need to search the weight for the punctuation.  Picking the highest frequency will work.
                    frequency = Utility.MAX_FREQUENCE;
                    charArray = new char[] { sentence[i] };
                    token     = new SegToken(charArray, i, j, WordType.DELIMITER, frequency);
                    segGraph.AddToken(token);
                    i = j;
                    break;

                default:
                    j = i + 1;
                    // Treat the unrecognized char symbol as unknown string.
                    // For example, any symbol not in GB2312 is treated as one of these.
                    charArray = Utility.STRING_CHAR_ARRAY;
                    frequency = wordDict.GetFrequency(charArray);
                    token     = new SegToken(charArray, i, j, WordType.STRING, frequency);
                    segGraph.AddToken(token);
                    i = j;
                    break;
                }
            }

            // Add two more Tokens: "beginning xx beginning"
            charArray = Utility.START_CHAR_ARRAY;
            frequency = wordDict.GetFrequency(charArray);
            token     = new SegToken(charArray, -1, 0, WordType.SENTENCE_BEGIN, frequency);
            segGraph.AddToken(token);

            // "end xx end"
            charArray = Utility.END_CHAR_ARRAY;
            frequency = wordDict.GetFrequency(charArray);
            token     = new SegToken(charArray, length, length + 1, WordType.SENTENCE_END,
                                     frequency);
            segGraph.AddToken(token);

            return(segGraph);
        }