コード例 #1
0
        private WordData Next()
        {
            ByteBuffer entryBuffer = entriesIter.Current;

            /*
             * Entries are typically: inflected<SEP>codedBase<SEP>tag so try to find this split.
             */
            byte[] ba     = entryBuffer.Array;
            int    bbSize = entryBuffer.Remaining;

            int sepPos;

            for (sepPos = 0; sepPos < bbSize; sepPos++)
            {
                if (ba[sepPos] == separator)
                {
                    break;
                }
            }

            if (sepPos == bbSize)
            {
                throw new Exception("Invalid dictionary " + "entry format (missing separator).");
            }

            inflectedBuffer = BufferUtils.ClearAndEnsureCapacity(inflectedBuffer, sepPos);
            //Array.Resize(ref inflectedBuffer, sepPos);
            //Array.Copy(ba, 0, inflectedBuffer, 0, sepPos);
            inflectedBuffer.Put(ba, 0, sepPos);
            inflectedBuffer.Flip();

            inflectedCharBuffer = BufferUtils.BytesToChars(decoder, inflectedBuffer, inflectedCharBuffer);
            entry.Update(inflectedBuffer, inflectedCharBuffer);

            temp = BufferUtils.ClearAndEnsureCapacity(temp, bbSize - sepPos);
            //Array.Resize(ref temp, bbSize - sepPos);
            sepPos++;
            //Array.Copy(ba, 0, temp, sepPos, bbSize - sepPos);
            temp.Put(ba, sepPos, bbSize - sepPos);
            temp.Flip();

            ba     = temp.Array;
            bbSize = temp.Remaining;

            /*
             * Find the next separator byte's position splitting word form and tag.
             */
#pragma warning disable 612, 618
            Debug.Assert(sequenceEncoder.PrefixBytes <= bbSize, sequenceEncoder.GetType() + " >? " + bbSize);
            sepPos = sequenceEncoder.PrefixBytes;
#pragma warning restore 612, 618
            for (; sepPos < bbSize; sepPos++)
            {
                if (ba[sepPos] == separator)
                {
                    break;
                }
            }

            /*
             * Decode the stem into stem buffer.
             */
            if (decodeStems)
            {
                entry.stemBuffer = sequenceEncoder.Decode(entry.stemBuffer,
                                                          inflectedBuffer,
                                                          ByteBuffer.Wrap(ba, 0, sepPos));
            }
            else
            {
                entry.stemBuffer = BufferUtils.ClearAndEnsureCapacity(entry.stemBuffer, sepPos);
                entry.stemBuffer.Put(ba, 0, sepPos);
                entry.stemBuffer.Flip();
            }

            // Skip separator character, if present.
            if (sepPos + 1 <= bbSize)
            {
                sepPos++;
            }

            /*
             * Decode the tag data.
             */
            entry.tagBuffer = BufferUtils.ClearAndEnsureCapacity(entry.tagBuffer, bbSize - sepPos);
            //Array.Resize(ref entry.tagBuffer, bbSize - sepPos);
            entry.tagBuffer.Put(ba, sepPos, bbSize - sepPos);
            entry.tagBuffer.Flip();

            return(entry);
        }
コード例 #2
0
        /// <summary>
        /// Searches the automaton for a symbol sequence equal to <paramref name="word"/>,
        /// followed by a separator. The result is a stem (decompressed accordingly
        /// to the dictionary's specification) and an optional tag data.
        /// </summary>
        public IList <WordData> Lookup(ICharSequence word)
        {
            byte separator = dictionaryMetadata.Separator;

#pragma warning disable 612, 618
            int prefixBytes = sequenceEncoder.PrefixBytes;
#pragma warning restore 612, 618

            if (dictionaryMetadata.InputConversionPairs.Any())
            {
                word = ApplyReplacements(word, dictionaryMetadata.InputConversionPairs);
            }

            // Reset the output list to zero length.
            formsList.Wrap(forms, 0, 0);

            // Encode word characters into bytes in the same encoding as the FSA's.
            charBuffer = BufferUtils.ClearAndEnsureCapacity(charBuffer, word.Length);
            for (int i = 0; i < word.Length; i++)
            {
                char chr = word[i];
                if (chr == separatorChar)
                {
                    // No valid input can contain the separator.
                    return(formsList);
                }
                charBuffer.Put(chr);
            }
            charBuffer.Flip();
            try
            {
                byteBuffer = BufferUtils.CharsToBytes(encoder, charBuffer, byteBuffer);
            }
            catch (UnmappableInputException)
            {
                // This should be a rare occurrence, but if it happens it means there is no way
                // the dictionary can contain the input word.
                return(formsList);
            }

            // Try to find a partial match in the dictionary.
            MatchResult match = matcher.Match(matchResult, byteBuffer
                                              .Array, 0, byteBuffer.Remaining, rootNode);

            if (match.Kind == MatchResult.SequenceIsAPrefix)
            {
                /*
                 * The entire sequence exists in the dictionary. A separator should
                 * be the next symbol.
                 */
                int arc = fsa.GetArc(match.Node, separator);

                /*
                 * The situation when the arc points to a final node should NEVER
                 * happen. After all, we want the word to have SOME base form.
                 */
                if (arc != 0 && !fsa.IsArcFinal(arc))
                {
                    // There is such a word in the dictionary. Return its base forms.
                    int formsCount = 0;

                    finalStatesIterator.RestartFrom(fsa.GetEndNode(arc));
                    while (finalStatesIterator.MoveNext())
                    {
                        ByteBuffer bb     = finalStatesIterator.Current;
                        byte[]     ba     = bb.Array;
                        int        bbSize = bb.Remaining;

                        if (formsCount >= forms.Length)
                        {
                            //forms = Arrays.CopyOf(forms, forms.Length + EXPAND_SIZE);
                            Array.Resize(ref forms, forms.Length + ExpandSize);
                            for (int k = 0; k < forms.Length; k++)
                            {
                                if (forms[k] == null)
                                {
                                    forms[k] = new WordData(decoder);
                                }
                            }
                        }

                        /*
                         * Now, expand the prefix/ suffix 'compression' and store
                         * the base form.
                         */
                        WordData wordData = forms[formsCount++];
                        if (!dictionaryMetadata.OutputConversionPairs.Any())
                        {
                            wordData.Update(byteBuffer, word);
                        }
                        else
                        {
                            wordData.Update(byteBuffer, ApplyReplacements(word, dictionaryMetadata.OutputConversionPairs));
                        }

                        /*
                         * Find the separator byte's position splitting the inflection instructions
                         * from the tag.
                         */
                        Debug.Assert(prefixBytes <= bbSize, sequenceEncoder.GetType() + " >? " + bbSize);
                        int sepPos;
                        for (sepPos = prefixBytes; sepPos < bbSize; sepPos++)
                        {
                            if (ba[sepPos] == separator)
                            {
                                break;
                            }
                        }

                        /*
                         * Decode the stem into stem buffer.
                         */
                        wordData.stemBuffer = sequenceEncoder.Decode(wordData.stemBuffer,
                                                                     byteBuffer,
                                                                     ByteBuffer.Wrap(ba, 0, sepPos));

                        // Skip separator character.
                        sepPos++;

                        /*
                         * Decode the tag data.
                         */
                        int tagSize = bbSize - sepPos;
                        if (tagSize > 0)
                        {
                            wordData.tagBuffer = BufferUtils.ClearAndEnsureCapacity(wordData.tagBuffer, tagSize);
                            wordData.tagBuffer.Put(ba, sepPos, tagSize);
                            wordData.tagBuffer.Flip();
                        }
                    }

                    formsList.Wrap(forms, 0, formsCount);
                }
            }
            else
            {
                /*
                 * this case is somewhat confusing: we should have hit the separator
                 * first... I don't really know how to deal with it at the time
                 * being.
                 */
            }
            return(formsList);
        }