/// <summary>
        ///
        /// </summary>
        public virtual ByteBuffer Encode(ByteBuffer reuse, ByteBuffer source, ByteBuffer target)
        {
            int sharedPrefix  = BufferUtils.SharedPrefixLength(source, target);
            int truncateBytes = source.Remaining - sharedPrefix;

            if (truncateBytes >= RemoveEverything)
            {
                truncateBytes = RemoveEverything;
                sharedPrefix  = 0;
            }

            reuse = BufferUtils.ClearAndEnsureCapacity(reuse, 1 + target.Remaining - sharedPrefix);

            Debug.Assert(target.HasArray &&
                         target.Position == 0 &&
                         target.ArrayOffset == 0);

            byte suffixTrimCode = (byte)(truncateBytes + 'A');

            reuse.Put(suffixTrimCode)
            .Put(target.Array, sharedPrefix, target.Remaining - sharedPrefix)
            .Flip();

            return(reuse);
        }
        /// <summary>
        ///
        /// </summary>
        public virtual ByteBuffer Decode(ByteBuffer reuse, ByteBuffer source, ByteBuffer encoded)
        {
            Debug.Assert(encoded.Remaining >= 1);

            int suffixTrimCode = encoded.Get(encoded.Position);
            int truncateBytes  = (suffixTrimCode - 'A') & 0xFF;

            if (truncateBytes == RemoveEverything)
            {
                truncateBytes = source.Remaining;
            }

            int len1 = source.Remaining - truncateBytes;
            int len2 = encoded.Remaining - 1;

            reuse = BufferUtils.ClearAndEnsureCapacity(reuse, len1 + len2);

            Debug.Assert(source.HasArray &&
                         source.Position == 0 &&
                         source.ArrayOffset == 0);

            Debug.Assert(encoded.HasArray &&
                         encoded.Position == 0 &&
                         encoded.ArrayOffset == 0);

            reuse.Put(source.Array, 0, len1)
            .Put(encoded.Array, 1, len2)
            .Flip();

            return(reuse);
        }
        /// <summary>
        ///
        /// </summary>
        public virtual ByteBuffer Decode(ByteBuffer reuse, ByteBuffer source, ByteBuffer encoded)
        {
            Debug.Assert(encoded.Remaining >= 2);

            int p = encoded.Position;
            int truncatePrefixBytes = (encoded.Get(p) - 'A') & 0xFF;
            int truncateSuffixBytes = (encoded.Get(p + 1) - 'A') & 0xFF;

            if (truncatePrefixBytes == RemoveEverything ||
                truncateSuffixBytes == RemoveEverything)
            {
                truncatePrefixBytes = source.Remaining;
                truncateSuffixBytes = 0;
            }

            Debug.Assert(source.HasArray &&
                         source.Position == 0 &&
                         source.ArrayOffset == 0);

            Debug.Assert(encoded.HasArray &&
                         encoded.Position == 0 &&
                         encoded.ArrayOffset == 0);

            int len1 = source.Remaining - (truncateSuffixBytes + truncatePrefixBytes);
            int len2 = encoded.Remaining - 2;

            reuse = BufferUtils.ClearAndEnsureCapacity(reuse, len1 + len2);

            reuse.Put(source.Array, truncatePrefixBytes, len1);
            reuse.Put(encoded.Array, 2, len2);
            reuse.Flip();

            return(reuse);
        }
 /// <summary>
 /// Copy the inflected word's binary data (no charset decoding) to a custom
 /// byte buffer.
 /// <para/>
 /// The buffer is cleared prior to copying and flipped for reading
 /// upon returning from this method. If the buffer is null or not large
 /// enough to hold the result, a new buffer is allocated.
 /// </summary>
 /// <param name="target">Target byte buffer to copy the word buffer to or
 /// <c>null</c> if a new buffer should be allocated.</param>
 /// <returns>Returns <paramref name="target"/> or the new reallocated buffer.</returns>
 public ByteBuffer GetWordBytes(ByteBuffer target)
 {
     target = BufferUtils.ClearAndEnsureCapacity(target, wordBuffer.Remaining);
     wordBuffer.Mark();
     target.Put(wordBuffer);
     wordBuffer.Reset();
     target.Flip();
     return(target);
 }
        /// <summary>
        ///
        /// </summary>
        public virtual ByteBuffer Decode(ByteBuffer reuse, ByteBuffer source, ByteBuffer encoded)
        {
            reuse = BufferUtils.ClearAndEnsureCapacity(reuse, encoded.Remaining);

            encoded.Mark();
            reuse.Put(encoded)
            .Flip();
            encoded.Reset();

            return(reuse);
        }
        /// <summary>
        ///
        /// </summary>
        public virtual ByteBuffer Encode(ByteBuffer reuse, ByteBuffer source, ByteBuffer target)
        {
            reuse = BufferUtils.ClearAndEnsureCapacity(reuse, target.Remaining);

            target.Mark();
            reuse.Put(target)
            .Flip();
            target.Reset();

            return(reuse);
        }
        /// <summary>
        ///
        /// </summary>
        public virtual ByteBuffer Encode(ByteBuffer reuse, ByteBuffer source, ByteBuffer target)
        {
            // Search for the maximum matching subsequence that can be encoded.
            int maxSubsequenceLength = 0;
            int maxSubsequenceIndex  = 0;

            for (int i = 0; i < source.Remaining; i++)
            {
                // prefix at i => shared subsequence (infix)
                int sharedPrefix = BufferUtils.SharedPrefixLength(source, i, target, 0);
                // Only update maxSubsequenceLength if we will be able to encode it.
                if (sharedPrefix > maxSubsequenceLength && i < RemoveEverything &&
                    (source.Remaining - (i + sharedPrefix)) < RemoveEverything)
                {
                    maxSubsequenceLength = sharedPrefix;
                    maxSubsequenceIndex  = i;
                }
            }

            // Determine how much to remove (and where) from src to get a prefix of dst.
            int truncatePrefixBytes = maxSubsequenceIndex;
            int truncateSuffixBytes = (source.Remaining - (maxSubsequenceIndex + maxSubsequenceLength));

            if (truncatePrefixBytes >= RemoveEverything || truncateSuffixBytes >= RemoveEverything)
            {
                maxSubsequenceIndex = maxSubsequenceLength = 0;
                truncatePrefixBytes = truncateSuffixBytes = RemoveEverything;
            }

            int len1 = target.Remaining - maxSubsequenceLength;

            reuse = BufferUtils.ClearAndEnsureCapacity(reuse, 2 + len1);

            Debug.Assert(target.HasArray &&
                         target.Position == 0 &&
                         target.ArrayOffset == 0);

            reuse.Put((byte)((truncatePrefixBytes + 'A') & 0xFF));
            reuse.Put((byte)((truncateSuffixBytes + 'A') & 0xFF));
            reuse.Put(target.Array, maxSubsequenceLength, len1);
            reuse.Flip();

            return(reuse);
        }
示例#8
0
        /// <summary>
        ///
        /// </summary>
        public virtual ByteBuffer Decode(ByteBuffer reuse, ByteBuffer source, ByteBuffer encoded)
        {
            Debug.Assert(encoded.Remaining >= 3);

            int p                   = encoded.Position;
            int infixIndex          = (encoded.Get(p) - 'A') & 0xFF;
            int infixLength         = (encoded.Get(p + 1) - 'A') & 0xFF;
            int truncateSuffixBytes = (encoded.Get(p + 2) - 'A') & 0xFF;

            if (infixLength == RemoveEverything ||
                truncateSuffixBytes == RemoveEverything)
            {
                infixIndex          = 0;
                infixLength         = source.Remaining;
                truncateSuffixBytes = 0;
            }

            int len1 = source.Remaining - (infixIndex + infixLength + truncateSuffixBytes);
            int len2 = encoded.Remaining - 3;

            reuse = BufferUtils.ClearAndEnsureCapacity(reuse, infixIndex + len1 + len2);

            Debug.Assert(encoded.HasArray &&
                         encoded.Position == 0 &&
                         encoded.ArrayOffset == 0);

            Debug.Assert(source.HasArray &&
                         source.Position == 0 &&
                         source.ArrayOffset == 0);

            reuse.Put(source.Array, 0, infixIndex);
            reuse.Put(source.Array, infixIndex + infixLength, len1);
            reuse.Put(encoded.Array, 3, len2);
            reuse.Flip();

            return(reuse);
        }
示例#9
0
        /// <summary>
        ///
        /// </summary>
        public virtual ByteBuffer Encode(ByteBuffer reuse, ByteBuffer source, ByteBuffer target)
        {
            Debug.Assert(source.HasArray &&
                         source.Position == 0 &&
                         source.ArrayOffset == 0);

            Debug.Assert(target.HasArray &&
                         target.Position == 0 &&
                         target.ArrayOffset == 0);

            // Search for the infix that can we can encode and remove from src
            // to get a maximum-length prefix of dst. This could be done more efficiently
            // by running a smarter longest-common-subsequence algorithm and some pruning (?).
            //
            // For now, naive loop should do.

            // There can be only two positions for the infix to delete:
            // 1) we remove leading bytes, even if they are partially matching (but a longer match
            //    exists somewhere later on).
            // 2) we leave max. matching prefix and remove non-matching bytes that follow.
            int maxInfixIndex        = 0;
            int maxSubsequenceLength = BufferUtils.SharedPrefixLength(source, target);
            int maxInfixLength       = 0;

            foreach (int i in new int[] { 0, maxSubsequenceLength })
            {
                for (int j = 1; j <= source.Remaining - i; j++)
                {
                    // Compute temporary src with the infix removed.
                    // Concatenate in scratch space for simplicity.
                    int len2 = source.Remaining - (i + j);
                    scratch = BufferUtils.ClearAndEnsureCapacity(scratch, i + len2);
                    scratch.Put(source.Array, 0, i);
                    scratch.Put(source.Array, i + j, len2);
                    scratch.Flip();

                    int sharedPrefix = BufferUtils.SharedPrefixLength(scratch, target);

                    // Only update maxSubsequenceLength if we will be able to encode it.
                    if (sharedPrefix > 0 && sharedPrefix > maxSubsequenceLength && i < RemoveEverything && j < RemoveEverything)
                    {
                        maxSubsequenceLength = sharedPrefix;
                        maxInfixIndex        = i;
                        maxInfixLength       = j;
                    }
                }
            }

            int truncateSuffixBytes = source.Remaining - (maxInfixLength + maxSubsequenceLength);

            // Special case: if we're removing the suffix in the infix code, move it
            // to the suffix code instead.
            if (truncateSuffixBytes == 0 &&
                maxInfixIndex + maxInfixLength == source.Remaining)
            {
                truncateSuffixBytes = maxInfixLength;
                maxInfixIndex       = maxInfixLength = 0;
            }

            if (maxInfixIndex >= RemoveEverything ||
                maxInfixLength >= RemoveEverything ||
                truncateSuffixBytes >= RemoveEverything)
            {
                maxInfixIndex  = maxSubsequenceLength = 0;
                maxInfixLength = truncateSuffixBytes = RemoveEverything;
            }

            int len1 = target.Remaining - maxSubsequenceLength;

            reuse = BufferUtils.ClearAndEnsureCapacity(reuse, 3 + len1);

            reuse.Put((byte)((maxInfixIndex + 'A') & 0xFF));
            reuse.Put((byte)((maxInfixLength + 'A') & 0xFF));
            reuse.Put((byte)((truncateSuffixBytes + 'A') & 0xFF));
            reuse.Put(target.Array, maxSubsequenceLength, len1);
            reuse.Flip();

            return(reuse);
        }
        private WordData Next()
        {
            ByteBuffer entryBuffer = entriesIter.Current;

            /*
             * Entries are typically: inflected<SEP>codedBase<SEP>tag so try to find this split.
             */
            byte[] ba     = entryBuffer.Array;
            int    bbSize = entryBuffer.Remaining;

            int sepPos;

            for (sepPos = 0; sepPos < bbSize; sepPos++)
            {
                if (ba[sepPos] == separator)
                {
                    break;
                }
            }

            if (sepPos == bbSize)
            {
                throw new Exception("Invalid dictionary " + "entry format (missing separator).");
            }

            inflectedBuffer = BufferUtils.ClearAndEnsureCapacity(inflectedBuffer, sepPos);
            //Array.Resize(ref inflectedBuffer, sepPos);
            //Array.Copy(ba, 0, inflectedBuffer, 0, sepPos);
            inflectedBuffer.Put(ba, 0, sepPos);
            inflectedBuffer.Flip();

            inflectedCharBuffer = BufferUtils.BytesToChars(decoder, inflectedBuffer, inflectedCharBuffer);
            entry.Update(inflectedBuffer, inflectedCharBuffer);

            temp = BufferUtils.ClearAndEnsureCapacity(temp, bbSize - sepPos);
            //Array.Resize(ref temp, bbSize - sepPos);
            sepPos++;
            //Array.Copy(ba, 0, temp, sepPos, bbSize - sepPos);
            temp.Put(ba, sepPos, bbSize - sepPos);
            temp.Flip();

            ba     = temp.Array;
            bbSize = temp.Remaining;

            /*
             * Find the next separator byte's position splitting word form and tag.
             */
#pragma warning disable 612, 618
            Debug.Assert(sequenceEncoder.PrefixBytes <= bbSize, sequenceEncoder.GetType() + " >? " + bbSize);
            sepPos = sequenceEncoder.PrefixBytes;
#pragma warning restore 612, 618
            for (; sepPos < bbSize; sepPos++)
            {
                if (ba[sepPos] == separator)
                {
                    break;
                }
            }

            /*
             * Decode the stem into stem buffer.
             */
            if (decodeStems)
            {
                entry.stemBuffer = sequenceEncoder.Decode(entry.stemBuffer,
                                                          inflectedBuffer,
                                                          ByteBuffer.Wrap(ba, 0, sepPos));
            }
            else
            {
                entry.stemBuffer = BufferUtils.ClearAndEnsureCapacity(entry.stemBuffer, sepPos);
                entry.stemBuffer.Put(ba, 0, sepPos);
                entry.stemBuffer.Flip();
            }

            // Skip separator character, if present.
            if (sepPos + 1 <= bbSize)
            {
                sepPos++;
            }

            /*
             * Decode the tag data.
             */
            entry.tagBuffer = BufferUtils.ClearAndEnsureCapacity(entry.tagBuffer, bbSize - sepPos);
            //Array.Resize(ref entry.tagBuffer, bbSize - sepPos);
            entry.tagBuffer.Put(ba, sepPos, bbSize - sepPos);
            entry.tagBuffer.Flip();

            return(entry);
        }
        /// <summary>
        /// Searches the automaton for a symbol sequence equal to <paramref name="word"/>,
        /// followed by a separator. The result is a stem (decompressed accordingly
        /// to the dictionary's specification) and an optional tag data.
        /// </summary>
        public IList <WordData> Lookup(string word)
        {
            byte separator = dictionaryMetadata.Separator;

#pragma warning disable 612, 618
            int prefixBytes = sequenceEncoder.PrefixBytes;
#pragma warning restore 612, 618

            if (dictionaryMetadata.InputConversionPairs.Any())
            {
                word = ApplyReplacements(word, dictionaryMetadata.InputConversionPairs);
            }

            // Reset the output list to zero length.
            formsList.Wrap(forms, 0, 0);

            // Encode word characters into bytes in the same encoding as the FSA's.
            charBuffer = BufferUtils.ClearAndEnsureCapacity(charBuffer, word.Length);
            for (int i = 0; i < word.Length; i++)
            {
                char chr = word[i];
                if (chr == separatorChar)
                {
                    // No valid input can contain the separator.
                    return(formsList);
                }
                charBuffer.Put(chr);
            }
            charBuffer.Flip();
            try
            {
                byteBuffer = BufferUtils.CharsToBytes(encoder, charBuffer, byteBuffer);
            }
            catch (UnmappableInputException)
            {
                // This should be a rare occurrence, but if it happens it means there is no way
                // the dictionary can contain the input word.
                return(formsList);
            }

            // Try to find a partial match in the dictionary.
            MatchResult match = matcher.Match(matchResult, byteBuffer
                                              .Array, 0, byteBuffer.Remaining, rootNode);

            if (match.Kind == MatchResult.SequenceIsAPrefix)
            {
                /*
                 * The entire sequence exists in the dictionary. A separator should
                 * be the next symbol.
                 */
                int arc = fsa.GetArc(match.Node, separator);

                /*
                 * The situation when the arc points to a final node should NEVER
                 * happen. After all, we want the word to have SOME base form.
                 */
                if (arc != 0 && !fsa.IsArcFinal(arc))
                {
                    // There is such a word in the dictionary. Return its base forms.
                    int formsCount = 0;

                    finalStatesIterator.RestartFrom(fsa.GetEndNode(arc));
                    while (finalStatesIterator.MoveNext())
                    {
                        ByteBuffer bb     = finalStatesIterator.Current;
                        byte[]     ba     = bb.Array;
                        int        bbSize = bb.Remaining;

                        if (formsCount >= forms.Length)
                        {
                            //forms = Arrays.CopyOf(forms, forms.Length + EXPAND_SIZE);
                            Array.Resize(ref forms, forms.Length + ExpandSize);
                            for (int k = 0; k < forms.Length; k++)
                            {
                                if (forms[k] == null)
                                {
                                    forms[k] = new WordData(decoder);
                                }
                            }
                        }

                        /*
                         * Now, expand the prefix/ suffix 'compression' and store
                         * the base form.
                         */
                        WordData wordData = forms[formsCount++];
                        if (!dictionaryMetadata.OutputConversionPairs.Any())
                        {
                            wordData.Update(byteBuffer, word);
                        }
                        else
                        {
                            wordData.Update(byteBuffer, ApplyReplacements(word, dictionaryMetadata.OutputConversionPairs));
                        }

                        /*
                         * Find the separator byte's position splitting the inflection instructions
                         * from the tag.
                         */
                        Debug.Assert(prefixBytes <= bbSize, sequenceEncoder.GetType() + " >? " + bbSize);
                        int sepPos;
                        for (sepPos = prefixBytes; sepPos < bbSize; sepPos++)
                        {
                            if (ba[sepPos] == separator)
                            {
                                break;
                            }
                        }

                        /*
                         * Decode the stem into stem buffer.
                         */
                        wordData.stemBuffer = sequenceEncoder.Decode(wordData.stemBuffer,
                                                                     byteBuffer,
                                                                     ByteBuffer.Wrap(ba, 0, sepPos));

                        // Skip separator character.
                        sepPos++;

                        /*
                         * Decode the tag data.
                         */
                        int tagSize = bbSize - sepPos;
                        if (tagSize > 0)
                        {
                            wordData.tagBuffer = BufferUtils.ClearAndEnsureCapacity(wordData.tagBuffer, tagSize);
                            wordData.tagBuffer.Put(ba, sepPos, tagSize);
                            wordData.tagBuffer.Flip();
                        }
                    }

                    formsList.Wrap(forms, 0, formsCount);
                }
            }
            else
            {
                /*
                 * this case is somewhat confusing: we should have hit the separator
                 * first... I don't really know how to deal with it at the time
                 * being.
                 */
            }
            return(formsList);
        }