/// <summary> /// /// </summary> public virtual ByteBuffer Encode(ByteBuffer reuse, ByteBuffer source, ByteBuffer target) { int sharedPrefix = BufferUtils.SharedPrefixLength(source, target); int truncateBytes = source.Remaining - sharedPrefix; if (truncateBytes >= RemoveEverything) { truncateBytes = RemoveEverything; sharedPrefix = 0; } reuse = BufferUtils.ClearAndEnsureCapacity(reuse, 1 + target.Remaining - sharedPrefix); Debug.Assert(target.HasArray && target.Position == 0 && target.ArrayOffset == 0); byte suffixTrimCode = (byte)(truncateBytes + 'A'); reuse.Put(suffixTrimCode) .Put(target.Array, sharedPrefix, target.Remaining - sharedPrefix) .Flip(); return(reuse); }
/// <summary> /// /// </summary> public virtual ByteBuffer Decode(ByteBuffer reuse, ByteBuffer source, ByteBuffer encoded) { Debug.Assert(encoded.Remaining >= 1); int suffixTrimCode = encoded.Get(encoded.Position); int truncateBytes = (suffixTrimCode - 'A') & 0xFF; if (truncateBytes == RemoveEverything) { truncateBytes = source.Remaining; } int len1 = source.Remaining - truncateBytes; int len2 = encoded.Remaining - 1; reuse = BufferUtils.ClearAndEnsureCapacity(reuse, len1 + len2); Debug.Assert(source.HasArray && source.Position == 0 && source.ArrayOffset == 0); Debug.Assert(encoded.HasArray && encoded.Position == 0 && encoded.ArrayOffset == 0); reuse.Put(source.Array, 0, len1) .Put(encoded.Array, 1, len2) .Flip(); return(reuse); }
/// <summary> /// /// </summary> public virtual ByteBuffer Decode(ByteBuffer reuse, ByteBuffer source, ByteBuffer encoded) { Debug.Assert(encoded.Remaining >= 2); int p = encoded.Position; int truncatePrefixBytes = (encoded.Get(p) - 'A') & 0xFF; int truncateSuffixBytes = (encoded.Get(p + 1) - 'A') & 0xFF; if (truncatePrefixBytes == RemoveEverything || truncateSuffixBytes == RemoveEverything) { truncatePrefixBytes = source.Remaining; truncateSuffixBytes = 0; } Debug.Assert(source.HasArray && source.Position == 0 && source.ArrayOffset == 0); Debug.Assert(encoded.HasArray && encoded.Position == 0 && encoded.ArrayOffset == 0); int len1 = source.Remaining - (truncateSuffixBytes + truncatePrefixBytes); int len2 = encoded.Remaining - 2; reuse = BufferUtils.ClearAndEnsureCapacity(reuse, len1 + len2); reuse.Put(source.Array, truncatePrefixBytes, len1); reuse.Put(encoded.Array, 2, len2); reuse.Flip(); return(reuse); }
/// <summary> /// Copy the inflected word's binary data (no charset decoding) to a custom /// byte buffer. /// <para/> /// The buffer is cleared prior to copying and flipped for reading /// upon returning from this method. If the buffer is null or not large /// enough to hold the result, a new buffer is allocated. /// </summary> /// <param name="target">Target byte buffer to copy the word buffer to or /// <c>null</c> if a new buffer should be allocated.</param> /// <returns>Returns <paramref name="target"/> or the new reallocated buffer.</returns> public ByteBuffer GetWordBytes(ByteBuffer target) { target = BufferUtils.ClearAndEnsureCapacity(target, wordBuffer.Remaining); wordBuffer.Mark(); target.Put(wordBuffer); wordBuffer.Reset(); target.Flip(); return(target); }
/// <summary> /// /// </summary> public virtual ByteBuffer Decode(ByteBuffer reuse, ByteBuffer source, ByteBuffer encoded) { reuse = BufferUtils.ClearAndEnsureCapacity(reuse, encoded.Remaining); encoded.Mark(); reuse.Put(encoded) .Flip(); encoded.Reset(); return(reuse); }
/// <summary> /// /// </summary> public virtual ByteBuffer Encode(ByteBuffer reuse, ByteBuffer source, ByteBuffer target) { reuse = BufferUtils.ClearAndEnsureCapacity(reuse, target.Remaining); target.Mark(); reuse.Put(target) .Flip(); target.Reset(); return(reuse); }
/// <summary> /// /// </summary> public virtual ByteBuffer Encode(ByteBuffer reuse, ByteBuffer source, ByteBuffer target) { // Search for the maximum matching subsequence that can be encoded. int maxSubsequenceLength = 0; int maxSubsequenceIndex = 0; for (int i = 0; i < source.Remaining; i++) { // prefix at i => shared subsequence (infix) int sharedPrefix = BufferUtils.SharedPrefixLength(source, i, target, 0); // Only update maxSubsequenceLength if we will be able to encode it. if (sharedPrefix > maxSubsequenceLength && i < RemoveEverything && (source.Remaining - (i + sharedPrefix)) < RemoveEverything) { maxSubsequenceLength = sharedPrefix; maxSubsequenceIndex = i; } } // Determine how much to remove (and where) from src to get a prefix of dst. int truncatePrefixBytes = maxSubsequenceIndex; int truncateSuffixBytes = (source.Remaining - (maxSubsequenceIndex + maxSubsequenceLength)); if (truncatePrefixBytes >= RemoveEverything || truncateSuffixBytes >= RemoveEverything) { maxSubsequenceIndex = maxSubsequenceLength = 0; truncatePrefixBytes = truncateSuffixBytes = RemoveEverything; } int len1 = target.Remaining - maxSubsequenceLength; reuse = BufferUtils.ClearAndEnsureCapacity(reuse, 2 + len1); Debug.Assert(target.HasArray && target.Position == 0 && target.ArrayOffset == 0); reuse.Put((byte)((truncatePrefixBytes + 'A') & 0xFF)); reuse.Put((byte)((truncateSuffixBytes + 'A') & 0xFF)); reuse.Put(target.Array, maxSubsequenceLength, len1); reuse.Flip(); return(reuse); }
/// <summary> /// /// </summary> public virtual ByteBuffer Decode(ByteBuffer reuse, ByteBuffer source, ByteBuffer encoded) { Debug.Assert(encoded.Remaining >= 3); int p = encoded.Position; int infixIndex = (encoded.Get(p) - 'A') & 0xFF; int infixLength = (encoded.Get(p + 1) - 'A') & 0xFF; int truncateSuffixBytes = (encoded.Get(p + 2) - 'A') & 0xFF; if (infixLength == RemoveEverything || truncateSuffixBytes == RemoveEverything) { infixIndex = 0; infixLength = source.Remaining; truncateSuffixBytes = 0; } int len1 = source.Remaining - (infixIndex + infixLength + truncateSuffixBytes); int len2 = encoded.Remaining - 3; reuse = BufferUtils.ClearAndEnsureCapacity(reuse, infixIndex + len1 + len2); Debug.Assert(encoded.HasArray && encoded.Position == 0 && encoded.ArrayOffset == 0); Debug.Assert(source.HasArray && source.Position == 0 && source.ArrayOffset == 0); reuse.Put(source.Array, 0, infixIndex); reuse.Put(source.Array, infixIndex + infixLength, len1); reuse.Put(encoded.Array, 3, len2); reuse.Flip(); return(reuse); }
/// <summary> /// /// </summary> public virtual ByteBuffer Encode(ByteBuffer reuse, ByteBuffer source, ByteBuffer target) { Debug.Assert(source.HasArray && source.Position == 0 && source.ArrayOffset == 0); Debug.Assert(target.HasArray && target.Position == 0 && target.ArrayOffset == 0); // Search for the infix that can we can encode and remove from src // to get a maximum-length prefix of dst. This could be done more efficiently // by running a smarter longest-common-subsequence algorithm and some pruning (?). // // For now, naive loop should do. // There can be only two positions for the infix to delete: // 1) we remove leading bytes, even if they are partially matching (but a longer match // exists somewhere later on). // 2) we leave max. matching prefix and remove non-matching bytes that follow. int maxInfixIndex = 0; int maxSubsequenceLength = BufferUtils.SharedPrefixLength(source, target); int maxInfixLength = 0; foreach (int i in new int[] { 0, maxSubsequenceLength }) { for (int j = 1; j <= source.Remaining - i; j++) { // Compute temporary src with the infix removed. // Concatenate in scratch space for simplicity. int len2 = source.Remaining - (i + j); scratch = BufferUtils.ClearAndEnsureCapacity(scratch, i + len2); scratch.Put(source.Array, 0, i); scratch.Put(source.Array, i + j, len2); scratch.Flip(); int sharedPrefix = BufferUtils.SharedPrefixLength(scratch, target); // Only update maxSubsequenceLength if we will be able to encode it. if (sharedPrefix > 0 && sharedPrefix > maxSubsequenceLength && i < RemoveEverything && j < RemoveEverything) { maxSubsequenceLength = sharedPrefix; maxInfixIndex = i; maxInfixLength = j; } } } int truncateSuffixBytes = source.Remaining - (maxInfixLength + maxSubsequenceLength); // Special case: if we're removing the suffix in the infix code, move it // to the suffix code instead. if (truncateSuffixBytes == 0 && maxInfixIndex + maxInfixLength == source.Remaining) { truncateSuffixBytes = maxInfixLength; maxInfixIndex = maxInfixLength = 0; } if (maxInfixIndex >= RemoveEverything || maxInfixLength >= RemoveEverything || truncateSuffixBytes >= RemoveEverything) { maxInfixIndex = maxSubsequenceLength = 0; maxInfixLength = truncateSuffixBytes = RemoveEverything; } int len1 = target.Remaining - maxSubsequenceLength; reuse = BufferUtils.ClearAndEnsureCapacity(reuse, 3 + len1); reuse.Put((byte)((maxInfixIndex + 'A') & 0xFF)); reuse.Put((byte)((maxInfixLength + 'A') & 0xFF)); reuse.Put((byte)((truncateSuffixBytes + 'A') & 0xFF)); reuse.Put(target.Array, maxSubsequenceLength, len1); reuse.Flip(); return(reuse); }
private WordData Next() { ByteBuffer entryBuffer = entriesIter.Current; /* * Entries are typically: inflected<SEP>codedBase<SEP>tag so try to find this split. */ byte[] ba = entryBuffer.Array; int bbSize = entryBuffer.Remaining; int sepPos; for (sepPos = 0; sepPos < bbSize; sepPos++) { if (ba[sepPos] == separator) { break; } } if (sepPos == bbSize) { throw new Exception("Invalid dictionary " + "entry format (missing separator)."); } inflectedBuffer = BufferUtils.ClearAndEnsureCapacity(inflectedBuffer, sepPos); //Array.Resize(ref inflectedBuffer, sepPos); //Array.Copy(ba, 0, inflectedBuffer, 0, sepPos); inflectedBuffer.Put(ba, 0, sepPos); inflectedBuffer.Flip(); inflectedCharBuffer = BufferUtils.BytesToChars(decoder, inflectedBuffer, inflectedCharBuffer); entry.Update(inflectedBuffer, inflectedCharBuffer); temp = BufferUtils.ClearAndEnsureCapacity(temp, bbSize - sepPos); //Array.Resize(ref temp, bbSize - sepPos); sepPos++; //Array.Copy(ba, 0, temp, sepPos, bbSize - sepPos); temp.Put(ba, sepPos, bbSize - sepPos); temp.Flip(); ba = temp.Array; bbSize = temp.Remaining; /* * Find the next separator byte's position splitting word form and tag. */ #pragma warning disable 612, 618 Debug.Assert(sequenceEncoder.PrefixBytes <= bbSize, sequenceEncoder.GetType() + " >? " + bbSize); sepPos = sequenceEncoder.PrefixBytes; #pragma warning restore 612, 618 for (; sepPos < bbSize; sepPos++) { if (ba[sepPos] == separator) { break; } } /* * Decode the stem into stem buffer. */ if (decodeStems) { entry.stemBuffer = sequenceEncoder.Decode(entry.stemBuffer, inflectedBuffer, ByteBuffer.Wrap(ba, 0, sepPos)); } else { entry.stemBuffer = BufferUtils.ClearAndEnsureCapacity(entry.stemBuffer, sepPos); entry.stemBuffer.Put(ba, 0, sepPos); entry.stemBuffer.Flip(); } // Skip separator character, if present. if (sepPos + 1 <= bbSize) { sepPos++; } /* * Decode the tag data. */ entry.tagBuffer = BufferUtils.ClearAndEnsureCapacity(entry.tagBuffer, bbSize - sepPos); //Array.Resize(ref entry.tagBuffer, bbSize - sepPos); entry.tagBuffer.Put(ba, sepPos, bbSize - sepPos); entry.tagBuffer.Flip(); return(entry); }
/// <summary> /// Searches the automaton for a symbol sequence equal to <paramref name="word"/>, /// followed by a separator. The result is a stem (decompressed accordingly /// to the dictionary's specification) and an optional tag data. /// </summary> public IList <WordData> Lookup(string word) { byte separator = dictionaryMetadata.Separator; #pragma warning disable 612, 618 int prefixBytes = sequenceEncoder.PrefixBytes; #pragma warning restore 612, 618 if (dictionaryMetadata.InputConversionPairs.Any()) { word = ApplyReplacements(word, dictionaryMetadata.InputConversionPairs); } // Reset the output list to zero length. formsList.Wrap(forms, 0, 0); // Encode word characters into bytes in the same encoding as the FSA's. charBuffer = BufferUtils.ClearAndEnsureCapacity(charBuffer, word.Length); for (int i = 0; i < word.Length; i++) { char chr = word[i]; if (chr == separatorChar) { // No valid input can contain the separator. return(formsList); } charBuffer.Put(chr); } charBuffer.Flip(); try { byteBuffer = BufferUtils.CharsToBytes(encoder, charBuffer, byteBuffer); } catch (UnmappableInputException) { // This should be a rare occurrence, but if it happens it means there is no way // the dictionary can contain the input word. return(formsList); } // Try to find a partial match in the dictionary. MatchResult match = matcher.Match(matchResult, byteBuffer .Array, 0, byteBuffer.Remaining, rootNode); if (match.Kind == MatchResult.SequenceIsAPrefix) { /* * The entire sequence exists in the dictionary. A separator should * be the next symbol. */ int arc = fsa.GetArc(match.Node, separator); /* * The situation when the arc points to a final node should NEVER * happen. After all, we want the word to have SOME base form. */ if (arc != 0 && !fsa.IsArcFinal(arc)) { // There is such a word in the dictionary. Return its base forms. int formsCount = 0; finalStatesIterator.RestartFrom(fsa.GetEndNode(arc)); while (finalStatesIterator.MoveNext()) { ByteBuffer bb = finalStatesIterator.Current; byte[] ba = bb.Array; int bbSize = bb.Remaining; if (formsCount >= forms.Length) { //forms = Arrays.CopyOf(forms, forms.Length + EXPAND_SIZE); Array.Resize(ref forms, forms.Length + ExpandSize); for (int k = 0; k < forms.Length; k++) { if (forms[k] == null) { forms[k] = new WordData(decoder); } } } /* * Now, expand the prefix/ suffix 'compression' and store * the base form. */ WordData wordData = forms[formsCount++]; if (!dictionaryMetadata.OutputConversionPairs.Any()) { wordData.Update(byteBuffer, word); } else { wordData.Update(byteBuffer, ApplyReplacements(word, dictionaryMetadata.OutputConversionPairs)); } /* * Find the separator byte's position splitting the inflection instructions * from the tag. */ Debug.Assert(prefixBytes <= bbSize, sequenceEncoder.GetType() + " >? " + bbSize); int sepPos; for (sepPos = prefixBytes; sepPos < bbSize; sepPos++) { if (ba[sepPos] == separator) { break; } } /* * Decode the stem into stem buffer. */ wordData.stemBuffer = sequenceEncoder.Decode(wordData.stemBuffer, byteBuffer, ByteBuffer.Wrap(ba, 0, sepPos)); // Skip separator character. sepPos++; /* * Decode the tag data. */ int tagSize = bbSize - sepPos; if (tagSize > 0) { wordData.tagBuffer = BufferUtils.ClearAndEnsureCapacity(wordData.tagBuffer, tagSize); wordData.tagBuffer.Put(ba, sepPos, tagSize); wordData.tagBuffer.Flip(); } } formsList.Wrap(forms, 0, formsCount); } } else { /* * this case is somewhat confusing: we should have hit the separator * first... I don't really know how to deal with it at the time * being. */ } return(formsList); }