private void assertRoundtripEncode(ISequenceEncoder coder, String srcString, String dstString) { ByteBuffer source = ByteBuffer.Wrap(Encoding.UTF8.GetBytes(srcString)); ByteBuffer target = ByteBuffer.Wrap(Encoding.UTF8.GetBytes(dstString)); ByteBuffer encoded = coder.Encode(ByteBuffer.Allocate(Random.Next(30)), source, target); ByteBuffer decoded = coder.Decode(ByteBuffer.Allocate(Random.Next(30)), source, encoded); if (!decoded.Equals(target)) { Console.Out.WriteLine("src: " + BufferUtils.ToString(source, Encoding.UTF8)); Console.Out.WriteLine("dst: " + BufferUtils.ToString(target, Encoding.UTF8)); Console.Out.WriteLine("enc: " + BufferUtils.ToString(encoded, Encoding.UTF8)); Console.Out.WriteLine("dec: " + BufferUtils.ToString(decoded, Encoding.UTF8)); fail("Mismatch."); } }
private WordData Next() { ByteBuffer entryBuffer = entriesIter.Current; /* * Entries are typically: inflected<SEP>codedBase<SEP>tag so try to find this split. */ byte[] ba = entryBuffer.Array; int bbSize = entryBuffer.Remaining; int sepPos; for (sepPos = 0; sepPos < bbSize; sepPos++) { if (ba[sepPos] == separator) { break; } } if (sepPos == bbSize) { throw new Exception("Invalid dictionary " + "entry format (missing separator)."); } inflectedBuffer = BufferUtils.ClearAndEnsureCapacity(inflectedBuffer, sepPos); //Array.Resize(ref inflectedBuffer, sepPos); //Array.Copy(ba, 0, inflectedBuffer, 0, sepPos); inflectedBuffer.Put(ba, 0, sepPos); inflectedBuffer.Flip(); inflectedCharBuffer = BufferUtils.BytesToChars(decoder, inflectedBuffer, inflectedCharBuffer); entry.Update(inflectedBuffer, inflectedCharBuffer); temp = BufferUtils.ClearAndEnsureCapacity(temp, bbSize - sepPos); //Array.Resize(ref temp, bbSize - sepPos); sepPos++; //Array.Copy(ba, 0, temp, sepPos, bbSize - sepPos); temp.Put(ba, sepPos, bbSize - sepPos); temp.Flip(); ba = temp.Array; bbSize = temp.Remaining; /* * Find the next separator byte's position splitting word form and tag. */ #pragma warning disable 612, 618 Debug.Assert(sequenceEncoder.PrefixBytes <= bbSize, sequenceEncoder.GetType() + " >? " + bbSize); sepPos = sequenceEncoder.PrefixBytes; #pragma warning restore 612, 618 for (; sepPos < bbSize; sepPos++) { if (ba[sepPos] == separator) { break; } } /* * Decode the stem into stem buffer. */ if (decodeStems) { entry.stemBuffer = sequenceEncoder.Decode(entry.stemBuffer, inflectedBuffer, ByteBuffer.Wrap(ba, 0, sepPos)); } else { entry.stemBuffer = BufferUtils.ClearAndEnsureCapacity(entry.stemBuffer, sepPos); entry.stemBuffer.Put(ba, 0, sepPos); entry.stemBuffer.Flip(); } // Skip separator character, if present. if (sepPos + 1 <= bbSize) { sepPos++; } /* * Decode the tag data. */ entry.tagBuffer = BufferUtils.ClearAndEnsureCapacity(entry.tagBuffer, bbSize - sepPos); //Array.Resize(ref entry.tagBuffer, bbSize - sepPos); entry.tagBuffer.Put(ba, sepPos, bbSize - sepPos); entry.tagBuffer.Flip(); return(entry); }
/// <summary> /// Searches the automaton for a symbol sequence equal to <paramref name="word"/>, /// followed by a separator. The result is a stem (decompressed accordingly /// to the dictionary's specification) and an optional tag data. /// </summary> public IList <WordData> Lookup(ICharSequence word) { byte separator = dictionaryMetadata.Separator; #pragma warning disable 612, 618 int prefixBytes = sequenceEncoder.PrefixBytes; #pragma warning restore 612, 618 if (dictionaryMetadata.InputConversionPairs.Any()) { word = ApplyReplacements(word, dictionaryMetadata.InputConversionPairs); } // Reset the output list to zero length. formsList.Wrap(forms, 0, 0); // Encode word characters into bytes in the same encoding as the FSA's. charBuffer = BufferUtils.ClearAndEnsureCapacity(charBuffer, word.Length); for (int i = 0; i < word.Length; i++) { char chr = word[i]; if (chr == separatorChar) { // No valid input can contain the separator. return(formsList); } charBuffer.Put(chr); } charBuffer.Flip(); try { byteBuffer = BufferUtils.CharsToBytes(encoder, charBuffer, byteBuffer); } catch (UnmappableInputException) { // This should be a rare occurrence, but if it happens it means there is no way // the dictionary can contain the input word. return(formsList); } // Try to find a partial match in the dictionary. MatchResult match = matcher.Match(matchResult, byteBuffer .Array, 0, byteBuffer.Remaining, rootNode); if (match.Kind == MatchResult.SequenceIsAPrefix) { /* * The entire sequence exists in the dictionary. A separator should * be the next symbol. */ int arc = fsa.GetArc(match.Node, separator); /* * The situation when the arc points to a final node should NEVER * happen. After all, we want the word to have SOME base form. */ if (arc != 0 && !fsa.IsArcFinal(arc)) { // There is such a word in the dictionary. Return its base forms. int formsCount = 0; finalStatesIterator.RestartFrom(fsa.GetEndNode(arc)); while (finalStatesIterator.MoveNext()) { ByteBuffer bb = finalStatesIterator.Current; byte[] ba = bb.Array; int bbSize = bb.Remaining; if (formsCount >= forms.Length) { //forms = Arrays.CopyOf(forms, forms.Length + EXPAND_SIZE); Array.Resize(ref forms, forms.Length + ExpandSize); for (int k = 0; k < forms.Length; k++) { if (forms[k] == null) { forms[k] = new WordData(decoder); } } } /* * Now, expand the prefix/ suffix 'compression' and store * the base form. */ WordData wordData = forms[formsCount++]; if (!dictionaryMetadata.OutputConversionPairs.Any()) { wordData.Update(byteBuffer, word); } else { wordData.Update(byteBuffer, ApplyReplacements(word, dictionaryMetadata.OutputConversionPairs)); } /* * Find the separator byte's position splitting the inflection instructions * from the tag. */ Debug.Assert(prefixBytes <= bbSize, sequenceEncoder.GetType() + " >? " + bbSize); int sepPos; for (sepPos = prefixBytes; sepPos < bbSize; sepPos++) { if (ba[sepPos] == separator) { break; } } /* * Decode the stem into stem buffer. */ wordData.stemBuffer = sequenceEncoder.Decode(wordData.stemBuffer, byteBuffer, ByteBuffer.Wrap(ba, 0, sepPos)); // Skip separator character. sepPos++; /* * Decode the tag data. */ int tagSize = bbSize - sepPos; if (tagSize > 0) { wordData.tagBuffer = BufferUtils.ClearAndEnsureCapacity(wordData.tagBuffer, tagSize); wordData.tagBuffer.Put(ba, sepPos, tagSize); wordData.tagBuffer.Flip(); } } formsList.Wrap(forms, 0, formsCount); } } else { /* * this case is somewhat confusing: we should have hit the separator * first... I don't really know how to deal with it at the time * being. */ } return(formsList); }