public sealed override bool IncrementToken() { if (m_input.IncrementToken()) { string text = termAttribute.ToString(); termAttribute.SetEmpty(); termAttribute.Append(CyrillicLatinConverter.cir2lat(text)); return(true); } return(false); }
/// <summary> /// Returns the next token in the stream, or null at EOS. </summary> public override bool IncrementToken() { ClearAttributes(); // if we are just starting, read the whole input if (!started) { started = true; gramSize = minGram; int limit = side == Side.FRONT ? maxGram : 1024; char[] chars = new char[Math.Min(1024, limit)]; charsRead = 0; // TODO: refactor to a shared readFully somewhere: bool exhausted = false; while (charsRead < limit) { int inc = input.Read(chars, charsRead, chars.Length - charsRead); if (inc <= 0) { exhausted = true; break; } charsRead += inc; if (charsRead == chars.Length && charsRead < limit) { chars = ArrayUtil.Grow(chars); } } inStr = new string(chars, 0, charsRead); inStr = inStr.Trim(); if (!exhausted) { // Read extra throwaway chars so that on end() we // report the correct offset: var throwaway = new char[1024]; while (true) { int inc = input.Read(throwaway, 0, throwaway.Length); if (inc <= 0) { break; } charsRead += inc; } } inLen = inStr.Length; if (inLen == 0) { return(false); } posIncrAtt.PositionIncrement = 1; } else { posIncrAtt.PositionIncrement = 0; } // if the remaining input is too short, we can't generate any n-grams if (gramSize > inLen) { return(false); } // if we have hit the end of our n-gram size range, quit if (gramSize > maxGram || gramSize > inLen) { return(false); } // grab gramSize chars from front or back int start = side == Side.FRONT ? 0 : inLen - gramSize; int end = start + gramSize; termAtt.SetEmpty().Append(inStr, start, end); offsetAtt.SetOffset(CorrectOffset(start), CorrectOffset(end)); gramSize++; return(true); }
/// <summary> /// Returns the next token in the stream, or null at EOS. </summary> public override bool IncrementToken() { ClearAttributes(); if (!started) { started = true; gramSize = minGram; char[] chars = new char[1024]; charsRead = 0; // TODO: refactor to a shared readFully somewhere: while (charsRead < chars.Length) { int inc = input.Read(chars, charsRead, chars.Length - charsRead); if (inc == -1) { break; } charsRead += inc; } inStr = (new string(chars, 0, charsRead)).Trim(); // remove any trailing empty strings if (charsRead == chars.Length) { // Read extra throwaway chars so that on end() we // report the correct offset: var throwaway = new char[1024]; while (true) { int inc = input.Read(throwaway, 0, throwaway.Length); if (inc == -1) { break; } charsRead += inc; } } inLen = inStr.Length; if (inLen == 0) { return(false); } } if (pos + gramSize > inLen) // if we hit the end of the string { pos = 0; // reset to beginning of string gramSize++; // increase n-gram size if (gramSize > maxGram) // we are done { return(false); } if (pos + gramSize > inLen) { return(false); } } int oldPos = pos; pos++; termAtt.SetEmpty().Append(inStr, oldPos, oldPos + gramSize); offsetAtt.SetOffset(CorrectOffset(oldPos), CorrectOffset(oldPos + gramSize)); return(true); }