/// <summary> /// Reads the next character from the text reader and advances the character position by one character. /// </summary> /// <returns>The next character from the text reader, or -1 if no more characters are available.</returns> public override int Read() { int ic = buffer.Get(bufferPosition); // End of input if (ic == -1) { buffer.FreeBefore(bufferPosition); return(ic); } char c = (char)ic; // Skip surrogate pair characters if (char.IsHighSurrogate(c) || char.IsLowSurrogate(c)) { iterationMarkSpanEndPosition = bufferPosition + 1; } // Free rolling buffer on full stop if (c == FULL_STOP_PUNCTUATION) { buffer.FreeBefore(bufferPosition); iterationMarkSpanEndPosition = bufferPosition + 1; } // Normalize iteration mark if (IsIterationMark(c)) { c = NormalizeIterationMark(c); } bufferPosition++; return(c); }
public override int Read() { //System.out.println("\nread"); while (true) { if (replacement != null && replacementPointer < replacement.Length) { //System.out.println(" return repl[" + replacementPointer + "]=" + replacement.chars[replacement.offset + replacementPointer]); return(replacement.Chars[replacement.Offset + replacementPointer++]); } // TODO: a more efficient approach would be Aho/Corasick's // algorithm // (http://en.wikipedia.org/wiki/Aho%E2%80%93Corasick_string_matching_algorithm) // or this generalizatio: www.cis.uni-muenchen.de/people/Schulz/Pub/dictle5.ps // // I think this would be (almost?) equivalent to 1) adding // epsilon arcs from all final nodes back to the init // node in the FST, 2) adding a .* (skip any char) // loop on the initial node, and 3) determinizing // that. Then we would not have to Restart matching // at each position. int lastMatchLen = -1; CharsRef lastMatch = null; int firstCH = buffer.Get(inputOff); if (firstCH != -1) { // LUCENENET fix: Check the dictionary to ensure it contains a key before reading it. char key = Convert.ToChar((char)firstCH); if (cachedRootArcs.TryGetValue(key, out FST.Arc <CharsRef> arc) && arc != null) { if (!FST.TargetHasArcs(arc)) { // Fast pass for single character match: if (Debugging.AssertsEnabled) { Debugging.Assert(arc.IsFinal); } lastMatchLen = 1; lastMatch = arc.Output; } else { int lookahead = 0; CharsRef output = arc.Output; while (true) { lookahead++; if (arc.IsFinal) { // Match! (to node is final) lastMatchLen = lookahead; lastMatch = outputs.Add(output, arc.NextFinalOutput); // Greedy: keep searching to see if there's a // longer match... } if (!FST.TargetHasArcs(arc)) { break; } int ch = buffer.Get(inputOff + lookahead); if (ch == -1) { break; } if ((arc = map.FindTargetArc(ch, arc, scratchArc, fstReader)) == null) { // Dead end break; } output = outputs.Add(output, arc.Output); } } } } if (lastMatch != null) { inputOff += lastMatchLen; //System.out.println(" match! len=" + lastMatchLen + " repl=" + lastMatch); int diff = lastMatchLen - lastMatch.Length; if (diff != 0) { int prevCumulativeDiff = LastCumulativeDiff; if (diff > 0) { // Replacement is shorter than matched input: AddOffCorrectMap(inputOff - diff - prevCumulativeDiff, prevCumulativeDiff + diff); } else { // Replacement is longer than matched input: remap // the "extra" chars all back to the same input // offset: int outputStart = inputOff - prevCumulativeDiff; for (int extraIDX = 0; extraIDX < -diff; extraIDX++) { AddOffCorrectMap(outputStart + extraIDX, prevCumulativeDiff - extraIDX - 1); } } } replacement = lastMatch; replacementPointer = 0; } else { int ret = buffer.Get(inputOff); if (ret != -1) { inputOff++; buffer.FreeBefore(inputOff); } return(ret); } } }
public virtual void Test() { var ITERS = AtLeast(1000); var buffer = new RollingCharBuffer(); var random = Random(); for (var iter = 0; iter < ITERS; iter++) { var stringLen = random.NextBoolean() ? random.Next(50) : random.Next(20000); string s; if (stringLen == 0) { s = ""; } else { s = TestUtil.RandomUnicodeString(random, stringLen); } if (VERBOSE) { Console.WriteLine("\nTEST: iter=" + iter + " s.length()=" + s.Length); } buffer.Reset(new StringReader(s)); var nextRead = 0; var availCount = 0; while (nextRead < s.Length) { if (VERBOSE) { Console.WriteLine(" cycle nextRead=" + nextRead + " avail=" + availCount); } if (availCount == 0 || random.NextBoolean()) { // Read next char if (VERBOSE) { Console.WriteLine(" new char"); } assertEquals(s[nextRead], buffer.Get(nextRead)); nextRead++; availCount++; } else if (random.NextBoolean()) { // Read previous char var pos = TestUtil.NextInt(random, nextRead - availCount, nextRead - 1); if (VERBOSE) { Console.WriteLine(" old char pos=" + pos); } assertEquals(s[pos], buffer.Get(pos)); } else { // Read slice int length; if (availCount == 1) { length = 1; } else { length = TestUtil.NextInt(random, 1, availCount); } int start; if (length == availCount) { start = nextRead - availCount; } else { start = nextRead - availCount + random.Next(availCount - length); } if (VERBOSE) { Console.WriteLine(" slice start=" + start + " length=" + length); } assertEquals(s.Substring(start, length), new string(buffer.Get(start, length))); } if (availCount > 0 && random.Next(20) == 17) { var toFree = random.Next(availCount); if (VERBOSE) { Console.WriteLine(" free " + toFree + " (avail=" + (availCount - toFree) + ")"); } buffer.FreeBefore(nextRead - (availCount - toFree)); availCount -= toFree; } } } }