Example #1
0
        /// <summary>
        /// Reads the next character from the text reader and advances the character position by one character.
        /// </summary>
        /// <returns>The next character from the text reader, or -1 if no more characters are available.</returns>
        public override int Read()
        {
            int ic = buffer.Get(bufferPosition);

            // End of input
            if (ic == -1)
            {
                buffer.FreeBefore(bufferPosition);
                return(ic);
            }

            char c = (char)ic;

            // Skip surrogate pair characters
            if (char.IsHighSurrogate(c) || char.IsLowSurrogate(c))
            {
                iterationMarkSpanEndPosition = bufferPosition + 1;
            }

            // Free rolling buffer on full stop
            if (c == FULL_STOP_PUNCTUATION)
            {
                buffer.FreeBefore(bufferPosition);
                iterationMarkSpanEndPosition = bufferPosition + 1;
            }

            // Normalize iteration mark
            if (IsIterationMark(c))
            {
                c = NormalizeIterationMark(c);
            }

            bufferPosition++;
            return(c);
        }
Example #2
0
        public override int Read()
        {
            //System.out.println("\nread");
            while (true)
            {
                if (replacement != null && replacementPointer < replacement.Length)
                {
                    //System.out.println("  return repl[" + replacementPointer + "]=" + replacement.chars[replacement.offset + replacementPointer]);
                    return(replacement.Chars[replacement.Offset + replacementPointer++]);
                }

                // TODO: a more efficient approach would be Aho/Corasick's
                // algorithm
                // (http://en.wikipedia.org/wiki/Aho%E2%80%93Corasick_string_matching_algorithm)
                // or this generalizatio: www.cis.uni-muenchen.de/people/Schulz/Pub/dictle5.ps
                //
                // I think this would be (almost?) equivalent to 1) adding
                // epsilon arcs from all final nodes back to the init
                // node in the FST, 2) adding a .* (skip any char)
                // loop on the initial node, and 3) determinizing
                // that.  Then we would not have to Restart matching
                // at each position.

                int      lastMatchLen = -1;
                CharsRef lastMatch    = null;

                int firstCH = buffer.Get(inputOff);
                if (firstCH != -1)
                {
                    // LUCENENET fix: Check the dictionary to ensure it contains a key before reading it.
                    char key = Convert.ToChar((char)firstCH);
                    if (cachedRootArcs.TryGetValue(key, out FST.Arc <CharsRef> arc) && arc != null)
                    {
                        if (!FST.TargetHasArcs(arc))
                        {
                            // Fast pass for single character match:
                            if (Debugging.AssertsEnabled)
                            {
                                Debugging.Assert(arc.IsFinal);
                            }
                            lastMatchLen = 1;
                            lastMatch    = arc.Output;
                        }
                        else
                        {
                            int      lookahead = 0;
                            CharsRef output    = arc.Output;
                            while (true)
                            {
                                lookahead++;

                                if (arc.IsFinal)
                                {
                                    // Match! (to node is final)
                                    lastMatchLen = lookahead;
                                    lastMatch    = outputs.Add(output, arc.NextFinalOutput);
                                    // Greedy: keep searching to see if there's a
                                    // longer match...
                                }

                                if (!FST.TargetHasArcs(arc))
                                {
                                    break;
                                }

                                int ch = buffer.Get(inputOff + lookahead);
                                if (ch == -1)
                                {
                                    break;
                                }
                                if ((arc = map.FindTargetArc(ch, arc, scratchArc, fstReader)) == null)
                                {
                                    // Dead end
                                    break;
                                }
                                output = outputs.Add(output, arc.Output);
                            }
                        }
                    }
                }

                if (lastMatch != null)
                {
                    inputOff += lastMatchLen;
                    //System.out.println("  match!  len=" + lastMatchLen + " repl=" + lastMatch);
                    int diff = lastMatchLen - lastMatch.Length;

                    if (diff != 0)
                    {
                        int prevCumulativeDiff = LastCumulativeDiff;
                        if (diff > 0)
                        {
                            // Replacement is shorter than matched input:
                            AddOffCorrectMap(inputOff - diff - prevCumulativeDiff, prevCumulativeDiff + diff);
                        }
                        else
                        {
                            // Replacement is longer than matched input: remap
                            // the "extra" chars all back to the same input
                            // offset:
                            int outputStart = inputOff - prevCumulativeDiff;
                            for (int extraIDX = 0; extraIDX < -diff; extraIDX++)
                            {
                                AddOffCorrectMap(outputStart + extraIDX, prevCumulativeDiff - extraIDX - 1);
                            }
                        }
                    }

                    replacement        = lastMatch;
                    replacementPointer = 0;
                }
                else
                {
                    int ret = buffer.Get(inputOff);
                    if (ret != -1)
                    {
                        inputOff++;
                        buffer.FreeBefore(inputOff);
                    }
                    return(ret);
                }
            }
        }
Example #3
0
        public virtual void Test()
        {
            var ITERS = AtLeast(1000);

            var buffer = new RollingCharBuffer();

            var random = Random();

            for (var iter = 0; iter < ITERS; iter++)
            {
                var stringLen = random.NextBoolean() ? random.Next(50) : random.Next(20000);

                string s;
                if (stringLen == 0)
                {
                    s = "";
                }
                else
                {
                    s = TestUtil.RandomUnicodeString(random, stringLen);
                }
                if (VERBOSE)
                {
                    Console.WriteLine("\nTEST: iter=" + iter + " s.length()=" + s.Length);
                }
                buffer.Reset(new StringReader(s));
                var nextRead   = 0;
                var availCount = 0;
                while (nextRead < s.Length)
                {
                    if (VERBOSE)
                    {
                        Console.WriteLine("  cycle nextRead=" + nextRead + " avail=" + availCount);
                    }
                    if (availCount == 0 || random.NextBoolean())
                    {
                        // Read next char
                        if (VERBOSE)
                        {
                            Console.WriteLine("    new char");
                        }
                        assertEquals(s[nextRead], buffer.Get(nextRead));
                        nextRead++;
                        availCount++;
                    }
                    else if (random.NextBoolean())
                    {
                        // Read previous char
                        var pos = TestUtil.NextInt(random, nextRead - availCount, nextRead - 1);
                        if (VERBOSE)
                        {
                            Console.WriteLine("    old char pos=" + pos);
                        }
                        assertEquals(s[pos], buffer.Get(pos));
                    }
                    else
                    {
                        // Read slice
                        int length;
                        if (availCount == 1)
                        {
                            length = 1;
                        }
                        else
                        {
                            length = TestUtil.NextInt(random, 1, availCount);
                        }
                        int start;
                        if (length == availCount)
                        {
                            start = nextRead - availCount;
                        }
                        else
                        {
                            start = nextRead - availCount + random.Next(availCount - length);
                        }
                        if (VERBOSE)
                        {
                            Console.WriteLine("    slice start=" + start + " length=" + length);
                        }
                        assertEquals(s.Substring(start, length), new string(buffer.Get(start, length)));
                    }

                    if (availCount > 0 && random.Next(20) == 17)
                    {
                        var toFree = random.Next(availCount);
                        if (VERBOSE)
                        {
                            Console.WriteLine("    free " + toFree + " (avail=" + (availCount - toFree) + ")");
                        }
                        buffer.FreeBefore(nextRead - (availCount - toFree));
                        availCount -= toFree;
                    }
                }
            }
        }