예제 #1
0
            /// <summary>
            /// Returns the value mapped to the given key or <code>null</code> if the key is not in the FST dictionary.
            /// </summary>
//JAVA TO C# CONVERTER WARNING: Method 'throws' clauses are not available in .NET:
//ORIGINAL LINE: public org.apache.lucene.util.BytesRef get(char[] buffer, int bufferLen, org.apache.lucene.util.fst.FST.Arc<org.apache.lucene.util.BytesRef> scratchArc, org.apache.lucene.util.fst.FST.BytesReader fstReader) throws java.io.IOException
            public BytesRef get(char[] buffer, int bufferLen, FST.Arc <BytesRef> scratchArc, FST.BytesReader fstReader)
            {
                BytesRef pendingOutput = fst.outputs.NoOutput;
                BytesRef matchOutput   = null;
                int      bufUpto       = 0;

                fst.getFirstArc(scratchArc);
                while (bufUpto < bufferLen)
                {
//JAVA TO C# CONVERTER WARNING: The original Java variable was marked 'final':
//ORIGINAL LINE: final int codePoint = Character.codePointAt(buffer, bufUpto, bufferLen);
                    int codePoint = char.codePointAt(buffer, bufUpto, bufferLen);
                    if (fst.findTargetArc(ignoreCase ? char.ToLower(codePoint) : codePoint, scratchArc, scratchArc, fstReader) == null)
                    {
                        return(null);
                    }
                    pendingOutput = fst.outputs.add(pendingOutput, scratchArc.output);
                    bufUpto      += char.charCount(codePoint);
                }
                if (scratchArc.Final)
                {
                    matchOutput = fst.outputs.add(pendingOutput, scratchArc.nextFinalOutput);
                }
                return(matchOutput);
            }
예제 #2
0
//JAVA TO C# CONVERTER WARNING: Method 'throws' clauses are not available in .NET:
//ORIGINAL LINE: private void parse() throws java.io.IOException
        private void parse()
        {
            //System.out.println("\nS: parse");

            Debug.Assert(inputSkipCount == 0);

            int curNextRead = nextRead;

            // Holds the longest match we've seen so far:
            BytesRef matchOutput      = null;
            int      matchInputLength = 0;
            int      matchEndOffset   = -1;

            BytesRef pendingOutput = fst.outputs.NoOutput;

            fst.getFirstArc(scratchArc);

            Debug.Assert(scratchArc.output == fst.outputs.NoOutput);

            int tokenCount = 0;

            while (true)
            {
                // Pull next token's chars:
//JAVA TO C# CONVERTER WARNING: The original Java variable was marked 'final':
//ORIGINAL LINE: final char[] buffer;
                char[] buffer;
//JAVA TO C# CONVERTER WARNING: The original Java variable was marked 'final':
//ORIGINAL LINE: final int bufferLen;
                int bufferLen;
                //System.out.println("  cycle nextRead=" + curNextRead + " nextWrite=" + nextWrite);

                int inputEndOffset = 0;

                if (curNextRead == nextWrite)
                {
                    // We used up our lookahead buffer of input tokens
                    // -- pull next real input token:

                    if (finished)
                    {
                        break;
                    }
                    else
                    {
                        //System.out.println("  input.incrToken");
                        Debug.Assert(futureInputs[nextWrite].consumed);
                        // Not correct: a syn match whose output is longer
                        // than its input can set future inputs keepOrig
                        // to true:
                        //assert !futureInputs[nextWrite].keepOrig;
                        if (input.incrementToken())
                        {
                            buffer    = termAtt.buffer();
                            bufferLen = termAtt.length();
//JAVA TO C# CONVERTER WARNING: The original Java variable was marked 'final':
//ORIGINAL LINE: final PendingInput input = futureInputs[nextWrite];
                            PendingInput input = futureInputs[nextWrite];
                            lastStartOffset = input.startOffset = offsetAtt.startOffset();
                            lastEndOffset   = input.endOffset = offsetAtt.endOffset();
                            inputEndOffset  = input.endOffset;
                            //System.out.println("  new token=" + new String(buffer, 0, bufferLen));
                            if (nextRead != nextWrite)
                            {
                                capture();
                            }
                            else
                            {
                                input.consumed = false;
                            }
                        }
                        else
                        {
                            // No more input tokens
                            //System.out.println("      set end");
                            finished = true;
                            break;
                        }
                    }
                }
                else
                {
                    // Still in our lookahead
                    buffer         = futureInputs[curNextRead].term.chars;
                    bufferLen      = futureInputs[curNextRead].term.length;
                    inputEndOffset = futureInputs[curNextRead].endOffset;
                    //System.out.println("  old token=" + new String(buffer, 0, bufferLen));
                }

                tokenCount++;

                // Run each char in this token through the FST:
                int bufUpto = 0;
                while (bufUpto < bufferLen)
                {
//JAVA TO C# CONVERTER WARNING: The original Java variable was marked 'final':
//ORIGINAL LINE: final int codePoint = Character.codePointAt(buffer, bufUpto, bufferLen);
                    int codePoint = char.codePointAt(buffer, bufUpto, bufferLen);
                    if (fst.findTargetArc(ignoreCase ? char.ToLower(codePoint) : codePoint, scratchArc, scratchArc, fstReader) == null)
                    {
                        //System.out.println("    stop");
                        goto byTokenBreak;
                    }

                    // Accum the output
                    pendingOutput = fst.outputs.add(pendingOutput, scratchArc.output);
                    //System.out.println("    char=" + buffer[bufUpto] + " output=" + pendingOutput + " arc.output=" + scratchArc.output);
                    bufUpto += char.charCount(codePoint);
                }

                // OK, entire token matched; now see if this is a final
                // state:
                if (scratchArc.Final)
                {
                    matchOutput      = fst.outputs.add(pendingOutput, scratchArc.nextFinalOutput);
                    matchInputLength = tokenCount;
                    matchEndOffset   = inputEndOffset;
                    //System.out.println("  found matchLength=" + matchInputLength + " output=" + matchOutput);
                }

                // See if the FST wants to continue matching (ie, needs to
                // see the next input token):
                if (fst.findTargetArc(SynonymMap.WORD_SEPARATOR, scratchArc, scratchArc, fstReader) == null)
                {
                    // No further rules can match here; we're done
                    // searching for matching rules starting at the
                    // current input position.
                    break;
                }
                else
                {
                    // More matching is possible -- accum the output (if
                    // any) of the WORD_SEP arc:
                    pendingOutput = fst.outputs.add(pendingOutput, scratchArc.output);
                    if (nextRead == nextWrite)
                    {
                        capture();
                    }
                }

                curNextRead = rollIncr(curNextRead);
                byTokenContinue :;
            }
            byTokenBreak :

            if (nextRead == nextWrite && !finished)
            {
                //System.out.println("  skip write slot=" + nextWrite);
                nextWrite = rollIncr(nextWrite);
            }

            if (matchOutput != null)
            {
                //System.out.println("  add matchLength=" + matchInputLength + " output=" + matchOutput);
                inputSkipCount = matchInputLength;
                addOutput(matchOutput, matchInputLength, matchEndOffset);
            }
            else if (nextRead != nextWrite)
            {
                // Even though we had no match here, we set to 1
                // because we need to skip current input token before
                // trying to match again:
                inputSkipCount = 1;
            }
            else
            {
                Debug.Assert(finished);
            }

            //System.out.println("  parse done inputSkipCount=" + inputSkipCount + " nextRead=" + nextRead + " nextWrite=" + nextWrite);
        }
예제 #3
0
//JAVA TO C# CONVERTER WARNING: Method 'throws' clauses are not available in .NET:
//ORIGINAL LINE: @Override public int read() throws java.io.IOException
        public override int read()
        {
            //System.out.println("\nread");
            while (true)
            {
                if (replacement != null && replacementPointer < replacement.length)
                {
                    //System.out.println("  return repl[" + replacementPointer + "]=" + replacement.chars[replacement.offset + replacementPointer]);
                    return(replacement.chars[replacement.offset + replacementPointer++]);
                }

                // TODO: a more efficient approach would be Aho/Corasick's
                // algorithm
                // (http://en.wikipedia.org/wiki/Aho%E2%80%93Corasick_string_matching_algorithm)
                // or this generalizatio: www.cis.uni-muenchen.de/people/Schulz/Pub/dictle5.ps
                //
                // I think this would be (almost?) equivalent to 1) adding
                // epsilon arcs from all final nodes back to the init
                // node in the FST, 2) adding a .* (skip any char)
                // loop on the initial node, and 3) determinizing
                // that.  Then we would not have to restart matching
                // at each position.

                int      lastMatchLen = -1;
                CharsRef lastMatch    = null;

//JAVA TO C# CONVERTER WARNING: The original Java variable was marked 'final':
//ORIGINAL LINE: final int firstCH = buffer.get(inputOff);
                int firstCH = buffer.get(inputOff);
                if (firstCH != -1)
                {
                    FST.Arc <CharsRef> arc = cachedRootArcs[Convert.ToChar((char)firstCH)];
                    if (arc != null)
                    {
                        if (!FST.targetHasArcs(arc))
                        {
                            // Fast pass for single character match:
                            Debug.Assert(arc.Final);
                            lastMatchLen = 1;
                            lastMatch    = arc.output;
                        }
                        else
                        {
                            int      lookahead = 0;
                            CharsRef output    = arc.output;
                            while (true)
                            {
                                lookahead++;

                                if (arc.Final)
                                {
                                    // Match! (to node is final)
                                    lastMatchLen = lookahead;
                                    lastMatch    = outputs.add(output, arc.nextFinalOutput);
                                    // Greedy: keep searching to see if there's a
                                    // longer match...
                                }

                                if (!FST.targetHasArcs(arc))
                                {
                                    break;
                                }

                                int ch = buffer.get(inputOff + lookahead);
                                if (ch == -1)
                                {
                                    break;
                                }
                                if ((arc = map.findTargetArc(ch, arc, scratchArc, fstReader)) == null)
                                {
                                    // Dead end
                                    break;
                                }
                                output = outputs.add(output, arc.output);
                            }
                        }
                    }
                }

                if (lastMatch != null)
                {
                    inputOff += lastMatchLen;
                    //System.out.println("  match!  len=" + lastMatchLen + " repl=" + lastMatch);

//JAVA TO C# CONVERTER WARNING: The original Java variable was marked 'final':
//ORIGINAL LINE: final int diff = lastMatchLen - lastMatch.length;
                    int diff = lastMatchLen - lastMatch.length;

                    if (diff != 0)
                    {
//JAVA TO C# CONVERTER WARNING: The original Java variable was marked 'final':
//ORIGINAL LINE: final int prevCumulativeDiff = getLastCumulativeDiff();
                        int prevCumulativeDiff = LastCumulativeDiff;
                        if (diff > 0)
                        {
                            // Replacement is shorter than matched input:
                            addOffCorrectMap(inputOff - diff - prevCumulativeDiff, prevCumulativeDiff + diff);
                        }
                        else
                        {
                            // Replacement is longer than matched input: remap
                            // the "extra" chars all back to the same input
                            // offset:
//JAVA TO C# CONVERTER WARNING: The original Java variable was marked 'final':
//ORIGINAL LINE: final int outputStart = inputOff - prevCumulativeDiff;
                            int outputStart = inputOff - prevCumulativeDiff;
                            for (int extraIDX = 0; extraIDX < -diff; extraIDX++)
                            {
                                addOffCorrectMap(outputStart + extraIDX, prevCumulativeDiff - extraIDX - 1);
                            }
                        }
                    }

                    replacement        = lastMatch;
                    replacementPointer = 0;
                }
                else
                {
//JAVA TO C# CONVERTER WARNING: The original Java variable was marked 'final':
//ORIGINAL LINE: final int ret = buffer.get(inputOff);
                    int ret = buffer.get(inputOff);
                    if (ret != -1)
                    {
                        inputOff++;
                        buffer.freeBefore(inputOff);
                    }
                    return(ret);
                }
            }
        }