示例#1
0
        /// <param name="input"> input tokenstream </param>
        /// <param name="synonyms"> synonym map </param>
        /// <param name="ignoreCase"> case-folds input for matching with <see cref="Character.ToLower(int, CultureInfo)"/>
        ///                   in using <see cref="CultureInfo.InvariantCulture"/>.
        ///                   Note, if you set this to <c>true</c>, its your responsibility to lowercase
        ///                   the input entries when you create the <see cref="SynonymMap"/>.</param>
        public SynonymFilter(TokenStream input, SynonymMap synonyms, bool ignoreCase)
            : base(input)
        {
            termAtt    = AddAttribute <ICharTermAttribute>();
            posIncrAtt = AddAttribute <IPositionIncrementAttribute>();
            posLenAtt  = AddAttribute <IPositionLengthAttribute>();
            typeAtt    = AddAttribute <ITypeAttribute>();
            offsetAtt  = AddAttribute <IOffsetAttribute>();

            this.synonyms   = synonyms;
            this.ignoreCase = ignoreCase;
            this.fst        = synonyms.Fst;
            if (fst == null)
            {
                throw new ArgumentException("fst must be non-null");
            }
            this.fstReader = fst.GetBytesReader();

            // Must be 1+ so that when roll buffer is at full
            // lookahead we can distinguish this full buffer from
            // the empty buffer:
            rollBufferSize = 1 + synonyms.MaxHorizontalContext;

            futureInputs  = new PendingInput[rollBufferSize];
            futureOutputs = new PendingOutputs[rollBufferSize];
            for (int pos = 0; pos < rollBufferSize; pos++)
            {
                futureInputs[pos]  = new PendingInput();
                futureOutputs[pos] = new PendingOutputs();
            }

            //System.out.println("FSTFilt maxH=" + synonyms.maxHorizontalContext);

            scratchArc = new FST.Arc <BytesRef>();
        }
示例#2
0
        /// <param name="input"> input tokenstream </param>
        /// <param name="synonyms"> synonym map </param>
        /// <param name="ignoreCase"> case-folds input for matching with <seealso cref="Character#toLowerCase(int)"/>.
        ///                   Note, if you set this to true, its your responsibility to lowercase
        ///                   the input entries when you create the <seealso cref="SynonymMap"/> </param>
        public SynonymFilter(TokenStream input, SynonymMap synonyms, bool ignoreCase) : base(input)
        {
            this.synonyms   = synonyms;
            this.ignoreCase = ignoreCase;
            this.fst        = synonyms.fst;
            if (fst == null)
            {
                throw new System.ArgumentException("fst must be non-null");
            }
            this.fstReader = fst.BytesReader;

            // Must be 1+ so that when roll buffer is at full
            // lookahead we can distinguish this full buffer from
            // the empty buffer:
            rollBufferSize = 1 + synonyms.maxHorizontalContext;

            futureInputs  = new PendingInput[rollBufferSize];
            futureOutputs = new PendingOutputs[rollBufferSize];
            for (int pos = 0; pos < rollBufferSize; pos++)
            {
                futureInputs[pos]  = new PendingInput();
                futureOutputs[pos] = new PendingOutputs();
            }

            //System.out.println("FSTFilt maxH=" + synonyms.maxHorizontalContext);

            scratchArc = new FST.Arc <>();
        }
示例#3
0
        private void Capture()
        {
            captureCount++;
            //System.out.println("  capture slot=" + nextWrite);
            PendingInput input = futureInputs[nextWrite];

            input.state    = CaptureState();
            input.consumed = false;
            input.term.CopyChars(termAtt.Buffer, 0, termAtt.Length);

            nextWrite = RollIncr(nextWrite);

            // Buffer head should never catch up to tail:
            Debug.Assert(nextWrite != nextRead);
        }
示例#4
0
        private void capture()
        {
            captureCount++;
            //System.out.println("  capture slot=" + nextWrite);
//JAVA TO C# CONVERTER WARNING: The original Java variable was marked 'final':
//ORIGINAL LINE: final PendingInput input = futureInputs[nextWrite];
            PendingInput input = futureInputs[nextWrite];

            input.state    = captureState();
            input.consumed = false;
            input.term.copyChars(termAtt.buffer(), 0, termAtt.length());

            nextWrite = rollIncr(nextWrite);

            // Buffer head should never catch up to tail:
            Debug.Assert(nextWrite != nextRead);
        }
 public void __transport_in_fakelag(string xmlstring)
 {
     PendingInput.Add(xmlstring);
 }
示例#6
0
        public override bool IncrementToken()
        {
            //System.out.println("\nS: incrToken inputSkipCount=" + inputSkipCount + " nextRead=" + nextRead + " nextWrite=" + nextWrite);

            while (true)
            {
                // First play back any buffered future inputs/outputs
                // w/o running parsing again:
                while (inputSkipCount != 0)
                {
                    // At each position, we first output the original
                    // token

                    // TODO: maybe just a PendingState class, holding
                    // both input & outputs?
                    PendingInput   input   = futureInputs[nextRead];
                    PendingOutputs outputs = futureOutputs[nextRead];

                    //System.out.println("  cycle nextRead=" + nextRead + " nextWrite=" + nextWrite + " inputSkipCount="+ inputSkipCount + " input.keepOrig=" + input.keepOrig + " input.consumed=" + input.consumed + " input.state=" + input.state);

                    if (!input.consumed && (input.keepOrig || !input.matched))
                    {
                        if (input.state != null)
                        {
                            // Return a previously saved token (because we
                            // had to lookahead):
                            RestoreState(input.state);
                        }
                        else
                        {
                            // Pass-through case: return token we just pulled
                            // but didn't capture:
                            if (Debugging.AssertsEnabled)
                            {
                                Debugging.Assert(inputSkipCount == 1, "inputSkipCount={0} nextRead={1}", inputSkipCount, nextRead);
                            }
                        }
                        input.Reset();
                        if (outputs.count > 0)
                        {
                            outputs.posIncr = 0;
                        }
                        else
                        {
                            nextRead = RollIncr(nextRead);
                            inputSkipCount--;
                        }
                        //System.out.println("  return token=" + termAtt.toString());
                        return(true);
                    }
                    else if (outputs.upto < outputs.count)
                    {
                        // Still have pending outputs to replay at this
                        // position
                        input.Reset();
                        int      posIncr = outputs.posIncr;
                        CharsRef output  = outputs.PullNext();
                        ClearAttributes();
                        termAtt.CopyBuffer(output.Chars, output.Offset, output.Length);
                        typeAtt.Type = TYPE_SYNONYM;
                        int endOffset = outputs.LastEndOffset;
                        if (endOffset == -1)
                        {
                            endOffset = input.endOffset;
                        }
                        offsetAtt.SetOffset(input.startOffset, endOffset);
                        posIncrAtt.PositionIncrement = posIncr;
                        posLenAtt.PositionLength     = outputs.LastPosLength;
                        if (outputs.count == 0)
                        {
                            // Done with the buffered input and all outputs at
                            // this position
                            nextRead = RollIncr(nextRead);
                            inputSkipCount--;
                        }
                        //System.out.println("  return token=" + termAtt.toString());
                        return(true);
                    }
                    else
                    {
                        // Done with the buffered input and all outputs at
                        // this position
                        input.Reset();
                        nextRead = RollIncr(nextRead);
                        inputSkipCount--;
                    }
                }

                if (finished && nextRead == nextWrite)
                {
                    // End case: if any output syns went beyond end of
                    // input stream, enumerate them now:
                    PendingOutputs outputs = futureOutputs[nextRead];
                    if (outputs.upto < outputs.count)
                    {
                        int      posIncr = outputs.posIncr;
                        CharsRef output  = outputs.PullNext();
                        futureInputs[nextRead].Reset();
                        if (outputs.count == 0)
                        {
                            nextWrite = nextRead = RollIncr(nextRead);
                        }
                        ClearAttributes();
                        // Keep offset from last input token:
                        offsetAtt.SetOffset(lastStartOffset, lastEndOffset);
                        termAtt.CopyBuffer(output.Chars, output.Offset, output.Length);
                        typeAtt.Type = TYPE_SYNONYM;
                        //System.out.println("  set posIncr=" + outputs.posIncr + " outputs=" + outputs);
                        posIncrAtt.PositionIncrement = posIncr;
                        //System.out.println("  return token=" + termAtt.toString());
                        return(true);
                    }
                    else
                    {
                        return(false);
                    }
                }

                // Find new synonym matches:
                Parse();
            }
        }
示例#7
0
        private void Parse()
        {
            //System.out.println("\nS: parse");

            if (Debugging.AssertsEnabled)
            {
                Debugging.Assert(inputSkipCount == 0);
            }

            int curNextRead = nextRead;

            // Holds the longest match we've seen so far:
            BytesRef matchOutput      = null;
            int      matchInputLength = 0;
            int      matchEndOffset   = -1;

            BytesRef pendingOutput = fst.Outputs.NoOutput;

            fst.GetFirstArc(scratchArc);

            if (Debugging.AssertsEnabled)
            {
                Debugging.Assert(scratchArc.Output == fst.Outputs.NoOutput);
            }

            int tokenCount = 0;

            while (true)
            {
                // Pull next token's chars:
                char[] buffer;
                int    bufferLen;
                //System.out.println("  cycle nextRead=" + curNextRead + " nextWrite=" + nextWrite);

                int inputEndOffset = 0;

                if (curNextRead == nextWrite)
                {
                    // We used up our lookahead buffer of input tokens
                    // -- pull next real input token:

                    if (finished)
                    {
                        break;
                    }
                    else
                    {
                        //System.out.println("  input.incrToken");
                        if (Debugging.AssertsEnabled)
                        {
                            Debugging.Assert(futureInputs[nextWrite].consumed);
                        }
                        // Not correct: a syn match whose output is longer
                        // than its input can set future inputs keepOrig
                        // to true:
                        //assert !futureInputs[nextWrite].keepOrig;
                        if (m_input.IncrementToken())
                        {
                            buffer    = termAtt.Buffer;
                            bufferLen = termAtt.Length;
                            PendingInput pendingInput = futureInputs[nextWrite];
                            lastStartOffset = pendingInput.startOffset = offsetAtt.StartOffset;
                            lastEndOffset   = pendingInput.endOffset = offsetAtt.EndOffset;
                            inputEndOffset  = pendingInput.endOffset;
                            //System.out.println("  new token=" + new String(buffer, 0, bufferLen));
                            if (nextRead != nextWrite)
                            {
                                Capture();
                            }
                            else
                            {
                                pendingInput.consumed = false;
                            }
                        }
                        else
                        {
                            // No more input tokens
                            //System.out.println("      set end");
                            finished = true;
                            break;
                        }
                    }
                }
                else
                {
                    // Still in our lookahead
                    buffer         = futureInputs[curNextRead].term.Chars;
                    bufferLen      = futureInputs[curNextRead].term.Length;
                    inputEndOffset = futureInputs[curNextRead].endOffset;
                    //System.out.println("  old token=" + new String(buffer, 0, bufferLen));
                }

                tokenCount++;

                // Run each char in this token through the FST:
                int bufUpto = 0;
                while (bufUpto < bufferLen)
                {
                    int codePoint = Character.CodePointAt(buffer, bufUpto, bufferLen);
                    if (fst.FindTargetArc(ignoreCase ? Character.ToLower(codePoint, CultureInfo.InvariantCulture) : codePoint, scratchArc, scratchArc, fstReader) == null)
                    {
                        //System.out.println("    stop");
                        goto byTokenBreak;
                    }

                    // Accum the output
                    pendingOutput = fst.Outputs.Add(pendingOutput, scratchArc.Output);
                    //System.out.println("    char=" + buffer[bufUpto] + " output=" + pendingOutput + " arc.output=" + scratchArc.output);
                    bufUpto += Character.CharCount(codePoint);
                }

                // OK, entire token matched; now see if this is a final
                // state:
                if (scratchArc.IsFinal)
                {
                    matchOutput      = fst.Outputs.Add(pendingOutput, scratchArc.NextFinalOutput);
                    matchInputLength = tokenCount;
                    matchEndOffset   = inputEndOffset;
                    //System.out.println("  found matchLength=" + matchInputLength + " output=" + matchOutput);
                }

                // See if the FST wants to continue matching (ie, needs to
                // see the next input token):
                if (fst.FindTargetArc(SynonymMap.WORD_SEPARATOR, scratchArc, scratchArc, fstReader) == null)
                {
                    // No further rules can match here; we're done
                    // searching for matching rules starting at the
                    // current input position.
                    break;
                }
                else
                {
                    // More matching is possible -- accum the output (if
                    // any) of the WORD_SEP arc:
                    pendingOutput = fst.Outputs.Add(pendingOutput, scratchArc.Output);
                    if (nextRead == nextWrite)
                    {
                        Capture();
                    }
                }

                curNextRead = RollIncr(curNextRead);
            }
byTokenBreak:

            if (nextRead == nextWrite && !finished)
            {
                //System.out.println("  skip write slot=" + nextWrite);
                nextWrite = RollIncr(nextWrite);
            }

            if (matchOutput != null)
            {
                //System.out.println("  add matchLength=" + matchInputLength + " output=" + matchOutput);
                inputSkipCount = matchInputLength;
                AddOutput(matchOutput, matchInputLength, matchEndOffset);
            }
            else if (nextRead != nextWrite)
            {
                // Even though we had no match here, we set to 1
                // because we need to skip current input token before
                // trying to match again:
                inputSkipCount = 1;
            }
            else
            {
                if (Debugging.AssertsEnabled)
                {
                    Debugging.Assert(finished);
                }
            }

            //System.out.println("  parse done inputSkipCount=" + inputSkipCount + " nextRead=" + nextRead + " nextWrite=" + nextWrite);
        }
        /// <param name="input"> input tokenstream </param>
        /// <param name="synonyms"> synonym map </param>
        /// <param name="ignoreCase"> case-folds input for matching with <seealso cref="Character#toLowerCase(int)"/>.
        ///                   Note, if you set this to true, its your responsibility to lowercase
        ///                   the input entries when you create the <seealso cref="SynonymMap"/> </param>
        public SynonymFilter(TokenStream input, SynonymMap synonyms, bool ignoreCase)
            : base(input)
        {
            termAtt = AddAttribute<ICharTermAttribute>();
            posIncrAtt = AddAttribute<IPositionIncrementAttribute>();
            posLenAtt = AddAttribute<IPositionLengthAttribute>();
            typeAtt = AddAttribute<ITypeAttribute>();
            offsetAtt = AddAttribute<IOffsetAttribute>();

            this.synonyms = synonyms;
            this.ignoreCase = ignoreCase;
            this.fst = synonyms.fst;
            if (fst == null)
            {
                throw new System.ArgumentException("fst must be non-null");
            }
            this.fstReader = fst.BytesReader;

            // Must be 1+ so that when roll buffer is at full
            // lookahead we can distinguish this full buffer from
            // the empty buffer:
            rollBufferSize = 1 + synonyms.maxHorizontalContext;

            futureInputs = new PendingInput[rollBufferSize];
            futureOutputs = new PendingOutputs[rollBufferSize];
            for (int pos = 0; pos < rollBufferSize; pos++)
            {
                futureInputs[pos] = new PendingInput();
                futureOutputs[pos] = new PendingOutputs();
            }

            //System.out.println("FSTFilt maxH=" + synonyms.maxHorizontalContext);

            scratchArc = new FST.Arc<BytesRef>();
        }