/// <param name="input"> input tokenstream </param> /// <param name="synonyms"> synonym map </param> /// <param name="ignoreCase"> case-folds input for matching with <see cref="Character.ToLower(int, CultureInfo)"/> /// in using <see cref="CultureInfo.InvariantCulture"/>. /// Note, if you set this to <c>true</c>, its your responsibility to lowercase /// the input entries when you create the <see cref="SynonymMap"/>.</param> public SynonymFilter(TokenStream input, SynonymMap synonyms, bool ignoreCase) : base(input) { termAtt = AddAttribute <ICharTermAttribute>(); posIncrAtt = AddAttribute <IPositionIncrementAttribute>(); posLenAtt = AddAttribute <IPositionLengthAttribute>(); typeAtt = AddAttribute <ITypeAttribute>(); offsetAtt = AddAttribute <IOffsetAttribute>(); this.synonyms = synonyms; this.ignoreCase = ignoreCase; this.fst = synonyms.Fst; if (fst == null) { throw new ArgumentException("fst must be non-null"); } this.fstReader = fst.GetBytesReader(); // Must be 1+ so that when roll buffer is at full // lookahead we can distinguish this full buffer from // the empty buffer: rollBufferSize = 1 + synonyms.MaxHorizontalContext; futureInputs = new PendingInput[rollBufferSize]; futureOutputs = new PendingOutputs[rollBufferSize]; for (int pos = 0; pos < rollBufferSize; pos++) { futureInputs[pos] = new PendingInput(); futureOutputs[pos] = new PendingOutputs(); } //System.out.println("FSTFilt maxH=" + synonyms.maxHorizontalContext); scratchArc = new FST.Arc <BytesRef>(); }
/// <param name="input"> input tokenstream </param> /// <param name="synonyms"> synonym map </param> /// <param name="ignoreCase"> case-folds input for matching with <seealso cref="Character#toLowerCase(int)"/>. /// Note, if you set this to true, its your responsibility to lowercase /// the input entries when you create the <seealso cref="SynonymMap"/> </param> public SynonymFilter(TokenStream input, SynonymMap synonyms, bool ignoreCase) : base(input) { this.synonyms = synonyms; this.ignoreCase = ignoreCase; this.fst = synonyms.fst; if (fst == null) { throw new System.ArgumentException("fst must be non-null"); } this.fstReader = fst.BytesReader; // Must be 1+ so that when roll buffer is at full // lookahead we can distinguish this full buffer from // the empty buffer: rollBufferSize = 1 + synonyms.maxHorizontalContext; futureInputs = new PendingInput[rollBufferSize]; futureOutputs = new PendingOutputs[rollBufferSize]; for (int pos = 0; pos < rollBufferSize; pos++) { futureInputs[pos] = new PendingInput(); futureOutputs[pos] = new PendingOutputs(); } //System.out.println("FSTFilt maxH=" + synonyms.maxHorizontalContext); scratchArc = new FST.Arc <>(); }
private void Capture() { captureCount++; //System.out.println(" capture slot=" + nextWrite); PendingInput input = futureInputs[nextWrite]; input.state = CaptureState(); input.consumed = false; input.term.CopyChars(termAtt.Buffer, 0, termAtt.Length); nextWrite = RollIncr(nextWrite); // Buffer head should never catch up to tail: Debug.Assert(nextWrite != nextRead); }
private void capture() { captureCount++; //System.out.println(" capture slot=" + nextWrite); //JAVA TO C# CONVERTER WARNING: The original Java variable was marked 'final': //ORIGINAL LINE: final PendingInput input = futureInputs[nextWrite]; PendingInput input = futureInputs[nextWrite]; input.state = captureState(); input.consumed = false; input.term.copyChars(termAtt.buffer(), 0, termAtt.length()); nextWrite = rollIncr(nextWrite); // Buffer head should never catch up to tail: Debug.Assert(nextWrite != nextRead); }
public void __transport_in_fakelag(string xmlstring) { PendingInput.Add(xmlstring); }
public override bool IncrementToken() { //System.out.println("\nS: incrToken inputSkipCount=" + inputSkipCount + " nextRead=" + nextRead + " nextWrite=" + nextWrite); while (true) { // First play back any buffered future inputs/outputs // w/o running parsing again: while (inputSkipCount != 0) { // At each position, we first output the original // token // TODO: maybe just a PendingState class, holding // both input & outputs? PendingInput input = futureInputs[nextRead]; PendingOutputs outputs = futureOutputs[nextRead]; //System.out.println(" cycle nextRead=" + nextRead + " nextWrite=" + nextWrite + " inputSkipCount="+ inputSkipCount + " input.keepOrig=" + input.keepOrig + " input.consumed=" + input.consumed + " input.state=" + input.state); if (!input.consumed && (input.keepOrig || !input.matched)) { if (input.state != null) { // Return a previously saved token (because we // had to lookahead): RestoreState(input.state); } else { // Pass-through case: return token we just pulled // but didn't capture: if (Debugging.AssertsEnabled) { Debugging.Assert(inputSkipCount == 1, "inputSkipCount={0} nextRead={1}", inputSkipCount, nextRead); } } input.Reset(); if (outputs.count > 0) { outputs.posIncr = 0; } else { nextRead = RollIncr(nextRead); inputSkipCount--; } //System.out.println(" return token=" + termAtt.toString()); return(true); } else if (outputs.upto < outputs.count) { // Still have pending outputs to replay at this // position input.Reset(); int posIncr = outputs.posIncr; CharsRef output = outputs.PullNext(); ClearAttributes(); termAtt.CopyBuffer(output.Chars, output.Offset, output.Length); typeAtt.Type = TYPE_SYNONYM; int endOffset = outputs.LastEndOffset; if (endOffset == -1) { endOffset = input.endOffset; } offsetAtt.SetOffset(input.startOffset, endOffset); posIncrAtt.PositionIncrement = posIncr; posLenAtt.PositionLength = outputs.LastPosLength; if (outputs.count == 0) { // Done with the buffered input and all outputs at // this position nextRead = RollIncr(nextRead); inputSkipCount--; } //System.out.println(" return token=" + termAtt.toString()); return(true); } else { // Done with the buffered input and all outputs at // this position input.Reset(); nextRead = RollIncr(nextRead); inputSkipCount--; } } if (finished && nextRead == nextWrite) { // End case: if any output syns went beyond end of // input stream, enumerate them now: PendingOutputs outputs = futureOutputs[nextRead]; if (outputs.upto < outputs.count) { int posIncr = outputs.posIncr; CharsRef output = outputs.PullNext(); futureInputs[nextRead].Reset(); if (outputs.count == 0) { nextWrite = nextRead = RollIncr(nextRead); } ClearAttributes(); // Keep offset from last input token: offsetAtt.SetOffset(lastStartOffset, lastEndOffset); termAtt.CopyBuffer(output.Chars, output.Offset, output.Length); typeAtt.Type = TYPE_SYNONYM; //System.out.println(" set posIncr=" + outputs.posIncr + " outputs=" + outputs); posIncrAtt.PositionIncrement = posIncr; //System.out.println(" return token=" + termAtt.toString()); return(true); } else { return(false); } } // Find new synonym matches: Parse(); } }
private void Parse() { //System.out.println("\nS: parse"); if (Debugging.AssertsEnabled) { Debugging.Assert(inputSkipCount == 0); } int curNextRead = nextRead; // Holds the longest match we've seen so far: BytesRef matchOutput = null; int matchInputLength = 0; int matchEndOffset = -1; BytesRef pendingOutput = fst.Outputs.NoOutput; fst.GetFirstArc(scratchArc); if (Debugging.AssertsEnabled) { Debugging.Assert(scratchArc.Output == fst.Outputs.NoOutput); } int tokenCount = 0; while (true) { // Pull next token's chars: char[] buffer; int bufferLen; //System.out.println(" cycle nextRead=" + curNextRead + " nextWrite=" + nextWrite); int inputEndOffset = 0; if (curNextRead == nextWrite) { // We used up our lookahead buffer of input tokens // -- pull next real input token: if (finished) { break; } else { //System.out.println(" input.incrToken"); if (Debugging.AssertsEnabled) { Debugging.Assert(futureInputs[nextWrite].consumed); } // Not correct: a syn match whose output is longer // than its input can set future inputs keepOrig // to true: //assert !futureInputs[nextWrite].keepOrig; if (m_input.IncrementToken()) { buffer = termAtt.Buffer; bufferLen = termAtt.Length; PendingInput pendingInput = futureInputs[nextWrite]; lastStartOffset = pendingInput.startOffset = offsetAtt.StartOffset; lastEndOffset = pendingInput.endOffset = offsetAtt.EndOffset; inputEndOffset = pendingInput.endOffset; //System.out.println(" new token=" + new String(buffer, 0, bufferLen)); if (nextRead != nextWrite) { Capture(); } else { pendingInput.consumed = false; } } else { // No more input tokens //System.out.println(" set end"); finished = true; break; } } } else { // Still in our lookahead buffer = futureInputs[curNextRead].term.Chars; bufferLen = futureInputs[curNextRead].term.Length; inputEndOffset = futureInputs[curNextRead].endOffset; //System.out.println(" old token=" + new String(buffer, 0, bufferLen)); } tokenCount++; // Run each char in this token through the FST: int bufUpto = 0; while (bufUpto < bufferLen) { int codePoint = Character.CodePointAt(buffer, bufUpto, bufferLen); if (fst.FindTargetArc(ignoreCase ? Character.ToLower(codePoint, CultureInfo.InvariantCulture) : codePoint, scratchArc, scratchArc, fstReader) == null) { //System.out.println(" stop"); goto byTokenBreak; } // Accum the output pendingOutput = fst.Outputs.Add(pendingOutput, scratchArc.Output); //System.out.println(" char=" + buffer[bufUpto] + " output=" + pendingOutput + " arc.output=" + scratchArc.output); bufUpto += Character.CharCount(codePoint); } // OK, entire token matched; now see if this is a final // state: if (scratchArc.IsFinal) { matchOutput = fst.Outputs.Add(pendingOutput, scratchArc.NextFinalOutput); matchInputLength = tokenCount; matchEndOffset = inputEndOffset; //System.out.println(" found matchLength=" + matchInputLength + " output=" + matchOutput); } // See if the FST wants to continue matching (ie, needs to // see the next input token): if (fst.FindTargetArc(SynonymMap.WORD_SEPARATOR, scratchArc, scratchArc, fstReader) == null) { // No further rules can match here; we're done // searching for matching rules starting at the // current input position. break; } else { // More matching is possible -- accum the output (if // any) of the WORD_SEP arc: pendingOutput = fst.Outputs.Add(pendingOutput, scratchArc.Output); if (nextRead == nextWrite) { Capture(); } } curNextRead = RollIncr(curNextRead); } byTokenBreak: if (nextRead == nextWrite && !finished) { //System.out.println(" skip write slot=" + nextWrite); nextWrite = RollIncr(nextWrite); } if (matchOutput != null) { //System.out.println(" add matchLength=" + matchInputLength + " output=" + matchOutput); inputSkipCount = matchInputLength; AddOutput(matchOutput, matchInputLength, matchEndOffset); } else if (nextRead != nextWrite) { // Even though we had no match here, we set to 1 // because we need to skip current input token before // trying to match again: inputSkipCount = 1; } else { if (Debugging.AssertsEnabled) { Debugging.Assert(finished); } } //System.out.println(" parse done inputSkipCount=" + inputSkipCount + " nextRead=" + nextRead + " nextWrite=" + nextWrite); }
/// <param name="input"> input tokenstream </param> /// <param name="synonyms"> synonym map </param> /// <param name="ignoreCase"> case-folds input for matching with <seealso cref="Character#toLowerCase(int)"/>. /// Note, if you set this to true, its your responsibility to lowercase /// the input entries when you create the <seealso cref="SynonymMap"/> </param> public SynonymFilter(TokenStream input, SynonymMap synonyms, bool ignoreCase) : base(input) { termAtt = AddAttribute<ICharTermAttribute>(); posIncrAtt = AddAttribute<IPositionIncrementAttribute>(); posLenAtt = AddAttribute<IPositionLengthAttribute>(); typeAtt = AddAttribute<ITypeAttribute>(); offsetAtt = AddAttribute<IOffsetAttribute>(); this.synonyms = synonyms; this.ignoreCase = ignoreCase; this.fst = synonyms.fst; if (fst == null) { throw new System.ArgumentException("fst must be non-null"); } this.fstReader = fst.BytesReader; // Must be 1+ so that when roll buffer is at full // lookahead we can distinguish this full buffer from // the empty buffer: rollBufferSize = 1 + synonyms.maxHorizontalContext; futureInputs = new PendingInput[rollBufferSize]; futureOutputs = new PendingOutputs[rollBufferSize]; for (int pos = 0; pos < rollBufferSize; pos++) { futureInputs[pos] = new PendingInput(); futureOutputs[pos] = new PendingOutputs(); } //System.out.println("FSTFilt maxH=" + synonyms.maxHorizontalContext); scratchArc = new FST.Arc<BytesRef>(); }