/// <summary> /// Returns the value mapped to the given key or <code>null</code> if the key is not in the FST dictionary. /// </summary> //JAVA TO C# CONVERTER WARNING: Method 'throws' clauses are not available in .NET: //ORIGINAL LINE: public org.apache.lucene.util.BytesRef get(char[] buffer, int bufferLen, org.apache.lucene.util.fst.FST.Arc<org.apache.lucene.util.BytesRef> scratchArc, org.apache.lucene.util.fst.FST.BytesReader fstReader) throws java.io.IOException public BytesRef get(char[] buffer, int bufferLen, FST.Arc <BytesRef> scratchArc, FST.BytesReader fstReader) { BytesRef pendingOutput = fst.outputs.NoOutput; BytesRef matchOutput = null; int bufUpto = 0; fst.getFirstArc(scratchArc); while (bufUpto < bufferLen) { //JAVA TO C# CONVERTER WARNING: The original Java variable was marked 'final': //ORIGINAL LINE: final int codePoint = Character.codePointAt(buffer, bufUpto, bufferLen); int codePoint = char.codePointAt(buffer, bufUpto, bufferLen); if (fst.findTargetArc(ignoreCase ? char.ToLower(codePoint) : codePoint, scratchArc, scratchArc, fstReader) == null) { return(null); } pendingOutput = fst.outputs.add(pendingOutput, scratchArc.output); bufUpto += char.charCount(codePoint); } if (scratchArc.Final) { matchOutput = fst.outputs.add(pendingOutput, scratchArc.nextFinalOutput); } return(matchOutput); }
//JAVA TO C# CONVERTER WARNING: Method 'throws' clauses are not available in .NET: //ORIGINAL LINE: private void parse() throws java.io.IOException private void parse() { //System.out.println("\nS: parse"); Debug.Assert(inputSkipCount == 0); int curNextRead = nextRead; // Holds the longest match we've seen so far: BytesRef matchOutput = null; int matchInputLength = 0; int matchEndOffset = -1; BytesRef pendingOutput = fst.outputs.NoOutput; fst.getFirstArc(scratchArc); Debug.Assert(scratchArc.output == fst.outputs.NoOutput); int tokenCount = 0; while (true) { // Pull next token's chars: //JAVA TO C# CONVERTER WARNING: The original Java variable was marked 'final': //ORIGINAL LINE: final char[] buffer; char[] buffer; //JAVA TO C# CONVERTER WARNING: The original Java variable was marked 'final': //ORIGINAL LINE: final int bufferLen; int bufferLen; //System.out.println(" cycle nextRead=" + curNextRead + " nextWrite=" + nextWrite); int inputEndOffset = 0; if (curNextRead == nextWrite) { // We used up our lookahead buffer of input tokens // -- pull next real input token: if (finished) { break; } else { //System.out.println(" input.incrToken"); Debug.Assert(futureInputs[nextWrite].consumed); // Not correct: a syn match whose output is longer // than its input can set future inputs keepOrig // to true: //assert !futureInputs[nextWrite].keepOrig; if (input.incrementToken()) { buffer = termAtt.buffer(); bufferLen = termAtt.length(); //JAVA TO C# CONVERTER WARNING: The original Java variable was marked 'final': //ORIGINAL LINE: final PendingInput input = futureInputs[nextWrite]; PendingInput input = futureInputs[nextWrite]; lastStartOffset = input.startOffset = offsetAtt.startOffset(); lastEndOffset = input.endOffset = offsetAtt.endOffset(); inputEndOffset = input.endOffset; //System.out.println(" new token=" + new String(buffer, 0, bufferLen)); if (nextRead != nextWrite) { capture(); } else { input.consumed = false; } } else { // No more input tokens //System.out.println(" set end"); finished = true; break; } } } else { // Still in our lookahead buffer = futureInputs[curNextRead].term.chars; bufferLen = futureInputs[curNextRead].term.length; inputEndOffset = futureInputs[curNextRead].endOffset; //System.out.println(" old token=" + new String(buffer, 0, bufferLen)); } tokenCount++; // Run each char in this token through the FST: int bufUpto = 0; while (bufUpto < bufferLen) { //JAVA TO C# CONVERTER WARNING: The original Java variable was marked 'final': //ORIGINAL LINE: final int codePoint = Character.codePointAt(buffer, bufUpto, bufferLen); int codePoint = char.codePointAt(buffer, bufUpto, bufferLen); if (fst.findTargetArc(ignoreCase ? char.ToLower(codePoint) : codePoint, scratchArc, scratchArc, fstReader) == null) { //System.out.println(" stop"); goto byTokenBreak; } // Accum the output pendingOutput = fst.outputs.add(pendingOutput, scratchArc.output); //System.out.println(" char=" + buffer[bufUpto] + " output=" + pendingOutput + " arc.output=" + scratchArc.output); bufUpto += char.charCount(codePoint); } // OK, entire token matched; now see if this is a final // state: if (scratchArc.Final) { matchOutput = fst.outputs.add(pendingOutput, scratchArc.nextFinalOutput); matchInputLength = tokenCount; matchEndOffset = inputEndOffset; //System.out.println(" found matchLength=" + matchInputLength + " output=" + matchOutput); } // See if the FST wants to continue matching (ie, needs to // see the next input token): if (fst.findTargetArc(SynonymMap.WORD_SEPARATOR, scratchArc, scratchArc, fstReader) == null) { // No further rules can match here; we're done // searching for matching rules starting at the // current input position. break; } else { // More matching is possible -- accum the output (if // any) of the WORD_SEP arc: pendingOutput = fst.outputs.add(pendingOutput, scratchArc.output); if (nextRead == nextWrite) { capture(); } } curNextRead = rollIncr(curNextRead); byTokenContinue :; } byTokenBreak : if (nextRead == nextWrite && !finished) { //System.out.println(" skip write slot=" + nextWrite); nextWrite = rollIncr(nextWrite); } if (matchOutput != null) { //System.out.println(" add matchLength=" + matchInputLength + " output=" + matchOutput); inputSkipCount = matchInputLength; addOutput(matchOutput, matchInputLength, matchEndOffset); } else if (nextRead != nextWrite) { // Even though we had no match here, we set to 1 // because we need to skip current input token before // trying to match again: inputSkipCount = 1; } else { Debug.Assert(finished); } //System.out.println(" parse done inputSkipCount=" + inputSkipCount + " nextRead=" + nextRead + " nextWrite=" + nextWrite); }
//JAVA TO C# CONVERTER WARNING: Method 'throws' clauses are not available in .NET: //ORIGINAL LINE: @Override public int read() throws java.io.IOException public override int read() { //System.out.println("\nread"); while (true) { if (replacement != null && replacementPointer < replacement.length) { //System.out.println(" return repl[" + replacementPointer + "]=" + replacement.chars[replacement.offset + replacementPointer]); return(replacement.chars[replacement.offset + replacementPointer++]); } // TODO: a more efficient approach would be Aho/Corasick's // algorithm // (http://en.wikipedia.org/wiki/Aho%E2%80%93Corasick_string_matching_algorithm) // or this generalizatio: www.cis.uni-muenchen.de/people/Schulz/Pub/dictle5.ps // // I think this would be (almost?) equivalent to 1) adding // epsilon arcs from all final nodes back to the init // node in the FST, 2) adding a .* (skip any char) // loop on the initial node, and 3) determinizing // that. Then we would not have to restart matching // at each position. int lastMatchLen = -1; CharsRef lastMatch = null; //JAVA TO C# CONVERTER WARNING: The original Java variable was marked 'final': //ORIGINAL LINE: final int firstCH = buffer.get(inputOff); int firstCH = buffer.get(inputOff); if (firstCH != -1) { FST.Arc <CharsRef> arc = cachedRootArcs[Convert.ToChar((char)firstCH)]; if (arc != null) { if (!FST.targetHasArcs(arc)) { // Fast pass for single character match: Debug.Assert(arc.Final); lastMatchLen = 1; lastMatch = arc.output; } else { int lookahead = 0; CharsRef output = arc.output; while (true) { lookahead++; if (arc.Final) { // Match! (to node is final) lastMatchLen = lookahead; lastMatch = outputs.add(output, arc.nextFinalOutput); // Greedy: keep searching to see if there's a // longer match... } if (!FST.targetHasArcs(arc)) { break; } int ch = buffer.get(inputOff + lookahead); if (ch == -1) { break; } if ((arc = map.findTargetArc(ch, arc, scratchArc, fstReader)) == null) { // Dead end break; } output = outputs.add(output, arc.output); } } } } if (lastMatch != null) { inputOff += lastMatchLen; //System.out.println(" match! len=" + lastMatchLen + " repl=" + lastMatch); //JAVA TO C# CONVERTER WARNING: The original Java variable was marked 'final': //ORIGINAL LINE: final int diff = lastMatchLen - lastMatch.length; int diff = lastMatchLen - lastMatch.length; if (diff != 0) { //JAVA TO C# CONVERTER WARNING: The original Java variable was marked 'final': //ORIGINAL LINE: final int prevCumulativeDiff = getLastCumulativeDiff(); int prevCumulativeDiff = LastCumulativeDiff; if (diff > 0) { // Replacement is shorter than matched input: addOffCorrectMap(inputOff - diff - prevCumulativeDiff, prevCumulativeDiff + diff); } else { // Replacement is longer than matched input: remap // the "extra" chars all back to the same input // offset: //JAVA TO C# CONVERTER WARNING: The original Java variable was marked 'final': //ORIGINAL LINE: final int outputStart = inputOff - prevCumulativeDiff; int outputStart = inputOff - prevCumulativeDiff; for (int extraIDX = 0; extraIDX < -diff; extraIDX++) { addOffCorrectMap(outputStart + extraIDX, prevCumulativeDiff - extraIDX - 1); } } } replacement = lastMatch; replacementPointer = 0; } else { //JAVA TO C# CONVERTER WARNING: The original Java variable was marked 'final': //ORIGINAL LINE: final int ret = buffer.get(inputOff); int ret = buffer.get(inputOff); if (ret != -1) { inputOff++; buffer.freeBefore(inputOff); } return(ret); } } }