//JAVA TO C# CONVERTER WARNING: Method 'throws' clauses are not available in .NET: //ORIGINAL LINE: @Override public boolean incrementToken() throws java.io.IOException public override bool incrementToken() { while (input.incrementToken()) { char[] text = termAtt.buffer(); int termLength = termAtt.length(); // why not key off token type here assuming ChineseTokenizer comes first? if (!stopTable.contains(text, 0, termLength)) { switch (char.getType(text[0])) { case char.LOWERCASE_LETTER: case char.UPPERCASE_LETTER: // English word/token should larger than 1 character. if (termLength > 1) { return(true); } break; case char.OTHER_LETTER: // One Chinese character as one Chinese word. // Chinese word extraction to be added later here. return(true); } } } return(false); }
public override bool accept() { //JAVA TO C# CONVERTER WARNING: The original Java variable was marked 'final': //ORIGINAL LINE: final int max32 = termAtt.length(); int max32 = termAtt.length(); //JAVA TO C# CONVERTER WARNING: The original Java variable was marked 'final': //ORIGINAL LINE: final int min32 = max32 >> 1; int min32 = max32 >> 1; if (min32 >= min && max32 <= max) { // definitely within range return(true); } else if (min32 > max || max32 < min) { // definitely not return(false); } else { // we must count to be sure int len = char.codePointCount(termAtt.buffer(), 0, termAtt.length()); return(len >= min && len <= max); } }
/// <summary> /// Returns the next input Token, after being stemmed </summary> //JAVA TO C# CONVERTER WARNING: Method 'throws' clauses are not available in .NET: //ORIGINAL LINE: @Override public final boolean incrementToken() throws java.io.IOException public override bool incrementToken() { if (input.incrementToken()) { if (!keywordAttr.Keyword) { char[] termBuffer = termAtt.buffer(); //JAVA TO C# CONVERTER WARNING: The original Java variable was marked 'final': //ORIGINAL LINE: final int length = termAtt.length(); int length = termAtt.length(); stemmer.setCurrent(termBuffer, length); stemmer.stem(); //JAVA TO C# CONVERTER WARNING: The original Java variable was marked 'final': //ORIGINAL LINE: final char finalTerm[] = stemmer.getCurrentBuffer(); char[] finalTerm = stemmer.CurrentBuffer; //JAVA TO C# CONVERTER WARNING: The original Java variable was marked 'final': //ORIGINAL LINE: final int newLength = stemmer.getCurrentBufferLength(); int newLength = stemmer.CurrentBufferLength; if (finalTerm != termBuffer) { termAtt.copyBuffer(finalTerm, 0, newLength); } else { termAtt.Length = newLength; } } return(true); } else { return(false); } }
private void capture() { captureCount++; //System.out.println(" capture slot=" + nextWrite); //JAVA TO C# CONVERTER WARNING: The original Java variable was marked 'final': //ORIGINAL LINE: final PendingInput input = futureInputs[nextWrite]; PendingInput input = futureInputs[nextWrite]; input.state = captureState(); input.consumed = false; input.term.copyChars(termAtt.buffer(), 0, termAtt.length()); nextWrite = rollIncr(nextWrite); // Buffer head should never catch up to tail: Debug.Assert(nextWrite != nextRead); }
/// <summary> /// {@inheritDoc} /// </summary> //JAVA TO C# CONVERTER WARNING: Method 'throws' clauses are not available in .NET: //ORIGINAL LINE: @Override public boolean incrementToken() throws java.io.IOException public override bool incrementToken() { while (!exhausted && input.incrementToken()) { char[] term = termAttribute.buffer(); int termLength = termAttribute.length(); lastEndOffset = offsetAttribute.endOffset(); if (termLength > 0 && term[termLength - 1] == '-') { // a hyphenated word // capture the state of the first token only if (savedState == null) { savedState = captureState(); } hyphenated.Append(term, 0, termLength - 1); } else if (savedState == null) { // not part of a hyphenated word. return(true); } else { // the final portion of a hyphenated word hyphenated.Append(term, 0, termLength); unhyphenate(); return(true); } } exhausted = true; if (savedState != null) { // the final term ends with a hyphen // add back the hyphen, for backwards compatibility. hyphenated.Append('-'); unhyphenate(); return(true); } return(false); }
//JAVA TO C# CONVERTER WARNING: Method 'throws' clauses are not available in .NET: //ORIGINAL LINE: @Override public final boolean incrementToken() throws java.io.IOException public override bool incrementToken() { bool iOrAfter = false; if (input.incrementToken()) { //JAVA TO C# CONVERTER WARNING: The original Java variable was marked 'final': //ORIGINAL LINE: final char[] buffer = termAtt.buffer(); char[] buffer = termAtt.buffer(); int length = termAtt.length(); for (int i = 0; i < length;) { //JAVA TO C# CONVERTER WARNING: The original Java variable was marked 'final': //ORIGINAL LINE: final int ch = Character.codePointAt(buffer, i, length); int ch = char.codePointAt(buffer, i, length); iOrAfter = (ch == LATIN_CAPITAL_LETTER_I || (iOrAfter && char.getType(ch) == char.NON_SPACING_MARK)); if (iOrAfter) // all the special I turkish handling happens here. { switch (ch) { // remove COMBINING_DOT_ABOVE to mimic composed lowercase case COMBINING_DOT_ABOVE: length = delete(buffer, i, length); continue; // i itself, it depends if it is followed by COMBINING_DOT_ABOVE // if it is, we will make it small i and later remove the dot case LATIN_CAPITAL_LETTER_I: if (isBeforeDot(buffer, i + 1, length)) { buffer[i] = (char)LATIN_SMALL_LETTER_I; } else { buffer[i] = (char)LATIN_SMALL_LETTER_DOTLESS_I; // below is an optimization. no COMBINING_DOT_ABOVE follows, // so don't waste time calculating Character.getType(), etc iOrAfter = false; } i++; continue; } } i += char.toChars(char.ToLower(ch), buffer, i); } termAtt.Length = length; return(true); } else { return(false); } }
//JAVA TO C# CONVERTER WARNING: Method 'throws' clauses are not available in .NET: //ORIGINAL LINE: @Override public boolean incrementToken() throws java.io.IOException public override bool incrementToken() { if (input.incrementToken()) { int len = termAtt.length(); if (marker != NOMARKER) { len++; termAtt.resizeBuffer(len); termAtt.buffer()[len - 1] = marker; } reverse(matchVersion, termAtt.buffer(), 0, len); termAtt.Length = len; return(true); } else { return(false); } }
//JAVA TO C# CONVERTER WARNING: Method 'throws' clauses are not available in .NET: //ORIGINAL LINE: @Override public final boolean incrementToken() throws java.io.IOException public override bool incrementToken() { if (input.incrementToken()) { charUtils.ToUpper(termAtt.buffer(), 0, termAtt.length()); return(true); } else { return(false); } }
//JAVA TO C# CONVERTER WARNING: Method 'throws' clauses are not available in .NET: //ORIGINAL LINE: @Override public boolean incrementToken() throws java.io.IOException public override bool incrementToken() { if (!input.incrementToken()) { return(false); } char[] termBuffer = termAtt.buffer(); int len = termAtt.length(); //TODO: Is this the right behavior or should we return false? Currently, " ", returns true, so I think this should //also return true if (len == 0) { return(true); } int start = 0; int end = 0; int endOff = 0; // eat the first characters for (start = 0; start < len && char.IsWhiteSpace(termBuffer[start]); start++) { } // eat the end characters for (end = len; end >= start && char.IsWhiteSpace(termBuffer[end - 1]); end--) { endOff++; } if (start > 0 || end < len) { if (start < end) { termAtt.copyBuffer(termBuffer, start, (end - start)); } else { termAtt.setEmpty(); } if (updateOffsets && len == offsetAtt.endOffset() - offsetAtt.startOffset()) { int newStart = offsetAtt.startOffset() + start; int newEnd = offsetAtt.endOffset() - (start < end ? endOff:0); offsetAtt.setOffset(newStart, newEnd); } } return(true); }
private const char oe_se = '\u00F6'; //ö //JAVA TO C# CONVERTER WARNING: Method 'throws' clauses are not available in .NET: //ORIGINAL LINE: @Override public boolean incrementToken() throws java.io.IOException public override bool incrementToken() { if (!input.incrementToken()) { return(false); } char[] buffer = charTermAttribute.buffer(); int length = charTermAttribute.length(); int i; for (i = 0; i < length; i++) { if (buffer[i] == aa || buffer[i] == ae_se || buffer[i] == ae) { buffer[i] = 'a'; } else if (buffer[i] == AA || buffer[i] == AE_se || buffer[i] == AE) { buffer[i] = 'A'; } else if (buffer[i] == oe || buffer[i] == oe_se) { buffer[i] = 'o'; } else if (buffer[i] == OE || buffer[i] == OE_se) { buffer[i] = 'O'; } else if (length - 1 > i) { if ((buffer[i] == 'a' || buffer[i] == 'A') && (buffer[i + 1] == 'a' || buffer[i + 1] == 'A' || buffer[i + 1] == 'e' || buffer[i + 1] == 'E' || buffer[i + 1] == 'o' || buffer[i + 1] == 'O')) { length = StemmerUtil.delete(buffer, i + 1, length); } else if ((buffer[i] == 'o' || buffer[i] == 'O') && (buffer[i + 1] == 'e' || buffer[i + 1] == 'E' || buffer[i + 1] == 'o' || buffer[i + 1] == 'O')) { length = StemmerUtil.delete(buffer, i + 1, length); } } } charTermAttribute.Length = length; return(true); }
/// <summary> /// Sugar: analyzes the text with the analyzer and /// separates by <seealso cref="SynonymMap#WORD_SEPARATOR"/>. /// reuse and its chars must not be null. /// </summary> //JAVA TO C# CONVERTER WARNING: Method 'throws' clauses are not available in .NET: //ORIGINAL LINE: public org.apache.lucene.util.CharsRef analyze(String text, org.apache.lucene.util.CharsRef reuse) throws java.io.IOException public virtual CharsRef analyze(string text, CharsRef reuse) { IOException priorException = null; TokenStream ts = analyzer.tokenStream("", text); try { CharTermAttribute termAtt = ts.addAttribute(typeof(CharTermAttribute)); PositionIncrementAttribute posIncAtt = ts.addAttribute(typeof(PositionIncrementAttribute)); ts.reset(); reuse.length = 0; while (ts.incrementToken()) { int length = termAtt.length(); if (length == 0) { throw new System.ArgumentException("term: " + text + " analyzed to a zero-length token"); } if (posIncAtt.PositionIncrement != 1) { throw new System.ArgumentException("term: " + text + " analyzed to a token with posinc != 1"); } reuse.grow(reuse.length + length + 1); // current + word + separator int end = reuse.offset + reuse.length; if (reuse.length > 0) { reuse.chars[end++] = SynonymMap.WORD_SEPARATOR; reuse.length++; } Array.Copy(termAtt.buffer(), 0, reuse.chars, end, length); reuse.length += length; } ts.end(); } catch (IOException e) { priorException = e; } finally { IOUtils.closeWhileHandlingException(priorException, ts); } if (reuse.length == 0) { throw new System.ArgumentException("term: " + text + " was completely eliminated by analyzer"); } return(reuse); }
//JAVA TO C# CONVERTER WARNING: Method 'throws' clauses are not available in .NET: //ORIGINAL LINE: private SlowSynonymMap match(SlowSynonymMap map) throws java.io.IOException private SlowSynonymMap match(SlowSynonymMap map) { SlowSynonymMap result = null; if (map.submap != null) { AttributeSource tok = nextTok(); if (tok != null) { // clone ourselves. if (tok == this) { tok = cloneAttributes(); } // check for positionIncrement!=1? if>1, should not match, if==0, check multiple at this level? CharTermAttribute termAtt = tok.getAttribute(typeof(CharTermAttribute)); SlowSynonymMap subMap = map.submap.get(termAtt.buffer(), 0, termAtt.length()); if (subMap != null) { // recurse result = match(subMap); } if (result != null) { matched.AddFirst(tok); } else { // push back unmatched token pushTok(tok); } } } // if no longer sequence matched, so if this node has synonyms, it's the match. if (result == null && map.synonyms != null) { result = map; } return(result); }
public override bool incrementToken() { if (!input.incrementToken()) { return(false); } //JAVA TO C# CONVERTER WARNING: The original Java variable was marked 'final': //ORIGINAL LINE: final char[] buffer = termAtt.buffer(); char[] buffer = termAtt.buffer(); //JAVA TO C# CONVERTER WARNING: The original Java variable was marked 'final': //ORIGINAL LINE: final int bufferLength = termAtt.length(); int bufferLength = termAtt.length(); if (bufferLength >= 2 && (buffer[bufferLength - 2] == '\'' || (matchVersion.onOrAfter(Version.LUCENE_36) && (buffer[bufferLength - 2] == '\u2019' || buffer[bufferLength - 2] == '\uFF07'))) && (buffer[bufferLength - 1] == 's' || buffer[bufferLength - 1] == 'S')) { termAtt.Length = bufferLength - 2; // Strip last 2 characters off } return(true); }
//JAVA TO C# CONVERTER WARNING: Method 'throws' clauses are not available in .NET: //ORIGINAL LINE: public final boolean incrementTokenClassic() throws java.io.IOException public bool incrementTokenClassic() { if (!input.incrementToken()) { return false; } //JAVA TO C# CONVERTER WARNING: The original Java variable was marked 'final': //ORIGINAL LINE: final char[] buffer = termAtt.buffer(); char[] buffer = termAtt.buffer(); //JAVA TO C# CONVERTER WARNING: The original Java variable was marked 'final': //ORIGINAL LINE: final int bufferLength = termAtt.length(); int bufferLength = termAtt.length(); //JAVA TO C# CONVERTER WARNING: The original Java variable was marked 'final': //ORIGINAL LINE: final String type = typeAtt.type(); string type = typeAtt.type(); if (type == APOSTROPHE_TYPE && bufferLength >= 2 && buffer[bufferLength - 2] == '\'' && (buffer[bufferLength - 1] == 's' || buffer[bufferLength - 1] == 'S')) // remove 's { // Strip last 2 characters off termAtt.Length = bufferLength - 2; } // remove dots else if (type == ACRONYM_TYPE) { int upto = 0; for (int i = 0;i < bufferLength;i++) { char c = buffer[i]; if (c != '.') { buffer[upto++] = c; } } termAtt.Length = upto; } return true; }
//JAVA TO C# CONVERTER WARNING: Method 'throws' clauses are not available in .NET: //ORIGINAL LINE: @Override public boolean incrementToken() throws java.io.IOException public override bool incrementToken() { if (input.incrementToken()) { int state = N; char[] buffer = termAtt.buffer(); int length = termAtt.length(); for (int i = 0; i < length; i++) { //JAVA TO C# CONVERTER WARNING: The original Java variable was marked 'final': //ORIGINAL LINE: final char c = buffer[i]; char c = buffer[i]; switch (c) { case 'a': case 'o': state = U; break; case 'u': state = (state == N) ? U : V; break; case 'e': if (state == U) { length = StemmerUtil.delete(buffer, i--, length); } state = V; break; case 'i': case 'q': case 'y': state = V; break; case 'ä': buffer[i] = 'a'; state = V; break; case 'ö': buffer[i] = 'o'; state = V; break; case 'ü': buffer[i] = 'u'; state = V; break; case 'ß': buffer[i++] = 's'; buffer = termAtt.resizeBuffer(1 + length); if (i < length) { Array.Copy(buffer, i, buffer, i + 1, (length - i)); } buffer[i] = 's'; length++; state = N; break; default: state = N; break; } } termAtt.Length = length; return(true); } else { return(false); } }
/// <summary> /// Returns the next token in the stream, or null at EOS. </summary> //JAVA TO C# CONVERTER WARNING: Method 'throws' clauses are not available in .NET: //ORIGINAL LINE: @Override public final boolean incrementToken() throws java.io.IOException public override bool incrementToken() { while (true) { if (curTermBuffer == null) { if (!input.incrementToken()) { return(false); } else { curTermBuffer = termAtt.buffer().clone(); curTermLength = termAtt.length(); curCodePointCount = charUtils.codePointCount(termAtt); curGramSize = minGram; curPos = 0; curPosInc = posIncAtt.PositionIncrement; curPosLen = posLenAtt.PositionLength; tokStart = offsetAtt.startOffset(); tokEnd = offsetAtt.endOffset(); // if length by start + end offsets doesn't match the term text then assume // this is a synonym and don't adjust the offsets. hasIllegalOffsets = (tokStart + curTermLength) != tokEnd; } } if (version.onOrAfter(Version.LUCENE_44)) { if (curGramSize > maxGram || (curPos + curGramSize) > curCodePointCount) { ++curPos; curGramSize = minGram; } if ((curPos + curGramSize) <= curCodePointCount) { clearAttributes(); //JAVA TO C# CONVERTER WARNING: The original Java variable was marked 'final': //ORIGINAL LINE: final int start = charUtils.offsetByCodePoints(curTermBuffer, 0, curTermLength, 0, curPos); int start = charUtils.offsetByCodePoints(curTermBuffer, 0, curTermLength, 0, curPos); //JAVA TO C# CONVERTER WARNING: The original Java variable was marked 'final': //ORIGINAL LINE: final int end = charUtils.offsetByCodePoints(curTermBuffer, 0, curTermLength, start, curGramSize); int end = charUtils.offsetByCodePoints(curTermBuffer, 0, curTermLength, start, curGramSize); termAtt.copyBuffer(curTermBuffer, start, end - start); posIncAtt.PositionIncrement = curPosInc; curPosInc = 0; posLenAtt.PositionLength = curPosLen; offsetAtt.setOffset(tokStart, tokEnd); curGramSize++; return(true); } } else { while (curGramSize <= maxGram) { while (curPos + curGramSize <= curTermLength) // while there is input { clearAttributes(); termAtt.copyBuffer(curTermBuffer, curPos, curGramSize); if (hasIllegalOffsets) { offsetAtt.setOffset(tokStart, tokEnd); } else { offsetAtt.setOffset(tokStart + curPos, tokStart + curPos + curGramSize); } curPos++; return(true); } curGramSize++; // increase n-gram size curPos = 0; } } curTermBuffer = null; } }
/* * Need to worry about multiple scenarios: * - need to go for the longest match * a b => foo #shouldn't match if "a b" is followed by "c d" * a b c d => bar * - need to backtrack - retry matches for tokens already read * a b c d => foo * b c => bar * If the input stream is "a b c x", one will consume "a b c d" * trying to match the first rule... all but "a" should be * pushed back so a match may be made on "b c". * - don't try and match generated tokens (thus need separate queue) * matching is not recursive. * - handle optional generation of original tokens in all these cases, * merging token streams to preserve token positions. * - preserve original positionIncrement of first matched token */ //JAVA TO C# CONVERTER WARNING: Method 'throws' clauses are not available in .NET: //ORIGINAL LINE: @Override public boolean incrementToken() throws java.io.IOException public override bool incrementToken() { while (true) { // if there are any generated tokens, return them... don't try any // matches against them, as we specifically don't want recursion. //JAVA TO C# CONVERTER TODO TASK: Java iterators are only converted within the context of 'while' and 'for' loops: if (replacement != null && replacement.hasNext()) { //JAVA TO C# CONVERTER TODO TASK: Java iterators are only converted within the context of 'while' and 'for' loops: copy(this, replacement.next()); return(true); } // common case fast-path of first token not matching anything AttributeSource firstTok = nextTok(); if (firstTok == null) { return(false); } CharTermAttribute termAtt = firstTok.addAttribute(typeof(CharTermAttribute)); SlowSynonymMap result = map.submap != null?map.submap.get(termAtt.buffer(), 0, termAtt.length()) : null; if (result == null) { copy(this, firstTok); return(true); } // fast-path failed, clone ourselves if needed if (firstTok == this) { firstTok = cloneAttributes(); } // OK, we matched a token, so find the longest match. matched = new LinkedList <>(); result = match(result); if (result == null) { // no match, simply return the first token read. copy(this, firstTok); return(true); } // reuse, or create new one each time? List <AttributeSource> generated = new List <AttributeSource>(result.synonyms.Length + matched.Count + 1); // // there was a match... let's generate the new tokens, merging // in the matched tokens (position increments need adjusting) // AttributeSource lastTok = matched.Count == 0 ? firstTok : matched.Last.Value; bool includeOrig = result.includeOrig(); AttributeSource origTok = includeOrig ? firstTok : null; PositionIncrementAttribute firstPosIncAtt = firstTok.addAttribute(typeof(PositionIncrementAttribute)); int origPos = firstPosIncAtt.PositionIncrement; // position of origTok in the original stream int repPos = 0; // curr position in replacement token stream int pos = 0; // current position in merged token stream for (int i = 0; i < result.synonyms.Length; i++) { Token repTok = result.synonyms[i]; AttributeSource newTok = firstTok.cloneAttributes(); CharTermAttribute newTermAtt = newTok.addAttribute(typeof(CharTermAttribute)); OffsetAttribute newOffsetAtt = newTok.addAttribute(typeof(OffsetAttribute)); PositionIncrementAttribute newPosIncAtt = newTok.addAttribute(typeof(PositionIncrementAttribute)); OffsetAttribute lastOffsetAtt = lastTok.addAttribute(typeof(OffsetAttribute)); newOffsetAtt.setOffset(newOffsetAtt.startOffset(), lastOffsetAtt.endOffset()); newTermAtt.copyBuffer(repTok.buffer(), 0, repTok.length()); repPos += repTok.PositionIncrement; if (i == 0) // make position of first token equal to original { repPos = origPos; } // if necessary, insert original tokens and adjust position increment while (origTok != null && origPos <= repPos) { PositionIncrementAttribute origPosInc = origTok.addAttribute(typeof(PositionIncrementAttribute)); origPosInc.PositionIncrement = origPos - pos; generated.Add(origTok); pos += origPosInc.PositionIncrement; origTok = matched.Count == 0 ? null : matched.RemoveFirst(); if (origTok != null) { origPosInc = origTok.addAttribute(typeof(PositionIncrementAttribute)); origPos += origPosInc.PositionIncrement; } } newPosIncAtt.PositionIncrement = repPos - pos; generated.Add(newTok); pos += newPosIncAtt.PositionIncrement; } // finish up any leftover original tokens while (origTok != null) { PositionIncrementAttribute origPosInc = origTok.addAttribute(typeof(PositionIncrementAttribute)); origPosInc.PositionIncrement = origPos - pos; generated.Add(origTok); pos += origPosInc.PositionIncrement; origTok = matched.Count == 0 ? null : matched.RemoveFirst(); if (origTok != null) { origPosInc = origTok.addAttribute(typeof(PositionIncrementAttribute)); origPos += origPosInc.PositionIncrement; } } // what if we replaced a longer sequence with a shorter one? // a/0 b/5 => foo/0 // should I re-create the gap on the next buffered token? replacement = generated.GetEnumerator(); // Now return to the top of the loop to read and return the first // generated token.. The reason this is done is that we may have generated // nothing at all, and may need to continue with more matching logic. } }
//JAVA TO C# CONVERTER WARNING: Method 'throws' clauses are not available in .NET: //ORIGINAL LINE: @Override public final boolean incrementToken() throws java.io.IOException public override bool incrementToken() { clearAttributes(); termAtt.append(resultToken); if (resultToken.Length == 0) { posAtt.PositionIncrement = 1; } else { posAtt.PositionIncrement = 0; } int length = 0; bool added = false; if (endDelimiter) { termAtt.append(replacement); length++; endDelimiter = false; added = true; } while (true) { int c = input.read(); if (c >= 0) { charsRead++; } else { if (skipped > skip) { length += resultToken.Length; termAtt.Length = length; offsetAtt.setOffset(correctOffset(startPosition), correctOffset(startPosition + length)); if (added) { resultToken.Length = 0; resultToken.Append(termAtt.buffer(), 0, length); } return(added); } else { return(false); } } if (!added) { added = true; skipped++; if (skipped > skip) { termAtt.append(c == delimiter ? replacement : (char)c); length++; } else { startPosition++; } } else { if (c == delimiter) { if (skipped > skip) { endDelimiter = true; break; } skipped++; if (skipped > skip) { termAtt.append(replacement); length++; } else { startPosition++; } } else { if (skipped > skip) { termAtt.append((char)c); length++; } else { startPosition++; } } } } length += resultToken.Length; termAtt.Length = length; offsetAtt.setOffset(correctOffset(startPosition), correctOffset(startPosition + length)); resultToken.Length = 0; resultToken.Append(termAtt.buffer(), 0, length); return(true); }
//JAVA TO C# CONVERTER WARNING: Method 'throws' clauses are not available in .NET: //ORIGINAL LINE: @Override public boolean incrementToken() throws java.io.IOException public override bool incrementToken() { if (hasMoreTokensInClone) { int start = breaker.current(); int end = breaker.next(); if (end != BreakIterator.DONE) { clonedToken.copyTo(this); termAtt.copyBuffer(clonedTermAtt.buffer(), start, end - start); if (hasIllegalOffsets) { offsetAtt.setOffset(clonedOffsetAtt.startOffset(), clonedOffsetAtt.endOffset()); } else { offsetAtt.setOffset(clonedOffsetAtt.startOffset() + start, clonedOffsetAtt.startOffset() + end); } if (handlePosIncr) { posAtt.PositionIncrement = 1; } return(true); } hasMoreTokensInClone = false; } if (!input.incrementToken()) { return(false); } if (termAtt.length() == 0 || char.UnicodeBlock.of(termAtt.charAt(0)) != char.UnicodeBlock.THAI) { return(true); } hasMoreTokensInClone = true; // if length by start + end offsets doesn't match the term text then assume // this is a synonym and don't adjust the offsets. hasIllegalOffsets = offsetAtt.endOffset() - offsetAtt.startOffset() != termAtt.length(); // we lazy init the cloned token, as in ctor not all attributes may be added if (clonedToken == null) { clonedToken = cloneAttributes(); clonedTermAtt = clonedToken.getAttribute(typeof(CharTermAttribute)); clonedOffsetAtt = clonedToken.getAttribute(typeof(OffsetAttribute)); } else { this.copyTo(clonedToken); } // reinit CharacterIterator charIterator.setText(clonedTermAtt.buffer(), 0, clonedTermAtt.length()); breaker.Text = charIterator; int end = breaker.next(); if (end != BreakIterator.DONE) { termAtt.Length = end; if (hasIllegalOffsets) { offsetAtt.setOffset(clonedOffsetAtt.startOffset(), clonedOffsetAtt.endOffset()); } else { offsetAtt.setOffset(clonedOffsetAtt.startOffset(), clonedOffsetAtt.startOffset() + end); } // position increment keeps as it is for first token return(true); } return(false); }
private void assertEquals(CharTermAttribute term, string expected) { assertEquals(expected.Length, term.length()); //JAVA TO C# CONVERTER WARNING: The original Java variable was marked 'final': //ORIGINAL LINE: final char[] buffer = term.buffer(); char[] buffer = term.buffer(); for (int chIDX = 0;chIDX < expected.Length;chIDX++) { assertEquals(expected[chIDX], buffer[chIDX]); } }
//JAVA TO C# CONVERTER WARNING: Method 'throws' clauses are not available in .NET: //ORIGINAL LINE: @Override public final boolean incrementToken() throws java.io.IOException public override bool incrementToken() { clearAttributes(); // termination of this loop is guaranteed by the fact that every iteration // either advances the buffer (calls consumes()) or increases gramSize while (true) { // compact if (bufferStart >= bufferEnd - maxGram - 1 && !exhausted) { Array.Copy(buffer, bufferStart, buffer, 0, bufferEnd - bufferStart); bufferEnd -= bufferStart; lastCheckedChar -= bufferStart; lastNonTokenChar -= bufferStart; bufferStart = 0; // fill in remaining space exhausted = !charUtils.fill(charBuffer, input, buffer.Length - bufferEnd); // convert to code points bufferEnd += charUtils.toCodePoints(charBuffer.Buffer, 0, charBuffer.Length, buffer, bufferEnd); } // should we go to the next offset? if (gramSize > maxGram || (bufferStart + gramSize) > bufferEnd) { if (bufferStart + 1 + minGram > bufferEnd) { Debug.Assert(exhausted); return(false); } consume(); gramSize = minGram; } updateLastNonTokenChar(); // retry if the token to be emitted was going to not only contain token chars //JAVA TO C# CONVERTER WARNING: The original Java variable was marked 'final': //ORIGINAL LINE: final boolean termContainsNonTokenChar = lastNonTokenChar >= bufferStart && lastNonTokenChar < (bufferStart + gramSize); bool termContainsNonTokenChar = lastNonTokenChar >= bufferStart && lastNonTokenChar < (bufferStart + gramSize); //JAVA TO C# CONVERTER WARNING: The original Java variable was marked 'final': //ORIGINAL LINE: final boolean isEdgeAndPreviousCharIsTokenChar = edgesOnly && lastNonTokenChar != bufferStart - 1; bool isEdgeAndPreviousCharIsTokenChar = edgesOnly && lastNonTokenChar != bufferStart - 1; if (termContainsNonTokenChar || isEdgeAndPreviousCharIsTokenChar) { consume(); gramSize = minGram; continue; } //JAVA TO C# CONVERTER WARNING: The original Java variable was marked 'final': //ORIGINAL LINE: final int length = charUtils.toChars(buffer, bufferStart, gramSize, termAtt.buffer(), 0); int length = charUtils.toChars(buffer, bufferStart, gramSize, termAtt.buffer(), 0); termAtt.Length = length; posIncAtt.PositionIncrement = 1; posLenAtt.PositionLength = 1; offsetAtt.setOffset(correctOffset(offset), correctOffset(offset + length)); ++gramSize; return(true); } }
public override bool IncrementToken() { if (input.IncrementToken()) { var collationKey = collator.GetCollationKey(termAtt.ToString()).toByteArray(); int encodedLength = IndexableBinaryStringTools.getEncodedLength(collationKey, 0, collationKey.Length); termAtt.resizeBuffer(encodedLength); termAtt.Length = encodedLength; IndexableBinaryStringTools.encode(collationKey, 0, collationKey.Length, termAtt.buffer(), 0, encodedLength); return(true); } else { return(false); } }
//JAVA TO C# CONVERTER WARNING: Method 'throws' clauses are not available in .NET: //ORIGINAL LINE: @Override public boolean incrementToken() throws java.io.IOException public override bool incrementToken() { if (currentMatcher != -1 && nextCapture()) { Debug.Assert(state != null); clearAttributes(); restoreState(state); //JAVA TO C# CONVERTER WARNING: The original Java variable was marked 'final': //ORIGINAL LINE: final int start = matchers[currentMatcher].start(currentGroup[currentMatcher]); int start = matchers[currentMatcher].start(currentGroup[currentMatcher]); //JAVA TO C# CONVERTER WARNING: The original Java variable was marked 'final': //ORIGINAL LINE: final int end = matchers[currentMatcher].end(currentGroup[currentMatcher]); int end = matchers[currentMatcher].end(currentGroup[currentMatcher]); posAttr.PositionIncrement = 0; charTermAttr.copyBuffer(spare.chars, start, end - start); currentGroup[currentMatcher]++; return(true); } if (!input.incrementToken()) { return(false); } char[] buffer = charTermAttr.buffer(); int length = charTermAttr.length(); spare.copyChars(buffer, 0, length); state = captureState(); for (int i = 0; i < matchers.Length; i++) { matchers[i].reset(spare); currentGroup[i] = -1; } if (preserveOriginal) { currentMatcher = 0; } else if (nextCapture()) { //JAVA TO C# CONVERTER WARNING: The original Java variable was marked 'final': //ORIGINAL LINE: final int start = matchers[currentMatcher].start(currentGroup[currentMatcher]); int start = matchers[currentMatcher].start(currentGroup[currentMatcher]); //JAVA TO C# CONVERTER WARNING: The original Java variable was marked 'final': //ORIGINAL LINE: final int end = matchers[currentMatcher].end(currentGroup[currentMatcher]); int end = matchers[currentMatcher].end(currentGroup[currentMatcher]); // if we start at 0 we can simply set the length and save the copy if (start == 0) { charTermAttr.Length = end; } else { charTermAttr.copyBuffer(spare.chars, start, end - start); } currentGroup[currentMatcher]++; } return(true); }
/// <summary> /// Returns the next input Token whose term() is not a stop word. /// </summary> protected internal override bool Accept() { return(!stopWords.contains(termAtt.buffer(), 0, termAtt.length())); }