public override bool incrementToken() { if (index >= str.Length) { return(false); } clearAttributes(); if (group >= 0) { // match a specific group while (matcher.find()) { index = matcher.start(group); //JAVA TO C# CONVERTER WARNING: The original Java variable was marked 'final': //ORIGINAL LINE: final int endIndex = matcher.end(group); int endIndex = matcher.end(group); if (index == endIndex) { continue; } termAtt.setEmpty().append(str, index, endIndex); offsetAtt.setOffset(correctOffset(index), correctOffset(endIndex)); return(true); } index = int.MaxValue; // mark exhausted return(false); } else { // String.split() functionality while (matcher.find()) { if (matcher.start() - index > 0) { // found a non-zero-length token termAtt.setEmpty().append(str, index, matcher.start()); offsetAtt.setOffset(correctOffset(index), correctOffset(matcher.start())); index = matcher.end(); return(true); } index = matcher.end(); } if (str.Length - index == 0) { index = int.MaxValue; // mark exhausted return(false); } termAtt.setEmpty().append(str, index, str.Length); offsetAtt.setOffset(correctOffset(index), correctOffset(str.Length)); index = int.MaxValue; // mark exhausted return(true); } }
public override bool incrementToken() { if (str == null) { throw new System.InvalidOperationException("Consumer did not call reset()."); } clearAttributes(); // cache loop instance vars (performance) string s = str; int len = s.Length; int i = pos; bool letter = isLetter; int start = 0; string text; do { // find beginning of token text = null; while (i < len && !isTokenChar(s[i], letter)) { i++; } if (i < len) // found beginning; now find end of token { start = i; while (i < len && isTokenChar(s[i], letter)) { i++; } text = s.Substring(start, i - start); if (toLowerCase) { text = text.ToLower(locale); } // if (toLowerCase) { //// use next line once JDK 1.5 String.toLowerCase() performance regression is fixed //// see http://bugs.sun.com/bugdatabase/view_bug.do?bug_id=6265809 // text = s.substring(start, i).toLowerCase(); //// char[] chars = new char[i-start]; //// for (int j=start; j < i; j++) chars[j-start] = Character.toLowerCase(s.charAt(j)); //// text = new String(chars); // } else { // text = s.substring(start, i); // } } } while (text != null && isStopWord(text)); pos = i; if (text == null) { return(false); } termAtt.setEmpty().append(text); offsetAtt.setOffset(correctOffset(start), correctOffset(i)); return(true); }
//JAVA TO C# CONVERTER WARNING: Method 'throws' clauses are not available in .NET: //ORIGINAL LINE: private void collapseAndSaveTokens(int tokenType, String type) throws java.io.IOException private void collapseAndSaveTokens(int tokenType, string type) { //collapse StringBuilder buffer = new StringBuilder(32); int numAdded = scanner.setText(buffer); //TODO: how to know how much whitespace to add int theStart = scanner.yychar(); int lastPos = theStart + numAdded; int tmpTokType; int numSeen = 0; IList <AttributeSource.State> tmp = new List <AttributeSource.State>(); setupSavedToken(0, type); tmp.Add(captureState()); //while we can get a token and that token is the same type and we have not transitioned to a new wiki-item of the same type while ((tmpTokType = scanner.NextToken) != WikipediaTokenizerImpl.YYEOF && tmpTokType == tokenType && scanner.NumWikiTokensSeen > numSeen) { int currPos = scanner.yychar(); //append whitespace for (int i = 0; i < (currPos - lastPos); i++) { buffer.Append(' '); } numAdded = scanner.setText(buffer); setupSavedToken(scanner.PositionIncrement, type); tmp.Add(captureState()); numSeen++; lastPos = currPos + numAdded; } //trim the buffer // TODO: this is inefficient string s = buffer.ToString().Trim(); termAtt.setEmpty().append(s); offsetAtt.setOffset(correctOffset(theStart), correctOffset(theStart + s.Length)); flagsAtt.Flags = UNTOKENIZED_TOKEN_FLAG; //The way the loop is written, we will have proceeded to the next token. We need to pushback the scanner to lastPos if (tmpTokType != WikipediaTokenizerImpl.YYEOF) { scanner.yypushback(scanner.yylength()); } tokens = tmp.GetEnumerator(); }
public override bool incrementToken() { clearAttributes(); termAtt.setEmpty().append("accents"); offsetAtt.setOffset(2, 7); typeAtt.Type = "wrd"; posIncAtt.PositionIncrement = 3; payloadAtt.Payload = new BytesRef(new sbyte[] { 0, 1, 2, 3 }); flagsAtt.Flags = 77; return(true); }
//JAVA TO C# CONVERTER WARNING: Method 'throws' clauses are not available in .NET: //ORIGINAL LINE: @Override public boolean incrementToken() throws java.io.IOException public override bool incrementToken() { if (!input.incrementToken()) { return(false); } char[] termBuffer = termAtt.buffer(); int len = termAtt.length(); //TODO: Is this the right behavior or should we return false? Currently, " ", returns true, so I think this should //also return true if (len == 0) { return(true); } int start = 0; int end = 0; int endOff = 0; // eat the first characters for (start = 0; start < len && char.IsWhiteSpace(termBuffer[start]); start++) { } // eat the end characters for (end = len; end >= start && char.IsWhiteSpace(termBuffer[end - 1]); end--) { endOff++; } if (start > 0 || end < len) { if (start < end) { termAtt.copyBuffer(termBuffer, start, (end - start)); } else { termAtt.setEmpty(); } if (updateOffsets && len == offsetAtt.endOffset() - offsetAtt.startOffset()) { int newStart = offsetAtt.startOffset() + start; int newEnd = offsetAtt.endOffset() - (start < end ? endOff:0); offsetAtt.setOffset(newStart, newEnd); } } return(true); }
/// <summary> /// TODO: rewrite tests not to use string comparison. /// </summary> //JAVA TO C# CONVERTER WARNING: Method 'throws' clauses are not available in .NET: //ORIGINAL LINE: private static String tsToString(org.apache.lucene.analysis.TokenStream in) throws java.io.IOException private static string tsToString(TokenStream @in) { StringBuilder @out = new StringBuilder(); CharTermAttribute termAtt = @in.addAttribute(typeof(CharTermAttribute)); // extra safety to enforce, that the state is not preserved and also // assign bogus values @in.clearAttributes(); termAtt.setEmpty().append("bogusTerm"); @in.reset(); while (@in.incrementToken()) { if (@out.Length > 0) { @out.Append(' '); } @out.Append(termAtt.ToString()); @in.clearAttributes(); termAtt.setEmpty().append("bogusTerm"); } @in.close(); return(@out.ToString()); }
public override bool incrementToken() { //JAVA TO C# CONVERTER TODO TASK: Java iterators are only converted within the context of 'while' and 'for' loops: if (toks.hasNext()) { clearAttributes(); //JAVA TO C# CONVERTER TODO TASK: Java iterators are only converted within the context of 'while' and 'for' loops: Token tok = toks.next(); termAtt.setEmpty().append(tok); offsetAtt.setOffset(tok.startOffset(), tok.endOffset()); posIncAtt.PositionIncrement = tok.PositionIncrement; return(true); } else { return(false); } }
public override bool incrementToken() { if (!initialized) { throw new System.InvalidOperationException("Consumer did not call reset()."); } if (matcher == null) { return(false); } clearAttributes(); while (true) // loop takes care of leading and trailing boundary cases { int start = pos; int end_Renamed; bool isMatch = matcher.find(); if (isMatch) { end_Renamed = matcher.start(); pos = matcher.end(); } else { end_Renamed = str.Length; matcher = null; // we're finished } if (start != end_Renamed) // non-empty match (header/trailer) { string text = str.Substring(start, end_Renamed - start); if (toLowerCase) { text = text.ToLower(locale); } termAtt.setEmpty().append(text); offsetAtt.setOffset(correctOffset(start), correctOffset(end_Renamed)); return(true); } if (!isMatch) { return(false); } } }
//JAVA TO C# CONVERTER WARNING: Method 'throws' clauses are not available in .NET: //ORIGINAL LINE: @Override public boolean incrementToken() throws java.io.IOException public override bool incrementToken() { if (index >= tokens.Length) { return(false); } else { clearAttributes(); Token token = tokens[index++]; termAtt.setEmpty().append(token); offsetAtt.setOffset(token.startOffset(), token.endOffset()); posIncAtt.PositionIncrement = token.PositionIncrement; flagsAtt.Flags = token.Flags; typeAtt.Type = token.type(); payloadAtt.Payload = token.Payload; return(true); } }
/// <returns> Returns true for the next token in the stream, or false at EOS </returns> //JAVA TO C# CONVERTER WARNING: Method 'throws' clauses are not available in .NET: //ORIGINAL LINE: @Override public boolean incrementToken() throws java.io.IOException public override bool incrementToken() { if (input.incrementToken()) { string term = termAtt.ToString(); // Check the exclusion table if (!keywordAttr.Keyword) { string s = stemmer.stem(term); // If not stemmed, don't waste the time adjusting the token. if ((s != null) && !s.Equals(term)) { termAtt.setEmpty().append(s); } } return(true); } else { return(false); } }
//JAVA TO C# CONVERTER WARNING: Method 'throws' clauses are not available in .NET: //ORIGINAL LINE: @Override public boolean incrementToken() throws java.io.IOException public override bool incrementToken() { if (bufferedState != null) { restoreState(bufferedState); posIncAtt.PositionIncrement = 0; termAtt.setEmpty().append("hte"); bufferedState = null; return(true); } else if (input.incrementToken()) { if (termAtt.ToString().Equals("the")) { bufferedState = captureState(); } return(true); } else { return(false); } }
/// <summary> /// Returns the next token in the stream, or null at EOS. </summary> //JAVA TO C# CONVERTER WARNING: Method 'throws' clauses are not available in .NET: //ORIGINAL LINE: @Override public boolean incrementToken() throws java.io.IOException public override bool incrementToken() { clearAttributes(); if (!started) { started = true; gramSize = minGram; char[] chars = new char[1024]; charsRead = 0; // TODO: refactor to a shared readFully somewhere: while (charsRead < chars.Length) { int inc = input.read(chars, charsRead, chars.Length - charsRead); if (inc == -1) { break; } charsRead += inc; } inStr = (new string(chars, 0, charsRead)).Trim(); // remove any trailing empty strings if (charsRead == chars.Length) { // Read extra throwaway chars so that on end() we // report the correct offset: char[] throwaway = new char[1024]; while (true) { //JAVA TO C# CONVERTER WARNING: The original Java variable was marked 'final': //ORIGINAL LINE: final int inc = input.read(throwaway, 0, throwaway.length); int inc = input.read(throwaway, 0, throwaway.Length); if (inc == -1) { break; } charsRead += inc; } } inLen = inStr.Length; if (inLen == 0) { return(false); } } if (pos + gramSize > inLen) // if we hit the end of the string { pos = 0; // reset to beginning of string gramSize++; // increase n-gram size if (gramSize > maxGram) // we are done { return(false); } if (pos + gramSize > inLen) { return(false); } } int oldPos = pos; pos++; termAtt.setEmpty().append(inStr, oldPos, oldPos + gramSize); offsetAtt.setOffset(correctOffset(oldPos), correctOffset(oldPos + gramSize)); return(true); }