/// <summary>Returns the next token in the stream, or null at EOS. /// <p/>Removes <tt>'s</tt> from the end of words. /// <p/>Removes dots from acronyms. /// </summary> public override bool IncrementToken() { if (!input.IncrementToken()) { return(false); } char[] buffer = termAtt.TermBuffer(); int bufferLength = termAtt.TermLength(); System.String type = typeAtt.Type(); if ((System.Object)type == (System.Object)APOSTROPHE_TYPE && bufferLength >= 2 && buffer[bufferLength - 2] == '\'' && (buffer[bufferLength - 1] == 's' || buffer[bufferLength - 1] == 'S')) { // Strip last 2 characters off termAtt.SetTermLength(bufferLength - 2); } else if ((System.Object)type == (System.Object)ACRONYM_TYPE) { // remove dots int upto = 0; for (int i = 0; i < bufferLength; i++) { char c = buffer[i]; if (c != '.') { buffer[upto++] = c; } } termAtt.SetTermLength(upto); } return(true); }
public override bool IncrementToken() { if (!done) { ClearAttributes(); done = true; int upto = 0; char[] buffer = termAtt.TermBuffer(); while (true) { int length = input.Read(buffer, upto, buffer.Length - upto); if (length == 0) { break; } upto += length; if (upto == buffer.Length) { buffer = termAtt.ResizeTermBuffer(1 + buffer.Length); } } termAtt.SetTermLength(upto); finalOffset = CorrectOffset(upto); offsetAtt.SetOffset(CorrectOffset(0), finalOffset); return(true); } return(false); }
public override bool IncrementToken() { if (input.IncrementToken()) { char[] buffer = termAtt.TermBuffer(); int length = termAtt.TermLength(); // If no characters actually require rewriting then we // just return token as-is: for (int i = 0; i < length; i++) { char c = buffer[i]; if (c >= '\u00c0' && c <= '\uFB06') { RemoveAccents(buffer, length); termAtt.SetTermBuffer(output, 0, outputPos); break; } } return(true); } else { return(false); } }
public override bool IncrementToken() { if (!input.IncrementToken()) { return(false); } if (stemmer.Stem(termAtt.TermBuffer(), 0, termAtt.TermLength())) { termAtt.SetTermBuffer(stemmer.GetResultBuffer(), 0, stemmer.GetResultLength()); } return(true); }
public override bool IncrementToken() { if (input.IncrementToken()) { char[] buffer = termAtt.TermBuffer(); int length = termAtt.TermLength(); for (int i = 0; i < length; i++) { buffer[i] = System.Char.ToLower(buffer[i]); } return(true); } else { return(false); } }
/// <summary> Returns the next input Token whose term() is not a stop word.</summary> public override bool IncrementToken() { // return the first non-stop word found int skippedPositions = 0; while (input.IncrementToken()) { if (!stopWords.Contains(termAtt.TermBuffer(), 0, termAtt.TermLength())) { if (enablePositionIncrements) { posIncrAtt.SetPositionIncrement(posIncrAtt.GetPositionIncrement() + skippedPositions); } return(true); } skippedPositions += posIncrAtt.GetPositionIncrement(); } // reached EOS -- return null return(false); }
public override bool IncrementToken() { if (input.IncrementToken()) { System.String token = new System.String(termAtt.TermBuffer(), 0, termAtt.TermLength()); if (!nopayload.Contains(token)) { if (entities.Contains(token)) { payloadAtt.SetPayload(new Payload(System.Text.UTF8Encoding.UTF8.GetBytes(token + ":Entity:" + pos))); } else { payloadAtt.SetPayload(new Payload(System.Text.UTF8Encoding.UTF8.GetBytes(token + ":Noise:" + pos))); } } pos += posIncrAtt.GetPositionIncrement(); return(true); } return(false); }
public override bool IncrementToken() { int count = input.Read((System.Char[])buffer, 0, buffer.Length); if (done) { return(false); } else { ClearAttributes(); done = true; if (count == 1) { termAtt.TermBuffer()[0] = buffer[0]; termAtt.SetTermLength(1); } else { termAtt.SetTermLength(0); } return(true); } }
public override bool IncrementToken() { ClearAttributes(); int length = 0; int start = bufferIndex; char[] buffer = termAtt.TermBuffer(); while (true) { if (bufferIndex >= dataLen) { offset += dataLen; dataLen = input.Read((System.Char[])ioBuffer, 0, ioBuffer.Length); if (dataLen <= 0) { dataLen = 0; // so next offset += dataLen won't decrement offset if (length > 0) { break; } else { return(false); } } bufferIndex = 0; } char c = ioBuffer[bufferIndex++]; if (IsTokenChar(c)) { // if it's a token char if (length == 0) { // start of token start = offset + bufferIndex - 1; } else if (length == buffer.Length) { buffer = termAtt.ResizeTermBuffer(1 + length); } buffer[length++] = Normalize(c); // buffer it, normalized if (length == MAX_WORD_LEN) { // buffer overflow! break; } } else if (length > 0) { // at non-Letter w/ chars break; // return 'em } } termAtt.SetTermLength(length); offsetAtt.SetOffset(CorrectOffset(start), CorrectOffset(start + length)); return(true); }
// Primary entry point (for first TermsHash) internal override void Add() { System.Diagnostics.Debug.Assert(!postingsCompacted); // We are first in the chain so we must "intern" the // term text into textStart address // Get the text of this term. char[] tokenText = termAtt.TermBuffer(); ; int tokenTextLen = termAtt.TermLength(); // Compute hashcode & replace any invalid UTF16 sequences int downto = tokenTextLen; int code = 0; while (downto > 0) { char ch = tokenText[--downto]; if (ch >= UnicodeUtil.UNI_SUR_LOW_START && ch <= UnicodeUtil.UNI_SUR_LOW_END) { if (0 == downto) { // Unpaired ch = tokenText[downto] = (char)(UnicodeUtil.UNI_REPLACEMENT_CHAR); } else { char ch2 = tokenText[downto - 1]; if (ch2 >= UnicodeUtil.UNI_SUR_HIGH_START && ch2 <= UnicodeUtil.UNI_SUR_HIGH_END) { // OK: high followed by low. This is a valid // surrogate pair. code = ((code * 31) + ch) * 31 + ch2; downto--; continue; } else { // Unpaired ch = tokenText[downto] = (char)(UnicodeUtil.UNI_REPLACEMENT_CHAR); } } } else if (ch >= UnicodeUtil.UNI_SUR_HIGH_START && (ch <= UnicodeUtil.UNI_SUR_HIGH_END || ch == 0xffff)) { // Unpaired or 0xffff ch = tokenText[downto] = (char)(UnicodeUtil.UNI_REPLACEMENT_CHAR); } code = (code * 31) + ch; } int hashPos = code & postingsHashMask; // Locate RawPostingList in hash p = postingsHash[hashPos]; if (p != null && !PostingEquals(tokenText, tokenTextLen)) { // Conflict: keep searching different locations in // the hash table. int inc = ((code >> 8) + code) | 1; do { code += inc; hashPos = code & postingsHashMask; p = postingsHash[hashPos]; }while (p != null && !PostingEquals(tokenText, tokenTextLen)); } if (p == null) { // First time we are seeing this token since we last // flushed the hash. int textLen1 = 1 + tokenTextLen; if (textLen1 + charPool.charUpto > DocumentsWriter.CHAR_BLOCK_SIZE) { if (textLen1 > DocumentsWriter.CHAR_BLOCK_SIZE) { // Just skip this term, to remain as robust as // possible during indexing. A TokenFilter // can be inserted into the analyzer chain if // other behavior is wanted (pruning the term // to a prefix, throwing an exception, etc). if (docState.maxTermPrefix == null) { docState.maxTermPrefix = new System.String(tokenText, 0, 30); } consumer.SkippingLongTerm(); return; } charPool.NextBuffer(); } // Refill? if (0 == perThread.freePostingsCount) { perThread.MorePostings(); } // Pull next free RawPostingList from free list p = perThread.freePostings[--perThread.freePostingsCount]; System.Diagnostics.Debug.Assert(p != null); char[] text = charPool.buffer; int textUpto = charPool.charUpto; p.textStart = textUpto + charPool.charOffset; charPool.charUpto += textLen1; Array.Copy(tokenText, 0, text, textUpto, tokenTextLen); text[textUpto + tokenTextLen] = (char)(0xffff); System.Diagnostics.Debug.Assert(postingsHash [hashPos] == null); postingsHash[hashPos] = p; numPostings++; if (numPostings == postingsHashHalfSize) { RehashPostings(2 * postingsHashSize); } // Init stream slices if (numPostingInt + intPool.intUpto > DocumentsWriter.INT_BLOCK_SIZE) { intPool.NextBuffer(); } if (DocumentsWriter.BYTE_BLOCK_SIZE - bytePool.byteUpto < numPostingInt * ByteBlockPool.FIRST_LEVEL_SIZE) { bytePool.NextBuffer(); } intUptos = intPool.buffer; intUptoStart = intPool.intUpto; intPool.intUpto += streamCount; p.intStart = intUptoStart + intPool.intOffset; for (int i = 0; i < streamCount; i++) { int upto = bytePool.NewSlice(ByteBlockPool.FIRST_LEVEL_SIZE); intUptos[intUptoStart + i] = upto + bytePool.byteOffset; } p.byteStart = intUptos[intUptoStart]; consumer.NewTerm(p); } else { intUptos = intPool.buffers[p.intStart >> DocumentsWriter.INT_BLOCK_SHIFT]; intUptoStart = p.intStart & DocumentsWriter.INT_BLOCK_MASK; consumer.AddTerm(p); } if (doNextCall) { nextPerField.Add(p.textStart); } }