/// <summary>Returns the next token in the stream, or null at EOS. /// <p>Removes <tt>'s</tt> from the end of words. /// <p>Removes dots from acronyms. /// </summary> public override Token Next(/* in */ Token reusableToken) { System.Diagnostics.Debug.Assert(reusableToken != null); Token nextToken = input.Next(reusableToken); if (nextToken == null) { return(null); } char[] buffer = nextToken.TermBuffer(); int bufferLength = nextToken.TermLength(); System.String type = nextToken.Type(); if (type == APOSTROPHE_TYPE && bufferLength >= 2 && buffer[bufferLength - 2] == '\'' && (buffer[bufferLength - 1] == 's' || buffer[bufferLength - 1] == 'S')) { // Strip last 2 characters off nextToken.SetTermLength(bufferLength - 2); } else if (type == ACRONYM_TYPE) { // remove dots int upto = 0; for (int i = 0; i < bufferLength; i++) { char c = buffer[i]; if (c != '.') { buffer[upto++] = c; } } nextToken.SetTermLength(upto); } return(nextToken); }
/// <summary>Returns the next token in the stream, or null at EOS. /// <p>Removes <tt>'s</tt> from the end of words. /// <p>Removes dots from acronyms. /// </summary> public override Token Next(Token result) { Token t = input.Next(result); if (t == null) { return(null); } char[] buffer = t.TermBuffer(); int bufferLength = t.TermLength(); System.String type = t.Type(); if (type == APOSTROPHE_TYPE && bufferLength >= 2 && buffer[bufferLength - 2] == '\'' && (buffer[bufferLength - 1] == 's' || buffer[bufferLength - 1] == 'S')) { // Strip last 2 characters off t.SetTermLength(bufferLength - 2); } else if (type == ACRONYM_TYPE) { // remove dots int upto = 0; for (int i = 0; i < bufferLength; i++) { char c = buffer[i]; if (c != '.') { buffer[upto++] = c; } } t.SetTermLength(upto); } return(t); }
// Primary entry point (for first TermsHash) internal override void add(Token token) { System.Diagnostics.Debug.Assert(!postingsCompacted); // We are first in the chain so we must "intern" the // term text into textStart address // Get the text of this term. char[] tokenText = token.TermBuffer(); int tokenTextLen = token.TermLength(); // Compute hashcode & replace any invalid UTF16 sequences int downto = tokenTextLen; int code = 0; while (downto > 0) { char ch = tokenText[--downto]; if (ch >= UnicodeUtil.UNI_SUR_LOW_START && ch <= UnicodeUtil.UNI_SUR_LOW_END) { if (0 == downto) { // Unpaired ch = tokenText[downto] = (char)UnicodeUtil.UNI_REPLACEMENT_CHAR; } else { char ch2 = tokenText[downto - 1]; if (ch2 >= UnicodeUtil.UNI_SUR_HIGH_START && ch2 <= UnicodeUtil.UNI_SUR_HIGH_END) { // OK: high followed by low. This is a valid // surrogate pair. code = ((code * 31) + ch) * 31 + ch2; downto--; continue; } else { // Unpaired ch = tokenText[downto] = (char)UnicodeUtil.UNI_REPLACEMENT_CHAR; } } } else if (ch >= UnicodeUtil.UNI_SUR_HIGH_START && ch <= UnicodeUtil.UNI_SUR_HIGH_END) { // Unpaired ch = tokenText[downto] = (char)UnicodeUtil.UNI_REPLACEMENT_CHAR; } code = (code * 31) + ch; } int hashPos = code & postingsHashMask; // Locate RawPostingList in hash p = postingsHash[hashPos]; if (p != null && !postingEquals(tokenText, tokenTextLen)) { // Conflict: keep searching different locations in // the hash table. int inc = ((code >> 8) + code) | 1; do { code += inc; hashPos = code & postingsHashMask; p = postingsHash[hashPos]; } while (p != null && !postingEquals(tokenText, tokenTextLen)); } if (p == null) { // First time we are seeing this token since we last // flushed the hash. int textLen1 = 1 + tokenTextLen; if (textLen1 + charPool.charUpto > DocumentsWriter.CHAR_BLOCK_SIZE) { if (textLen1 > DocumentsWriter.CHAR_BLOCK_SIZE) { // Just skip this term, to remain as robust as // possible during indexing. A TokenFilter // can be inserted into the analyzer chain if // other behavior is wanted (pruning the term // to a prefix, throwing an exception, etc). if (docState.maxTermPrefix == null) { docState.maxTermPrefix = new System.String(tokenText, 0, 30); } consumer.skippingLongTerm(token); return; } charPool.nextBuffer(); } // Refill? if (0 == perThread.freePostingsCount) { perThread.morePostings(); } // Pull next free RawPostingList from free list p = perThread.freePostings[--perThread.freePostingsCount]; System.Diagnostics.Debug.Assert(p != null); char[] text = charPool.buffer; int textUpto = charPool.charUpto; p.textStart = textUpto + charPool.charOffset; charPool.charUpto += textLen1; System.Array.Copy(tokenText, 0, text, textUpto, tokenTextLen); text[textUpto + tokenTextLen] = (char)0xffff; System.Diagnostics.Debug.Assert(postingsHash[hashPos] == null); postingsHash[hashPos] = p; numPostings++; if (numPostings == postingsHashHalfSize) { rehashPostings(2 * postingsHashSize); } // Init stream slices if (numPostingInt + intPool.intUpto > DocumentsWriter.INT_BLOCK_SIZE) { intPool.nextBuffer(); } if (DocumentsWriter.BYTE_BLOCK_SIZE - bytePool.byteUpto < numPostingInt * ByteBlockPool.FIRST_LEVEL_SIZE) { bytePool.NextBuffer(); } intUptos = intPool.buffer; intUptoStart = intPool.intUpto; intPool.intUpto += streamCount; p.intStart = intUptoStart + intPool.intOffset; for (int i = 0; i < streamCount; i++) { int upto = bytePool.NewSlice(ByteBlockPool.FIRST_LEVEL_SIZE); intUptos[intUptoStart + i] = upto + bytePool.byteOffset; } p.byteStart = intUptos[intUptoStart]; consumer.newTerm(token, p); } else { intUptos = intPool.buffers[p.intStart >> DocumentsWriter.INT_BLOCK_SHIFT]; intUptoStart = p.intStart & DocumentsWriter.INT_BLOCK_MASK; consumer.addTerm(token, p); } if (doNextCall) { nextPerField.add(token, p.textStart); } }
/// <summary>This is the hotspot of indexing: it's called once /// for every term of every document. Its job is to * /// update the postings byte stream (Postings hash) * /// based on the occurence of a single term. /// </summary> private void AddPosition(Token token) { Payload payload = token.GetPayload(); // Get the text of this term. Term can either // provide a String token or offset into a char[] // array char[] tokenText = token.TermBuffer(); int tokenTextLen = token.TermLength(); int code = 0; // Compute hashcode int downto = tokenTextLen; while (downto > 0) code = (code * 31) + tokenText[--downto]; // System.out.println(" addPosition: buffer=" + new String(tokenText, 0, tokenTextLen) + " pos=" + position + " offsetStart=" + (offset+token.startOffset()) + " offsetEnd=" + (offset + token.endOffset()) + " docID=" + docID + " doPos=" + doVectorPositions + " doOffset=" + doVectorOffsets); int hashPos = code & postingsHashMask; System.Diagnostics.Debug.Assert(!postingsCompacted); // Locate Posting in hash Enclosing_Instance.p = postingsHash[hashPos]; if (Enclosing_Instance.p != null && !Enclosing_Instance.PostingEquals(tokenText, tokenTextLen)) { // Conflict: keep searching different locations in // the hash table. int inc = ((code >> 8) + code) | 1; do { code += inc; hashPos = code & postingsHashMask; Enclosing_Instance.p = postingsHash[hashPos]; } while (Enclosing_Instance.p != null && !Enclosing_Instance.PostingEquals(tokenText, tokenTextLen)); } int proxCode; // If we hit an exception below, it's possible the // posting list or term vectors data will be // partially written and thus inconsistent if // flushed, so we have to abort all documents // since the last flush: try { if (Enclosing_Instance.p != null) { // term seen since last flush if (Enclosing_Instance.docID != Enclosing_Instance.p.lastDocID) { // term not yet seen in this doc // System.out.println(" seen before (new docID=" + docID + ") freqUpto=" + p.freqUpto +" proxUpto=" + p.proxUpto); System.Diagnostics.Debug.Assert(Enclosing_Instance.p.docFreq > 0); // Now that we know doc freq for previous doc, // write it & lastDocCode Enclosing_Instance.freqUpto = Enclosing_Instance.p.freqUpto & Lucene.Net.Index.DocumentsWriter.BYTE_BLOCK_MASK; Enclosing_Instance.freq = Enclosing_Instance.postingsPool.buffers[Enclosing_Instance.p.freqUpto >> Lucene.Net.Index.DocumentsWriter.BYTE_BLOCK_SHIFT]; if (1 == Enclosing_Instance.p.docFreq) Enclosing_Instance.WriteFreqVInt(Enclosing_Instance.p.lastDocCode | 1); else { Enclosing_Instance.WriteFreqVInt(Enclosing_Instance.p.lastDocCode); Enclosing_Instance.WriteFreqVInt(Enclosing_Instance.p.docFreq); } Enclosing_Instance.p.freqUpto = Enclosing_Instance.freqUpto + (Enclosing_Instance.p.freqUpto & Lucene.Net.Index.DocumentsWriter.BYTE_BLOCK_NOT_MASK); if (doVectors) { Enclosing_Instance.vector = AddNewVector(); if (doVectorOffsets) { offsetStartCode = offsetStart = offset + token.StartOffset(); offsetEnd = offset + token.EndOffset(); } } proxCode = position; Enclosing_Instance.p.docFreq = 1; // Store code so we can write this after we're // done with this new doc Enclosing_Instance.p.lastDocCode = (Enclosing_Instance.docID - Enclosing_Instance.p.lastDocID) << 1; Enclosing_Instance.p.lastDocID = Enclosing_Instance.docID; } else { // term already seen in this doc // System.out.println(" seen before (same docID=" + docID + ") proxUpto=" + p.proxUpto); Enclosing_Instance.p.docFreq++; proxCode = position - Enclosing_Instance.p.lastPosition; if (doVectors) { Enclosing_Instance.vector = Enclosing_Instance.p.vector; if (Enclosing_Instance.vector == null) Enclosing_Instance.vector = AddNewVector(); if (doVectorOffsets) { offsetStart = offset + token.StartOffset(); offsetEnd = offset + token.EndOffset(); offsetStartCode = offsetStart - Enclosing_Instance.vector.lastOffset; } } } } else { // term not seen before // System.out.println(" never seen docID=" + docID); // Refill? if (0 == Enclosing_Instance.postingsFreeCount) { Enclosing_Instance.Enclosing_Instance.GetPostings(Enclosing_Instance.postingsFreeList); Enclosing_Instance.postingsFreeCount = Enclosing_Instance.postingsFreeList.Length; } int textLen1 = 1 + tokenTextLen; if (textLen1 + Enclosing_Instance.charPool.byteUpto > Lucene.Net.Index.DocumentsWriter.CHAR_BLOCK_SIZE) { if (textLen1 > Lucene.Net.Index.DocumentsWriter.CHAR_BLOCK_SIZE) { // Just skip this term, to remain as robust as // possible during indexing. A TokenFilter // can be inserted into the analyzer chain if // other behavior is wanted (pruning the term // to a prefix, throwing an exception, etc). if (Enclosing_Instance.maxTermPrefix == null) Enclosing_Instance.maxTermPrefix = new System.String(tokenText, 0, 30); // Still increment position: position++; return ; } Enclosing_Instance.charPool.NextBuffer(); } char[] text = Enclosing_Instance.charPool.buffer; int textUpto = Enclosing_Instance.charPool.byteUpto; // Pull next free Posting from free list Enclosing_Instance.p = Enclosing_Instance.postingsFreeList[--Enclosing_Instance.postingsFreeCount]; Enclosing_Instance.p.textStart = textUpto + Enclosing_Instance.charPool.byteOffset; Enclosing_Instance.charPool.byteUpto += textLen1; Array.Copy(tokenText, 0, text, textUpto, tokenTextLen); text[textUpto + tokenTextLen] = (char) (0xffff); System.Diagnostics.Debug.Assert(postingsHash [hashPos] == null); postingsHash[hashPos] = Enclosing_Instance.p; numPostings++; if (numPostings == postingsHashHalfSize) RehashPostings(2 * postingsHashSize); // Init first slice for freq & prox streams int firstSize = Lucene.Net.Index.DocumentsWriter.levelSizeArray[0]; int upto1 = Enclosing_Instance.postingsPool.NewSlice(firstSize); Enclosing_Instance.p.freqStart = Enclosing_Instance.p.freqUpto = Enclosing_Instance.postingsPool.byteOffset + upto1; int upto2 = Enclosing_Instance.postingsPool.NewSlice(firstSize); Enclosing_Instance.p.proxStart = Enclosing_Instance.p.proxUpto = Enclosing_Instance.postingsPool.byteOffset + upto2; Enclosing_Instance.p.lastDocCode = Enclosing_Instance.docID << 1; Enclosing_Instance.p.lastDocID = Enclosing_Instance.docID; Enclosing_Instance.p.docFreq = 1; if (doVectors) { Enclosing_Instance.vector = AddNewVector(); if (doVectorOffsets) { offsetStart = offsetStartCode = offset + token.StartOffset(); offsetEnd = offset + token.EndOffset(); } } proxCode = position; } Enclosing_Instance.proxUpto = Enclosing_Instance.p.proxUpto & Lucene.Net.Index.DocumentsWriter.BYTE_BLOCK_MASK; Enclosing_Instance.prox = Enclosing_Instance.postingsPool.buffers[Enclosing_Instance.p.proxUpto >> Lucene.Net.Index.DocumentsWriter.BYTE_BLOCK_SHIFT]; System.Diagnostics.Debug.Assert(Enclosing_Instance.prox != null); if (payload != null && payload.length > 0) { Enclosing_Instance.WriteProxVInt((proxCode << 1) | 1); Enclosing_Instance.WriteProxVInt(payload.length); Enclosing_Instance.WriteProxBytes(payload.data, payload.offset, payload.length); fieldInfo.storePayloads = true; } else Enclosing_Instance.WriteProxVInt(proxCode << 1); Enclosing_Instance.p.proxUpto = Enclosing_Instance.proxUpto + (Enclosing_Instance.p.proxUpto & Lucene.Net.Index.DocumentsWriter.BYTE_BLOCK_NOT_MASK); Enclosing_Instance.p.lastPosition = position++; if (doVectorPositions) { Enclosing_Instance.posUpto = Enclosing_Instance.vector.posUpto & Lucene.Net.Index.DocumentsWriter.BYTE_BLOCK_MASK; Enclosing_Instance.pos = Enclosing_Instance.vectorsPool.buffers[Enclosing_Instance.vector.posUpto >> Lucene.Net.Index.DocumentsWriter.BYTE_BLOCK_SHIFT]; Enclosing_Instance.WritePosVInt(proxCode); Enclosing_Instance.vector.posUpto = Enclosing_Instance.posUpto + (Enclosing_Instance.vector.posUpto & Lucene.Net.Index.DocumentsWriter.BYTE_BLOCK_NOT_MASK); } if (doVectorOffsets) { Enclosing_Instance.offsetUpto = Enclosing_Instance.vector.offsetUpto & Lucene.Net.Index.DocumentsWriter.BYTE_BLOCK_MASK; Enclosing_Instance.offsets = Enclosing_Instance.vectorsPool.buffers[Enclosing_Instance.vector.offsetUpto >> Lucene.Net.Index.DocumentsWriter.BYTE_BLOCK_SHIFT]; Enclosing_Instance.WriteOffsetVInt(offsetStartCode); Enclosing_Instance.WriteOffsetVInt(offsetEnd - offsetStart); Enclosing_Instance.vector.lastOffset = offsetEnd; Enclosing_Instance.vector.offsetUpto = Enclosing_Instance.offsetUpto + (Enclosing_Instance.vector.offsetUpto & Lucene.Net.Index.DocumentsWriter.BYTE_BLOCK_NOT_MASK); } } catch (System.Exception t) { throw new AbortException(t, Enclosing_Instance.Enclosing_Instance); } }
// Primary entry point (for first TermsHash) internal override void add(Token token) { System.Diagnostics.Debug.Assert(!postingsCompacted); // We are first in the chain so we must "intern" the // term text into textStart address // Get the text of this term. char[] tokenText = token.TermBuffer(); int tokenTextLen = token.TermLength(); // Compute hashcode & replace any invalid UTF16 sequences int downto = tokenTextLen; int code = 0; while (downto > 0) { char ch = tokenText[--downto]; if (ch >= UnicodeUtil.UNI_SUR_LOW_START && ch <= UnicodeUtil.UNI_SUR_LOW_END) { if (0 == downto) { // Unpaired ch = tokenText[downto] = (char) UnicodeUtil.UNI_REPLACEMENT_CHAR; } else { char ch2 = tokenText[downto - 1]; if (ch2 >= UnicodeUtil.UNI_SUR_HIGH_START && ch2 <= UnicodeUtil.UNI_SUR_HIGH_END) { // OK: high followed by low. This is a valid // surrogate pair. code = ((code * 31) + ch) * 31 + ch2; downto--; continue; } else { // Unpaired ch = tokenText[downto] = (char) UnicodeUtil.UNI_REPLACEMENT_CHAR; } } } else if (ch >= UnicodeUtil.UNI_SUR_HIGH_START && ch <= UnicodeUtil.UNI_SUR_HIGH_END) // Unpaired ch = tokenText[downto] = (char) UnicodeUtil.UNI_REPLACEMENT_CHAR; code = (code * 31) + ch; } int hashPos = code & postingsHashMask; // Locate RawPostingList in hash p = postingsHash[hashPos]; if (p != null && !postingEquals(tokenText, tokenTextLen)) { // Conflict: keep searching different locations in // the hash table. int inc = ((code >> 8) + code) | 1; do { code += inc; hashPos = code & postingsHashMask; p = postingsHash[hashPos]; } while (p != null && !postingEquals(tokenText, tokenTextLen)); } if (p == null) { // First time we are seeing this token since we last // flushed the hash. int textLen1 = 1 + tokenTextLen; if (textLen1 + charPool.charUpto > DocumentsWriter.CHAR_BLOCK_SIZE) { if (textLen1 > DocumentsWriter.CHAR_BLOCK_SIZE) { // Just skip this term, to remain as robust as // possible during indexing. A TokenFilter // can be inserted into the analyzer chain if // other behavior is wanted (pruning the term // to a prefix, throwing an exception, etc). if (docState.maxTermPrefix == null) docState.maxTermPrefix = new System.String(tokenText, 0, 30); consumer.skippingLongTerm(token); return; } charPool.nextBuffer(); } // Refill? if (0 == perThread.freePostingsCount) perThread.morePostings(); // Pull next free RawPostingList from free list p = perThread.freePostings[--perThread.freePostingsCount]; System.Diagnostics.Debug.Assert(p != null); char[] text = charPool.buffer; int textUpto = charPool.charUpto; p.textStart = textUpto + charPool.charOffset; charPool.charUpto += textLen1; System.Array.Copy(tokenText, 0, text, textUpto, tokenTextLen); text[textUpto + tokenTextLen] = (char) 0xffff; System.Diagnostics.Debug.Assert(postingsHash[hashPos] == null); postingsHash[hashPos] = p; numPostings++; if (numPostings == postingsHashHalfSize) rehashPostings(2 * postingsHashSize); // Init stream slices if (numPostingInt + intPool.intUpto > DocumentsWriter.INT_BLOCK_SIZE) intPool.nextBuffer(); if (DocumentsWriter.BYTE_BLOCK_SIZE - bytePool.byteUpto < numPostingInt * ByteBlockPool.FIRST_LEVEL_SIZE) bytePool.NextBuffer(); intUptos = intPool.buffer; intUptoStart = intPool.intUpto; intPool.intUpto += streamCount; p.intStart = intUptoStart + intPool.intOffset; for (int i = 0; i < streamCount; i++) { int upto = bytePool.NewSlice(ByteBlockPool.FIRST_LEVEL_SIZE); intUptos[intUptoStart + i] = upto + bytePool.byteOffset; } p.byteStart = intUptos[intUptoStart]; consumer.newTerm(token, p); } else { intUptos = intPool.buffers[p.intStart >> DocumentsWriter.INT_BLOCK_SHIFT]; intUptoStart = p.intStart & DocumentsWriter.INT_BLOCK_MASK; consumer.addTerm(token, p); } if (doNextCall) nextPerField.add(token, p.textStart); }