internal void writeProx(Token t, FreqProxTermsWriter.PostingList p, int proxCode) { Payload payload = t.GetPayload(); if (payload != null && payload.length > 0) { termsHashPerField.writeVInt(1, (proxCode << 1) | 1); termsHashPerField.writeVInt(1, payload.length); termsHashPerField.writeBytes(1, payload.data, payload.offset, payload.length); hasPayloads = true; } else { termsHashPerField.writeVInt(1, proxCode << 1); } p.lastPosition = fieldState.position; }
/// <summary>This is the hotspot of indexing: it's called once /// for every term of every document. Its job is to * /// update the postings byte stream (Postings hash) * /// based on the occurence of a single term. /// </summary> private void AddPosition(Token token) { Payload payload = token.GetPayload(); // Get the text of this term. Term can either // provide a String token or offset into a char[] // array char[] tokenText = token.TermBuffer(); int tokenTextLen = token.TermLength(); int code = 0; // Compute hashcode int downto = tokenTextLen; while (downto > 0) code = (code * 31) + tokenText[--downto]; // System.out.println(" addPosition: buffer=" + new String(tokenText, 0, tokenTextLen) + " pos=" + position + " offsetStart=" + (offset+token.startOffset()) + " offsetEnd=" + (offset + token.endOffset()) + " docID=" + docID + " doPos=" + doVectorPositions + " doOffset=" + doVectorOffsets); int hashPos = code & postingsHashMask; System.Diagnostics.Debug.Assert(!postingsCompacted); // Locate Posting in hash Enclosing_Instance.p = postingsHash[hashPos]; if (Enclosing_Instance.p != null && !Enclosing_Instance.PostingEquals(tokenText, tokenTextLen)) { // Conflict: keep searching different locations in // the hash table. int inc = ((code >> 8) + code) | 1; do { code += inc; hashPos = code & postingsHashMask; Enclosing_Instance.p = postingsHash[hashPos]; } while (Enclosing_Instance.p != null && !Enclosing_Instance.PostingEquals(tokenText, tokenTextLen)); } int proxCode; // If we hit an exception below, it's possible the // posting list or term vectors data will be // partially written and thus inconsistent if // flushed, so we have to abort all documents // since the last flush: try { if (Enclosing_Instance.p != null) { // term seen since last flush if (Enclosing_Instance.docID != Enclosing_Instance.p.lastDocID) { // term not yet seen in this doc // System.out.println(" seen before (new docID=" + docID + ") freqUpto=" + p.freqUpto +" proxUpto=" + p.proxUpto); System.Diagnostics.Debug.Assert(Enclosing_Instance.p.docFreq > 0); // Now that we know doc freq for previous doc, // write it & lastDocCode Enclosing_Instance.freqUpto = Enclosing_Instance.p.freqUpto & Lucene.Net.Index.DocumentsWriter.BYTE_BLOCK_MASK; Enclosing_Instance.freq = Enclosing_Instance.postingsPool.buffers[Enclosing_Instance.p.freqUpto >> Lucene.Net.Index.DocumentsWriter.BYTE_BLOCK_SHIFT]; if (1 == Enclosing_Instance.p.docFreq) Enclosing_Instance.WriteFreqVInt(Enclosing_Instance.p.lastDocCode | 1); else { Enclosing_Instance.WriteFreqVInt(Enclosing_Instance.p.lastDocCode); Enclosing_Instance.WriteFreqVInt(Enclosing_Instance.p.docFreq); } Enclosing_Instance.p.freqUpto = Enclosing_Instance.freqUpto + (Enclosing_Instance.p.freqUpto & Lucene.Net.Index.DocumentsWriter.BYTE_BLOCK_NOT_MASK); if (doVectors) { Enclosing_Instance.vector = AddNewVector(); if (doVectorOffsets) { offsetStartCode = offsetStart = offset + token.StartOffset(); offsetEnd = offset + token.EndOffset(); } } proxCode = position; Enclosing_Instance.p.docFreq = 1; // Store code so we can write this after we're // done with this new doc Enclosing_Instance.p.lastDocCode = (Enclosing_Instance.docID - Enclosing_Instance.p.lastDocID) << 1; Enclosing_Instance.p.lastDocID = Enclosing_Instance.docID; } else { // term already seen in this doc // System.out.println(" seen before (same docID=" + docID + ") proxUpto=" + p.proxUpto); Enclosing_Instance.p.docFreq++; proxCode = position - Enclosing_Instance.p.lastPosition; if (doVectors) { Enclosing_Instance.vector = Enclosing_Instance.p.vector; if (Enclosing_Instance.vector == null) Enclosing_Instance.vector = AddNewVector(); if (doVectorOffsets) { offsetStart = offset + token.StartOffset(); offsetEnd = offset + token.EndOffset(); offsetStartCode = offsetStart - Enclosing_Instance.vector.lastOffset; } } } } else { // term not seen before // System.out.println(" never seen docID=" + docID); // Refill? if (0 == Enclosing_Instance.postingsFreeCount) { Enclosing_Instance.Enclosing_Instance.GetPostings(Enclosing_Instance.postingsFreeList); Enclosing_Instance.postingsFreeCount = Enclosing_Instance.postingsFreeList.Length; } int textLen1 = 1 + tokenTextLen; if (textLen1 + Enclosing_Instance.charPool.byteUpto > Lucene.Net.Index.DocumentsWriter.CHAR_BLOCK_SIZE) { if (textLen1 > Lucene.Net.Index.DocumentsWriter.CHAR_BLOCK_SIZE) { // Just skip this term, to remain as robust as // possible during indexing. A TokenFilter // can be inserted into the analyzer chain if // other behavior is wanted (pruning the term // to a prefix, throwing an exception, etc). if (Enclosing_Instance.maxTermPrefix == null) Enclosing_Instance.maxTermPrefix = new System.String(tokenText, 0, 30); // Still increment position: position++; return ; } Enclosing_Instance.charPool.NextBuffer(); } char[] text = Enclosing_Instance.charPool.buffer; int textUpto = Enclosing_Instance.charPool.byteUpto; // Pull next free Posting from free list Enclosing_Instance.p = Enclosing_Instance.postingsFreeList[--Enclosing_Instance.postingsFreeCount]; Enclosing_Instance.p.textStart = textUpto + Enclosing_Instance.charPool.byteOffset; Enclosing_Instance.charPool.byteUpto += textLen1; Array.Copy(tokenText, 0, text, textUpto, tokenTextLen); text[textUpto + tokenTextLen] = (char) (0xffff); System.Diagnostics.Debug.Assert(postingsHash [hashPos] == null); postingsHash[hashPos] = Enclosing_Instance.p; numPostings++; if (numPostings == postingsHashHalfSize) RehashPostings(2 * postingsHashSize); // Init first slice for freq & prox streams int firstSize = Lucene.Net.Index.DocumentsWriter.levelSizeArray[0]; int upto1 = Enclosing_Instance.postingsPool.NewSlice(firstSize); Enclosing_Instance.p.freqStart = Enclosing_Instance.p.freqUpto = Enclosing_Instance.postingsPool.byteOffset + upto1; int upto2 = Enclosing_Instance.postingsPool.NewSlice(firstSize); Enclosing_Instance.p.proxStart = Enclosing_Instance.p.proxUpto = Enclosing_Instance.postingsPool.byteOffset + upto2; Enclosing_Instance.p.lastDocCode = Enclosing_Instance.docID << 1; Enclosing_Instance.p.lastDocID = Enclosing_Instance.docID; Enclosing_Instance.p.docFreq = 1; if (doVectors) { Enclosing_Instance.vector = AddNewVector(); if (doVectorOffsets) { offsetStart = offsetStartCode = offset + token.StartOffset(); offsetEnd = offset + token.EndOffset(); } } proxCode = position; } Enclosing_Instance.proxUpto = Enclosing_Instance.p.proxUpto & Lucene.Net.Index.DocumentsWriter.BYTE_BLOCK_MASK; Enclosing_Instance.prox = Enclosing_Instance.postingsPool.buffers[Enclosing_Instance.p.proxUpto >> Lucene.Net.Index.DocumentsWriter.BYTE_BLOCK_SHIFT]; System.Diagnostics.Debug.Assert(Enclosing_Instance.prox != null); if (payload != null && payload.length > 0) { Enclosing_Instance.WriteProxVInt((proxCode << 1) | 1); Enclosing_Instance.WriteProxVInt(payload.length); Enclosing_Instance.WriteProxBytes(payload.data, payload.offset, payload.length); fieldInfo.storePayloads = true; } else Enclosing_Instance.WriteProxVInt(proxCode << 1); Enclosing_Instance.p.proxUpto = Enclosing_Instance.proxUpto + (Enclosing_Instance.p.proxUpto & Lucene.Net.Index.DocumentsWriter.BYTE_BLOCK_NOT_MASK); Enclosing_Instance.p.lastPosition = position++; if (doVectorPositions) { Enclosing_Instance.posUpto = Enclosing_Instance.vector.posUpto & Lucene.Net.Index.DocumentsWriter.BYTE_BLOCK_MASK; Enclosing_Instance.pos = Enclosing_Instance.vectorsPool.buffers[Enclosing_Instance.vector.posUpto >> Lucene.Net.Index.DocumentsWriter.BYTE_BLOCK_SHIFT]; Enclosing_Instance.WritePosVInt(proxCode); Enclosing_Instance.vector.posUpto = Enclosing_Instance.posUpto + (Enclosing_Instance.vector.posUpto & Lucene.Net.Index.DocumentsWriter.BYTE_BLOCK_NOT_MASK); } if (doVectorOffsets) { Enclosing_Instance.offsetUpto = Enclosing_Instance.vector.offsetUpto & Lucene.Net.Index.DocumentsWriter.BYTE_BLOCK_MASK; Enclosing_Instance.offsets = Enclosing_Instance.vectorsPool.buffers[Enclosing_Instance.vector.offsetUpto >> Lucene.Net.Index.DocumentsWriter.BYTE_BLOCK_SHIFT]; Enclosing_Instance.WriteOffsetVInt(offsetStartCode); Enclosing_Instance.WriteOffsetVInt(offsetEnd - offsetStart); Enclosing_Instance.vector.lastOffset = offsetEnd; Enclosing_Instance.vector.offsetUpto = Enclosing_Instance.offsetUpto + (Enclosing_Instance.vector.offsetUpto & Lucene.Net.Index.DocumentsWriter.BYTE_BLOCK_NOT_MASK); } } catch (System.Exception t) { throw new AbortException(t, Enclosing_Instance.Enclosing_Instance); } }
internal void writeProx(Token t, FreqProxTermsWriter.PostingList p, int proxCode) { Payload payload = t.GetPayload(); if (payload != null && payload.length > 0) { termsHashPerField.writeVInt(1, (proxCode << 1) | 1); termsHashPerField.writeVInt(1, payload.length); termsHashPerField.writeBytes(1, payload.data, payload.offset, payload.length); hasPayloads = true; } else termsHashPerField.writeVInt(1, proxCode << 1); p.lastPosition = fieldState.position; }