internal override void NewTerm(int termID) { // First time we're seeing this term since the last // flush Debug.Assert(docState.TestPoint("FreqProxTermsWriterPerField.newTerm start")); FreqProxPostingsArray postings = (FreqProxPostingsArray)termsHashPerField.postingsArray; postings.lastDocIDs[termID] = docState.docID; if (!hasFreq) { postings.lastDocCodes[termID] = docState.docID; } else { postings.lastDocCodes[termID] = docState.docID << 1; postings.termFreqs[termID] = 1; if (hasProx) { WriteProx(termID, fieldState.Position); if (hasOffsets) { WriteOffsets(termID, fieldState.Offset); } } else { Debug.Assert(!hasOffsets); } } fieldState.MaxTermFrequency = Math.Max(1, fieldState.MaxTermFrequency); fieldState.UniqueTermCount++; }
internal override void CopyTo(ParallelPostingsArray toArray, int numToCopy) { Debug.Assert(toArray is FreqProxPostingsArray); FreqProxPostingsArray to = (FreqProxPostingsArray)toArray; base.CopyTo(toArray, numToCopy); Array.Copy(LastDocIDs, 0, to.LastDocIDs, 0, numToCopy); Array.Copy(LastDocCodes, 0, to.LastDocCodes, 0, numToCopy); if (LastPositions != null) { Debug.Assert(to.LastPositions != null); Array.Copy(LastPositions, 0, to.LastPositions, 0, numToCopy); } if (LastOffsets != null) { Debug.Assert(to.LastOffsets != null); Array.Copy(LastOffsets, 0, to.LastOffsets, 0, numToCopy); } if (TermFreqs != null) { Debug.Assert(to.TermFreqs != null); Array.Copy(TermFreqs, 0, to.TermFreqs, 0, numToCopy); } }
internal override void NewTerm(int termID) { // First time we're seeing this term since the last // flush Debug.Assert(DocState.TestPoint("FreqProxTermsWriterPerField.newTerm start")); FreqProxPostingsArray postings = (FreqProxPostingsArray)TermsHashPerField.PostingsArray; postings.LastDocIDs[termID] = DocState.DocID; if (!HasFreq) { postings.LastDocCodes[termID] = DocState.DocID; } else { postings.LastDocCodes[termID] = DocState.DocID << 1; postings.TermFreqs[termID] = 1; if (HasProx) { WriteProx(termID, FieldState.Position_Renamed); if (HasOffsets) { WriteOffsets(termID, FieldState.Offset_Renamed); } } else { Debug.Assert(!HasOffsets); } } FieldState.MaxTermFrequency_Renamed = Math.Max(1, FieldState.MaxTermFrequency_Renamed); FieldState.UniqueTermCount_Renamed++; }
internal void WriteProx(int termID, int proxCode) { //System.out.println("writeProx termID=" + termID + " proxCode=" + proxCode); Debug.Assert(HasProx); BytesRef payload; if (PayloadAttribute == null) { payload = null; } else { payload = PayloadAttribute.Payload; } if (payload != null && payload.Length > 0) { TermsHashPerField.WriteVInt(1, (proxCode << 1) | 1); TermsHashPerField.WriteVInt(1, payload.Length); TermsHashPerField.WriteBytes(1, payload.Bytes, payload.Offset, payload.Length); HasPayloads = true; } else { TermsHashPerField.WriteVInt(1, proxCode << 1); } FreqProxPostingsArray postings = (FreqProxPostingsArray)TermsHashPerField.PostingsArray; postings.LastPositions[termID] = FieldState.Position_Renamed; }
internal void WriteProx(int termID, int proxCode) { //System.out.println("writeProx termID=" + termID + " proxCode=" + proxCode); if (Debugging.AssertsEnabled) { Debugging.Assert(hasProx); } BytesRef payload; if (payloadAttribute == null) { payload = null; } else { payload = payloadAttribute.Payload; } if (payload != null && payload.Length > 0) { termsHashPerField.WriteVInt32(1, (proxCode << 1) | 1); termsHashPerField.WriteVInt32(1, payload.Length); termsHashPerField.WriteBytes(1, payload.Bytes, payload.Offset, payload.Length); hasPayloads = true; } else { termsHashPerField.WriteVInt32(1, proxCode << 1); } FreqProxPostingsArray postings = (FreqProxPostingsArray)termsHashPerField.postingsArray; postings.lastPositions[termID] = fieldState.Position; }
internal void WriteOffsets(int termID, int offsetAccum) { Debug.Assert(HasOffsets); int startOffset = offsetAccum + OffsetAttribute.StartOffset(); int endOffset = offsetAccum + OffsetAttribute.EndOffset(); FreqProxPostingsArray postings = (FreqProxPostingsArray)TermsHashPerField.PostingsArray; Debug.Assert(startOffset - postings.LastOffsets[termID] >= 0); TermsHashPerField.WriteVInt(1, startOffset - postings.LastOffsets[termID]); TermsHashPerField.WriteVInt(1, endOffset - startOffset); postings.LastOffsets[termID] = startOffset; }
internal void WriteOffsets(int termID, int offsetAccum) { if (Debugging.AssertsEnabled) { Debugging.Assert(hasOffsets); } int startOffset = offsetAccum + offsetAttribute.StartOffset; int endOffset = offsetAccum + offsetAttribute.EndOffset; FreqProxPostingsArray postings = (FreqProxPostingsArray)termsHashPerField.postingsArray; if (Debugging.AssertsEnabled) { Debugging.Assert(startOffset - postings.lastOffsets[termID] >= 0); } termsHashPerField.WriteVInt32(1, startOffset - postings.lastOffsets[termID]); termsHashPerField.WriteVInt32(1, endOffset - startOffset); postings.lastOffsets[termID] = startOffset; }
internal override void NewTerm(int termID) { // First time we're seeing this term since the last // flush // LUCENENET: .NET doesn't support asserts in release mode if (Lucene.Net.Diagnostics.Debugging.AssertsEnabled) { docState.TestPoint("FreqProxTermsWriterPerField.newTerm start"); } FreqProxPostingsArray postings = (FreqProxPostingsArray)termsHashPerField.postingsArray; postings.lastDocIDs[termID] = docState.docID; if (!hasFreq) { postings.lastDocCodes[termID] = docState.docID; } else { postings.lastDocCodes[termID] = docState.docID << 1; postings.termFreqs[termID] = 1; if (hasProx) { WriteProx(termID, fieldState.Position); if (hasOffsets) { WriteOffsets(termID, fieldState.Offset); } } else { Debug.Assert(!hasOffsets); } } fieldState.MaxTermFrequency = Math.Max(1, fieldState.MaxTermFrequency); fieldState.UniqueTermCount++; }
internal override void CopyTo(ParallelPostingsArray toArray, int numToCopy) { if (Debugging.AssertsEnabled) { Debugging.Assert(toArray is FreqProxPostingsArray); } FreqProxPostingsArray to = (FreqProxPostingsArray)toArray; base.CopyTo(toArray, numToCopy); Array.Copy(lastDocIDs, 0, to.lastDocIDs, 0, numToCopy); Array.Copy(lastDocCodes, 0, to.lastDocCodes, 0, numToCopy); if (lastPositions != null) { if (Debugging.AssertsEnabled) { Debugging.Assert(to.lastPositions != null); } Array.Copy(lastPositions, 0, to.lastPositions, 0, numToCopy); } if (lastOffsets != null) { if (Debugging.AssertsEnabled) { Debugging.Assert(to.lastOffsets != null); } Array.Copy(lastOffsets, 0, to.lastOffsets, 0, numToCopy); } if (termFreqs != null) { if (Debugging.AssertsEnabled) { Debugging.Assert(to.termFreqs != null); } Array.Copy(termFreqs, 0, to.termFreqs, 0, numToCopy); } }
/* Walk through all unique text tokens (Posting * instances) found in this field and serialize them * into a single RAM segment. */ internal void Flush(string fieldName, FieldsConsumer consumer, SegmentWriteState state) { if (!fieldInfo.Indexed) { return; // nothing to flush, don't bother the codec with the unindexed field } TermsConsumer termsConsumer = consumer.AddField(fieldInfo); IComparer <BytesRef> termComp = termsConsumer.Comparator; // CONFUSING: this.indexOptions holds the index options // that were current when we first saw this field. But // it's possible this has changed, eg when other // documents are indexed that cause a "downgrade" of the // IndexOptions. So we must decode the in-RAM buffer // according to this.indexOptions, but then write the // new segment to the directory according to // currentFieldIndexOptions: FieldInfo.IndexOptions?currentFieldIndexOptions = fieldInfo.FieldIndexOptions; Debug.Assert(currentFieldIndexOptions != null); bool writeTermFreq = currentFieldIndexOptions >= FieldInfo.IndexOptions.DOCS_AND_FREQS; bool writePositions = currentFieldIndexOptions >= FieldInfo.IndexOptions.DOCS_AND_FREQS_AND_POSITIONS; bool writeOffsets = currentFieldIndexOptions >= FieldInfo.IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS; bool readTermFreq = this.HasFreq; bool readPositions = this.HasProx; bool readOffsets = this.HasOffsets; //System.out.println("flush readTF=" + readTermFreq + " readPos=" + readPositions + " readOffs=" + readOffsets); // Make sure FieldInfo.update is working correctly!: Debug.Assert(!writeTermFreq || readTermFreq); Debug.Assert(!writePositions || readPositions); Debug.Assert(!writeOffsets || readOffsets); Debug.Assert(!writeOffsets || writePositions); IDictionary <Term, int?> segDeletes; if (state.SegUpdates != null && state.SegUpdates.Terms.Count > 0) { segDeletes = state.SegUpdates.Terms; } else { segDeletes = null; } int[] termIDs = TermsHashPerField.SortPostings(termComp); int numTerms = TermsHashPerField.BytesHash.Size(); BytesRef text = new BytesRef(); FreqProxPostingsArray postings = (FreqProxPostingsArray)TermsHashPerField.PostingsArray; ByteSliceReader freq = new ByteSliceReader(); ByteSliceReader prox = new ByteSliceReader(); FixedBitSet visitedDocs = new FixedBitSet(state.SegmentInfo.DocCount); long sumTotalTermFreq = 0; long sumDocFreq = 0; Term protoTerm = new Term(fieldName); for (int i = 0; i < numTerms; i++) { int termID = termIDs[i]; // Get BytesRef int textStart = postings.TextStarts[termID]; TermsHashPerField.BytePool.SetBytesRef(text, textStart); TermsHashPerField.InitReader(freq, termID, 0); if (readPositions || readOffsets) { TermsHashPerField.InitReader(prox, termID, 1); } // TODO: really TermsHashPerField should take over most // of this loop, including merge sort of terms from // multiple threads and interacting with the // TermsConsumer, only calling out to us (passing us the // DocsConsumer) to handle delivery of docs/positions PostingsConsumer postingsConsumer = termsConsumer.StartTerm(text); int?delDocLimit; if (segDeletes != null) { protoTerm.Bytes_Renamed = text; int?docIDUpto; segDeletes.TryGetValue(protoTerm, out docIDUpto); if (docIDUpto != null) { delDocLimit = docIDUpto; } else { delDocLimit = 0; } } else { delDocLimit = 0; } // Now termStates has numToMerge FieldMergeStates // which all share the same term. Now we must // interleave the docID streams. int docFreq = 0; long totalTermFreq = 0; int docID = 0; while (true) { //System.out.println(" cycle"); int termFreq; if (freq.Eof()) { if (postings.LastDocCodes[termID] != -1) { // Return last doc docID = postings.LastDocIDs[termID]; if (readTermFreq) { termFreq = postings.TermFreqs[termID]; } else { termFreq = -1; } postings.LastDocCodes[termID] = -1; } else { // EOF break; } } else { int code = freq.ReadVInt(); if (!readTermFreq) { docID += code; termFreq = -1; } else { docID += (int)((uint)code >> 1); if ((code & 1) != 0) { termFreq = 1; } else { termFreq = freq.ReadVInt(); } } Debug.Assert(docID != postings.LastDocIDs[termID]); } docFreq++; Debug.Assert(docID < state.SegmentInfo.DocCount, "doc=" + docID + " maxDoc=" + state.SegmentInfo.DocCount); // NOTE: we could check here if the docID was // deleted, and skip it. However, this is somewhat // dangerous because it can yield non-deterministic // behavior since we may see the docID before we see // the term that caused it to be deleted. this // would mean some (but not all) of its postings may // make it into the index, which'd alter the docFreq // for those terms. We could fix this by doing two // passes, ie first sweep marks all del docs, and // 2nd sweep does the real flush, but I suspect // that'd add too much time to flush. visitedDocs.Set(docID); postingsConsumer.StartDoc(docID, writeTermFreq ? termFreq : -1); if (docID < delDocLimit) { // Mark it deleted. TODO: we could also skip // writing its postings; this would be // deterministic (just for this Term's docs). // TODO: can we do this reach-around in a cleaner way???? if (state.LiveDocs == null) { state.LiveDocs = DocState.DocWriter.Codec.LiveDocsFormat().NewLiveDocs(state.SegmentInfo.DocCount); } if (state.LiveDocs.Get(docID)) { state.DelCountOnFlush++; state.LiveDocs.Clear(docID); } } totalTermFreq += termFreq; // Carefully copy over the prox + payload info, // changing the format to match Lucene's segment // format. if (readPositions || readOffsets) { // we did record positions (& maybe payload) and/or offsets int position = 0; int offset = 0; for (int j = 0; j < termFreq; j++) { BytesRef thisPayload; if (readPositions) { int code = prox.ReadVInt(); position += (int)((uint)code >> 1); if ((code & 1) != 0) { // this position has a payload int payloadLength = prox.ReadVInt(); if (Payload == null) { Payload = new BytesRef(); Payload.Bytes = new sbyte[payloadLength]; } else if (Payload.Bytes.Length < payloadLength) { Payload.Grow(payloadLength); } prox.ReadBytes(Payload.Bytes, 0, payloadLength); Payload.Length = payloadLength; thisPayload = Payload; } else { thisPayload = null; } if (readOffsets) { int startOffset = offset + prox.ReadVInt(); int endOffset = startOffset + prox.ReadVInt(); if (writePositions) { if (writeOffsets) { Debug.Assert(startOffset >= 0 && endOffset >= startOffset, "startOffset=" + startOffset + ",endOffset=" + endOffset + ",offset=" + offset); postingsConsumer.AddPosition(position, thisPayload, startOffset, endOffset); } else { postingsConsumer.AddPosition(position, thisPayload, -1, -1); } } offset = startOffset; } else if (writePositions) { postingsConsumer.AddPosition(position, thisPayload, -1, -1); } } } } postingsConsumer.FinishDoc(); } termsConsumer.FinishTerm(text, new TermStats(docFreq, writeTermFreq ? totalTermFreq : -1)); sumTotalTermFreq += totalTermFreq; sumDocFreq += docFreq; } termsConsumer.Finish(writeTermFreq ? sumTotalTermFreq : -1, sumDocFreq, visitedDocs.Cardinality()); }
internal override void AddTerm(int termID) { Debug.Assert(DocState.TestPoint("FreqProxTermsWriterPerField.addTerm start")); FreqProxPostingsArray postings = (FreqProxPostingsArray)TermsHashPerField.PostingsArray; Debug.Assert(!HasFreq || postings.TermFreqs[termID] > 0); if (!HasFreq) { Debug.Assert(postings.TermFreqs == null); if (DocState.DocID != postings.LastDocIDs[termID]) { Debug.Assert(DocState.DocID > postings.LastDocIDs[termID]); TermsHashPerField.WriteVInt(0, postings.LastDocCodes[termID]); postings.LastDocCodes[termID] = DocState.DocID - postings.LastDocIDs[termID]; postings.LastDocIDs[termID] = DocState.DocID; FieldState.UniqueTermCount_Renamed++; } } else if (DocState.DocID != postings.LastDocIDs[termID]) { Debug.Assert(DocState.DocID > postings.LastDocIDs[termID], "id: " + DocState.DocID + " postings ID: " + postings.LastDocIDs[termID] + " termID: " + termID); // Term not yet seen in the current doc but previously // seen in other doc(s) since the last flush // Now that we know doc freq for previous doc, // write it & lastDocCode if (1 == postings.TermFreqs[termID]) { TermsHashPerField.WriteVInt(0, postings.LastDocCodes[termID] | 1); } else { TermsHashPerField.WriteVInt(0, postings.LastDocCodes[termID]); TermsHashPerField.WriteVInt(0, postings.TermFreqs[termID]); } postings.TermFreqs[termID] = 1; FieldState.MaxTermFrequency_Renamed = Math.Max(1, FieldState.MaxTermFrequency_Renamed); postings.LastDocCodes[termID] = (DocState.DocID - postings.LastDocIDs[termID]) << 1; postings.LastDocIDs[termID] = DocState.DocID; if (HasProx) { WriteProx(termID, FieldState.Position_Renamed); if (HasOffsets) { postings.LastOffsets[termID] = 0; WriteOffsets(termID, FieldState.Offset_Renamed); } } else { Debug.Assert(!HasOffsets); } FieldState.UniqueTermCount_Renamed++; } else { FieldState.MaxTermFrequency_Renamed = Math.Max(FieldState.MaxTermFrequency_Renamed, ++postings.TermFreqs[termID]); if (HasProx) { WriteProx(termID, FieldState.Position_Renamed - postings.LastPositions[termID]); } if (HasOffsets) { WriteOffsets(termID, FieldState.Offset_Renamed); } } }
internal override void AddTerm(int termID) { Debug.Assert(docState.TestPoint("FreqProxTermsWriterPerField.addTerm start")); FreqProxPostingsArray postings = (FreqProxPostingsArray)termsHashPerField.postingsArray; Debug.Assert(!hasFreq || postings.termFreqs[termID] > 0); if (!hasFreq) { Debug.Assert(postings.termFreqs == null); if (docState.docID != postings.lastDocIDs[termID]) { Debug.Assert(docState.docID > postings.lastDocIDs[termID]); termsHashPerField.WriteVInt32(0, postings.lastDocCodes[termID]); postings.lastDocCodes[termID] = docState.docID - postings.lastDocIDs[termID]; postings.lastDocIDs[termID] = docState.docID; fieldState.UniqueTermCount++; } } else if (docState.docID != postings.lastDocIDs[termID]) { Debug.Assert(docState.docID > postings.lastDocIDs[termID], "id: " + docState.docID + " postings ID: " + postings.lastDocIDs[termID] + " termID: " + termID); // Term not yet seen in the current doc but previously // seen in other doc(s) since the last flush // Now that we know doc freq for previous doc, // write it & lastDocCode if (1 == postings.termFreqs[termID]) { termsHashPerField.WriteVInt32(0, postings.lastDocCodes[termID] | 1); } else { termsHashPerField.WriteVInt32(0, postings.lastDocCodes[termID]); termsHashPerField.WriteVInt32(0, postings.termFreqs[termID]); } postings.termFreqs[termID] = 1; fieldState.MaxTermFrequency = Math.Max(1, fieldState.MaxTermFrequency); postings.lastDocCodes[termID] = (docState.docID - postings.lastDocIDs[termID]) << 1; postings.lastDocIDs[termID] = docState.docID; if (hasProx) { WriteProx(termID, fieldState.Position); if (hasOffsets) { postings.lastOffsets[termID] = 0; WriteOffsets(termID, fieldState.Offset); } } else { Debug.Assert(!hasOffsets); } fieldState.UniqueTermCount++; } else { fieldState.MaxTermFrequency = Math.Max(fieldState.MaxTermFrequency, ++postings.termFreqs[termID]); if (hasProx) { WriteProx(termID, fieldState.Position - postings.lastPositions[termID]); } if (hasOffsets) { WriteOffsets(termID, fieldState.Offset); } } }