/// <summary> /// Merges in the stored fields from the readers in /// <code>mergeState</code>. The default implementation skips /// over deleted documents, and uses <seealso cref="#startDocument(int)"/>, /// <seealso cref="#writeField(FieldInfo, IndexableField)"/>, and <seealso cref="#finish(FieldInfos, int)"/>, /// returning the number of documents that were written. /// Implementations can override this method for more sophisticated /// merging (bulk-byte copying, etc). /// </summary> public virtual int Merge(MergeState mergeState) { int docCount = 0; foreach (AtomicReader reader in mergeState.Readers) { int maxDoc = reader.MaxDoc; Bits liveDocs = reader.LiveDocs; for (int i = 0; i < maxDoc; i++) { if (liveDocs != null && !liveDocs.Get(i)) { // skip deleted docs continue; } // TODO: this could be more efficient using // FieldVisitor instead of loading/writing entire // doc; ie we just have to renumber the field number // on the fly? // NOTE: it's very important to first assign to doc then pass it to // fieldsWriter.addDocument; see LUCENE-1282 Document doc = reader.Document(i); AddDocument(doc, mergeState.FieldInfos); docCount++; mergeState.checkAbort.Work(300); } } Finish(mergeState.FieldInfos, docCount); return(docCount); }
/// <summary> /// Called during merging to merge all <seealso cref="Fields"/> from /// sub-readers. this must recurse to merge all postings /// (terms, docs, positions, etc.). A {@link /// PostingsFormat} can override this default /// implementation to do its own merging. /// </summary> public virtual void Merge(MergeState mergeState, Fields fields) { foreach (string field in fields) { FieldInfo info = mergeState.FieldInfos.FieldInfo(field); Debug.Assert(info != null, "FieldInfo for field is null: " + field); Terms terms = fields.Terms(field); if (terms != null) { TermsConsumer termsConsumer = AddField(info); termsConsumer.Merge(mergeState, info.FieldIndexOptions, terms.Iterator(null)); } } }
private int CopyVectorsWithDeletions(MergeState mergeState, Lucene40TermVectorsReader matchingVectorsReader, AtomicReader reader, int[] rawDocLengths, int[] rawDocLengths2) { int maxDoc = reader.MaxDoc; Bits liveDocs = reader.LiveDocs; int totalNumDocs = 0; if (matchingVectorsReader != null) { // We can bulk-copy because the fieldInfos are "congruent" for (int docNum = 0; docNum < maxDoc; ) { if (!liveDocs.Get(docNum)) { // skip deleted docs ++docNum; continue; } // We can optimize this case (doing a bulk byte copy) since the field // numbers are identical int start = docNum, numDocs = 0; do { docNum++; numDocs++; if (docNum >= maxDoc) { break; } if (!liveDocs.Get(docNum)) { docNum++; break; } } while (numDocs < MAX_RAW_MERGE_DOCS); matchingVectorsReader.RawDocs(rawDocLengths, rawDocLengths2, start, numDocs); AddRawDocuments(matchingVectorsReader, rawDocLengths, rawDocLengths2, numDocs); totalNumDocs += numDocs; mergeState.checkAbort.Work(300 * numDocs); } } else { for (int docNum = 0; docNum < maxDoc; docNum++) { if (!liveDocs.Get(docNum)) { // skip deleted docs continue; } // NOTE: it's very important to first assign to vectors then pass it to // termVectorsWriter.addAllDocVectors; see LUCENE-1282 Fields vectors = reader.GetTermVectors(docNum); AddAllDocVectors(vectors, mergeState); totalNumDocs++; mergeState.checkAbort.Work(300); } } return totalNumDocs; }
private int CopyVectorsNoDeletions(MergeState mergeState, Lucene40TermVectorsReader matchingVectorsReader, AtomicReader reader, int[] rawDocLengths, int[] rawDocLengths2) { int maxDoc = reader.MaxDoc; if (matchingVectorsReader != null) { // We can bulk-copy because the fieldInfos are "congruent" int docCount = 0; while (docCount < maxDoc) { int len = Math.Min(MAX_RAW_MERGE_DOCS, maxDoc - docCount); matchingVectorsReader.RawDocs(rawDocLengths, rawDocLengths2, docCount, len); AddRawDocuments(matchingVectorsReader, rawDocLengths, rawDocLengths2, len); docCount += len; mergeState.checkAbort.Work(300 * len); } } else { for (int docNum = 0; docNum < maxDoc; docNum++) { // NOTE: it's very important to first assign to vectors then pass it to // termVectorsWriter.addAllDocVectors; see LUCENE-1282 Fields vectors = reader.GetTermVectors(docNum); AddAllDocVectors(vectors, mergeState); mergeState.checkAbort.Work(300); } } return maxDoc; }
public override int Merge(MergeState mergeState) { // Used for bulk-reading raw bytes for term vectors int[] rawDocLengths = new int[MAX_RAW_MERGE_DOCS]; int[] rawDocLengths2 = new int[MAX_RAW_MERGE_DOCS]; int idx = 0; int numDocs = 0; for (int i = 0; i < mergeState.Readers.Count; i++) { AtomicReader reader = mergeState.Readers[i]; SegmentReader matchingSegmentReader = mergeState.MatchingSegmentReaders[idx++]; Lucene40TermVectorsReader matchingVectorsReader = null; if (matchingSegmentReader != null) { TermVectorsReader vectorsReader = matchingSegmentReader.TermVectorsReader; if (vectorsReader != null && vectorsReader is Lucene40TermVectorsReader) { matchingVectorsReader = (Lucene40TermVectorsReader)vectorsReader; } } if (reader.LiveDocs != null) { numDocs += CopyVectorsWithDeletions(mergeState, matchingVectorsReader, reader, rawDocLengths, rawDocLengths2); } else { numDocs += CopyVectorsNoDeletions(mergeState, matchingVectorsReader, reader, rawDocLengths, rawDocLengths2); } } Finish(mergeState.FieldInfos, numDocs); return numDocs; }
private int CopyFieldsWithDeletions(MergeState mergeState, AtomicReader reader, Lucene40StoredFieldsReader matchingFieldsReader, int[] rawDocLengths) { int docCount = 0; int maxDoc = reader.MaxDoc; Bits liveDocs = reader.LiveDocs; Debug.Assert(liveDocs != null); if (matchingFieldsReader != null) { // We can bulk-copy because the fieldInfos are "congruent" for (int j = 0; j < maxDoc; ) { if (!liveDocs.Get(j)) { // skip deleted docs ++j; continue; } // We can optimize this case (doing a bulk byte copy) since the field // numbers are identical int start = j, numDocs = 0; do { j++; numDocs++; if (j >= maxDoc) { break; } if (!liveDocs.Get(j)) { j++; break; } } while (numDocs < MAX_RAW_MERGE_DOCS); IndexInput stream = matchingFieldsReader.RawDocs(rawDocLengths, start, numDocs); AddRawDocuments(stream, rawDocLengths, numDocs); docCount += numDocs; mergeState.checkAbort.Work(300 * numDocs); } } else { for (int j = 0; j < maxDoc; j++) { if (!liveDocs.Get(j)) { // skip deleted docs continue; } // TODO: this could be more efficient using // FieldVisitor instead of loading/writing entire // doc; ie we just have to renumber the field number // on the fly? // NOTE: it's very important to first assign to doc then pass it to // fieldsWriter.addDocument; see LUCENE-1282 Document doc = reader.Document(j); AddDocument(doc, mergeState.FieldInfos); docCount++; mergeState.checkAbort.Work(300); } } return docCount; }
private int CopyFieldsNoDeletions(MergeState mergeState, AtomicReader reader, Lucene40StoredFieldsReader matchingFieldsReader, int[] rawDocLengths) { int maxDoc = reader.MaxDoc; int docCount = 0; if (matchingFieldsReader != null) { // We can bulk-copy because the fieldInfos are "congruent" while (docCount < maxDoc) { int len = Math.Min(MAX_RAW_MERGE_DOCS, maxDoc - docCount); IndexInput stream = matchingFieldsReader.RawDocs(rawDocLengths, docCount, len); AddRawDocuments(stream, rawDocLengths, len); docCount += len; mergeState.checkAbort.Work(300 * len); } } else { for (; docCount < maxDoc; docCount++) { // NOTE: it's very important to first assign to doc then pass it to // fieldsWriter.addDocument; see LUCENE-1282 Document doc = reader.Document(docCount); AddDocument(doc, mergeState.FieldInfos); mergeState.checkAbort.Work(300); } } return docCount; }
public override int Merge(MergeState mergeState) { int docCount = 0; // Used for bulk-reading raw bytes for stored fields int[] rawDocLengths = new int[MAX_RAW_MERGE_DOCS]; int idx = 0; foreach (AtomicReader reader in mergeState.Readers) { SegmentReader matchingSegmentReader = mergeState.MatchingSegmentReaders[idx++]; Lucene40StoredFieldsReader matchingFieldsReader = null; if (matchingSegmentReader != null) { StoredFieldsReader fieldsReader = matchingSegmentReader.FieldsReader; // we can only bulk-copy if the matching reader is also a Lucene40FieldsReader if (fieldsReader != null && fieldsReader is Lucene40StoredFieldsReader) { matchingFieldsReader = (Lucene40StoredFieldsReader)fieldsReader; } } if (reader.LiveDocs != null) { docCount += CopyFieldsWithDeletions(mergeState, reader, matchingFieldsReader, rawDocLengths); } else { docCount += CopyFieldsNoDeletions(mergeState, reader, matchingFieldsReader, rawDocLengths); } } Finish(mergeState.FieldInfos, docCount); return docCount; }
/// <summary> /// Safe (but, slowish) default method to write every /// vector field in the document. /// </summary> protected internal void AddAllDocVectors(Fields vectors, MergeState mergeState) { if (vectors == null) { StartDocument(0); FinishDocument(); return; } int numFields = vectors.Size; if (numFields == -1) { // count manually! TODO: Maybe enforce that Fields.size() returns something valid? numFields = 0; //for (IEnumerator<string> it = vectors.Iterator(); it.hasNext();) foreach (string it in vectors) { numFields++; } } StartDocument(numFields); string lastFieldName = null; TermsEnum termsEnum = null; DocsAndPositionsEnum docsAndPositionsEnum = null; int fieldCount = 0; foreach (string fieldName in vectors) { fieldCount++; FieldInfo fieldInfo = mergeState.FieldInfos.FieldInfo(fieldName); Debug.Assert(lastFieldName == null || fieldName.CompareTo(lastFieldName) > 0, "lastFieldName=" + lastFieldName + " fieldName=" + fieldName); lastFieldName = fieldName; Terms terms = vectors.Terms(fieldName); if (terms == null) { // FieldsEnum shouldn't lie... continue; } bool hasPositions = terms.HasPositions(); bool hasOffsets = terms.HasOffsets(); bool hasPayloads = terms.HasPayloads(); Debug.Assert(!hasPayloads || hasPositions); int numTerms = (int)terms.Size(); if (numTerms == -1) { // count manually. It is stupid, but needed, as Terms.size() is not a mandatory statistics function numTerms = 0; termsEnum = terms.Iterator(termsEnum); while (termsEnum.Next() != null) { numTerms++; } } StartField(fieldInfo, numTerms, hasPositions, hasOffsets, hasPayloads); termsEnum = terms.Iterator(termsEnum); int termCount = 0; while (termsEnum.Next() != null) { termCount++; int freq = (int)termsEnum.TotalTermFreq(); StartTerm(termsEnum.Term(), freq); if (hasPositions || hasOffsets) { docsAndPositionsEnum = termsEnum.DocsAndPositions(null, docsAndPositionsEnum); Debug.Assert(docsAndPositionsEnum != null); int docID = docsAndPositionsEnum.NextDoc(); Debug.Assert(docID != DocIdSetIterator.NO_MORE_DOCS); Debug.Assert(docsAndPositionsEnum.Freq() == freq); for (int posUpto = 0; posUpto < freq; posUpto++) { int pos = docsAndPositionsEnum.NextPosition(); int startOffset = docsAndPositionsEnum.StartOffset(); int endOffset = docsAndPositionsEnum.EndOffset(); BytesRef payload = docsAndPositionsEnum.Payload; Debug.Assert(!hasPositions || pos >= 0); AddPosition(pos, startOffset, endOffset, payload); } } FinishTerm(); } Debug.Assert(termCount == numTerms); FinishField(); } Debug.Assert(fieldCount == numFields); FinishDocument(); }
/// <summary> /// Merges in the term vectors from the readers in /// <code>mergeState</code>. The default implementation skips /// over deleted documents, and uses <seealso cref="#startDocument(int)"/>, /// <seealso cref="#startField(FieldInfo, int, boolean, boolean, boolean)"/>, /// <seealso cref="#startTerm(BytesRef, int)"/>, <seealso cref="#addPosition(int, int, int, BytesRef)"/>, /// and <seealso cref="#finish(FieldInfos, int)"/>, /// returning the number of documents that were written. /// Implementations can override this method for more sophisticated /// merging (bulk-byte copying, etc). /// </summary> public virtual int Merge(MergeState mergeState) { int docCount = 0; for (int i = 0; i < mergeState.Readers.Count; i++) { AtomicReader reader = mergeState.Readers[i]; int maxDoc = reader.MaxDoc; Bits liveDocs = reader.LiveDocs; for (int docID = 0; docID < maxDoc; docID++) { if (liveDocs != null && !liveDocs.Get(docID)) { // skip deleted docs continue; } // NOTE: it's very important to first assign to vectors then pass it to // termVectorsWriter.addAllDocVectors; see LUCENE-1282 Fields vectors = reader.GetTermVectors(docID); AddAllDocVectors(vectors, mergeState); docCount++; mergeState.checkAbort.Work(300); } } Finish(mergeState.FieldInfos, docCount); return docCount; }