public override void VisitDocument(int docID, StoredFieldVisitor visitor) { fieldsStream.Seek(indexReader.GetStartPointer(docID)); int docBase = fieldsStream.ReadVInt32(); int chunkDocs = fieldsStream.ReadVInt32(); if (docID < docBase || docID >= docBase + chunkDocs || docBase + chunkDocs > numDocs) { throw new CorruptIndexException("Corrupted: docID=" + docID + ", docBase=" + docBase + ", chunkDocs=" + chunkDocs + ", numDocs=" + numDocs + " (resource=" + fieldsStream + ")"); } int numStoredFields, offset, length, totalLength; if (chunkDocs == 1) { numStoredFields = fieldsStream.ReadVInt32(); offset = 0; length = fieldsStream.ReadVInt32(); totalLength = length; } else { int bitsPerStoredFields = fieldsStream.ReadVInt32(); if (bitsPerStoredFields == 0) { numStoredFields = fieldsStream.ReadVInt32(); } else if (bitsPerStoredFields > 31) { throw new CorruptIndexException("bitsPerStoredFields=" + bitsPerStoredFields + " (resource=" + fieldsStream + ")"); } else { long filePointer = fieldsStream.GetFilePointer(); PackedInt32s.Reader reader = PackedInt32s.GetDirectReaderNoHeader(fieldsStream, PackedInt32s.Format.PACKED, packedIntsVersion, chunkDocs, bitsPerStoredFields); numStoredFields = (int)(reader.Get(docID - docBase)); fieldsStream.Seek(filePointer + PackedInt32s.Format.PACKED.ByteCount(packedIntsVersion, chunkDocs, bitsPerStoredFields)); } int bitsPerLength = fieldsStream.ReadVInt32(); if (bitsPerLength == 0) { length = fieldsStream.ReadVInt32(); offset = (docID - docBase) * length; totalLength = chunkDocs * length; } else if (bitsPerStoredFields > 31) { throw new CorruptIndexException("bitsPerLength=" + bitsPerLength + " (resource=" + fieldsStream + ")"); } else { PackedInt32s.IReaderIterator it = PackedInt32s.GetReaderIteratorNoHeader(fieldsStream, PackedInt32s.Format.PACKED, packedIntsVersion, chunkDocs, bitsPerLength, 1); int off = 0; for (int i = 0; i < docID - docBase; ++i) { off += (int)it.Next(); } offset = off; length = (int)it.Next(); off += length; for (int i = docID - docBase + 1; i < chunkDocs; ++i) { off += (int)it.Next(); } totalLength = off; } } if ((length == 0) != (numStoredFields == 0)) { throw new CorruptIndexException("length=" + length + ", numStoredFields=" + numStoredFields + " (resource=" + fieldsStream + ")"); } if (numStoredFields == 0) { // nothing to do return; } DataInput documentInput; if (version >= CompressingStoredFieldsWriter.VERSION_BIG_CHUNKS && totalLength >= 2 * chunkSize) { if (Debugging.AssertsEnabled) { Debugging.Assert(chunkSize > 0); Debugging.Assert(offset < chunkSize); } decompressor.Decompress(fieldsStream, chunkSize, offset, Math.Min(length, chunkSize - offset), bytes); documentInput = new DataInputAnonymousInnerClassHelper(this, length); } else { BytesRef bytes = totalLength <= BUFFER_REUSE_THRESHOLD ? this.bytes : new BytesRef(); decompressor.Decompress(fieldsStream, totalLength, offset, length, bytes); if (Debugging.AssertsEnabled) { Debugging.Assert(bytes.Length == length); } documentInput = new ByteArrayDataInput(bytes.Bytes, bytes.Offset, bytes.Length); } for (int fieldIDX = 0; fieldIDX < numStoredFields; fieldIDX++) { long infoAndBits = documentInput.ReadVInt64(); int fieldNumber = (int)((long)((ulong)infoAndBits >> CompressingStoredFieldsWriter.TYPE_BITS)); FieldInfo fieldInfo = fieldInfos.FieldInfo(fieldNumber); int bits = (int)(infoAndBits & CompressingStoredFieldsWriter.TYPE_MASK); if (Debugging.AssertsEnabled) { Debugging.Assert(bits <= CompressingStoredFieldsWriter.NUMERIC_DOUBLE, "bits={0:x}", bits); } switch (visitor.NeedsField(fieldInfo)) { case StoredFieldVisitor.Status.YES: ReadField(documentInput, visitor, fieldInfo, bits); break; case StoredFieldVisitor.Status.NO: SkipField(documentInput, bits); break; case StoredFieldVisitor.Status.STOP: return; } } }
public override void AddSortedField(FieldInfo field, IEnumerable <BytesRef> values, IEnumerable <long?> docToOrd) { int valueCount = 0; BytesRef lastValue = null; foreach (BytesRef b in values) { if (Debugging.AssertsEnabled) { Debugging.Assert(b != null); } if (Debugging.AssertsEnabled) { Debugging.Assert(b.IsValid()); } if (valueCount > 0) { if (Debugging.AssertsEnabled) { Debugging.Assert(b.CompareTo(lastValue) > 0); } } lastValue = BytesRef.DeepCopyOf(b); valueCount++; } if (Debugging.AssertsEnabled) { Debugging.Assert(valueCount <= maxDoc); } FixedBitSet seenOrds = new FixedBitSet(valueCount); int count = 0; foreach (long?v in docToOrd) { if (Debugging.AssertsEnabled) { Debugging.Assert(v != null); } int ord = (int)v.Value; if (Debugging.AssertsEnabled) { Debugging.Assert(ord >= -1 && ord < valueCount); } if (ord >= 0) { seenOrds.Set(ord); } count++; } if (Debugging.AssertsEnabled) { Debugging.Assert(count == maxDoc); } if (Debugging.AssertsEnabled) { Debugging.Assert(seenOrds.Cardinality() == valueCount); } CheckIterator(values.GetEnumerator(), valueCount, false); CheckIterator(docToOrd.GetEnumerator(), maxDoc, false); @in.AddSortedField(field, values, docToOrd); }
public virtual ApplyDeletesResult ApplyDeletesAndUpdates(IndexWriter.ReaderPool readerPool, IList <SegmentCommitInfo> infos) { UninterruptableMonitor.Enter(this); try { long t0 = J2N.Time.NanoTime() / J2N.Time.MillisecondsPerNanosecond; // LUCENENET: Use NanoTime() rather than CurrentTimeMilliseconds() for more accurate/reliable results if (infos.Count == 0) { return(new ApplyDeletesResult(false, nextGen++, null)); } if (Debugging.AssertsEnabled) { Debugging.Assert(CheckDeleteStats()); } if (!Any()) { if (infoStream.IsEnabled("BD")) { infoStream.Message("BD", "applyDeletes: no deletes; skipping"); } return(new ApplyDeletesResult(false, nextGen++, null)); } if (infoStream.IsEnabled("BD")) { infoStream.Message("BD", "applyDeletes: infos=" + string.Format(J2N.Text.StringFormatter.InvariantCulture, "{0}", infos) + " packetCount=" + updates.Count); } long gen = nextGen++; JCG.List <SegmentCommitInfo> infos2 = new JCG.List <SegmentCommitInfo>(); infos2.AddRange(infos); infos2.Sort(sortSegInfoByDelGen); CoalescedUpdates coalescedUpdates = null; bool anyNewDeletes = false; int infosIDX = infos2.Count - 1; int delIDX = updates.Count - 1; IList <SegmentCommitInfo> allDeleted = null; while (infosIDX >= 0) { //System.out.println("BD: cycle delIDX=" + delIDX + " infoIDX=" + infosIDX); FrozenBufferedUpdates packet = delIDX >= 0 ? updates[delIDX] : null; SegmentCommitInfo info = infos2[infosIDX]; long segGen = info.BufferedDeletesGen; if (packet != null && segGen < packet.DelGen) { // System.out.println(" coalesce"); if (coalescedUpdates == null) { coalescedUpdates = new CoalescedUpdates(); } if (!packet.isSegmentPrivate) { /* * Only coalesce if we are NOT on a segment private del packet: the segment private del packet * must only applied to segments with the same delGen. Yet, if a segment is already deleted * from the SI since it had no more documents remaining after some del packets younger than * its segPrivate packet (higher delGen) have been applied, the segPrivate packet has not been * removed. */ coalescedUpdates.Update(packet); } delIDX--; } else if (packet != null && segGen == packet.DelGen) { if (Debugging.AssertsEnabled) { Debugging.Assert(packet.isSegmentPrivate, "Packet and Segments deletegen can only match on a segment private del packet gen={0}", segGen); } //System.out.println(" eq"); // Lock order: IW -> BD -> RP if (Debugging.AssertsEnabled) { Debugging.Assert(readerPool.InfoIsLive(info)); } ReadersAndUpdates rld = readerPool.Get(info, true); SegmentReader reader = rld.GetReader(IOContext.READ); int delCount = 0; bool segAllDeletes; try { DocValuesFieldUpdates.Container dvUpdates = new DocValuesFieldUpdates.Container(); if (coalescedUpdates != null) { //System.out.println(" del coalesced"); delCount += (int)ApplyTermDeletes(coalescedUpdates.TermsIterable(), rld, reader); delCount += (int)ApplyQueryDeletes(coalescedUpdates.QueriesIterable(), rld, reader); ApplyDocValuesUpdates(coalescedUpdates.numericDVUpdates, rld, reader, dvUpdates); ApplyDocValuesUpdates(coalescedUpdates.binaryDVUpdates, rld, reader, dvUpdates); } //System.out.println(" del exact"); // Don't delete by Term here; DocumentsWriterPerThread // already did that on flush: delCount += (int)ApplyQueryDeletes(packet.GetQueriesEnumerable(), rld, reader); ApplyDocValuesUpdates(packet.numericDVUpdates, rld, reader, dvUpdates); ApplyDocValuesUpdates(packet.binaryDVUpdates, rld, reader, dvUpdates); if (dvUpdates.Any()) { rld.WriteFieldUpdates(info.Info.Dir, dvUpdates); } int fullDelCount = rld.Info.DelCount + rld.PendingDeleteCount; if (Debugging.AssertsEnabled) { Debugging.Assert(fullDelCount <= rld.Info.Info.DocCount); } segAllDeletes = fullDelCount == rld.Info.Info.DocCount; } finally { rld.Release(reader); readerPool.Release(rld); } anyNewDeletes |= delCount > 0; if (segAllDeletes) { if (allDeleted == null) { allDeleted = new JCG.List <SegmentCommitInfo>(); } allDeleted.Add(info); } if (infoStream.IsEnabled("BD")) { infoStream.Message("BD", "seg=" + info + " segGen=" + segGen + " segDeletes=[" + packet + "]; coalesced deletes=[" + (coalescedUpdates == null ? "null" : coalescedUpdates.ToString()) + "] newDelCount=" + delCount + (segAllDeletes ? " 100% deleted" : "")); } if (coalescedUpdates == null) { coalescedUpdates = new CoalescedUpdates(); } /* * Since we are on a segment private del packet we must not * update the coalescedDeletes here! We can simply advance to the * next packet and seginfo. */ delIDX--; infosIDX--; info.SetBufferedDeletesGen(gen); } else { //System.out.println(" gt"); if (coalescedUpdates != null) { // Lock order: IW -> BD -> RP if (Debugging.AssertsEnabled) { Debugging.Assert(readerPool.InfoIsLive(info)); } ReadersAndUpdates rld = readerPool.Get(info, true); SegmentReader reader = rld.GetReader(IOContext.READ); int delCount = 0; bool segAllDeletes; try { delCount += (int)ApplyTermDeletes(coalescedUpdates.TermsIterable(), rld, reader); delCount += (int)ApplyQueryDeletes(coalescedUpdates.QueriesIterable(), rld, reader); DocValuesFieldUpdates.Container dvUpdates = new DocValuesFieldUpdates.Container(); ApplyDocValuesUpdates(coalescedUpdates.numericDVUpdates, rld, reader, dvUpdates); ApplyDocValuesUpdates(coalescedUpdates.binaryDVUpdates, rld, reader, dvUpdates); if (dvUpdates.Any()) { rld.WriteFieldUpdates(info.Info.Dir, dvUpdates); } int fullDelCount = rld.Info.DelCount + rld.PendingDeleteCount; if (Debugging.AssertsEnabled) { Debugging.Assert(fullDelCount <= rld.Info.Info.DocCount); } segAllDeletes = fullDelCount == rld.Info.Info.DocCount; } finally { rld.Release(reader); readerPool.Release(rld); } anyNewDeletes |= delCount > 0; if (segAllDeletes) { if (allDeleted == null) { allDeleted = new JCG.List <SegmentCommitInfo>(); } allDeleted.Add(info); } if (infoStream.IsEnabled("BD")) { infoStream.Message("BD", "seg=" + info + " segGen=" + segGen + " coalesced deletes=[" + coalescedUpdates + "] newDelCount=" + delCount + (segAllDeletes ? " 100% deleted" : "")); } } info.SetBufferedDeletesGen(gen); infosIDX--; } } if (Debugging.AssertsEnabled) { Debugging.Assert(CheckDeleteStats()); } if (infoStream.IsEnabled("BD")) { infoStream.Message("BD", "applyDeletes took " + ((J2N.Time.NanoTime() / J2N.Time.MillisecondsPerNanosecond) - t0) + " msec"); // LUCENENET: Use NanoTime() rather than CurrentTimeMilliseconds() for more accurate/reliable results } // assert infos != segmentInfos || !any() : "infos=" + infos + " segmentInfos=" + segmentInfos + " any=" + any; return(new ApplyDeletesResult(anyNewDeletes, gen, allDeleted)); } finally { UninterruptableMonitor.Exit(this); } }
// algorithm: treat sentence snippets as miniature documents // we can intersect these with the postings lists via BreakIterator.preceding(offset),s // score each sentence as norm(sentenceStartOffset) * sum(weight * tf(freq)) private Passage[] HighlightDoc(string field, BytesRef[] terms, int contentLength, BreakIterator bi, int doc, TermsEnum termsEnum, DocsAndPositionsEnum[] postings, int n) { PassageScorer scorer = GetScorer(field); if (scorer == null) { throw new NullReferenceException("PassageScorer cannot be null"); } JCG.PriorityQueue <OffsetsEnum> pq = new JCG.PriorityQueue <OffsetsEnum>(); float[] weights = new float[terms.Length]; // initialize postings for (int i = 0; i < terms.Length; i++) { DocsAndPositionsEnum de = postings[i]; int pDoc; if (de == EMPTY) { continue; } else if (de == null) { postings[i] = EMPTY; // initially if (!termsEnum.SeekExact(terms[i])) { continue; // term not found } de = postings[i] = termsEnum.DocsAndPositions(null, null, DocsAndPositionsFlags.OFFSETS); if (de == null) { // no positions available throw new ArgumentException("field '" + field + "' was indexed without offsets, cannot highlight"); } pDoc = de.Advance(doc); } else { pDoc = de.DocID; if (pDoc < doc) { pDoc = de.Advance(doc); } } if (doc == pDoc) { weights[i] = scorer.Weight(contentLength, de.Freq); de.NextPosition(); pq.Add(new OffsetsEnum(de, i)); } } pq.Add(new OffsetsEnum(EMPTY, int.MaxValue)); // a sentinel for termination JCG.PriorityQueue <Passage> passageQueue = new JCG.PriorityQueue <Passage>(n, Comparer <Passage> .Create((left, right) => { if (left.score < right.score) { return(-1); } else if (left.score > right.score) { return(1); } else { return(left.startOffset - right.startOffset); } })); Passage current = new Passage(); while (pq.TryDequeue(out OffsetsEnum off)) { DocsAndPositionsEnum dp = off.dp; int start = dp.StartOffset; if (start == -1) { throw new ArgumentException("field '" + field + "' was indexed without offsets, cannot highlight"); } int end = dp.EndOffset; // LUCENE-5166: this hit would span the content limit... however more valid // hits may exist (they are sorted by start). so we pretend like we never // saw this term, it won't cause a passage to be added to passageQueue or anything. if (Debugging.AssertsEnabled) { Debugging.Assert(EMPTY.StartOffset == int.MaxValue); } if (start < contentLength && end > contentLength) { continue; } if (start >= current.endOffset) { if (current.startOffset >= 0) { // finalize current current.score *= scorer.Norm(current.startOffset); // new sentence: first add 'current' to queue if (passageQueue.Count == n && current.score < passageQueue.Peek().score) { current.Reset(); // can't compete, just reset it } else { passageQueue.Enqueue(current); if (passageQueue.Count > n) { current = passageQueue.Dequeue(); current.Reset(); } else { current = new Passage(); } } } // if we exceed limit, we are done if (start >= contentLength) { Passage[] passages = passageQueue.ToArray(); foreach (Passage p in passages) { p.Sort(); } // sort in ascending order ArrayUtil.TimSort(passages, Comparer <Passage> .Create((left, right) => left.startOffset - right.startOffset)); return(passages); } // advance breakiterator if (Debugging.AssertsEnabled) { Debugging.Assert(BreakIterator.Done < 0); } current.startOffset = Math.Max(bi.Preceding(start + 1), 0); current.endOffset = Math.Min(bi.Next(), contentLength); } int tf = 0; while (true) { tf++; BytesRef term = terms[off.id]; if (term == null) { // multitermquery match, pull from payload term = off.dp.GetPayload(); if (Debugging.AssertsEnabled) { Debugging.Assert(term != null); } } current.AddMatch(start, end, term); if (off.pos == dp.Freq) { break; // removed from pq } else { off.pos++; dp.NextPosition(); start = dp.StartOffset; end = dp.EndOffset; } if (start >= current.endOffset || end > contentLength) { pq.Enqueue(off); break; } } current.score += weights[off.id] * scorer.Tf(tf, current.endOffset - current.startOffset); } // Dead code but compiler disagrees: if (Debugging.AssertsEnabled) { Debugging.Assert(false); } return(null); }
public override Scorer GetScorer(AtomicReaderContext context, IBits acceptDocs) { if (Debugging.AssertsEnabled) { Debugging.Assert(outerInstance.termArrays.Count > 0); } AtomicReader reader = (context.AtomicReader); IBits liveDocs = acceptDocs; PhraseQuery.PostingsAndFreq[] postingsFreqs = new PhraseQuery.PostingsAndFreq[outerInstance.termArrays.Count]; Terms fieldTerms = reader.GetTerms(outerInstance.field); if (fieldTerms is null) { return(null); } // Reuse single TermsEnum below: TermsEnum termsEnum = fieldTerms.GetEnumerator(); for (int pos = 0; pos < postingsFreqs.Length; pos++) { Term[] terms = outerInstance.termArrays[pos]; DocsAndPositionsEnum postingsEnum; int docFreq; if (terms.Length > 1) { postingsEnum = new UnionDocsAndPositionsEnum(liveDocs, context, terms, termContexts, termsEnum); // coarse -- this overcounts since a given doc can // have more than one term: docFreq = 0; for (int termIdx = 0; termIdx < terms.Length; termIdx++) { Term term = terms[termIdx]; TermState termState = termContexts[term].Get(context.Ord); if (termState is null) { // Term not in reader continue; } termsEnum.SeekExact(term.Bytes, termState); docFreq += termsEnum.DocFreq; } if (docFreq == 0) { // None of the terms are in this reader return(null); } } else { Term term = terms[0]; TermState termState = termContexts[term].Get(context.Ord); if (termState is null) { // Term not in reader return(null); } termsEnum.SeekExact(term.Bytes, termState); postingsEnum = termsEnum.DocsAndPositions(liveDocs, null, DocsAndPositionsFlags.NONE); if (postingsEnum is null) { // term does exist, but has no positions if (Debugging.AssertsEnabled) { Debugging.Assert(termsEnum.Docs(liveDocs, null, DocsFlags.NONE) != null, "termstate found but no term exists in reader"); } throw IllegalStateException.Create("field \"" + term.Field + "\" was indexed without position data; cannot run PhraseQuery (term=" + term.Text + ")"); } docFreq = termsEnum.DocFreq; } postingsFreqs[pos] = new PhraseQuery.PostingsAndFreq(postingsEnum, docFreq, (int)outerInstance.positions[pos], terms); } // sort by increasing docFreq order if (outerInstance.slop == 0) { ArrayUtil.TimSort(postingsFreqs); } if (outerInstance.slop == 0) { ExactPhraseScorer s = new ExactPhraseScorer(this, postingsFreqs, similarity.GetSimScorer(stats, context)); if (s.noDocs) { return(null); } else { return(s); } } else { return(new SloppyPhraseScorer(this, postingsFreqs, outerInstance.slop, similarity.GetSimScorer(stats, context))); } }
internal virtual DocValuesConsumer GetInstance(FieldInfo field) { DocValuesFormat format = null; if (field.DocValuesGen != -1) { string formatName = field.GetAttribute(PER_FIELD_FORMAT_KEY); // this means the field never existed in that segment, yet is applied updates if (formatName != null) { format = DocValuesFormat.ForName(formatName); } } if (format == null) { format = outerInstance.GetDocValuesFormatForField(field.Name); } if (format == null) { throw new InvalidOperationException("invalid null DocValuesFormat for field=\"" + field.Name + "\""); } string formatName_ = format.Name; string previousValue = field.PutAttribute(PER_FIELD_FORMAT_KEY, formatName_); if (Debugging.AssertsEnabled) { Debugging.Assert(field.DocValuesGen != -1 || previousValue == null, () => "formatName=" + formatName_ + " prevValue=" + previousValue); } int?suffix = null; ConsumerAndSuffix consumer; if (!formats.TryGetValue(format, out consumer) || consumer == null) { // First time we are seeing this format; create a new instance if (field.DocValuesGen != -1) { string suffixAtt = field.GetAttribute(PER_FIELD_SUFFIX_KEY); // even when dvGen is != -1, it can still be a new field, that never // existed in the segment, and therefore doesn't have the recorded // attributes yet. if (suffixAtt != null) { suffix = Convert.ToInt32(suffixAtt, CultureInfo.InvariantCulture); } } if (suffix == null) { // bump the suffix if (!suffixes.TryGetValue(formatName_, out suffix) || suffix == null) { suffix = 0; } else { suffix = suffix + 1; } } suffixes[formatName_] = suffix; string segmentSuffix = GetFullSegmentSuffix(segmentWriteState.SegmentSuffix, GetSuffix(formatName_, Convert.ToString(suffix, CultureInfo.InvariantCulture))); consumer = new ConsumerAndSuffix(); consumer.Consumer = format.FieldsConsumer(new SegmentWriteState(segmentWriteState, segmentSuffix)); consumer.Suffix = suffix.Value; // LUCENENET NOTE: At this point suffix cannot be null formats[format] = consumer; } else { // we've already seen this format, so just grab its suffix if (Debugging.AssertsEnabled) { Debugging.Assert(suffixes.ContainsKey(formatName_)); } suffix = consumer.Suffix; } previousValue = field.PutAttribute(PER_FIELD_SUFFIX_KEY, Convert.ToString(suffix, CultureInfo.InvariantCulture)); if (Debugging.AssertsEnabled) { Debugging.Assert(field.DocValuesGen != -1 || previousValue == null, () => "suffix=" + Convert.ToString(suffix, CultureInfo.InvariantCulture) + " prevValue=" + previousValue); } // TODO: we should only provide the "slice" of FIS // that this DVF actually sees ... return(consumer.Consumer); }
public override void Write(Directory directory, string segmentName, string segmentSuffix, FieldInfos infos, IOContext context) { string fileName = IndexFileNames.SegmentFileName(segmentName, "", Lucene40FieldInfosFormat.FIELD_INFOS_EXTENSION); IndexOutput output = directory.CreateOutput(fileName, context); bool success = false; try { CodecUtil.WriteHeader(output, Lucene40FieldInfosFormat.CODEC_NAME, Lucene40FieldInfosFormat.FORMAT_CURRENT); output.WriteVInt32(infos.Count); foreach (FieldInfo fi in infos) { IndexOptions indexOptions = fi.IndexOptions; sbyte bits = 0x0; if (fi.HasVectors) { bits |= Lucene40FieldInfosFormat.STORE_TERMVECTOR; } if (fi.OmitsNorms) { bits |= Lucene40FieldInfosFormat.OMIT_NORMS; } if (fi.HasPayloads) { bits |= Lucene40FieldInfosFormat.STORE_PAYLOADS; } if (fi.IsIndexed) { bits |= Lucene40FieldInfosFormat.IS_INDEXED; if (Debugging.AssertsEnabled) { Debugging.Assert(indexOptions.CompareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) >= 0 || !fi.HasPayloads); } if (indexOptions == IndexOptions.DOCS_ONLY) { bits |= Lucene40FieldInfosFormat.OMIT_TERM_FREQ_AND_POSITIONS; } else if (indexOptions == IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS) { bits |= Lucene40FieldInfosFormat.STORE_OFFSETS_IN_POSTINGS; } else if (indexOptions == IndexOptions.DOCS_AND_FREQS) { bits |= Lucene40FieldInfosFormat.OMIT_POSITIONS; } } output.WriteString(fi.Name); output.WriteVInt32(fi.Number); output.WriteByte((byte)bits); // pack the DV types in one byte byte dv = DocValuesByte(fi.DocValuesType, fi.GetAttribute(Lucene40FieldInfosReader.LEGACY_DV_TYPE_KEY)); byte nrm = DocValuesByte(fi.NormType, fi.GetAttribute(Lucene40FieldInfosReader.LEGACY_NORM_TYPE_KEY)); if (Debugging.AssertsEnabled) { Debugging.Assert((dv & (~0xF)) == 0 && (nrm & (~0x0F)) == 0); } var val = (byte)(0xff & ((nrm << 4) | (byte)dv)); output.WriteByte(val); output.WriteStringStringMap(fi.Attributes); } success = true; } finally { if (success) { output.Dispose(); } else { IOUtils.DisposeWhileHandlingException(output); } } }
// Look for seek type 3 ("pop"): if the delta from // prev -> current was replacing an S with an E, // we must now seek to beyond that E. this seek // "finishes" the dance at this character // position. private bool DoPop() { if (DEBUG_SURROGATES) { Console.WriteLine(" try pop"); } if (Debugging.AssertsEnabled) { Debugging.Assert(newSuffixStart <= prevTerm.Length); Debugging.Assert(newSuffixStart < scratchTerm.Length || newSuffixStart == 0); } if (prevTerm.Length > newSuffixStart && IsNonBMPChar(prevTerm.Bytes, newSuffixStart) && IsHighBMPChar(scratchTerm.Bytes, newSuffixStart)) { // Seek type 2 -- put 0xFF at this position: scratchTerm.Bytes[newSuffixStart] = 0xff; scratchTerm.Length = newSuffixStart + 1; if (DEBUG_SURROGATES) { Console.WriteLine(" seek to term=" + UnicodeUtil.ToHexString(scratchTerm.Utf8ToString()) + " " + scratchTerm.ToString()); } // TODO: more efficient seek? can we simply swap // the enums? outerInstance.TermsDict.SeekEnum(termEnum, new Term(fieldInfo.Name, scratchTerm), true); Term t2 = termEnum.Term(); // We could hit EOF or different field since this // was a seek "forward": if (t2 != null && t2.Field == internedFieldName) { if (DEBUG_SURROGATES) { Console.WriteLine(" got term=" + UnicodeUtil.ToHexString(t2.Text()) + " " + t2.Bytes); } BytesRef b2 = t2.Bytes; if (Debugging.AssertsEnabled) { Debugging.Assert(b2.Offset == 0); } // Set newSuffixStart -- we can't use // termEnum's since the above seek may have // done no scanning (eg, term was precisely // and index term, or, was in the term seek // cache): scratchTerm.CopyBytes(b2); SetNewSuffixStart(prevTerm, scratchTerm); return(true); } else if (newSuffixStart != 0 || scratchTerm.Length != 0) { if (DEBUG_SURROGATES) { Console.WriteLine(" got term=null (or next field)"); } newSuffixStart = 0; scratchTerm.Length = 0; return(true); } } return(false); }
// Pre-flex indices store terms in UTF16 sort order, but // certain queries require Unicode codepoint order; this // method carefully seeks around surrogates to handle // this impedance mismatch private void SurrogateDance() { if (!unicodeSortOrder) { return; } // We are invoked after TIS.next() (by UTF16 order) to // possibly seek to a different "next" (by unicode // order) term. // We scan only the "delta" from the last term to the // current term, in UTF8 bytes. We look at 1) the bytes // stripped from the prior term, and then 2) the bytes // appended to that prior term's prefix. // We don't care about specific UTF8 sequences, just // the "category" of the UTF16 character. Category S // is a high/low surrogate pair (it non-BMP). // Category E is any BMP char > UNI_SUR_LOW_END (and < // U+FFFF). Category A is the rest (any unicode char // <= UNI_SUR_HIGH_START). // The core issue is that pre-flex indices sort the // characters as ASE, while flex must sort as AES. So // when scanning, when we hit S, we must 1) seek // forward to E and enum the terms there, then 2) seek // back to S and enum all terms there, then 3) seek to // after E. Three different seek points (1, 2, 3). // We can easily detect S in UTF8: if a byte has // prefix 11110 (0xf0), then that byte and the // following 3 bytes encode a single unicode codepoint // in S. Similarly, we can detect E: if a byte has // prefix 1110111 (0xee), then that byte and the // following 2 bytes encode a single unicode codepoint // in E. // Note that this is really a recursive process -- // maybe the char at pos 2 needs to dance, but any // point in its dance, suddenly pos 4 needs to dance // so you must finish pos 4 before returning to pos // 2. But then during pos 4's dance maybe pos 7 needs // to dance, etc. However, despite being recursive, // we don't need to hold any state because the state // can always be derived by looking at prior term & // current term. // TODO: can we avoid this copy? if (termEnum.Term() == null || termEnum.Term().Field != internedFieldName) { scratchTerm.Length = 0; } else { scratchTerm.CopyBytes(termEnum.Term().Bytes); } if (DEBUG_SURROGATES) { Console.WriteLine(" dance"); Console.WriteLine(" prev=" + UnicodeUtil.ToHexString(prevTerm.Utf8ToString())); Console.WriteLine(" " + prevTerm.ToString()); Console.WriteLine(" term=" + UnicodeUtil.ToHexString(scratchTerm.Utf8ToString())); Console.WriteLine(" " + scratchTerm.ToString()); } // this code assumes TermInfosReader/SegmentTermEnum // always use BytesRef.offset == 0 if (Debugging.AssertsEnabled) { Debugging.Assert(prevTerm.Offset == 0); Debugging.Assert(scratchTerm.Offset == 0); } // Need to loop here because we may need to do multiple // pops, and possibly a continue in the end, ie: // // cont // pop, cont // pop, pop, cont // <nothing> // while (true) { if (DoContinue()) { break; } else { if (!DoPop()) { break; } } } if (DEBUG_SURROGATES) { Console.WriteLine(" finish bmp ends"); } DoPushes(); }
public override BytesRef Next() { if (DEBUG_SURROGATES) { Console.WriteLine("TE.next()"); } if (skipNext) { if (DEBUG_SURROGATES) { Console.WriteLine(" skipNext=true"); } skipNext = false; if (termEnum.Term() == null) { return(null); // PreFlex codec interns field names: } else if (termEnum.Term().Field != internedFieldName) { return(null); } else { return(current = termEnum.Term().Bytes); } } // TODO: can we use STE's prevBuffer here? prevTerm.CopyBytes(termEnum.Term().Bytes); if (termEnum.Next() && termEnum.Term().Field == internedFieldName) { newSuffixStart = termEnum.newSuffixStart; if (DEBUG_SURROGATES) { Console.WriteLine(" newSuffixStart=" + newSuffixStart); } SurrogateDance(); Term t = termEnum.Term(); if (t == null || t.Field != internedFieldName) { // PreFlex codec interns field names; verify: if (Debugging.AssertsEnabled) { Debugging.Assert(t == null || !t.Field.Equals(internedFieldName, StringComparison.Ordinal)); } current = null; } else { current = t.Bytes; } return(current); } else { // this field is exhausted, but we have to give // surrogateDance a chance to seek back: if (DEBUG_SURROGATES) { Console.WriteLine(" force cont"); } //newSuffixStart = prevTerm.length; newSuffixStart = 0; SurrogateDance(); Term t = termEnum.Term(); if (t == null || t.Field != internedFieldName) { // PreFlex codec interns field names; verify: if (Debugging.AssertsEnabled) { Debugging.Assert(t == null || !t.Field.Equals(internedFieldName, StringComparison.Ordinal)); } return(null); } else { current = t.Bytes; return(current); } } }
// Swap in S, in place of E: private bool SeekToNonBMP(SegmentTermEnum te, BytesRef term, int pos) { int savLength = term.Length; if (Debugging.AssertsEnabled) { Debugging.Assert(term.Offset == 0); } // The 3 bytes starting at downTo make up 1 // unicode character: if (Debugging.AssertsEnabled) { Debugging.Assert(IsHighBMPChar(term.Bytes, pos)); } // NOTE: we cannot make this assert, because // AutomatonQuery legitimately sends us malformed UTF8 // (eg the UTF8 bytes with just 0xee) // assert term.length >= pos + 3: "term.length=" + term.length + " pos+3=" + (pos+3) + " byte=" + Integer.toHexString(term.bytes[pos]) + " term=" + term.toString(); // Save the bytes && length, since we need to // restore this if seek "back" finds no matching // terms if (term.Bytes.Length < 4 + pos) { term.Grow(4 + pos); } scratch[0] = (sbyte)term.Bytes[pos]; scratch[1] = (sbyte)term.Bytes[pos + 1]; scratch[2] = (sbyte)term.Bytes[pos + 2]; term.Bytes[pos] = 0xf0; term.Bytes[pos + 1] = 0x90; term.Bytes[pos + 2] = 0x80; term.Bytes[pos + 3] = 0x80; term.Length = 4 + pos; if (DEBUG_SURROGATES) { Console.WriteLine(" try seek term=" + UnicodeUtil.ToHexString(term.Utf8ToString())); } // Seek "back": outerInstance.TermsDict.SeekEnum(te, new Term(fieldInfo.Name, term), true); // Test if the term we seek'd to in fact found a // surrogate pair at the same position as the E: Term t2 = te.Term(); // Cannot be null (or move to next field) because at // "worst" it'd seek to the same term we are on now, // unless we are being called from seek if (t2 == null || t2.Field != internedFieldName) { return(false); } if (DEBUG_SURROGATES) { Console.WriteLine(" got term=" + UnicodeUtil.ToHexString(t2.Text())); } // Now test if prefix is identical and we found // a non-BMP char at the same position: BytesRef b2 = t2.Bytes; if (Debugging.AssertsEnabled) { Debugging.Assert(b2.Offset == 0); } bool matches; if (b2.Length >= term.Length && IsNonBMPChar(b2.Bytes, pos)) { matches = true; for (int i = 0; i < pos; i++) { if (term.Bytes[i] != b2.Bytes[i]) { matches = false; break; } } } else { matches = false; } // Restore term: term.Length = savLength; term.Bytes[pos] = (byte)scratch[0]; term.Bytes[pos + 1] = (byte)scratch[1]; term.Bytes[pos + 2] = (byte)scratch[2]; return(matches); }
public override Scorer GetScorer(AtomicReaderContext context, IBits acceptDocs) { if (Debugging.AssertsEnabled) { Debugging.Assert(outerInstance.terms.Count > 0); } AtomicReader reader = context.AtomicReader; IBits liveDocs = acceptDocs; PostingsAndFreq[] postingsFreqs = new PostingsAndFreq[outerInstance.terms.Count]; Terms fieldTerms = reader.GetTerms(outerInstance.field); if (fieldTerms is null) { return(null); } // Reuse single TermsEnum below: TermsEnum te = fieldTerms.GetEnumerator(); for (int i = 0; i < outerInstance.terms.Count; i++) { Term t = outerInstance.terms[i]; TermState state = states[i].Get(context.Ord); if (state is null) // term doesnt exist in this segment { if (Debugging.AssertsEnabled) { Debugging.Assert(TermNotInReader(reader, t), "no termstate found but term exists in reader"); } return(null); } te.SeekExact(t.Bytes, state); DocsAndPositionsEnum postingsEnum = te.DocsAndPositions(liveDocs, null, DocsAndPositionsFlags.NONE); // PhraseQuery on a field that did not index // positions. if (postingsEnum is null) { if (Debugging.AssertsEnabled) { Debugging.Assert(te.SeekExact(t.Bytes), "termstate found but no term exists in reader"); } // term does exist, but has no positions throw IllegalStateException.Create("field \"" + t.Field + "\" was indexed without position data; cannot run PhraseQuery (term=" + t.Text + ")"); } postingsFreqs[i] = new PostingsAndFreq(postingsEnum, te.DocFreq, outerInstance.positions[i], t); } // sort by increasing docFreq order if (outerInstance.slop == 0) { ArrayUtil.TimSort(postingsFreqs); } if (outerInstance.slop == 0) // optimize exact case { ExactPhraseScorer s = new ExactPhraseScorer(this, postingsFreqs, similarity.GetSimScorer(stats, context)); if (s.noDocs) { return(null); } else { return(s); } } else { return(new SloppyPhraseScorer(this, postingsFreqs, outerInstance.slop, similarity.GetSimScorer(stats, context))); } }
/// <summary> /// Sole constructor. </summary> public CompressingStoredFieldsReader(Directory d, SegmentInfo si, string segmentSuffix, FieldInfos fn, IOContext context, string formatName, CompressionMode compressionMode) { this.compressionMode = compressionMode; string segment = si.Name; bool success = false; fieldInfos = fn; numDocs = si.DocCount; ChecksumIndexInput indexStream = null; try { string indexStreamFN = IndexFileNames.SegmentFileName(segment, segmentSuffix, Lucene40StoredFieldsWriter.FIELDS_INDEX_EXTENSION); string fieldsStreamFN = IndexFileNames.SegmentFileName(segment, segmentSuffix, Lucene40StoredFieldsWriter.FIELDS_EXTENSION); // Load the index into memory indexStream = d.OpenChecksumInput(indexStreamFN, context); string codecNameIdx = formatName + CompressingStoredFieldsWriter.CODEC_SFX_IDX; version = CodecUtil.CheckHeader(indexStream, codecNameIdx, CompressingStoredFieldsWriter.VERSION_START, CompressingStoredFieldsWriter.VERSION_CURRENT); if (Debugging.AssertsEnabled) { Debugging.Assert(CodecUtil.HeaderLength(codecNameIdx) == indexStream.GetFilePointer()); } indexReader = new CompressingStoredFieldsIndexReader(indexStream, si); long maxPointer = -1; if (version >= CompressingStoredFieldsWriter.VERSION_CHECKSUM) { maxPointer = indexStream.ReadVInt64(); CodecUtil.CheckFooter(indexStream); } else { #pragma warning disable 612, 618 CodecUtil.CheckEOF(indexStream); #pragma warning restore 612, 618 } indexStream.Dispose(); indexStream = null; // Open the data file and read metadata fieldsStream = d.OpenInput(fieldsStreamFN, context); if (version >= CompressingStoredFieldsWriter.VERSION_CHECKSUM) { if (maxPointer + CodecUtil.FooterLength() != fieldsStream.Length) { throw new CorruptIndexException("Invalid fieldsStream maxPointer (file truncated?): maxPointer=" + maxPointer + ", length=" + fieldsStream.Length); } } else { maxPointer = fieldsStream.Length; } this.maxPointer = maxPointer; string codecNameDat = formatName + CompressingStoredFieldsWriter.CODEC_SFX_DAT; int fieldsVersion = CodecUtil.CheckHeader(fieldsStream, codecNameDat, CompressingStoredFieldsWriter.VERSION_START, CompressingStoredFieldsWriter.VERSION_CURRENT); if (version != fieldsVersion) { throw new CorruptIndexException("Version mismatch between stored fields index and data: " + version + " != " + fieldsVersion); } if (Debugging.AssertsEnabled) { Debugging.Assert(CodecUtil.HeaderLength(codecNameDat) == fieldsStream.GetFilePointer()); } if (version >= CompressingStoredFieldsWriter.VERSION_BIG_CHUNKS) { chunkSize = fieldsStream.ReadVInt32(); } else { chunkSize = -1; } packedIntsVersion = fieldsStream.ReadVInt32(); decompressor = compressionMode.NewDecompressor(); this.bytes = new BytesRef(); success = true; } finally { if (!success) { IOUtils.DisposeWhileHandlingException(this, indexStream); } } }
/// <summary> /// Go to the chunk containing the provided <paramref name="doc"/> ID. /// </summary> internal void Next(int doc) { if (Debugging.AssertsEnabled) { Debugging.Assert(doc >= this.docBase + this.chunkDocs, "{0} {1} {2}", doc, this.docBase, this.chunkDocs); } fieldsStream.Seek(outerInstance.indexReader.GetStartPointer(doc)); int docBase = fieldsStream.ReadVInt32(); int chunkDocs = fieldsStream.ReadVInt32(); if (docBase < this.docBase + this.chunkDocs || docBase + chunkDocs > outerInstance.numDocs) { throw new CorruptIndexException($"Corrupted: current docBase={this.docBase}, current numDocs={this.chunkDocs}, new docBase={docBase}, new numDocs={chunkDocs} (resource={fieldsStream})"); } this.docBase = docBase; this.chunkDocs = chunkDocs; if (chunkDocs > numStoredFields.Length) { int newLength = ArrayUtil.Oversize(chunkDocs, 4); numStoredFields = new int[newLength]; lengths = new int[newLength]; } if (chunkDocs == 1) { numStoredFields[0] = fieldsStream.ReadVInt32(); lengths[0] = fieldsStream.ReadVInt32(); } else { int bitsPerStoredFields = fieldsStream.ReadVInt32(); if (bitsPerStoredFields == 0) { Arrays.Fill(numStoredFields, 0, chunkDocs, fieldsStream.ReadVInt32()); } else if (bitsPerStoredFields > 31) { throw new CorruptIndexException("bitsPerStoredFields=" + bitsPerStoredFields + " (resource=" + fieldsStream + ")"); } else { PackedInt32s.IReaderIterator it = PackedInt32s.GetReaderIteratorNoHeader(fieldsStream, PackedInt32s.Format.PACKED, outerInstance.packedIntsVersion, chunkDocs, bitsPerStoredFields, 1); for (int i = 0; i < chunkDocs; ++i) { numStoredFields[i] = (int)it.Next(); } } int bitsPerLength = fieldsStream.ReadVInt32(); if (bitsPerLength == 0) { Arrays.Fill(lengths, 0, chunkDocs, fieldsStream.ReadVInt32()); } else if (bitsPerLength > 31) { throw new CorruptIndexException($"bitsPerLength={bitsPerLength}"); } else { PackedInt32s.IReaderIterator it = PackedInt32s.GetReaderIteratorNoHeader(fieldsStream, PackedInt32s.Format.PACKED, outerInstance.packedIntsVersion, chunkDocs, bitsPerLength, 1); for (int i = 0; i < chunkDocs; ++i) { lengths[i] = (int)it.Next(); } } } }
private void TestOne(Random r, ByteRunAutomaton a, int startCode, int endCode, int iters) { // Verify correct ints are accepted int nonSurrogateCount; bool ovSurStart; if (endCode < UnicodeUtil.UNI_SUR_HIGH_START || startCode > UnicodeUtil.UNI_SUR_LOW_END) { // no overlap w/ surrogates nonSurrogateCount = endCode - startCode + 1; ovSurStart = false; } else if (IsSurrogate(startCode)) { // start of range overlaps surrogates nonSurrogateCount = endCode - startCode + 1 - (UnicodeUtil.UNI_SUR_LOW_END - startCode + 1); ovSurStart = false; } else if (IsSurrogate(endCode)) { // end of range overlaps surrogates ovSurStart = true; nonSurrogateCount = endCode - startCode + 1 - (endCode - UnicodeUtil.UNI_SUR_HIGH_START + 1); } else { // range completely subsumes surrogates ovSurStart = true; nonSurrogateCount = endCode - startCode + 1 - (UnicodeUtil.UNI_SUR_LOW_END - UnicodeUtil.UNI_SUR_HIGH_START + 1); } if (Debugging.AssertsEnabled) { Debugging.Assert(nonSurrogateCount > 0); } for (int iter = 0; iter < iters; iter++) { // pick random code point in-range int code = startCode + r.Next(nonSurrogateCount); if (IsSurrogate(code)) { if (ovSurStart) { code = UnicodeUtil.UNI_SUR_LOW_END + 1 + (code - UnicodeUtil.UNI_SUR_HIGH_START); } else { code = UnicodeUtil.UNI_SUR_LOW_END + 1 + (code - startCode); } } if (Debugging.AssertsEnabled) { Debugging.Assert(code >= startCode && code <= endCode, () => "code=" + code + " start=" + startCode + " end=" + endCode); } if (Debugging.AssertsEnabled) { Debugging.Assert(!IsSurrogate(code)); } Assert.IsTrue(Matches(a, code), "DFA for range " + startCode + "-" + endCode + " failed to match code=" + code); } // Verify invalid ints are not accepted int invalidRange = MAX_UNICODE - (endCode - startCode + 1); if (invalidRange > 0) { for (int iter = 0; iter < iters; iter++) { int x = TestUtil.NextInt32(r, 0, invalidRange - 1); int code; if (x >= startCode) { code = endCode + 1 + x - startCode; } else { code = x; } if ((code >= UnicodeUtil.UNI_SUR_HIGH_START && code <= UnicodeUtil.UNI_SUR_HIGH_END) | (code >= UnicodeUtil.UNI_SUR_LOW_START && code <= UnicodeUtil.UNI_SUR_LOW_END)) { iter--; continue; } Assert.IsFalse(Matches(a, code), "DFA for range " + startCode + "-" + endCode + " matched invalid code=" + code); } } }
// Look for seek type 1 ("push"): if the newly added // suffix contains any S, we must try to seek to the // corresponding E. If we find a match, we go there; // else we keep looking for additional S's in the new // suffix. this "starts" the dance, at this character // position: private void DoPushes() { int upTo = newSuffixStart; if (DEBUG_SURROGATES) { Console.WriteLine(" try push newSuffixStart=" + newSuffixStart + " scratchLen=" + scratchTerm.Length); } while (upTo < scratchTerm.Length) { if (IsNonBMPChar(scratchTerm.Bytes, upTo) && (upTo > newSuffixStart || (upTo >= prevTerm.Length || (!IsNonBMPChar(prevTerm.Bytes, upTo) && !IsHighBMPChar(prevTerm.Bytes, upTo))))) { // A non-BMP char (4 bytes UTF8) starts here: if (Debugging.AssertsEnabled) { Debugging.Assert(scratchTerm.Length >= upTo + 4); } int savLength = scratchTerm.Length; scratch[0] = (sbyte)scratchTerm.Bytes[upTo]; scratch[1] = (sbyte)scratchTerm.Bytes[upTo + 1]; scratch[2] = (sbyte)scratchTerm.Bytes[upTo + 2]; scratchTerm.Bytes[upTo] = (byte)UTF8_HIGH_BMP_LEAD; scratchTerm.Bytes[upTo + 1] = 0x80; scratchTerm.Bytes[upTo + 2] = 0x80; scratchTerm.Length = upTo + 3; if (DEBUG_SURROGATES) { Console.WriteLine(" try seek 1 pos=" + upTo + " term=" + UnicodeUtil.ToHexString(scratchTerm.Utf8ToString()) + " " + scratchTerm.ToString() + " len=" + scratchTerm.Length); } // Seek "forward": // TODO: more efficient seek? outerInstance.TermsDict.SeekEnum(seekTermEnum, new Term(fieldInfo.Name, scratchTerm), true); scratchTerm.Bytes[upTo] = (byte)scratch[0]; scratchTerm.Bytes[upTo + 1] = (byte)scratch[1]; scratchTerm.Bytes[upTo + 2] = (byte)scratch[2]; scratchTerm.Length = savLength; // Did we find a match? Term t2 = seekTermEnum.Term(); if (DEBUG_SURROGATES) { if (t2 == null) { Console.WriteLine(" hit term=null"); } else { Console.WriteLine(" hit term=" + UnicodeUtil.ToHexString(t2.Text()) + " " + (t2 == null ? null : t2.Bytes)); } } // Since this was a seek "forward", we could hit // EOF or a different field: bool matches; if (t2 != null && t2.Field == internedFieldName) { BytesRef b2 = t2.Bytes; if (Debugging.AssertsEnabled) { Debugging.Assert(b2.Offset == 0); } if (b2.Length >= upTo + 3 && IsHighBMPChar(b2.Bytes, upTo)) { matches = true; for (int i = 0; i < upTo; i++) { if (scratchTerm.Bytes[i] != b2.Bytes[i]) { matches = false; break; } } } else { matches = false; } } else { matches = false; } if (matches) { if (DEBUG_SURROGATES) { Console.WriteLine(" matches!"); } // OK seek "back" // TODO: more efficient seek? outerInstance.TermsDict.SeekEnum(termEnum, seekTermEnum.Term(), true); scratchTerm.CopyBytes(seekTermEnum.Term().Bytes); // +3 because we don't need to check the char // at upTo: we know it's > BMP upTo += 3; // NOTE: we keep iterating, now, since this // can easily "recurse". Ie, after seeking // forward at a certain char position, we may // find another surrogate in our [new] suffix // and must then do another seek (recurse) } else { upTo++; } } else { upTo++; } } }
internal override int Transition(int absState, int position, int vector) { // null absState should never be passed in if (Debugging.AssertsEnabled) { Debugging.Assert(absState != -1); } // decode absState -> state, offset int state = absState / (m_w + 1); int offset = absState % (m_w + 1); if (Debugging.AssertsEnabled) { Debugging.Assert(offset >= 0); } if (position == m_w) { if (state < 3) { int loc = vector * 3 + state; offset += Unpack(offsetIncrs0, loc, 1); state = Unpack(toStates0, loc, 2) - 1; } } else if (position == m_w - 1) { if (state < 5) { int loc = vector * 5 + state; offset += Unpack(offsetIncrs1, loc, 1); state = Unpack(toStates1, loc, 3) - 1; } } else if (position == m_w - 2) { if (state < 11) { int loc = vector * 11 + state; offset += Unpack(offsetIncrs2, loc, 2); state = Unpack(toStates2, loc, 4) - 1; } } else if (position == m_w - 3) { if (state < 21) { int loc = vector * 21 + state; offset += Unpack(offsetIncrs3, loc, 2); state = Unpack(toStates3, loc, 5) - 1; } } else if (position == m_w - 4) { if (state < 30) { int loc = vector * 30 + state; offset += Unpack(offsetIncrs4, loc, 3); state = Unpack(toStates4, loc, 5) - 1; } } else { if (state < 30) { int loc = vector * 30 + state; offset += Unpack(offsetIncrs5, loc, 3); state = Unpack(toStates5, loc, 5) - 1; } } if (state == -1) { // null state return(-1); } else { // translate back to abs return(state * (m_w + 1) + offset); } }
public override SeekStatus SeekCeil(BytesRef term) { if (DEBUG_SURROGATES) { Console.WriteLine("TE.seek target=" + UnicodeUtil.ToHexString(term.Utf8ToString())); } skipNext = false; TermInfosReader tis = outerInstance.TermsDict; Term t0 = new Term(fieldInfo.Name, term); if (Debugging.AssertsEnabled) { Debugging.Assert(termEnum != null); } tis.SeekEnum(termEnum, t0, false); Term t = termEnum.Term(); if (t != null && t.Field == internedFieldName && term.BytesEquals(t.Bytes)) { // If we found an exact match, no need to do the // surrogate dance if (DEBUG_SURROGATES) { Console.WriteLine(" seek exact match"); } current = t.Bytes; return(SeekStatus.FOUND); } else if (t == null || t.Field != internedFieldName) { // TODO: maybe we can handle this like the next() // into null? set term as prevTerm then dance? if (DEBUG_SURROGATES) { Console.WriteLine(" seek hit EOF"); } // We hit EOF; try end-case surrogate dance: if we // find an E, try swapping in S, backwards: scratchTerm.CopyBytes(term); if (Debugging.AssertsEnabled) { Debugging.Assert(scratchTerm.Offset == 0); } for (int i = scratchTerm.Length - 1; i >= 0; i--) { if (IsHighBMPChar(scratchTerm.Bytes, i)) { if (DEBUG_SURROGATES) { Console.WriteLine(" found E pos=" + i + "; try seek"); } if (SeekToNonBMP(seekTermEnum, scratchTerm, i)) { scratchTerm.CopyBytes(seekTermEnum.Term().Bytes); outerInstance.TermsDict.SeekEnum(termEnum, seekTermEnum.Term(), false); newSuffixStart = 1 + i; DoPushes(); // Found a match // TODO: faster seek? current = termEnum.Term().Bytes; return(SeekStatus.NOT_FOUND); } } } if (DEBUG_SURROGATES) { Console.WriteLine(" seek END"); } current = null; return(SeekStatus.END); } else { // We found a non-exact but non-null term; this one // is fun -- just treat it like next, by pretending // requested term was prev: prevTerm.CopyBytes(term); if (DEBUG_SURROGATES) { Console.WriteLine(" seek hit non-exact term=" + UnicodeUtil.ToHexString(t.Text())); } BytesRef br = t.Bytes; if (Debugging.AssertsEnabled) { Debugging.Assert(br.Offset == 0); } SetNewSuffixStart(term, br); SurrogateDance(); Term t2 = termEnum.Term(); if (t2 == null || t2.Field != internedFieldName) { // PreFlex codec interns field names; verify: if (Debugging.AssertsEnabled) { Debugging.Assert(t2 == null || !t2.Field.Equals(internedFieldName, StringComparison.Ordinal)); } current = null; return(SeekStatus.END); } else { current = t2.Bytes; if (Debugging.AssertsEnabled) { Debugging.Assert(!unicodeSortOrder || term.CompareTo(current) < 0, () => "term=" + UnicodeUtil.ToHexString(term.Utf8ToString()) + " vs current=" + UnicodeUtil.ToHexString(current.Utf8ToString())); } return(SeekStatus.NOT_FOUND); } } }
private void Build(State start, State end, UTF8Sequence startUTF8, UTF8Sequence endUTF8, int upto) { // Break into start, middle, end: if (startUTF8.ByteAt(upto) == endUTF8.ByteAt(upto)) { // Degen case: lead with the same byte: if (upto == startUTF8.len - 1 && upto == endUTF8.len - 1) { // Super degen: just single edge, one UTF8 byte: start.AddTransition(new Transition(startUTF8.ByteAt(upto), endUTF8.ByteAt(upto), end)); return; } else { if (Debugging.AssertsEnabled) { Debugging.Assert(startUTF8.len > upto + 1); Debugging.Assert(endUTF8.len > upto + 1); } State n = NewUTF8State(); // Single value leading edge start.AddTransition(new Transition(startUTF8.ByteAt(upto), n)); // type=single // Recurse for the rest Build(n, end, startUTF8, endUTF8, 1 + upto); } } else if (startUTF8.len == endUTF8.len) { if (upto == startUTF8.len - 1) { start.AddTransition(new Transition(startUTF8.ByteAt(upto), endUTF8.ByteAt(upto), end)); // type=startend } else { Start(start, end, startUTF8, upto, false); if (endUTF8.ByteAt(upto) - startUTF8.ByteAt(upto) > 1) { // There is a middle All(start, end, startUTF8.ByteAt(upto) + 1, endUTF8.ByteAt(upto) - 1, startUTF8.len - upto - 1); } End(start, end, endUTF8, upto, false); } } else { // start Start(start, end, startUTF8, upto, true); // possibly middle, spanning multiple num bytes int byteCount = 1 + startUTF8.len - upto; int limit = endUTF8.len - upto; while (byteCount < limit) { // wasteful: we only need first byte, and, we should // statically encode this first byte: tmpUTF8a.Set(startCodes[byteCount - 1]); tmpUTF8b.Set(endCodes[byteCount - 1]); All(start, end, tmpUTF8a.ByteAt(0), tmpUTF8b.ByteAt(0), tmpUTF8a.len - 1); byteCount++; } // end End(start, end, endUTF8, upto, true); } }
internal void MarkForFullFlush() { DocumentsWriterDeleteQueue flushingQueue; lock (this) { if (Debugging.AssertsEnabled) { Debugging.Assert(!fullFlush, "called DWFC#markForFullFlush() while full flush is still running"); Debugging.Assert(fullFlushBuffer.Count == 0, () => "full flush buffer should be empty: " + fullFlushBuffer); } fullFlush = true; flushingQueue = documentsWriter.deleteQueue; // Set a new delete queue - all subsequent DWPT will use this queue until // we do another full flush DocumentsWriterDeleteQueue newQueue = new DocumentsWriterDeleteQueue(flushingQueue.generation + 1); documentsWriter.deleteQueue = newQueue; } int limit = perThreadPool.NumThreadStatesActive; for (int i = 0; i < limit; i++) { ThreadState next = perThreadPool.GetThreadState(i); next.@Lock(); try { if (!next.IsInitialized) { if (closed && next.IsActive) { perThreadPool.DeactivateThreadState(next); } continue; } if (Debugging.AssertsEnabled) { Debugging.Assert(next.dwpt.deleteQueue == flushingQueue || next.dwpt.deleteQueue == documentsWriter.deleteQueue, () => " flushingQueue: " + flushingQueue + " currentqueue: " + documentsWriter.deleteQueue + " perThread queue: " + next.dwpt.deleteQueue + " numDocsInRam: " + next.dwpt.NumDocsInRAM); } if (next.dwpt.deleteQueue != flushingQueue) { // this one is already a new DWPT continue; } AddFlushableState(next); } finally { next.Unlock(); } } lock (this) { /* make sure we move all DWPT that are where concurrently marked as * pending and moved to blocked are moved over to the flushQueue. There is * a chance that this happens since we marking DWPT for full flush without * blocking indexing.*/ PruneBlockedQueue(flushingQueue); if (Debugging.AssertsEnabled) { Debugging.Assert(AssertBlockedFlushes(documentsWriter.deleteQueue)); } //FlushQueue.AddAll(FullFlushBuffer); foreach (var dwpt in fullFlushBuffer) { flushQueue.Enqueue(dwpt); } fullFlushBuffer.Clear(); UpdateStallState(); } if (Debugging.AssertsEnabled) { Debugging.Assert(AssertActiveDeleteQueue(documentsWriter.deleteQueue)); } }
private IDictionary <int, object> HighlightField(string field, string[] contents, BreakIterator bi, BytesRef[] terms, int[] docids, IList <AtomicReaderContext> leaves, int maxPassages, Query query) { IDictionary <int, object> highlights = new Dictionary <int, object>(); PassageFormatter fieldFormatter = GetFormatter(field); if (fieldFormatter == null) { throw new NullReferenceException("PassageFormatter cannot be null"); } // check if we should do any multiterm processing Analyzer analyzer = GetIndexAnalyzer(field); CharacterRunAutomaton[] automata = Arrays.Empty <CharacterRunAutomaton>(); if (analyzer != null) { automata = MultiTermHighlighting.ExtractAutomata(query, field); } // resize 'terms', where the last term is the multiterm matcher if (automata.Length > 0) { BytesRef[] newTerms = new BytesRef[terms.Length + 1]; System.Array.Copy(terms, 0, newTerms, 0, terms.Length); terms = newTerms; } // we are processing in increasing docid order, so we only need to reinitialize stuff on segment changes // otherwise, we will just advance() existing enums to the new document in the same segment. DocsAndPositionsEnum[] postings = null; TermsEnum termsEnum = null; int lastLeaf = -1; for (int i = 0; i < docids.Length; i++) { string content = contents[i]; if (content.Length == 0) { continue; // nothing to do } bi.SetText(content); int doc = docids[i]; int leaf = ReaderUtil.SubIndex(doc, leaves); AtomicReaderContext subContext = leaves[leaf]; AtomicReader r = subContext.AtomicReader; if (Debugging.AssertsEnabled) { Debugging.Assert(leaf >= lastLeaf); // increasing order } // if the segment has changed, we must initialize new enums. if (leaf != lastLeaf) { Terms t = r.GetTerms(field); if (t != null) { termsEnum = t.GetEnumerator(); postings = new DocsAndPositionsEnum[terms.Length]; } } if (termsEnum == null) { continue; // no terms for this field, nothing to do } // if there are multi-term matches, we have to initialize the "fake" enum for each document if (automata.Length > 0) { DocsAndPositionsEnum dp = MultiTermHighlighting.GetDocsEnum(analyzer.GetTokenStream(field, content), automata); dp.Advance(doc - subContext.DocBase); postings[terms.Length - 1] = dp; // last term is the multiterm matcher } Passage[] passages = HighlightDoc(field, terms, content.Length, bi, doc - subContext.DocBase, termsEnum, postings, maxPassages); if (passages.Length == 0) { // no passages were returned, so ask for a default summary passages = GetEmptyHighlight(field, bi, maxPassages); } if (passages.Length > 0) { highlights[doc] = fieldFormatter.Format(passages, content); } lastLeaf = leaf; } return(highlights); }
public override int NextPosition() { if (lazyProxPointer != -1) { proxIn.Seek(lazyProxPointer); lazyProxPointer = -1; } if (payloadPending && payloadLength > 0) { // payload of last position was never retrieved -- skip it proxIn.Seek(proxIn.Position + payloadLength); // LUCENENET specific: Renamed from getFilePointer() to match FileStream payloadPending = false; } // scan over any docs that were iterated without their positions while (posPendingCount > freq) { int code = proxIn.ReadVInt32(); if (storePayloads) { if ((code & 1) != 0) { // new payload length payloadLength = proxIn.ReadVInt32(); if (Debugging.AssertsEnabled) { Debugging.Assert(payloadLength >= 0); } } if (Debugging.AssertsEnabled) { Debugging.Assert(payloadLength != -1); } } if (storeOffsets) { if ((proxIn.ReadVInt32() & 1) != 0) { // new offset length offsetLength = proxIn.ReadVInt32(); } } if (storePayloads) { proxIn.Seek(proxIn.Position + payloadLength); // LUCENENET specific: Renamed from getFilePointer() to match FileStream } posPendingCount--; position = 0; startOffset = 0; payloadPending = false; //System.out.println("StandardR.D&PE skipPos"); } // read next position if (payloadPending && payloadLength > 0) { // payload wasn't retrieved for last position proxIn.Seek(proxIn.Position + payloadLength); // LUCENENET specific: Renamed from getFilePointer() to match FileStream } int code_ = proxIn.ReadVInt32(); if (storePayloads) { if ((code_ & 1) != 0) { // new payload length payloadLength = proxIn.ReadVInt32(); if (Debugging.AssertsEnabled) { Debugging.Assert(payloadLength >= 0); } } if (Debugging.AssertsEnabled) { Debugging.Assert(payloadLength != -1); } payloadPending = true; code_ = code_.TripleShift(1); } position += code_; if (storeOffsets) { int offsetCode = proxIn.ReadVInt32(); if ((offsetCode & 1) != 0) { // new offset length offsetLength = proxIn.ReadVInt32(); } startOffset += offsetCode.TripleShift(1); } posPendingCount--; if (Debugging.AssertsEnabled) { Debugging.Assert(posPendingCount >= 0, "NextPosition() was called too many times (more than Freq times) posPendingCount={0}", posPendingCount); } //System.out.println("StandardR.D&PE nextPos return pos=" + position); return(position); }
/// <summary> /// Safe (but, slowish) default method to write every /// vector field in the document. /// </summary> protected void AddAllDocVectors(Fields vectors, MergeState mergeState) { if (vectors == null) { StartDocument(0); FinishDocument(); return; } int numFields = vectors.Count; if (numFields == -1) { // count manually! TODO: Maybe enforce that Fields.size() returns something valid? numFields = 0; //for (IEnumerator<string> it = vectors.Iterator(); it.hasNext();) foreach (string it in vectors) { numFields++; } } StartDocument(numFields); string lastFieldName = null; TermsEnum termsEnum = null; DocsAndPositionsEnum docsAndPositionsEnum = null; int fieldCount = 0; foreach (string fieldName in vectors) { fieldCount++; FieldInfo fieldInfo = mergeState.FieldInfos.FieldInfo(fieldName); if (Debugging.AssertsEnabled) { Debugging.Assert(lastFieldName == null || fieldName.CompareToOrdinal(lastFieldName) > 0, "lastFieldName={0} fieldName={1}", lastFieldName, fieldName); } lastFieldName = fieldName; Terms terms = vectors.GetTerms(fieldName); if (terms == null) { // FieldsEnum shouldn't lie... continue; } bool hasPositions = terms.HasPositions; bool hasOffsets = terms.HasOffsets; bool hasPayloads = terms.HasPayloads; if (Debugging.AssertsEnabled) { Debugging.Assert(!hasPayloads || hasPositions); } int numTerms = (int)terms.Count; if (numTerms == -1) { // count manually. It is stupid, but needed, as Terms.size() is not a mandatory statistics function numTerms = 0; termsEnum = terms.GetEnumerator(termsEnum); while (termsEnum.MoveNext()) { numTerms++; } } StartField(fieldInfo, numTerms, hasPositions, hasOffsets, hasPayloads); termsEnum = terms.GetEnumerator(termsEnum); int termCount = 0; while (termsEnum.MoveNext()) { termCount++; int freq = (int)termsEnum.TotalTermFreq; StartTerm(termsEnum.Term, freq); if (hasPositions || hasOffsets) { docsAndPositionsEnum = termsEnum.DocsAndPositions(null, docsAndPositionsEnum); if (Debugging.AssertsEnabled) { Debugging.Assert(docsAndPositionsEnum != null); } int docID = docsAndPositionsEnum.NextDoc(); if (Debugging.AssertsEnabled) { Debugging.Assert(docID != DocIdSetIterator.NO_MORE_DOCS); Debugging.Assert(docsAndPositionsEnum.Freq == freq); } for (int posUpto = 0; posUpto < freq; posUpto++) { int pos = docsAndPositionsEnum.NextPosition(); int startOffset = docsAndPositionsEnum.StartOffset; int endOffset = docsAndPositionsEnum.EndOffset; BytesRef payload = docsAndPositionsEnum.GetPayload(); if (Debugging.AssertsEnabled) { Debugging.Assert(!hasPositions || pos >= 0); } AddPosition(pos, startOffset, endOffset, payload); } } FinishTerm(); } if (Debugging.AssertsEnabled) { Debugging.Assert(termCount == numTerms); } FinishField(); } if (Debugging.AssertsEnabled) { Debugging.Assert(fieldCount == numFields); } FinishDocument(); }
public override void Warm(AtomicReader reader) { long startTime = J2N.Time.NanoTime() / J2N.Time.MillisecondsPerNanosecond; // LUCENENET: Use NanoTime() rather than CurrentTimeMilliseconds() for more accurate/reliable results int indexedCount = 0; int docValuesCount = 0; int normsCount = 0; foreach (FieldInfo info in reader.FieldInfos) { if (info.IsIndexed) { reader.GetTerms(info.Name); indexedCount++; if (info.HasNorms) { reader.GetNormValues(info.Name); normsCount++; } } if (info.HasDocValues) { switch (info.DocValuesType) { case DocValuesType.NUMERIC: reader.GetNumericDocValues(info.Name); break; case DocValuesType.BINARY: reader.GetBinaryDocValues(info.Name); break; case DocValuesType.SORTED: reader.GetSortedDocValues(info.Name); break; case DocValuesType.SORTED_SET: reader.GetSortedSetDocValues(info.Name); break; default: if (Debugging.AssertsEnabled) { Debugging.Assert(false); // unknown dv type } break; } docValuesCount++; } } reader.Document(0); reader.GetTermVectors(0); if (infoStream.IsEnabled("SMSW")) { infoStream.Message("SMSW", "Finished warming segment: " + reader + ", indexed=" + indexedCount + ", docValues=" + docValuesCount + ", norms=" + normsCount + ", time=" + ((J2N.Time.NanoTime() / J2N.Time.MillisecondsPerNanosecond) - startTime)); // LUCENENET: Use NanoTime() rather than CurrentTimeMilliseconds() for more accurate/reliable results } }
// for debugging /* * private String toString(BytesRef b) { * try { * return b.utf8ToString() + " " + b; * } catch (Throwable t) { * return b.toString(); * } * } */ /// <summary> /// It's OK to add the same input twice in a row with /// different outputs, as long as outputs impls the merge /// method. Note that input is fully consumed after this /// method is returned (so caller is free to reuse), but /// output is not. So if your outputs are changeable (eg /// <see cref="ByteSequenceOutputs"/> or /// <see cref="Int32SequenceOutputs"/>) then you cannot reuse across /// calls. /// </summary> public virtual void Add(Int32sRef input, T output) { /* * if (DEBUG) { * BytesRef b = new BytesRef(input.length); * for(int x=0;x<input.length;x++) { * b.bytes[x] = (byte) input.ints[x]; * } * b.length = input.length; * if (output == NO_OUTPUT) { * System.out.println("\nFST ADD: input=" + toString(b) + " " + b); * } else { * System.out.println("\nFST ADD: input=" + toString(b) + " " + b + " output=" + fst.outputs.outputToString(output)); * } * } */ // De-dup NO_OUTPUT since it must be a singleton: if (output.Equals(NO_OUTPUT)) { output = NO_OUTPUT; } if (Debugging.AssertsEnabled) { Debugging.Assert(lastInput.Length == 0 || input.CompareTo(lastInput) >= 0, () => "inputs are added out of order lastInput=" + lastInput + " vs input=" + input); Debugging.Assert(ValidOutput(output)); } //System.out.println("\nadd: " + input); if (input.Length == 0) { // empty input: only allowed as first input. we have // to special case this because the packed FST // format cannot represent the empty input since // 'finalness' is stored on the incoming arc, not on // the node frontier[0].InputCount++; frontier[0].IsFinal = true; fst.EmptyOutput = output; return; } // compare shared prefix length int pos1 = 0; int pos2 = input.Offset; int pos1Stop = Math.Min(lastInput.Length, input.Length); while (true) { frontier[pos1].InputCount++; //System.out.println(" incr " + pos1 + " ct=" + frontier[pos1].inputCount + " n=" + frontier[pos1]); if (pos1 >= pos1Stop || lastInput.Int32s[pos1] != input.Int32s[pos2]) { break; } pos1++; pos2++; } int prefixLenPlus1 = pos1 + 1; if (frontier.Length < input.Length + 1) { UnCompiledNode <T>[] next = new UnCompiledNode <T> [ArrayUtil.Oversize(input.Length + 1, RamUsageEstimator.NUM_BYTES_OBJECT_REF)]; Array.Copy(frontier, 0, next, 0, frontier.Length); for (int idx = frontier.Length; idx < next.Length; idx++) { next[idx] = new UnCompiledNode <T>(this, idx); } frontier = next; } // minimize/compile states from previous input's // orphan'd suffix DoFreezeTail(prefixLenPlus1); // init tail states for current input for (int idx = prefixLenPlus1; idx <= input.Length; idx++) { frontier[idx - 1].AddArc(input.Int32s[input.Offset + idx - 1], frontier[idx]); frontier[idx].InputCount++; } UnCompiledNode <T> lastNode = frontier[input.Length]; if (lastInput.Length != input.Length || prefixLenPlus1 != input.Length + 1) { lastNode.IsFinal = true; lastNode.Output = NO_OUTPUT; } // push conflicting outputs forward, only as far as // needed for (int idx = 1; idx < prefixLenPlus1; idx++) { UnCompiledNode <T> node = frontier[idx]; UnCompiledNode <T> parentNode = frontier[idx - 1]; T lastOutput = parentNode.GetLastOutput(input.Int32s[input.Offset + idx - 1]); if (Debugging.AssertsEnabled) { Debugging.Assert(ValidOutput(lastOutput)); } T commonOutputPrefix; T wordSuffix; if (!lastOutput.Equals(NO_OUTPUT)) { commonOutputPrefix = fst.Outputs.Common(output, lastOutput); if (Debugging.AssertsEnabled) { Debugging.Assert(ValidOutput(commonOutputPrefix)); } wordSuffix = fst.Outputs.Subtract(lastOutput, commonOutputPrefix); if (Debugging.AssertsEnabled) { Debugging.Assert(ValidOutput(wordSuffix)); } parentNode.SetLastOutput(input.Int32s[input.Offset + idx - 1], commonOutputPrefix); node.PrependOutput(wordSuffix); } else { commonOutputPrefix = /*wordSuffix =*/ NO_OUTPUT; // LUCENENET: Removed unnecessary assignment } output = fst.Outputs.Subtract(output, commonOutputPrefix); if (Debugging.AssertsEnabled) { Debugging.Assert(ValidOutput(output)); } } if (lastInput.Length == input.Length && prefixLenPlus1 == 1 + input.Length) { // same input more than 1 time in a row, mapping to // multiple outputs lastNode.Output = fst.Outputs.Merge(lastNode.Output, output); } else { // this new arc is private to this new input; set its // arc output to the leftover output: frontier[prefixLenPlus1 - 1].SetLastOutput(input.Int32s[input.Offset + prefixLenPlus1 - 1], output); } // save last input lastInput.CopyInt32s(input); //System.out.println(" count[0]=" + frontier[0].inputCount); }
public virtual void _run() { for (int iter = 0; iter < NUM_TEST_ITER; iter++) { FieldData field = fields[Random.Next(fields.Length)]; TermsEnum termsEnum = termsDict.GetTerms(field.fieldInfo.Name).GetEnumerator(); #pragma warning disable 612, 618 if (si.Codec is Lucene3xCodec) #pragma warning restore 612, 618 { // code below expects unicode sort order continue; } int upto = 0; // Test straight enum of the terms: while (termsEnum.MoveNext()) { BytesRef term = termsEnum.Term; BytesRef expected = new BytesRef(field.terms[upto++].text2); Assert.IsTrue(expected.BytesEquals(term), "expected=" + expected + " vs actual " + term); } Assert.AreEqual(upto, field.terms.Length); // Test random seek: TermData term2 = field.terms[Random.Next(field.terms.Length)]; TermsEnum.SeekStatus status = termsEnum.SeekCeil(new BytesRef(term2.text2)); Assert.AreEqual(status, TermsEnum.SeekStatus.FOUND); Assert.AreEqual(term2.docs.Length, termsEnum.DocFreq); if (field.omitTF) { this.VerifyDocs(term2.docs, term2.positions, TestUtil.Docs(Random, termsEnum, null, null, DocsFlags.NONE), false); } else { this.VerifyDocs(term2.docs, term2.positions, termsEnum.DocsAndPositions(null, null), true); } // Test random seek by ord: int idx = Random.Next(field.terms.Length); term2 = field.terms[idx]; bool success = false; try { termsEnum.SeekExact(idx); success = true; } catch (Exception uoe) when(uoe.IsUnsupportedOperationException()) { // ok -- skip it } if (success) { Assert.AreEqual(status, TermsEnum.SeekStatus.FOUND); Assert.IsTrue(termsEnum.Term.BytesEquals(new BytesRef(term2.text2))); Assert.AreEqual(term2.docs.Length, termsEnum.DocFreq); if (field.omitTF) { this.VerifyDocs(term2.docs, term2.positions, TestUtil.Docs(Random, termsEnum, null, null, DocsFlags.NONE), false); } else { this.VerifyDocs(term2.docs, term2.positions, termsEnum.DocsAndPositions(null, null), true); } } // Test seek to non-existent terms: if (Verbose) { Console.WriteLine("TEST: seek non-exist terms"); } for (int i = 0; i < 100; i++) { string text2 = TestUtil.RandomUnicodeString(Random) + "."; status = termsEnum.SeekCeil(new BytesRef(text2)); Assert.IsTrue(status == TermsEnum.SeekStatus.NOT_FOUND || status == TermsEnum.SeekStatus.END); } // Seek to each term, backwards: if (Verbose) { Console.WriteLine("TEST: seek terms backwards"); } for (int i = field.terms.Length - 1; i >= 0; i--) { Assert.AreEqual(TermsEnum.SeekStatus.FOUND, termsEnum.SeekCeil(new BytesRef(field.terms[i].text2)), Thread.CurrentThread.Name + ": field=" + field.fieldInfo.Name + " term=" + field.terms[i].text2); Assert.AreEqual(field.terms[i].docs.Length, termsEnum.DocFreq); } // Seek to each term by ord, backwards for (int i = field.terms.Length - 1; i >= 0; i--) { try { termsEnum.SeekExact(i); Assert.AreEqual(field.terms[i].docs.Length, termsEnum.DocFreq); Assert.IsTrue(termsEnum.Term.BytesEquals(new BytesRef(field.terms[i].text2))); } catch (Exception uoe) when(uoe.IsUnsupportedOperationException()) { } } // Seek to non-existent empty-string term status = termsEnum.SeekCeil(new BytesRef("")); Assert.IsNotNull(status); //Assert.AreEqual(TermsEnum.SeekStatus.NOT_FOUND, status); // Make sure we're now pointing to first term Assert.IsTrue(termsEnum.Term.BytesEquals(new BytesRef(field.terms[0].text2))); // Test docs enum termsEnum.SeekCeil(new BytesRef("")); upto = 0; do { term2 = field.terms[upto]; if (Random.Next(3) == 1) { DocsEnum docs; DocsEnum docsAndFreqs; DocsAndPositionsEnum postings; if (!field.omitTF) { postings = termsEnum.DocsAndPositions(null, null); if (postings != null) { docs = docsAndFreqs = postings; } else { docs = docsAndFreqs = TestUtil.Docs(Random, termsEnum, null, null, DocsFlags.FREQS); } } else { postings = null; docsAndFreqs = null; docs = TestUtil.Docs(Random, termsEnum, null, null, DocsFlags.NONE); } Assert.IsNotNull(docs); int upto2 = -1; bool ended = false; while (upto2 < term2.docs.Length - 1) { // Maybe skip: int left = term2.docs.Length - upto2; int doc; if (Random.Next(3) == 1 && left >= 1) { int inc = 1 + Random.Next(left - 1); upto2 += inc; if (Random.Next(2) == 1) { doc = docs.Advance(term2.docs[upto2]); Assert.AreEqual(term2.docs[upto2], doc); } else { doc = docs.Advance(1 + term2.docs[upto2]); if (doc == DocIdSetIterator.NO_MORE_DOCS) { // skipped past last doc if (Debugging.AssertsEnabled) { Debugging.Assert(upto2 == term2.docs.Length - 1); } ended = true; break; } else { // skipped to next doc if (Debugging.AssertsEnabled) { Debugging.Assert(upto2 < term2.docs.Length - 1); } if (doc >= term2.docs[1 + upto2]) { upto2++; } } } } else { doc = docs.NextDoc(); Assert.IsTrue(doc != -1); upto2++; } Assert.AreEqual(term2.docs[upto2], doc); if (!field.omitTF) { Assert.AreEqual(term2.positions[upto2].Length, postings.Freq); if (Random.Next(2) == 1) { this.VerifyPositions(term2.positions[upto2], postings); } } } if (!ended) { Assert.AreEqual(DocIdSetIterator.NO_MORE_DOCS, docs.NextDoc()); } } upto++; } while (termsEnum.MoveNext()); Assert.AreEqual(upto, field.terms.Length); } }
public override void AddSortedSetField(FieldInfo field, IEnumerable <BytesRef> values, IEnumerable <long?> docToOrdCount, IEnumerable <long?> ords) { long valueCount = 0; BytesRef lastValue = null; foreach (BytesRef b in values) { if (Debugging.AssertsEnabled) { Debugging.Assert(b != null); } if (Debugging.AssertsEnabled) { Debugging.Assert(b.IsValid()); } if (valueCount > 0) { if (Debugging.AssertsEnabled) { Debugging.Assert(b.CompareTo(lastValue) > 0); } } lastValue = BytesRef.DeepCopyOf(b); valueCount++; } int docCount = 0; long ordCount = 0; Int64BitSet seenOrds = new Int64BitSet(valueCount); using IEnumerator <long?> ordIterator = ords.GetEnumerator(); foreach (long?v in docToOrdCount) { if (Debugging.AssertsEnabled) { Debugging.Assert(v != null); } int count = (int)v.Value; if (Debugging.AssertsEnabled) { Debugging.Assert(count >= 0); } docCount++; ordCount += count; long lastOrd = -1; for (int i = 0; i < count; i++) { ordIterator.MoveNext(); long?o = ordIterator.Current; if (Debugging.AssertsEnabled) { Debugging.Assert(o != null); } long ord = o.Value; if (Debugging.AssertsEnabled) { Debugging.Assert(ord >= 0 && ord < valueCount); } if (Debugging.AssertsEnabled) { Debugging.Assert(ord > lastOrd, "ord={0},lastOrd={1}", ord, lastOrd); } seenOrds.Set(ord); lastOrd = ord; } } if (Debugging.AssertsEnabled) { Debugging.Assert(ordIterator.MoveNext() == false); } if (Debugging.AssertsEnabled) { Debugging.Assert(docCount == maxDoc); } if (Debugging.AssertsEnabled) { Debugging.Assert(seenOrds.Cardinality() == valueCount); } CheckIterator(values.GetEnumerator(), valueCount, false); CheckIterator(docToOrdCount.GetEnumerator(), maxDoc, false); CheckIterator(ords.GetEnumerator(), ordCount, false); @in.AddSortedSetField(field, values, docToOrdCount, ords); }
public override TopDocs Rescore(IndexSearcher searcher, TopDocs firstPassTopDocs, int topN) { ScoreDoc[] hits = (ScoreDoc[])firstPassTopDocs.ScoreDocs.Clone(); Array.Sort(hits, Comparer <ScoreDoc> .Create((a, b) => a.Doc - b.Doc)); IList <AtomicReaderContext> leaves = searcher.IndexReader.Leaves; Weight weight = searcher.CreateNormalizedWeight(query); // Now merge sort docIDs from hits, with reader's leaves: int hitUpto = 0; int readerUpto = -1; int endDoc = 0; int docBase = 0; Scorer scorer = null; while (hitUpto < hits.Length) { ScoreDoc hit = hits[hitUpto]; int docID = hit.Doc; AtomicReaderContext readerContext = null; while (docID >= endDoc) { readerUpto++; readerContext = leaves[readerUpto]; endDoc = readerContext.DocBase + readerContext.Reader.MaxDoc; } if (readerContext != null) { // We advanced to another segment: docBase = readerContext.DocBase; scorer = weight.GetScorer(readerContext, null); } int targetDoc = docID - docBase; int actualDoc = scorer.DocID; if (actualDoc < targetDoc) { actualDoc = scorer.Advance(targetDoc); } if (actualDoc == targetDoc) { // Query did match this doc: hit.Score = Combine(hit.Score, true, scorer.GetScore()); } else { // Query did not match this doc: if (Debugging.AssertsEnabled) { Debugging.Assert(actualDoc > targetDoc); } hit.Score = Combine(hit.Score, false, 0.0f); } hitUpto++; } // TODO: we should do a partial sort (of only topN) // instead, but typically the number of hits is // smallish: Array.Sort(hits, Comparer <ScoreDoc> .Create((a, b) => { // Sort by score descending, then docID ascending: if (a.Score > b.Score) { return(-1); } else if (a.Score < b.Score) { return(1); } else { // this subtraction can't overflow int // because docIDs are >= 0: return(a.Doc - b.Doc); } })); if (topN < hits.Length) { ScoreDoc[] subset = new ScoreDoc[topN]; Array.Copy(hits, 0, subset, 0, topN); hits = subset; } return(new TopDocs(firstPassTopDocs.TotalHits, hits, hits[0].Score)); }
// Delete by Term private long ApplyTermDeletes(IEnumerable <Term> termsIter, ReadersAndUpdates rld, SegmentReader reader) { UninterruptableMonitor.Enter(this); try { long delCount = 0; Fields fields = reader.Fields; if (fields == null) { // this reader has no postings return(0); } TermsEnum termsEnum = null; string currentField = null; DocsEnum docs = null; if (Debugging.AssertsEnabled) { Debugging.Assert(CheckDeleteTerm(null)); } bool any = false; //System.out.println(Thread.currentThread().getName() + " del terms reader=" + reader); foreach (Term term in termsIter) { // Since we visit terms sorted, we gain performance // by re-using the same TermsEnum and seeking only // forwards if (!string.Equals(term.Field, currentField, StringComparison.Ordinal)) { if (Debugging.AssertsEnabled) { Debugging.Assert(currentField == null || currentField.CompareToOrdinal(term.Field) < 0); } currentField = term.Field; Terms terms = fields.GetTerms(currentField); if (terms != null) { termsEnum = terms.GetEnumerator(termsEnum); } else { termsEnum = null; } } if (termsEnum == null) { continue; } if (Debugging.AssertsEnabled) { Debugging.Assert(CheckDeleteTerm(term)); } // System.out.println(" term=" + term); if (termsEnum.SeekExact(term.Bytes)) { // we don't need term frequencies for this DocsEnum docsEnum = termsEnum.Docs(rld.LiveDocs, docs, DocsFlags.NONE); //System.out.println("BDS: got docsEnum=" + docsEnum); if (docsEnum != null) { while (true) { int docID = docsEnum.NextDoc(); //System.out.println(Thread.currentThread().getName() + " del term=" + term + " doc=" + docID); if (docID == DocIdSetIterator.NO_MORE_DOCS) { break; } if (!any) { rld.InitWritableLiveDocs(); any = true; } // NOTE: there is no limit check on the docID // when deleting by Term (unlike by Query) // because on flush we apply all Term deletes to // each segment. So all Term deleting here is // against prior segments: if (rld.Delete(docID)) { delCount++; } } } } } return(delCount); } finally { UninterruptableMonitor.Exit(this); } }
/// <summary> /// Used by near real-time search </summary> internal static DirectoryReader Open(IndexWriter writer, SegmentInfos infos, bool applyAllDeletes) { // IndexWriter synchronizes externally before calling // us, which ensures infos will not change; so there's // no need to process segments in reverse order int numSegments = infos.Count; IList <SegmentReader> readers = new List <SegmentReader>(); Directory dir = writer.Directory; SegmentInfos segmentInfos = (SegmentInfos)infos.Clone(); int infosUpto = 0; bool success = false; try { for (int i = 0; i < numSegments; i++) { // NOTE: important that we use infos not // segmentInfos here, so that we are passing the // actual instance of SegmentInfoPerCommit in // IndexWriter's segmentInfos: SegmentCommitInfo info = infos.Info(i); if (Debugging.AssertsEnabled) { Debugging.Assert(info.Info.Dir == dir); } ReadersAndUpdates rld = writer.readerPool.Get(info, true); try { SegmentReader reader = rld.GetReadOnlyClone(IOContext.READ); if (reader.NumDocs > 0 || writer.KeepFullyDeletedSegments) { // Steal the ref: readers.Add(reader); infosUpto++; } else { reader.DecRef(); segmentInfos.Remove(infosUpto); } } finally { writer.readerPool.Release(rld); } } writer.IncRefDeleter(segmentInfos); StandardDirectoryReader result = new StandardDirectoryReader(dir, readers.ToArray(), writer, segmentInfos, writer.Config.ReaderTermsIndexDivisor, applyAllDeletes); success = true; return(result); } finally { if (!success) { foreach (SegmentReader r in readers) { try { r.DecRef(); } #pragma warning disable 168 catch (Exception th) #pragma warning restore 168 { // ignore any exception that is thrown here to not mask any original // exception. } } } } }