public AssertingAtomicReader(AtomicReader @in) : base(@in) { // check some basic reader sanity Debug.Assert(@in.MaxDoc() >= 0); Debug.Assert(@in.NumDocs() <= @in.MaxDoc()); Debug.Assert(@in.NumDeletedDocs() + @in.NumDocs() == @in.MaxDoc()); Debug.Assert([email protected]() || @in.NumDeletedDocs() > 0 && @in.NumDocs() < @in.MaxDoc()); }
/// <summary> /// Creates a <seealso cref="DocMap"/> instance appropriate for /// this reader. /// </summary> public static DocMap Build(AtomicReader reader) { int maxDoc = reader.MaxDoc(); if (!reader.HasDeletions()) { return new NoDelDocMap(maxDoc); } Bits liveDocs = reader.LiveDocs; return Build(maxDoc, liveDocs); }
internal SlowMinShouldMatchScorer(BooleanWeight weight, AtomicReader reader, IndexSearcher searcher) : base(weight) { this.Dv = reader.GetSortedSetDocValues("dv"); this.MaxDoc = reader.MaxDoc(); BooleanQuery bq = (BooleanQuery)weight.Query; this.MinNrShouldMatch = bq.MinimumNumberShouldMatch; this.Sims = new SimScorer[(int)Dv.ValueCount]; foreach (BooleanClause clause in bq.Clauses) { Debug.Assert(!clause.Prohibited); Debug.Assert(!clause.Required); Term term = ((TermQuery)clause.Query).Term; long ord = Dv.LookupTerm(term.Bytes()); if (ord >= 0) { bool success = Ords.Add(ord); Debug.Assert(success); // no dups TermContext context = TermContext.Build(reader.Context, term); SimWeight w = weight.Similarity.ComputeWeight(1f, searcher.CollectionStatistics("field"), searcher.TermStatistics(term, context)); var dummy = w.ValueForNormalization; // ignored w.Normalize(1F, 1F); Sims[(int)ord] = weight.Similarity.DoSimScorer(w, (AtomicReaderContext)reader.Context); } } }
public AllDeletedFilterReader(AtomicReader @in) : base(@in) { LiveDocs_Renamed = new Bits_MatchNoBits(@in.MaxDoc()); Debug.Assert(MaxDoc() == 0 || HasDeletions()); }
/// <summary> /// Call this only once (if you subclass!) </summary> protected internal virtual void Uninvert(AtomicReader reader, Bits liveDocs, BytesRef termPrefix) { FieldInfo info = reader.FieldInfos.FieldInfo(Field); if (info != null && info.HasDocValues()) { throw new InvalidOperationException("Type mismatch: " + Field + " was indexed as " + info.DocValuesType); } //System.out.println("DTO uninvert field=" + field + " prefix=" + termPrefix); long startTime = DateTime.Now.Millisecond; Prefix = termPrefix == null ? null : BytesRef.DeepCopyOf(termPrefix); int maxDoc = reader.MaxDoc(); int[] index = new int[maxDoc]; // immediate term numbers, or the index into the byte[] representing the last number int[] lastTerm = new int[maxDoc]; // last term we saw for this document sbyte[][] bytes = new sbyte[maxDoc][]; // list of term numbers for the doc (delta encoded vInts) Fields fields = reader.Fields(); if (fields == null) { // No terms return; } Terms terms = fields.Terms(Field); if (terms == null) { // No terms return; } TermsEnum te = terms.Iterator(null); BytesRef seekStart = termPrefix != null ? termPrefix : new BytesRef(); //System.out.println("seekStart=" + seekStart.utf8ToString()); if (te.SeekCeil(seekStart) == TermsEnum.SeekStatus.END) { // No terms match return; } // If we need our "term index wrapper", these will be // init'd below: IList<BytesRef> indexedTerms = null; PagedBytes indexedTermsBytes = null; bool testedOrd = false; // we need a minimum of 9 bytes, but round up to 12 since the space would // be wasted with most allocators anyway. sbyte[] tempArr = new sbyte[12]; // // enumerate all terms, and build an intermediate form of the un-inverted field. // // During this intermediate form, every document has a (potential) byte[] // and the int[maxDoc()] array either contains the termNumber list directly // or the *end* offset of the termNumber list in it's byte array (for faster // appending and faster creation of the final form). // // idea... if things are too large while building, we could do a range of docs // at a time (but it would be a fair amount slower to build) // could also do ranges in parallel to take advantage of multiple CPUs // OPTIONAL: remap the largest df terms to the lowest 128 (single byte) // values. this requires going over the field first to find the most // frequent terms ahead of time. int termNum = 0; DocsEnum = null; // Loop begins with te positioned to first term (we call // seek above): for (; ; ) { BytesRef t = te.Term(); if (t == null || (termPrefix != null && !StringHelper.StartsWith(t, termPrefix))) { break; } //System.out.println("visit term=" + t.utf8ToString() + " " + t + " termNum=" + termNum); if (!testedOrd) { try { OrdBase = (int)te.Ord(); //System.out.println("got ordBase=" + ordBase); } catch (System.NotSupportedException uoe) { // Reader cannot provide ord support, so we wrap // our own support by creating our own terms index: indexedTerms = new List<BytesRef>(); indexedTermsBytes = new PagedBytes(15); //System.out.println("NO ORDS"); } testedOrd = true; } VisitTerm(te, termNum); if (indexedTerms != null && (termNum & IndexIntervalMask) == 0) { // Index this term SizeOfIndexedStrings += t.Length; BytesRef indexedTerm = new BytesRef(); indexedTermsBytes.Copy(t, indexedTerm); // TODO: really should 1) strip off useless suffix, // and 2) use FST not array/PagedBytes indexedTerms.Add(indexedTerm); } int df = te.DocFreq(); if (df <= MaxTermDocFreq) { DocsEnum = te.Docs(liveDocs, DocsEnum, DocsEnum.FLAG_NONE); // dF, but takes deletions into account int actualDF = 0; for (; ; ) { int doc = DocsEnum.NextDoc(); if (doc == DocIdSetIterator.NO_MORE_DOCS) { break; } //System.out.println(" chunk=" + chunk + " docs"); actualDF++; TermInstances++; //System.out.println(" docID=" + doc); // add TNUM_OFFSET to the term number to make room for special reserved values: // 0 (end term) and 1 (index into byte array follows) int delta = termNum - lastTerm[doc] + TNUM_OFFSET; lastTerm[doc] = termNum; int val = index[doc]; if ((val & 0xff) == 1) { // index into byte array (actually the end of // the doc-specific byte[] when building) int pos = (int)((uint)val >> 8); int ilen = VIntSize(delta); sbyte[] arr = bytes[doc]; int newend = pos + ilen; if (newend > arr.Length) { // We avoid a doubling strategy to lower memory usage. // this faceting method isn't for docs with many terms. // In hotspot, objects have 2 words of overhead, then fields, rounded up to a 64-bit boundary. // TODO: figure out what array lengths we can round up to w/o actually using more memory // (how much space does a byte[] take up? Is data preceded by a 32 bit length only? // It should be safe to round up to the nearest 32 bits in any case. int newLen = (newend + 3) & unchecked((int)0xfffffffc); // 4 byte alignment sbyte[] newarr = new sbyte[newLen]; Array.Copy(arr, 0, newarr, 0, pos); arr = newarr; bytes[doc] = newarr; } pos = WriteInt(delta, arr, pos); index[doc] = (pos << 8) | 1; // update pointer to end index in byte[] } else { // OK, this int has data in it... find the end (a zero starting byte - not // part of another number, hence not following a byte with the high bit set). int ipos; if (val == 0) { ipos = 0; } else if ((val & 0x0000ff80) == 0) { ipos = 1; } else if ((val & 0x00ff8000) == 0) { ipos = 2; } else if ((val & 0xff800000) == 0) { ipos = 3; } else { ipos = 4; } //System.out.println(" ipos=" + ipos); int endPos = WriteInt(delta, tempArr, ipos); //System.out.println(" endpos=" + endPos); if (endPos <= 4) { //System.out.println(" fits!"); // value will fit in the integer... move bytes back for (int j = ipos; j < endPos; j++) { val |= (tempArr[j] & 0xff) << (j << 3); } index[doc] = val; } else { // value won't fit... move integer into byte[] for (int j = 0; j < ipos; j++) { tempArr[j] = (sbyte)val; val = (int)((uint)val >> 8); } // point at the end index in the byte[] index[doc] = (endPos << 8) | 1; bytes[doc] = tempArr; tempArr = new sbyte[12]; } } } SetActualDocFreq(termNum, actualDF); } termNum++; if (te.Next() == null) { break; } } NumTermsInField = termNum; long midPoint = DateTime.Now.Millisecond; if (TermInstances == 0) { // we didn't invert anything // lower memory consumption. Tnums = null; } else { this.Index = index; // // transform intermediate form into the final form, building a single byte[] // at a time, and releasing the intermediate byte[]s as we go to avoid // increasing the memory footprint. // for (int pass = 0; pass < 256; pass++) { sbyte[] target = Tnums[pass]; int pos = 0; // end in target; if (target != null) { pos = target.Length; } else { target = new sbyte[4096]; } // loop over documents, 0x00ppxxxx, 0x01ppxxxx, 0x02ppxxxx // where pp is the pass (which array we are building), and xx is all values. // each pass shares the same byte[] for termNumber lists. for (int docbase = pass << 16; docbase < maxDoc; docbase += (1 << 24)) { int lim = Math.Min(docbase + (1 << 16), maxDoc); for (int doc = docbase; doc < lim; doc++) { //System.out.println(" pass="******" process docID=" + doc); int val = index[doc]; if ((val & 0xff) == 1) { int len = (int)((uint)val >> 8); //System.out.println(" ptr pos=" + pos); index[doc] = (pos << 8) | 1; // change index to point to start of array if ((pos & 0xff000000) != 0) { // we only have 24 bits for the array index throw new InvalidOperationException("Too many values for UnInvertedField faceting on field " + Field); } sbyte[] arr = bytes[doc]; /* for(byte b : arr) { //System.out.println(" b=" + Integer.toHexString((int) b)); } */ bytes[doc] = null; // IMPORTANT: allow GC to avoid OOM if (target.Length <= pos + len) { int newlen = target.Length; /// <summary> ///* we don't have to worry about the array getting too large /// since the "pos" param will overflow first (only 24 bits available) /// if ((newlen<<1) <= 0) { /// // overflow... /// newlen = Integer.MAX_VALUE; /// if (newlen <= pos + len) { /// throw new SolrException(400,"Too many terms to uninvert field!"); /// } /// } else { /// while (newlen <= pos + len) newlen<<=1; // doubling strategy /// } /// *** /// </summary> while (newlen <= pos + len) // doubling strategy { newlen <<= 1; } sbyte[] newtarget = new sbyte[newlen]; Array.Copy(target, 0, newtarget, 0, pos); target = newtarget; } Array.Copy(arr, 0, target, pos, len); pos += len + 1; // skip single byte at end and leave it 0 for terminator } } } // shrink array if (pos < target.Length) { sbyte[] newtarget = new sbyte[pos]; Array.Copy(target, 0, newtarget, 0, pos); target = newtarget; } Tnums[pass] = target; if ((pass << 16) > maxDoc) { break; } } } if (indexedTerms != null) { IndexedTermsArray = indexedTerms.ToArray(); } long endTime = DateTime.Now.Millisecond; Total_time = (int)(endTime - startTime); Phase1_time = (int)(midPoint - startTime); }
private void Verify(AtomicReader r, int[][] idToOrds, BytesRef[] termsArray, BytesRef prefixRef) { DocTermOrds dto = new DocTermOrds(r, r.LiveDocs, "field", prefixRef, int.MaxValue, TestUtil.NextInt(Random(), 2, 10)); FieldCache_Fields.Ints docIDToID = FieldCache_Fields.DEFAULT.GetInts(r, "id", false); /* for(int docID=0;docID<subR.MaxDoc();docID++) { System.out.println(" docID=" + docID + " id=" + docIDToID[docID]); } */ if (VERBOSE) { Console.WriteLine("TEST: verify prefix=" + (prefixRef == null ? "null" : prefixRef.Utf8ToString())); Console.WriteLine("TEST: all TERMS:"); TermsEnum allTE = MultiFields.GetTerms(r, "field").Iterator(null); int ord = 0; while (allTE.Next() != null) { Console.WriteLine(" ord=" + (ord++) + " term=" + allTE.Term().Utf8ToString()); } } //final TermsEnum te = subR.Fields().Terms("field").iterator(); TermsEnum te = dto.GetOrdTermsEnum(r); if (dto.NumTerms() == 0) { if (prefixRef == null) { Assert.IsNull(MultiFields.GetTerms(r, "field")); } else { Terms terms = MultiFields.GetTerms(r, "field"); if (terms != null) { TermsEnum termsEnum = terms.Iterator(null); TermsEnum.SeekStatus result = termsEnum.SeekCeil(prefixRef); if (result != TermsEnum.SeekStatus.END) { Assert.IsFalse(StringHelper.StartsWith(termsEnum.Term(), prefixRef), "term=" + termsEnum.Term().Utf8ToString() + " matches prefix=" + prefixRef.Utf8ToString()); } else { // ok } } else { // ok } } return; } if (VERBOSE) { Console.WriteLine("TEST: TERMS:"); te.SeekExact(0); while (true) { Console.WriteLine(" ord=" + te.Ord() + " term=" + te.Term().Utf8ToString()); if (te.Next() == null) { break; } } } SortedSetDocValues iter = dto.GetIterator(r); for (int docID = 0; docID < r.MaxDoc(); docID++) { if (VERBOSE) { Console.WriteLine("TEST: docID=" + docID + " of " + r.MaxDoc() + " (id=" + docIDToID.Get(docID) + ")"); } iter.Document = docID; int[] answers = idToOrds[docIDToID.Get(docID)]; int upto = 0; long ord; while ((ord = iter.NextOrd()) != SortedSetDocValues.NO_MORE_ORDS) { te.SeekExact(ord); BytesRef expected = termsArray[answers[upto++]]; if (VERBOSE) { Console.WriteLine(" exp=" + expected.Utf8ToString() + " actual=" + te.Term().Utf8ToString()); } Assert.AreEqual(expected, te.Term(), "expected=" + expected.Utf8ToString() + " actual=" + te.Term().Utf8ToString() + " ord=" + ord); } Assert.AreEqual(answers.Length, upto); } }