/// <summary> /// Returns <see cref="DocsAndPositionsEnum"/> for the specified /// field & term, with control over whether offsets and payloads are /// required. Some codecs may be able to optimize /// their implementation when offsets and/or payloads are not /// required. This will return <c>null</c> if the field or term does not /// exist or positions were not indexed. See /// <see cref="TermsEnum.DocsAndPositions(IBits, DocsAndPositionsEnum, DocsAndPositionsFlags)"/>. /// </summary> public static DocsAndPositionsEnum GetTermPositionsEnum(IndexReader r, IBits liveDocs, string field, BytesRef term, DocsAndPositionsFlags flags) { Debug.Assert(field != null); Debug.Assert(term != null); Terms terms = GetTerms(r, field); if (terms != null) { TermsEnum termsEnum = terms.GetIterator(null); if (termsEnum.SeekExact(term)) { return(termsEnum.DocsAndPositions(liveDocs, null, flags)); } } return(null); }
/// <summary> /// Returns <see cref="DocsAndPositionsEnum"/> for the specified /// term. This will return <c>null</c> if the /// field or term does not exist or positions weren't indexed. </summary> /// <seealso cref="TermsEnum.DocsAndPositions(IBits, DocsAndPositionsEnum)"/> public DocsAndPositionsEnum GetTermPositionsEnum(Term term) // LUCENENET specific: Renamed from TermPositionsEnum() { Debug.Assert(term.Field != null); Debug.Assert(term.Bytes != null); Fields fields = Fields; if (fields != null) { Terms terms = fields.GetTerms(term.Field); if (terms != null) { TermsEnum termsEnum = terms.GetIterator(null); if (termsEnum.SeekExact(term.Bytes)) { return(termsEnum.DocsAndPositions(LiveDocs, null)); } } } return(null); }
/// <summary> /// Call this only once (if you subclass!) </summary> protected virtual void Uninvert(AtomicReader reader, IBits liveDocs, BytesRef termPrefix) { FieldInfo info = reader.FieldInfos.FieldInfo(m_field); if (info != null && info.HasDocValues) { throw new InvalidOperationException("Type mismatch: " + m_field + " was indexed as " + info.DocValuesType); } //System.out.println("DTO uninvert field=" + field + " prefix=" + termPrefix); long startTime = Environment.TickCount; m_prefix = termPrefix == null ? null : BytesRef.DeepCopyOf(termPrefix); int maxDoc = reader.MaxDoc; int[] index = new int[maxDoc]; // immediate term numbers, or the index into the byte[] representing the last number int[] lastTerm = new int[maxDoc]; // last term we saw for this document var bytes = new sbyte[maxDoc][]; // list of term numbers for the doc (delta encoded vInts) Fields fields = reader.Fields; if (fields == null) { // No terms return; } Terms terms = fields.GetTerms(m_field); if (terms == null) { // No terms return; } TermsEnum te = terms.GetIterator(null); BytesRef seekStart = termPrefix != null ? termPrefix : new BytesRef(); //System.out.println("seekStart=" + seekStart.utf8ToString()); if (te.SeekCeil(seekStart) == TermsEnum.SeekStatus.END) { // No terms match return; } // If we need our "term index wrapper", these will be // init'd below: IList <BytesRef> indexedTerms = null; PagedBytes indexedTermsBytes = null; bool testedOrd = false; // we need a minimum of 9 bytes, but round up to 12 since the space would // be wasted with most allocators anyway. var tempArr = new sbyte[12]; // // enumerate all terms, and build an intermediate form of the un-inverted field. // // During this intermediate form, every document has a (potential) byte[] // and the int[maxDoc()] array either contains the termNumber list directly // or the *end* offset of the termNumber list in it's byte array (for faster // appending and faster creation of the final form). // // idea... if things are too large while building, we could do a range of docs // at a time (but it would be a fair amount slower to build) // could also do ranges in parallel to take advantage of multiple CPUs // OPTIONAL: remap the largest df terms to the lowest 128 (single byte) // values. this requires going over the field first to find the most // frequent terms ahead of time. int termNum = 0; m_docsEnum = null; // Loop begins with te positioned to first term (we call // seek above): for (; ;) { BytesRef t = te.Term; if (t == null || (termPrefix != null && !StringHelper.StartsWith(t, termPrefix))) { break; } //System.out.println("visit term=" + t.utf8ToString() + " " + t + " termNum=" + termNum); if (!testedOrd) { try { m_ordBase = (int)te.Ord; //System.out.println("got ordBase=" + ordBase); } #pragma warning disable 168 catch (NotSupportedException uoe) #pragma warning restore 168 { // Reader cannot provide ord support, so we wrap // our own support by creating our own terms index: indexedTerms = new List <BytesRef>(); indexedTermsBytes = new PagedBytes(15); //System.out.println("NO ORDS"); } testedOrd = true; } VisitTerm(te, termNum); if (indexedTerms != null && (termNum & indexIntervalMask) == 0) { // Index this term m_sizeOfIndexedStrings += t.Length; BytesRef indexedTerm = new BytesRef(); indexedTermsBytes.Copy(t, indexedTerm); // TODO: really should 1) strip off useless suffix, // and 2) use FST not array/PagedBytes indexedTerms.Add(indexedTerm); } int df = te.DocFreq; if (df <= m_maxTermDocFreq) { m_docsEnum = te.Docs(liveDocs, m_docsEnum, DocsFlags.NONE); // dF, but takes deletions into account int actualDF = 0; for (; ;) { int doc = m_docsEnum.NextDoc(); if (doc == DocIdSetIterator.NO_MORE_DOCS) { break; } //System.out.println(" chunk=" + chunk + " docs"); actualDF++; m_termInstances++; //System.out.println(" docID=" + doc); // add TNUM_OFFSET to the term number to make room for special reserved values: // 0 (end term) and 1 (index into byte array follows) int delta = termNum - lastTerm[doc] + TNUM_OFFSET; lastTerm[doc] = termNum; int val = index[doc]; if ((val & 0xff) == 1) { // index into byte array (actually the end of // the doc-specific byte[] when building) int pos = (int)((uint)val >> 8); int ilen = VInt32Size(delta); var arr = bytes[doc]; int newend = pos + ilen; if (newend > arr.Length) { // We avoid a doubling strategy to lower memory usage. // this faceting method isn't for docs with many terms. // In hotspot, objects have 2 words of overhead, then fields, rounded up to a 64-bit boundary. // TODO: figure out what array lengths we can round up to w/o actually using more memory // (how much space does a byte[] take up? Is data preceded by a 32 bit length only? // It should be safe to round up to the nearest 32 bits in any case. int newLen = (newend + 3) & unchecked ((int)0xfffffffc); // 4 byte alignment var newarr = new sbyte[newLen]; Array.Copy(arr, 0, newarr, 0, pos); arr = newarr; bytes[doc] = newarr; } pos = WriteInt32(delta, arr, pos); index[doc] = (pos << 8) | 1; // update pointer to end index in byte[] } else { // OK, this int has data in it... find the end (a zero starting byte - not // part of another number, hence not following a byte with the high bit set). int ipos; if (val == 0) { ipos = 0; } else if ((val & 0x0000ff80) == 0) { ipos = 1; } else if ((val & 0x00ff8000) == 0) { ipos = 2; } else if ((val & 0xff800000) == 0) { ipos = 3; } else { ipos = 4; } //System.out.println(" ipos=" + ipos); int endPos = WriteInt32(delta, tempArr, ipos); //System.out.println(" endpos=" + endPos); if (endPos <= 4) { //System.out.println(" fits!"); // value will fit in the integer... move bytes back for (int j = ipos; j < endPos; j++) { val |= (tempArr[j] & 0xff) << (j << 3); } index[doc] = val; } else { // value won't fit... move integer into byte[] for (int j = 0; j < ipos; j++) { tempArr[j] = (sbyte)val; val = (int)((uint)val >> 8); } // point at the end index in the byte[] index[doc] = (endPos << 8) | 1; bytes[doc] = tempArr; tempArr = new sbyte[12]; } } } SetActualDocFreq(termNum, actualDF); } termNum++; if (te.Next() == null) { break; } } m_numTermsInField = termNum; long midPoint = Environment.TickCount; if (m_termInstances == 0) { // we didn't invert anything // lower memory consumption. m_tnums = null; } else { this.m_index = index; // // transform intermediate form into the final form, building a single byte[] // at a time, and releasing the intermediate byte[]s as we go to avoid // increasing the memory footprint. // for (int pass = 0; pass < 256; pass++) { var target = m_tnums[pass]; var pos = 0; // end in target; if (target != null) { pos = target.Length; } else { target = new sbyte[4096]; } // loop over documents, 0x00ppxxxx, 0x01ppxxxx, 0x02ppxxxx // where pp is the pass (which array we are building), and xx is all values. // each pass shares the same byte[] for termNumber lists. for (int docbase = pass << 16; docbase < maxDoc; docbase += (1 << 24)) { int lim = Math.Min(docbase + (1 << 16), maxDoc); for (int doc = docbase; doc < lim; doc++) { //System.out.println(" pass="******" process docID=" + doc); int val = index[doc]; if ((val & 0xff) == 1) { int len = (int)((uint)val >> 8); //System.out.println(" ptr pos=" + pos); index[doc] = (pos << 8) | 1; // change index to point to start of array if ((pos & 0xff000000) != 0) { // we only have 24 bits for the array index throw new InvalidOperationException("Too many values for UnInvertedField faceting on field " + m_field); } var arr = bytes[doc]; /* * for(byte b : arr) { * //System.out.println(" b=" + Integer.toHexString((int) b)); * } */ bytes[doc] = null; // IMPORTANT: allow GC to avoid OOM if (target.Length <= pos + len) { int newlen = target.Length; //* we don't have to worry about the array getting too large // since the "pos" param will overflow first (only 24 bits available) // if ((newlen<<1) <= 0) { // // overflow... // newlen = Integer.MAX_VALUE; // if (newlen <= pos + len) { // throw new SolrException(400,"Too many terms to uninvert field!"); // } // } else { // while (newlen <= pos + len) newlen<<=1; // doubling strategy // } // while (newlen <= pos + len) // doubling strategy { newlen <<= 1; } var newtarget = new sbyte[newlen]; Array.Copy(target, 0, newtarget, 0, pos); target = newtarget; } Array.Copy(arr, 0, target, pos, len); pos += len + 1; // skip single byte at end and leave it 0 for terminator } } } // shrink array if (pos < target.Length) { var newtarget = new sbyte[pos]; Array.Copy(target, 0, newtarget, 0, pos); target = newtarget; } m_tnums[pass] = target; if ((pass << 16) > maxDoc) { break; } } } if (indexedTerms != null) { m_indexedTermsArray = new BytesRef[indexedTerms.Count]; indexedTerms.CopyTo(m_indexedTermsArray, 0); } long endTime = Environment.TickCount; m_total_time = (int)(endTime - startTime); m_phase1_time = (int)(midPoint - startTime); }
// DocValues updates private void ApplyDocValuesUpdates <T1>(IEnumerable <T1> updates, ReadersAndUpdates rld, SegmentReader reader, DocValuesFieldUpdates.Container dvUpdatesContainer) where T1 : DocValuesUpdate { lock (this) { Fields fields = reader.Fields; if (fields == null) { // this reader has no postings return; } // TODO: we can process the updates per DV field, from last to first so that // if multiple terms affect same document for the same field, we add an update // only once (that of the last term). To do that, we can keep a bitset which // marks which documents have already been updated. So e.g. if term T1 // updates doc 7, and then we process term T2 and it updates doc 7 as well, // we don't apply the update since we know T1 came last and therefore wins // the update. // We can also use that bitset as 'liveDocs' to pass to TermEnum.docs(), so // that these documents aren't even returned. string currentField = null; TermsEnum termsEnum = null; DocsEnum docs = null; //System.out.println(Thread.currentThread().getName() + " numericDVUpdate reader=" + reader); foreach (DocValuesUpdate update in updates) { Term term = update.term; int limit = update.docIDUpto; // TODO: we traverse the terms in update order (not term order) so that we // apply the updates in the correct order, i.e. if two terms udpate the // same document, the last one that came in wins, irrespective of the // terms lexical order. // we can apply the updates in terms order if we keep an updatesGen (and // increment it with every update) and attach it to each NumericUpdate. Note // that we cannot rely only on docIDUpto because an app may send two updates // which will get same docIDUpto, yet will still need to respect the order // those updates arrived. if (!string.Equals(term.Field, currentField, StringComparison.Ordinal)) { // if we change the code to process updates in terms order, enable this assert // assert currentField == null || currentField.CompareToOrdinal(term.Field) < 0; currentField = term.Field; Terms terms = fields.GetTerms(currentField); if (terms != null) { termsEnum = terms.GetIterator(termsEnum); } else { termsEnum = null; continue; // no terms in that field } } if (termsEnum == null) { continue; } // System.out.println(" term=" + term); if (termsEnum.SeekExact(term.Bytes)) { // we don't need term frequencies for this DocsEnum docsEnum = termsEnum.Docs(rld.LiveDocs, docs, DocsFlags.NONE); //System.out.println("BDS: got docsEnum=" + docsEnum); DocValuesFieldUpdates dvUpdates = dvUpdatesContainer.GetUpdates(update.field, update.type); if (dvUpdates == null) { dvUpdates = dvUpdatesContainer.NewUpdates(update.field, update.type, reader.MaxDoc); } int doc; while ((doc = docsEnum.NextDoc()) != DocIdSetIterator.NO_MORE_DOCS) { //System.out.println(Thread.currentThread().getName() + " numericDVUpdate term=" + term + " doc=" + docID); if (doc >= limit) { break; // no more docs that can be updated for this term } dvUpdates.Add(doc, update.value); } } } } }
// Delete by Term private long ApplyTermDeletes(IEnumerable <Term> termsIter, ReadersAndUpdates rld, SegmentReader reader) { lock (this) { long delCount = 0; Fields fields = reader.Fields; if (fields == null) { // this reader has no postings return(0); } TermsEnum termsEnum = null; string currentField = null; DocsEnum docs = null; Debug.Assert(CheckDeleteTerm(null)); bool any = false; //System.out.println(Thread.currentThread().getName() + " del terms reader=" + reader); foreach (Term term in termsIter) { // Since we visit terms sorted, we gain performance // by re-using the same TermsEnum and seeking only // forwards if (!string.Equals(term.Field, currentField, StringComparison.Ordinal)) { Debug.Assert(currentField == null || currentField.CompareToOrdinal(term.Field) < 0); currentField = term.Field; Terms terms = fields.GetTerms(currentField); if (terms != null) { termsEnum = terms.GetIterator(termsEnum); } else { termsEnum = null; } } if (termsEnum == null) { continue; } Debug.Assert(CheckDeleteTerm(term)); // System.out.println(" term=" + term); if (termsEnum.SeekExact(term.Bytes)) { // we don't need term frequencies for this DocsEnum docsEnum = termsEnum.Docs(rld.LiveDocs, docs, DocsFlags.NONE); //System.out.println("BDS: got docsEnum=" + docsEnum); if (docsEnum != null) { while (true) { int docID = docsEnum.NextDoc(); //System.out.println(Thread.currentThread().getName() + " del term=" + term + " doc=" + docID); if (docID == DocIdSetIterator.NO_MORE_DOCS) { break; } if (!any) { rld.InitWritableLiveDocs(); any = true; } // NOTE: there is no limit check on the docID // when deleting by Term (unlike by Query) // because on flush we apply all Term deletes to // each segment. So all Term deleting here is // against prior segments: if (rld.Delete(docID)) { delCount++; } } } } } return(delCount); } }
public override TermsEnum GetIterator(TermsEnum reuse) { return(m_input.GetIterator(reuse)); }