public override void SetUp() { base.SetUp(); _dir = NewDirectory(); _indexWriter = new RandomIndexWriter(Random(), _dir, new MockAnalyzer(Random()), Similarity, TimeZone); FieldType ft = new FieldType(TextField.TYPE_STORED); ft.StoreTermVectors = true; ft.StoreTermVectorOffsets = true; ft.StoreTermVectorPositions = true; Analyzer analyzer = new MockAnalyzer(Random()); Document doc; for (int i = 0; i < 100; i++) { doc = new Document(); doc.Add(new Field(_idFieldName, Random().toString(), ft)); doc.Add(new Field(_textFieldName, new StringBuilder(Random().toString()).append(Random().toString()).append( Random().toString()).toString(), ft)); doc.Add(new Field(_classFieldName, Random().toString(), ft)); _indexWriter.AddDocument(doc, analyzer); } _indexWriter.Commit(); _originalIndex = SlowCompositeReaderWrapper.Wrap(_indexWriter.Reader); }
public override void SetUp() { base.SetUp(); Document doc; Rd1 = NewDirectory(); IndexWriter iw1 = new IndexWriter(Rd1, NewIndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer(Random()))); doc = new Document(); doc.Add(NewTextField("field1", "the quick brown fox jumps", Field.Store.YES)); doc.Add(NewTextField("field2", "the quick brown fox jumps", Field.Store.YES)); iw1.AddDocument(doc); iw1.Dispose(); Rd2 = NewDirectory(); IndexWriter iw2 = new IndexWriter(Rd2, NewIndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer(Random()))); doc = new Document(); doc.Add(NewTextField("field1", "the fox jumps over the lazy dog", Field.Store.YES)); doc.Add(NewTextField("field3", "the fox jumps over the lazy dog", Field.Store.YES)); iw2.AddDocument(doc); iw2.Dispose(); this.Ir1 = SlowCompositeReaderWrapper.Wrap(DirectoryReader.Open(Rd1)); this.Ir2 = SlowCompositeReaderWrapper.Wrap(DirectoryReader.Open(Rd2)); }
/// <summary> /// called only from static open() methods </summary> internal StandardDirectoryReader(Directory directory, AtomicReader[] readers, IndexWriter writer, SegmentInfos sis, int termInfosIndexDivisor, bool applyAllDeletes) : base(directory, readers) { this.Writer = writer; this.SegmentInfos = sis; this.TermInfosIndexDivisor = termInfosIndexDivisor; this.ApplyAllDeletes = applyAllDeletes; }
/// <summary> /// Creates a new <seealso cref="AtomicReaderContext"/> /// </summary> internal AtomicReaderContext(CompositeReaderContext parent, AtomicReader reader, int ord, int docBase, int leafOrd, int leafDocBase) : base(parent, ord, docBase) { this.Ord = leafOrd; this.DocBase = leafDocBase; this.reader = reader; this.leaves = IsTopLevel ? new[] { this } : null; //LUCENE TO-DO suspicous }
/// <summary> /// Get the wrapped instance by <code>reader</code> as long as this reader is /// an intance of <seealso cref="FilterAtomicReader"/>. /// </summary> public static AtomicReader Unwrap(AtomicReader reader) { while (reader is FilterAtomicReader) { reader = ((FilterAtomicReader)reader).@in; } return reader; }
public AssertingAtomicReader(AtomicReader @in) : base(@in) { // check some basic reader sanity Debug.Assert(@in.MaxDoc >= 0); Debug.Assert(@in.NumDocs <= @in.MaxDoc); Debug.Assert(@in.NumDeletedDocs + @in.NumDocs == @in.MaxDoc); Debug.Assert([email protected] || @in.NumDeletedDocs > 0 && @in.NumDocs < @in.MaxDoc); }
public AssertingAtomicReader(AtomicReader @in) : base(@in) { // check some basic reader sanity Debug.Assert(@in.MaxDoc() >= 0); Debug.Assert(@in.NumDocs() <= @in.MaxDoc()); Debug.Assert(@in.NumDeletedDocs() + @in.NumDocs() == @in.MaxDoc()); Debug.Assert([email protected]() || @in.NumDeletedDocs() > 0 && @in.NumDocs() < @in.MaxDoc()); }
public CachedDistanceFunctionValue(AtomicReader reader, ShapeFieldCacheDistanceValueSource enclosingInstance) { cache = enclosingInstance.provider.GetCache(reader); this.enclosingInstance = enclosingInstance; from = enclosingInstance.from; calculator = enclosingInstance.ctx.GetDistCalc(); nullValue = (enclosingInstance.ctx.IsGeo() ? 180 : double.MaxValue); }
public override void Warm(AtomicReader reader) { long startTime = DateTime.Now.Millisecond; int indexedCount = 0; int docValuesCount = 0; int normsCount = 0; foreach (FieldInfo info in reader.FieldInfos) { if (info.Indexed) { reader.Terms(info.Name); indexedCount++; if (info.HasNorms()) { reader.GetNormValues(info.Name); normsCount++; } } if (info.HasDocValues()) { switch (info.DocValuesType) { case DocValuesType_e.NUMERIC: reader.GetNumericDocValues(info.Name); break; case DocValuesType_e.BINARY: reader.GetBinaryDocValues(info.Name); break; case DocValuesType_e.SORTED: reader.GetSortedDocValues(info.Name); break; case DocValuesType_e.SORTED_SET: reader.GetSortedSetDocValues(info.Name); break; default: Debug.Assert(false); // unknown dv type break; } docValuesCount++; } } reader.Document(0); reader.GetTermVectors(0); if (InfoStream.IsEnabled("SMSW")) { InfoStream.Message("SMSW", "Finished warming segment: " + reader + ", indexed=" + indexedCount + ", docValues=" + docValuesCount + ", norms=" + normsCount + ", time=" + (DateTime.Now.Millisecond - startTime)); } }
/// <summary> /// Creates a <seealso cref="DocMap"/> instance appropriate for /// this reader. /// </summary> public static DocMap Build(AtomicReader reader) { int maxDoc = reader.MaxDoc(); if (!reader.HasDeletions()) { return new NoDelDocMap(maxDoc); } Bits liveDocs = reader.LiveDocs; return Build(maxDoc, liveDocs); }
/// <summary> /// Creates this, pulling doc values from the specified /// field. /// </summary> public DefaultSortedSetDocValuesReaderState(IndexReader reader, string field = FacetsConfig.DEFAULT_INDEX_FIELD_NAME) { this.field = field; this.origReader = reader; // We need this to create thread-safe MultiSortedSetDV // per collector: topReader = SlowCompositeReaderWrapper.Wrap(reader); SortedSetDocValues dv = topReader.GetSortedSetDocValues(field); if (dv == null) { throw new System.ArgumentException("field \"" + field + "\" was not indexed with SortedSetDocValues"); } if (dv.ValueCount > int.MaxValue) { throw new System.ArgumentException("can only handle valueCount < Integer.MAX_VALUE; got " + dv.ValueCount); } valueCount = (int)dv.ValueCount; // TODO: we can make this more efficient if eg we can be // "involved" when IOrdinalMap is being created? Ie see // each term/ord it's assigning as it goes... string lastDim = null; int startOrd = -1; // TODO: this approach can work for full hierarchy?; // TaxoReader can't do this since ords are not in // "sorted order" ... but we should generalize this to // support arbitrary hierarchy: for (int ord = 0; ord < valueCount; ord++) { BytesRef term = new BytesRef(); dv.LookupOrd(ord, term); string[] components = FacetsConfig.StringToPath(term.Utf8ToString()); if (components.Length != 2) { throw new System.ArgumentException("this class can only handle 2 level hierarchy (dim/value); got: " + Arrays.ToString(components) + " " + term.Utf8ToString()); } if (!components[0].Equals(lastDim)) { if (lastDim != null) { prefixToOrdRange[lastDim] = new OrdRange(startOrd, ord - 1); } startOrd = ord; lastDim = components[0]; } } if (lastDim != null) { prefixToOrdRange[lastDim] = new OrdRange(startOrd, valueCount - 1); } }
public DistanceFunctionValue(DistanceValueSource outerInstance, AtomicReader reader) { this.outerInstance = outerInstance; ptX = FieldCache.DEFAULT.GetDoubles(reader, outerInstance.strategy.FieldNameX, true); ptY = FieldCache.DEFAULT.GetDoubles(reader, outerInstance.strategy.FieldNameY, true); validX = FieldCache.DEFAULT.GetDocsWithField(reader, outerInstance.strategy.FieldNameX); validY = FieldCache.DEFAULT.GetDocsWithField(reader, outerInstance.strategy.FieldNameY); from = outerInstance.from; calculator = outerInstance.strategy.SpatialContext.DistCalc; nullValue = (outerInstance.strategy.SpatialContext.IsGeo ? 180 * outerInstance.multiplier : double.MaxValue); }
public virtual DocsAndPositionsEnum GetDocsAndPositions(AtomicReader reader, BytesRef bytes, Bits liveDocs) { Terms terms = reader.Terms(FieldName); if (terms != null) { TermsEnum te = terms.Iterator(null); if (te.SeekExact(bytes)) { return te.DocsAndPositions(liveDocs, null); } } return null; }
public DistanceFunctionValue(DistanceValueSource enclosingInstance, AtomicReader reader) { this.enclosingInstance = enclosingInstance; ptX = FieldCache.DEFAULT.GetDoubles(reader, enclosingInstance.strategy.FieldNameX, true); ptY = FieldCache.DEFAULT.GetDoubles(reader, enclosingInstance.strategy.FieldNameY, true); validX = FieldCache.DEFAULT.GetDocsWithField(reader, enclosingInstance.strategy.FieldNameX); validY = FieldCache.DEFAULT.GetDocsWithField(reader, enclosingInstance.strategy.FieldNameY); from = enclosingInstance.from; calculator = enclosingInstance.strategy.SpatialContext.GetDistCalc(); nullValue = (enclosingInstance.strategy.SpatialContext.IsGeo() ? 180 : double.MaxValue); }
public BBoxSimilarityValueSourceFunctionValue(AtomicReader reader, BBoxSimilarityValueSource enclosingInstance) { _enclosingInstance = enclosingInstance; rect = _enclosingInstance.strategy.SpatialContext.MakeRectangle(0, 0, 0, 0); //reused minX = FieldCache.DEFAULT.GetDoubles(reader, enclosingInstance.strategy.field_minX, true); minY = FieldCache.DEFAULT.GetDoubles(reader, enclosingInstance.strategy.field_minY, true); maxX = FieldCache.DEFAULT.GetDoubles(reader, enclosingInstance.strategy.field_maxX, true); maxY = FieldCache.DEFAULT.GetDoubles(reader, enclosingInstance.strategy.field_maxY, true); validMinX = FieldCache.DEFAULT.GetDocsWithField(reader, enclosingInstance.strategy.field_minX); validMaxX = FieldCache.DEFAULT.GetDocsWithField(reader, enclosingInstance.strategy.field_maxX); }
public override void SetUp() { base.SetUp(); directory = NewDirectory(); RandomIndexWriter writer = new RandomIndexWriter(Random(), directory, new MockAnalyzer(Random(), MockTokenizer.WHITESPACE, false), Similarity, TimeZone); AddDoc(writer, @"admin guest", @"010", @"20040101", @"Y"); AddDoc(writer, @"guest", @"020", @"20040101", @"Y"); AddDoc(writer, @"guest", @"020", @"20050101", @"Y"); AddDoc(writer, @"admin", @"020", @"20050101", @"Maybe"); AddDoc(writer, @"admin guest", @"030", @"20050101", @"N"); reader = SlowCompositeReaderWrapper.Wrap(writer.Reader); writer.Dispose(); }
private FixedBitSet CorrectBits(AtomicReader reader, Bits acceptDocs) { FixedBitSet bits = new FixedBitSet(reader.MaxDoc); //assume all are INvalid Terms terms = reader.Fields.Terms(fieldName); if (terms == null) { return bits; } TermsEnum termsEnum = terms.Iterator(null); DocsEnum docs = null; while (true) { BytesRef currTerm = termsEnum.Next(); if (currTerm == null) { break; } else { docs = termsEnum.Docs(acceptDocs, docs, DocsEnum.FLAG_NONE); int doc = docs.NextDoc(); if (doc != DocIdSetIterator.NO_MORE_DOCS) { if (keepMode == KeepMode.KM_USE_FIRST_OCCURRENCE) { bits.Set(doc); } else { int lastDoc = doc; while (true) { lastDoc = doc; doc = docs.NextDoc(); if (doc == DocIdSetIterator.NO_MORE_DOCS) { break; } } bits.Set(lastDoc); } } } } return bits; }
public static void CheckNorms(AtomicReader reader) { // test omit norms for (int i = 0; i < DocHelper.Fields.Length; i++) { IndexableField f = DocHelper.Fields[i]; if (f.FieldType().Indexed) { Assert.AreEqual(reader.GetNormValues(f.Name()) != null, !f.FieldType().OmitNorms); Assert.AreEqual(reader.GetNormValues(f.Name()) != null, !DocHelper.NoNorms.ContainsKey(f.Name())); if (reader.GetNormValues(f.Name()) == null) { // test for norms of null NumericDocValues norms = MultiDocValues.GetNormValues(reader, f.Name()); Assert.IsNull(norms); } } } }
public static void AssertSplit(AtomicReader originalIndex, double testRatio, double crossValidationRatio, params string[] fieldNames) { BaseDirectoryWrapper trainingIndex = NewDirectory(); BaseDirectoryWrapper testIndex = NewDirectory(); BaseDirectoryWrapper crossValidationIndex = NewDirectory(); try { DatasetSplitter datasetSplitter = new DatasetSplitter(testRatio, crossValidationRatio); datasetSplitter.Split(originalIndex, trainingIndex, testIndex, crossValidationIndex, new MockAnalyzer(Random()), fieldNames); NotNull(trainingIndex); NotNull(testIndex); NotNull(crossValidationIndex); DirectoryReader trainingReader = DirectoryReader.Open(trainingIndex); True((int)(originalIndex.MaxDoc * (1d - testRatio - crossValidationRatio)) == trainingReader.MaxDoc); DirectoryReader testReader = DirectoryReader.Open(testIndex); True((int)(originalIndex.MaxDoc * testRatio) == testReader.MaxDoc); DirectoryReader cvReader = DirectoryReader.Open(crossValidationIndex); True((int)(originalIndex.MaxDoc * crossValidationRatio) == cvReader.MaxDoc); trainingReader.Dispose(); testReader.Dispose(); cvReader.Dispose(); CloseQuietly(trainingReader); CloseQuietly(testReader); CloseQuietly(cvReader); } finally { trainingIndex.Dispose(); testIndex.Dispose(); crossValidationIndex.Dispose(); } }
protected override DocIdSet DocIdSetToCache(DocIdSet docIdSet, AtomicReader reader) { if (docIdSet == null) { return EMPTY_DOCIDSET; } if (docIdSet is FixedBitSet) { // this is different from CachingWrapperFilter: even when the DocIdSet is // cacheable, we convert it to a FixedBitSet since we require all the // cached filters to be FixedBitSets return docIdSet; } DocIdSetIterator it = docIdSet.GetIterator(); if (it == null) { return EMPTY_DOCIDSET; } FixedBitSet copy = new FixedBitSet(reader.MaxDoc); copy.Or(it); return copy; }
/// <summary> /// Call this only once (if you subclass!) </summary> protected virtual void Uninvert(AtomicReader reader, IBits liveDocs, BytesRef termPrefix) { FieldInfo info = reader.FieldInfos.FieldInfo(m_field); if (info != null && info.HasDocValues) { throw IllegalStateException.Create("Type mismatch: " + m_field + " was indexed as " + info.DocValuesType); } //System.out.println("DTO uninvert field=" + field + " prefix=" + termPrefix); long startTime = J2N.Time.NanoTime() / J2N.Time.MillisecondsPerNanosecond; // LUCENENET: Use NanoTime() rather than CurrentTimeMilliseconds() for more accurate/reliable results m_prefix = termPrefix == null ? null : BytesRef.DeepCopyOf(termPrefix); int maxDoc = reader.MaxDoc; int[] index = new int[maxDoc]; // immediate term numbers, or the index into the byte[] representing the last number int[] lastTerm = new int[maxDoc]; // last term we saw for this document var bytes = new sbyte[maxDoc][]; // list of term numbers for the doc (delta encoded vInts) Fields fields = reader.Fields; if (fields == null) { // No terms return; } Terms terms = fields.GetTerms(m_field); if (terms == null) { // No terms return; } TermsEnum te = terms.GetEnumerator(); BytesRef seekStart = termPrefix ?? new BytesRef(); //System.out.println("seekStart=" + seekStart.utf8ToString()); if (te.SeekCeil(seekStart) == TermsEnum.SeekStatus.END) { // No terms match return; } // If we need our "term index wrapper", these will be // init'd below: IList <BytesRef> indexedTerms = null; PagedBytes indexedTermsBytes = null; bool testedOrd = false; // we need a minimum of 9 bytes, but round up to 12 since the space would // be wasted with most allocators anyway. var tempArr = new sbyte[12]; // // enumerate all terms, and build an intermediate form of the un-inverted field. // // During this intermediate form, every document has a (potential) byte[] // and the int[maxDoc()] array either contains the termNumber list directly // or the *end* offset of the termNumber list in it's byte array (for faster // appending and faster creation of the final form). // // idea... if things are too large while building, we could do a range of docs // at a time (but it would be a fair amount slower to build) // could also do ranges in parallel to take advantage of multiple CPUs // OPTIONAL: remap the largest df terms to the lowest 128 (single byte) // values. this requires going over the field first to find the most // frequent terms ahead of time. int termNum = 0; m_docsEnum = null; // Loop begins with te positioned to first term (we call // seek above): for (; ;) { BytesRef t = te.Term; if (t == null || (termPrefix != null && !StringHelper.StartsWith(t, termPrefix))) { break; } //System.out.println("visit term=" + t.utf8ToString() + " " + t + " termNum=" + termNum); if (!testedOrd) { try { m_ordBase = (int)te.Ord; //System.out.println("got ordBase=" + ordBase); } catch (Exception uoe) when(uoe.IsUnsupportedOperationException()) { // Reader cannot provide ord support, so we wrap // our own support by creating our own terms index: indexedTerms = new JCG.List <BytesRef>(); indexedTermsBytes = new PagedBytes(15); //System.out.println("NO ORDS"); } testedOrd = true; } VisitTerm(te, termNum); if (indexedTerms != null && (termNum & indexIntervalMask) == 0) { // Index this term m_sizeOfIndexedStrings += t.Length; BytesRef indexedTerm = new BytesRef(); indexedTermsBytes.Copy(t, indexedTerm); // TODO: really should 1) strip off useless suffix, // and 2) use FST not array/PagedBytes indexedTerms.Add(indexedTerm); } int df = te.DocFreq; if (df <= m_maxTermDocFreq) { m_docsEnum = te.Docs(liveDocs, m_docsEnum, DocsFlags.NONE); // dF, but takes deletions into account int actualDF = 0; for (; ;) { int doc = m_docsEnum.NextDoc(); if (doc == DocIdSetIterator.NO_MORE_DOCS) { break; } //System.out.println(" chunk=" + chunk + " docs"); actualDF++; m_termInstances++; //System.out.println(" docID=" + doc); // add TNUM_OFFSET to the term number to make room for special reserved values: // 0 (end term) and 1 (index into byte array follows) int delta = termNum - lastTerm[doc] + TNUM_OFFSET; lastTerm[doc] = termNum; int val = index[doc]; if ((val & 0xff) == 1) { // index into byte array (actually the end of // the doc-specific byte[] when building) int pos = val.TripleShift(8); int ilen = VInt32Size(delta); var arr = bytes[doc]; int newend = pos + ilen; if (newend > arr.Length) { // We avoid a doubling strategy to lower memory usage. // this faceting method isn't for docs with many terms. // In hotspot, objects have 2 words of overhead, then fields, rounded up to a 64-bit boundary. // TODO: figure out what array lengths we can round up to w/o actually using more memory // (how much space does a byte[] take up? Is data preceded by a 32 bit length only? // It should be safe to round up to the nearest 32 bits in any case. int newLen = (newend + 3) & unchecked ((int)0xfffffffc); // 4 byte alignment var newarr = new sbyte[newLen]; Array.Copy(arr, 0, newarr, 0, pos); arr = newarr; bytes[doc] = newarr; } pos = WriteInt32(delta, arr, pos); index[doc] = (pos << 8) | 1; // update pointer to end index in byte[] } else { // OK, this int has data in it... find the end (a zero starting byte - not // part of another number, hence not following a byte with the high bit set). int ipos; if (val == 0) { ipos = 0; } else if ((val & 0x0000ff80) == 0) { ipos = 1; } else if ((val & 0x00ff8000) == 0) { ipos = 2; } else if ((val & 0xff800000) == 0) { ipos = 3; } else { ipos = 4; } //System.out.println(" ipos=" + ipos); int endPos = WriteInt32(delta, tempArr, ipos); //System.out.println(" endpos=" + endPos); if (endPos <= 4) { //System.out.println(" fits!"); // value will fit in the integer... move bytes back for (int j = ipos; j < endPos; j++) { val |= (tempArr[j] & 0xff) << (j << 3); } index[doc] = val; } else { // value won't fit... move integer into byte[] for (int j = 0; j < ipos; j++) { tempArr[j] = (sbyte)val; val = val.TripleShift(8); } // point at the end index in the byte[] index[doc] = (endPos << 8) | 1; bytes[doc] = tempArr; tempArr = new sbyte[12]; } } } SetActualDocFreq(termNum, actualDF); } termNum++; if (!te.MoveNext()) { break; } } m_numTermsInField = termNum; long midPoint = J2N.Time.NanoTime() / J2N.Time.MillisecondsPerNanosecond; // LUCENENET: Use NanoTime() rather than CurrentTimeMilliseconds() for more accurate/reliable results if (m_termInstances == 0) { // we didn't invert anything // lower memory consumption. m_tnums = null; } else { this.m_index = index; // // transform intermediate form into the final form, building a single byte[] // at a time, and releasing the intermediate byte[]s as we go to avoid // increasing the memory footprint. // for (int pass = 0; pass < 256; pass++) { var target = m_tnums[pass]; var pos = 0; // end in target; if (target != null) { pos = target.Length; } else { target = new sbyte[4096]; } // loop over documents, 0x00ppxxxx, 0x01ppxxxx, 0x02ppxxxx // where pp is the pass (which array we are building), and xx is all values. // each pass shares the same byte[] for termNumber lists. for (int docbase = pass << 16; docbase < maxDoc; docbase += (1 << 24)) { int lim = Math.Min(docbase + (1 << 16), maxDoc); for (int doc = docbase; doc < lim; doc++) { //System.out.println(" pass="******" process docID=" + doc); int val = index[doc]; if ((val & 0xff) == 1) { int len = val.TripleShift(8); //System.out.println(" ptr pos=" + pos); index[doc] = (pos << 8) | 1; // change index to point to start of array if ((pos & 0xff000000) != 0) { // we only have 24 bits for the array index throw IllegalStateException.Create("Too many values for UnInvertedField faceting on field " + m_field); } var arr = bytes[doc]; /* * for(byte b : arr) { * //System.out.println(" b=" + Integer.toHexString((int) b)); * } */ bytes[doc] = null; // IMPORTANT: allow GC to avoid OOM if (target.Length <= pos + len) { int newlen = target.Length; //* we don't have to worry about the array getting too large // since the "pos" param will overflow first (only 24 bits available) // if ((newlen<<1) <= 0) { // // overflow... // newlen = Integer.MAX_VALUE; // if (newlen <= pos + len) { // throw new SolrException(400,"Too many terms to uninvert field!"); // } // } else { // while (newlen <= pos + len) newlen<<=1; // doubling strategy // } // while (newlen <= pos + len) // doubling strategy { newlen <<= 1; } var newtarget = new sbyte[newlen]; Array.Copy(target, 0, newtarget, 0, pos); target = newtarget; } Array.Copy(arr, 0, target, pos, len); pos += len + 1; // skip single byte at end and leave it 0 for terminator } } } // shrink array if (pos < target.Length) { var newtarget = new sbyte[pos]; Array.Copy(target, 0, newtarget, 0, pos); target = newtarget; } m_tnums[pass] = target; if ((pass << 16) > maxDoc) { break; } } } if (indexedTerms != null) { m_indexedTermsArray = new BytesRef[indexedTerms.Count]; indexedTerms.CopyTo(m_indexedTermsArray, 0); } long endTime = J2N.Time.NanoTime() / J2N.Time.MillisecondsPerNanosecond; // LUCENENET: Use NanoTime() rather than CurrentTimeMilliseconds() for more accurate/reliable results m_total_time = (int)(endTime - startTime); m_phase1_time = (int)(midPoint - startTime); }
public ThreadAnonymousInnerClassHelper(TestDocValuesWithThreads outerInstance, IList <long?> numbers, IList <BytesRef> binary, IList <BytesRef> sorted, int numDocs, AtomicReader ar, CountdownEvent startingGun, Random threadRandom) { this.OuterInstance = outerInstance; this.Numbers = numbers; this.Binary = binary; this.Sorted = sorted; this.NumDocs = numDocs; this.Ar = ar; this.StartingGun = startingGun; this.ThreadRandom = threadRandom; }
public virtual void TestNumericField() { using Directory dir = NewDirectory(); DirectoryReader r = null; try { var numDocs = AtLeast(500); var answers = new Number[numDocs]; using (var w = new RandomIndexWriter(Random, dir)) { NumericType[] typeAnswers = new NumericType[numDocs]; for (int id = 0; id < numDocs; id++) { Document doc = new Document(); Field nf; Field sf; Number answer; NumericType typeAnswer; if (Random.NextBoolean()) { // float/double if (Random.NextBoolean()) { float f = Random.NextSingle(); answer = Single.GetInstance(f); nf = new SingleField("nf", f, Field.Store.NO); sf = new StoredField("nf", f); typeAnswer = NumericType.SINGLE; } else { double d = Random.NextDouble(); answer = Double.GetInstance(d); nf = new DoubleField("nf", d, Field.Store.NO); sf = new StoredField("nf", d); typeAnswer = NumericType.DOUBLE; } } else { // int/long if (Random.NextBoolean()) { int i = Random.Next(); answer = Int32.GetInstance(i); nf = new Int32Field("nf", i, Field.Store.NO); sf = new StoredField("nf", i); typeAnswer = NumericType.INT32; } else { long l = Random.NextInt64(); answer = Int64.GetInstance(l); nf = new Int64Field("nf", l, Field.Store.NO); sf = new StoredField("nf", l); typeAnswer = NumericType.INT64; } } doc.Add(nf); doc.Add(sf); answers[id] = answer; typeAnswers[id] = typeAnswer; FieldType ft = new FieldType(Int32Field.TYPE_STORED); ft.NumericPrecisionStep = int.MaxValue; doc.Add(new Int32Field("id", id, ft)); w.AddDocument(doc); } r = w.GetReader(); } // w.Dispose(); Assert.AreEqual(numDocs, r.NumDocs); foreach (AtomicReaderContext ctx in r.Leaves) { AtomicReader sub = ctx.AtomicReader; FieldCache.Int32s ids = FieldCache.DEFAULT.GetInt32s(sub, "id", false); for (int docID = 0; docID < sub.NumDocs; docID++) { Document doc = sub.Document(docID); Field f = doc.GetField <Field>("nf"); Assert.IsTrue(f is StoredField, "got f=" + f); #pragma warning disable 612, 618 Assert.AreEqual(answers[ids.Get(docID)], f.GetNumericValue()); #pragma warning restore 612, 618 } } } finally { r?.Dispose(); } }
internal Iterator(DocTermOrds outerInstance, AtomicReader reader) { this.outerInstance = outerInstance; this.reader = reader; this.te = GetTermsEnum(); }
/// <summary> /// Construct a <see cref="FilterAtomicReader"/> based on the specified base reader. /// <para/> /// Note that base reader is closed if this <see cref="FilterAtomicReader"/> is closed. /// </summary> /// <param name="input"> specified base reader. </param> public FilterAtomicReader(AtomicReader input) : base() { this.m_input = input; input.RegisterParentReader(this); }
private void Verify(AtomicReader r, int[][] idToOrds, BytesRef[] termsArray, BytesRef prefixRef) { DocTermOrds dto = new DocTermOrds(r, r.LiveDocs, "field", prefixRef, int.MaxValue, TestUtil.NextInt32(Random, 2, 10)); FieldCache.Int32s docIDToID = FieldCache.DEFAULT.GetInt32s(r, "id", false); /* * for(int docID=0;docID<subR.MaxDoc;docID++) { * System.out.println(" docID=" + docID + " id=" + docIDToID[docID]); * } */ if (Verbose) { Console.WriteLine("TEST: verify prefix=" + (prefixRef == null ? "null" : prefixRef.Utf8ToString())); Console.WriteLine("TEST: all TERMS:"); TermsEnum allTE = MultiFields.GetTerms(r, "field").GetIterator(null); int ord = 0; while (allTE.Next() != null) { Console.WriteLine(" ord=" + (ord++) + " term=" + allTE.Term.Utf8ToString()); } } //final TermsEnum te = subR.Fields.Terms("field").iterator(); TermsEnum te = dto.GetOrdTermsEnum(r); if (dto.NumTerms == 0) { if (prefixRef == null) { Assert.IsNull(MultiFields.GetTerms(r, "field")); } else { Terms terms = MultiFields.GetTerms(r, "field"); if (terms != null) { TermsEnum termsEnum = terms.GetIterator(null); TermsEnum.SeekStatus result = termsEnum.SeekCeil(prefixRef); if (result != TermsEnum.SeekStatus.END) { Assert.IsFalse(StringHelper.StartsWith(termsEnum.Term, prefixRef), "term=" + termsEnum.Term.Utf8ToString() + " matches prefix=" + prefixRef.Utf8ToString()); } else { // ok } } else { // ok } } return; } if (Verbose) { Console.WriteLine("TEST: TERMS:"); te.SeekExact(0); while (true) { Console.WriteLine(" ord=" + te.Ord + " term=" + te.Term.Utf8ToString()); if (te.Next() == null) { break; } } } SortedSetDocValues iter = dto.GetIterator(r); for (int docID = 0; docID < r.MaxDoc; docID++) { if (Verbose) { Console.WriteLine("TEST: docID=" + docID + " of " + r.MaxDoc + " (id=" + docIDToID.Get(docID) + ")"); } iter.SetDocument(docID); int[] answers = idToOrds[docIDToID.Get(docID)]; int upto = 0; long ord; while ((ord = iter.NextOrd()) != SortedSetDocValues.NO_MORE_ORDS) { te.SeekExact(ord); BytesRef expected = termsArray[answers[upto++]]; if (Verbose) { Console.WriteLine(" exp=" + expected.Utf8ToString() + " actual=" + te.Term.Utf8ToString()); } Assert.AreEqual(expected, te.Term, "expected=" + expected.Utf8ToString() + " actual=" + te.Term.Utf8ToString() + " ord=" + ord); } Assert.AreEqual(answers.Length, upto); } }
/// <summary> /// Wrap one of the parent <see cref="DirectoryReader"/>'s subreaders </summary> /// <param name="reader"> the subreader to wrap </param> /// <returns> a wrapped/filtered <see cref="AtomicReader"/> </returns> public abstract AtomicReader Wrap(AtomicReader reader);
public virtual void TestManyReopensAndFields() { Directory dir = NewDirectory(); Random random = Random(); IndexWriterConfig conf = NewIndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer(random)); LogMergePolicy lmp = NewLogMergePolicy(); lmp.MergeFactor = 3; // merge often conf.SetMergePolicy(lmp); IndexWriter writer = new IndexWriter(dir, conf); bool isNRT = random.NextBoolean(); DirectoryReader reader; if (isNRT) { reader = DirectoryReader.Open(writer, true); } else { writer.Commit(); reader = DirectoryReader.Open(dir); } int numFields = random.Next(4) + 3; // 3-7 int numNDVFields = random.Next(numFields / 2) + 1; // 1-3 long[] fieldValues = new long[numFields]; bool[] fieldHasValue = new bool[numFields]; Arrays.Fill(fieldHasValue, true); for (int i = 0; i < fieldValues.Length; i++) { fieldValues[i] = 1; } int numRounds = AtLeast(15); int docID = 0; for (int i = 0; i < numRounds; i++) { int numDocs = AtLeast(5); // System.out.println("[" + Thread.currentThread().getName() + "]: round=" + i + ", numDocs=" + numDocs); for (int j = 0; j < numDocs; j++) { Document doc = new Document(); doc.Add(new StringField("id", "doc-" + docID, Store.NO)); doc.Add(new StringField("key", "all", Store.NO)); // update key // add all fields with their current value for (int f = 0; f < fieldValues.Length; f++) { if (f < numNDVFields) { doc.Add(new NumericDocValuesField("f" + f, fieldValues[f])); } else { doc.Add(new BinaryDocValuesField("f" + f, TestBinaryDocValuesUpdates.ToBytes(fieldValues[f]))); } } writer.AddDocument(doc); ++docID; } // if field's value was unset before, unset it from all new added documents too for (int field = 0; field < fieldHasValue.Length; field++) { if (!fieldHasValue[field]) { if (field < numNDVFields) { writer.UpdateNumericDocValue(new Term("key", "all"), "f" + field, null); } else { writer.UpdateBinaryDocValue(new Term("key", "all"), "f" + field, null); } } } int fieldIdx = random.Next(fieldValues.Length); string updateField = "f" + fieldIdx; if (random.NextBoolean()) { // System.out.println("[" + Thread.currentThread().getName() + "]: unset field '" + updateField + "'"); fieldHasValue[fieldIdx] = false; if (fieldIdx < numNDVFields) { writer.UpdateNumericDocValue(new Term("key", "all"), updateField, null); } else { writer.UpdateBinaryDocValue(new Term("key", "all"), updateField, null); } } else { fieldHasValue[fieldIdx] = true; if (fieldIdx < numNDVFields) { writer.UpdateNumericDocValue(new Term("key", "all"), updateField, ++fieldValues[fieldIdx]); } else { writer.UpdateBinaryDocValue(new Term("key", "all"), updateField, TestBinaryDocValuesUpdates.ToBytes(++fieldValues[fieldIdx])); } // System.out.println("[" + Thread.currentThread().getName() + "]: updated field '" + updateField + "' to value " + fieldValues[fieldIdx]); } if (random.NextDouble() < 0.2) { int deleteDoc = random.Next(docID); // might also delete an already deleted document, ok! writer.DeleteDocuments(new Term("id", "doc-" + deleteDoc)); // System.out.println("[" + Thread.currentThread().getName() + "]: deleted document: doc-" + deleteDoc); } // verify reader if (!isNRT) { writer.Commit(); } // System.out.println("[" + Thread.currentThread().getName() + "]: reopen reader: " + reader); DirectoryReader newReader = DirectoryReader.OpenIfChanged(reader); Assert.IsNotNull(newReader); reader.Dispose(); reader = newReader; // System.out.println("[" + Thread.currentThread().getName() + "]: reopened reader: " + reader); Assert.IsTrue(reader.NumDocs > 0); // we delete at most one document per round BytesRef scratch = new BytesRef(); foreach (AtomicReaderContext context in reader.Leaves) { AtomicReader r = context.AtomicReader; // System.out.println(((SegmentReader) r).getSegmentName()); IBits liveDocs = r.LiveDocs; for (int field = 0; field < fieldValues.Length; field++) { string f = "f" + field; BinaryDocValues bdv = r.GetBinaryDocValues(f); NumericDocValues ndv = r.GetNumericDocValues(f); IBits docsWithField = r.GetDocsWithField(f); if (field < numNDVFields) { Assert.IsNotNull(ndv); Assert.IsNull(bdv); } else { Assert.IsNull(ndv); Assert.IsNotNull(bdv); } int maxDoc = r.MaxDoc; for (int doc = 0; doc < maxDoc; doc++) { if (liveDocs == null || liveDocs.Get(doc)) { // System.out.println("doc=" + (doc + context.DocBase) + " f='" + f + "' vslue=" + getValue(bdv, doc, scratch)); if (fieldHasValue[field]) { Assert.IsTrue(docsWithField.Get(doc)); if (field < numNDVFields) { Assert.AreEqual(fieldValues[field], ndv.Get(doc), "invalid value for doc=" + doc + ", field=" + f + ", reader=" + r); } else { Assert.AreEqual(fieldValues[field], TestBinaryDocValuesUpdates.GetValue(bdv, doc, scratch), "invalid value for doc=" + doc + ", field=" + f + ", reader=" + r); } } else { Assert.IsFalse(docsWithField.Get(doc)); } } } } } // System.out.println(); } IOUtils.Dispose(writer, reader, dir); }
[Test, LongRunningTest, HasTimeout] // LUCENENET TODO: Can this test be optimized to run faster on .NET Core 1.0? public virtual void TestTonsOfUpdates() { // LUCENE-5248: make sure that when there are many updates, we don't use too much RAM Directory dir = NewDirectory(); Random random = Random(); IndexWriterConfig conf = NewIndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer(random)); conf.SetRAMBufferSizeMB(IndexWriterConfig.DEFAULT_RAM_BUFFER_SIZE_MB); conf.SetMaxBufferedDocs(IndexWriterConfig.DISABLE_AUTO_FLUSH); // don't flush by doc IndexWriter writer = new IndexWriter(dir, conf); // test data: lots of documents (few 10Ks) and lots of update terms (few hundreds) int numDocs = AtLeast(20000); int numBinaryFields = AtLeast(5); int numTerms = TestUtil.NextInt(random, 10, 100); // terms should affect many docs HashSet <string> updateTerms = new HashSet <string>(); while (updateTerms.Count < numTerms) { updateTerms.Add(TestUtil.RandomSimpleString(random)); } // System.out.println("numDocs=" + numDocs + " numBinaryFields=" + numBinaryFields + " numTerms=" + numTerms); // build a large index with many BDV fields and update terms for (int i = 0; i < numDocs; i++) { Document doc = new Document(); int numUpdateTerms = TestUtil.NextInt(random, 1, numTerms / 10); for (int j = 0; j < numUpdateTerms; j++) { doc.Add(new StringField("upd", RandomInts.RandomFrom(random, updateTerms), Store.NO)); } for (int j = 0; j < numBinaryFields; j++) { long val = random.Next(); doc.Add(new BinaryDocValuesField("f" + j, TestBinaryDocValuesUpdates.ToBytes(val))); doc.Add(new NumericDocValuesField("cf" + j, val * 2)); } writer.AddDocument(doc); } writer.Commit(); // commit so there's something to apply to // set to flush every 2048 bytes (approximately every 12 updates), so we get // many flushes during binary updates writer.Config.SetRAMBufferSizeMB(2048.0 / 1024 / 1024); int numUpdates = AtLeast(100); // System.out.println("numUpdates=" + numUpdates); for (int i = 0; i < numUpdates; i++) { int field = random.Next(numBinaryFields); Term updateTerm = new Term("upd", RandomInts.RandomFrom(random, updateTerms)); long value = random.Next(); writer.UpdateBinaryDocValue(updateTerm, "f" + field, TestBinaryDocValuesUpdates.ToBytes(value)); writer.UpdateNumericDocValue(updateTerm, "cf" + field, value * 2); } writer.Dispose(); DirectoryReader reader = DirectoryReader.Open(dir); BytesRef scratch = new BytesRef(); foreach (AtomicReaderContext context in reader.Leaves) { for (int i = 0; i < numBinaryFields; i++) { AtomicReader r = context.AtomicReader; BinaryDocValues f = r.GetBinaryDocValues("f" + i); NumericDocValues cf = r.GetNumericDocValues("cf" + i); for (int j = 0; j < r.MaxDoc; j++) { Assert.AreEqual(cf.Get(j), TestBinaryDocValuesUpdates.GetValue(f, j, scratch) * 2, "reader=" + r + ", field=f" + i + ", doc=" + j); } } } reader.Dispose(); dir.Dispose(); }
public virtual void TestStressMultiThreading() { Directory dir = NewDirectory(); IndexWriterConfig conf = NewIndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer(Random())); IndexWriter writer = new IndexWriter(dir, conf); // create index int numThreads = TestUtil.NextInt(Random(), 3, 6); int numDocs = AtLeast(2000); for (int i = 0; i < numDocs; i++) { Document doc = new Document(); doc.Add(new StringField("id", "doc" + i, Store.NO)); double group = Random().NextDouble(); string g; if (group < 0.1) { g = "g0"; } else if (group < 0.5) { g = "g1"; } else if (group < 0.8) { g = "g2"; } else { g = "g3"; } doc.Add(new StringField("updKey", g, Store.NO)); for (int j = 0; j < numThreads; j++) { long value = Random().Next(); doc.Add(new BinaryDocValuesField("f" + j, TestBinaryDocValuesUpdates.ToBytes(value))); doc.Add(new NumericDocValuesField("cf" + j, value * 2)); // control, always updated to f * 2 } writer.AddDocument(doc); } CountdownEvent done = new CountdownEvent(numThreads); AtomicInt32 numUpdates = new AtomicInt32(AtLeast(100)); // same thread updates a field as well as reopens ThreadClass[] threads = new ThreadClass[numThreads]; for (int i = 0; i < threads.Length; i++) { string f = "f" + i; string cf = "cf" + i; threads[i] = new ThreadAnonymousInnerClassHelper(this, "UpdateThread-" + i, writer, numDocs, done, numUpdates, f, cf); } foreach (ThreadClass t in threads) { t.Start(); } done.Wait(); writer.Dispose(); DirectoryReader reader = DirectoryReader.Open(dir); BytesRef scratch = new BytesRef(); foreach (AtomicReaderContext context in reader.Leaves) { AtomicReader r = context.AtomicReader; for (int i = 0; i < numThreads; i++) { BinaryDocValues bdv = r.GetBinaryDocValues("f" + i); NumericDocValues control = r.GetNumericDocValues("cf" + i); IBits docsWithBdv = r.GetDocsWithField("f" + i); IBits docsWithControl = r.GetDocsWithField("cf" + i); IBits liveDocs = r.LiveDocs; for (int j = 0; j < r.MaxDoc; j++) { if (liveDocs == null || liveDocs.Get(j)) { Assert.AreEqual(docsWithBdv.Get(j), docsWithControl.Get(j)); if (docsWithBdv.Get(j)) { long ctrlValue = control.Get(j); long bdvValue = TestBinaryDocValuesUpdates.GetValue(bdv, j, scratch) * 2; // if (ctrlValue != bdvValue) { // System.out.println("seg=" + r + ", f=f" + i + ", doc=" + j + ", group=" + r.Document(j).Get("updKey") + ", ctrlValue=" + ctrlValue + ", bdvBytes=" + scratch); // } Assert.AreEqual(ctrlValue, bdvValue); } } } } } reader.Dispose(); dir.Dispose(); }
public virtual void TestIntersectStartTerm() { Directory dir = NewDirectory(); IndexWriterConfig iwc = NewIndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer(Random())); iwc.SetMergePolicy(new LogDocMergePolicy()); RandomIndexWriter w = new RandomIndexWriter(Random(), dir, iwc); Document doc = new Document(); doc.Add(NewStringField("field", "abc", Field.Store.NO)); w.AddDocument(doc); doc = new Document(); doc.Add(NewStringField("field", "abd", Field.Store.NO)); w.AddDocument(doc); doc = new Document(); doc.Add(NewStringField("field", "acd", Field.Store.NO)); w.AddDocument(doc); doc = new Document(); doc.Add(NewStringField("field", "bcd", Field.Store.NO)); w.AddDocument(doc); w.ForceMerge(1); DirectoryReader r = w.Reader; w.Dispose(); AtomicReader sub = GetOnlySegmentReader(r); Terms terms = sub.Fields.Terms("field"); Automaton automaton = (new RegExp(".*d", RegExp.NONE)).ToAutomaton(); CompiledAutomaton ca = new CompiledAutomaton(automaton, false, false); TermsEnum te; // should seek to startTerm te = terms.Intersect(ca, new BytesRef("aad")); Assert.AreEqual("abd", te.Next().Utf8ToString()); Assert.AreEqual(1, te.Docs(null, null, DocsEnum.FLAG_NONE).NextDoc()); Assert.AreEqual("acd", te.Next().Utf8ToString()); Assert.AreEqual(2, te.Docs(null, null, DocsEnum.FLAG_NONE).NextDoc()); Assert.AreEqual("bcd", te.Next().Utf8ToString()); Assert.AreEqual(3, te.Docs(null, null, DocsEnum.FLAG_NONE).NextDoc()); Assert.IsNull(te.Next()); // should fail to find ceil label on second arc, rewind te = terms.Intersect(ca, new BytesRef("add")); Assert.AreEqual("bcd", te.Next().Utf8ToString()); Assert.AreEqual(3, te.Docs(null, null, DocsEnum.FLAG_NONE).NextDoc()); Assert.IsNull(te.Next()); // should reach end te = terms.Intersect(ca, new BytesRef("bcd")); Assert.IsNull(te.Next()); te = terms.Intersect(ca, new BytesRef("ddd")); Assert.IsNull(te.Next()); r.Dispose(); dir.Dispose(); }
/// <summary> /// Expert: create a <see cref="ParallelAtomicReader"/> based on the provided /// <paramref name="readers"/> and <paramref name="storedFieldsReaders"/>; when a document is /// loaded, only <paramref name="storedFieldsReaders"/> will be used. /// </summary> public ParallelAtomicReader(bool closeSubReaders, AtomicReader[] readers, AtomicReader[] storedFieldsReaders) { this.closeSubReaders = closeSubReaders; if (readers.Length == 0 && storedFieldsReaders.Length > 0) { throw new ArgumentException("There must be at least one main reader if storedFieldsReaders are used."); } this.parallelReaders = (AtomicReader[])readers.Clone(); this.storedFieldsReaders = (AtomicReader[])storedFieldsReaders.Clone(); if (parallelReaders.Length > 0) { AtomicReader first = parallelReaders[0]; this.maxDoc = first.MaxDoc; this.numDocs = first.NumDocs; this.hasDeletions = first.HasDeletions; } else { this.maxDoc = this.numDocs = 0; this.hasDeletions = false; } completeReaderSet.UnionWith(this.parallelReaders); completeReaderSet.UnionWith(this.storedFieldsReaders); // check compatibility: foreach (AtomicReader reader in completeReaderSet) { if (reader.MaxDoc != maxDoc) { throw new ArgumentException("All readers must have same MaxDoc: " + maxDoc + "!=" + reader.MaxDoc); } } // TODO: make this read-only in a cleaner way? FieldInfos.Builder builder = new FieldInfos.Builder(); // build FieldInfos and fieldToReader map: foreach (AtomicReader reader in this.parallelReaders) { FieldInfos readerFieldInfos = reader.FieldInfos; foreach (FieldInfo fieldInfo in readerFieldInfos) { // NOTE: first reader having a given field "wins": if (!fieldToReader.ContainsKey(fieldInfo.Name)) { builder.Add(fieldInfo); fieldToReader[fieldInfo.Name] = reader; if (fieldInfo.HasVectors) { tvFieldToReader[fieldInfo.Name] = reader; } } } } fieldInfos = builder.Finish(); // build Fields instance foreach (AtomicReader reader in this.parallelReaders) { Fields readerFields = reader.Fields; if (readerFields != null) { foreach (string field in readerFields) { // only add if the reader responsible for that field name is the current: if (fieldToReader[field].Equals(reader)) { this.fields.AddField(field, readerFields.GetTerms(field)); } } } } // do this finally so any Exceptions occurred before don't affect refcounts: foreach (AtomicReader reader in completeReaderSet) { if (!closeSubReaders) { reader.IncRef(); } reader.RegisterParentReader(this); } }
/// <summary> /// Inverts only terms starting w/ prefix, and only terms /// whose docFreq (not taking deletions into account) is /// <= <paramref name="maxTermDocFreq"/> /// </summary> public DocTermOrds(AtomicReader reader, IBits liveDocs, string field, BytesRef termPrefix, int maxTermDocFreq) : this(reader, liveDocs, field, termPrefix, maxTermDocFreq, DEFAULT_INDEX_INTERVAL_BITS) { }
public ThreadAnonymousClass(TestDocValuesWithThreads outerInstance, IList <long?> numbers, IList <BytesRef> binary, IList <BytesRef> sorted, int numDocs, AtomicReader ar, CountdownEvent startingGun, Random threadRandom) { this.outerInstance = outerInstance; this.numbers = numbers; this.binary = binary; this.sorted = sorted; this.numDocs = numDocs; this.ar = ar; this.startingGun = startingGun; this.threadRandom = threadRandom; }
public override AtomicReader Wrap(AtomicReader reader) { return(reader); }
public virtual void Test2() { Random random = Random; int NUM_DOCS = AtLeast(100); Directory dir = NewDirectory(); RandomIndexWriter writer = new RandomIndexWriter( #if FEATURE_INSTANCE_TESTDATA_INITIALIZATION this, #endif random, dir); bool allowDups = random.NextBoolean(); HashSet <string> seen = new HashSet <string>(); if (VERBOSE) { Console.WriteLine("TEST: NUM_DOCS=" + NUM_DOCS + " allowDups=" + allowDups); } int numDocs = 0; IList <BytesRef> docValues = new List <BytesRef>(); // TODO: deletions while (numDocs < NUM_DOCS) { string s; if (random.NextBoolean()) { s = TestUtil.RandomSimpleString(random); } else { s = TestUtil.RandomUnicodeString(random); } BytesRef br = new BytesRef(s); if (!allowDups) { if (seen.Contains(s)) { continue; } seen.Add(s); } if (VERBOSE) { Console.WriteLine(" " + numDocs + ": s=" + s); } Document doc = new Document(); doc.Add(new SortedDocValuesField("stringdv", br)); doc.Add(new NumericDocValuesField("id", numDocs)); docValues.Add(br); writer.AddDocument(doc); numDocs++; if (random.Next(40) == 17) { // force flush writer.GetReader().Dispose(); } } writer.ForceMerge(1); DirectoryReader r = writer.GetReader(); writer.Dispose(); AtomicReader sr = GetOnlySegmentReader(r); long END_TIME = Environment.TickCount + (TEST_NIGHTLY ? 30 : 1); int NUM_THREADS = TestUtil.NextInt32(LuceneTestCase.Random, 1, 10); ThreadClass[] threads = new ThreadClass[NUM_THREADS]; for (int thread = 0; thread < NUM_THREADS; thread++) { threads[thread] = new ThreadAnonymousInnerClassHelper2(random, docValues, sr, END_TIME); threads[thread].Start(); } foreach (ThreadClass thread in threads) { thread.Join(); } r.Dispose(); dir.Dispose(); }
public virtual void TestSortedTermsEnum() { Directory directory = NewDirectory(); Analyzer analyzer = new MockAnalyzer(Random); IndexWriterConfig iwconfig = NewIndexWriterConfig(TEST_VERSION_CURRENT, analyzer); iwconfig.SetMergePolicy(NewLogMergePolicy()); RandomIndexWriter iwriter = new RandomIndexWriter(Random, directory, iwconfig); Document doc = new Document(); doc.Add(new StringField("field", "hello", Field.Store.NO)); iwriter.AddDocument(doc); doc = new Document(); doc.Add(new StringField("field", "world", Field.Store.NO)); iwriter.AddDocument(doc); doc = new Document(); doc.Add(new StringField("field", "beer", Field.Store.NO)); iwriter.AddDocument(doc); iwriter.ForceMerge(1); DirectoryReader ireader = iwriter.GetReader(); iwriter.Dispose(); AtomicReader ar = GetOnlySegmentReader(ireader); SortedSetDocValues dv = FieldCache.DEFAULT.GetDocTermOrds(ar, "field"); Assert.AreEqual(3, dv.ValueCount); TermsEnum termsEnum = dv.GetTermsEnum(); // next() Assert.AreEqual("beer", termsEnum.Next().Utf8ToString()); Assert.AreEqual(0, termsEnum.Ord); Assert.AreEqual("hello", termsEnum.Next().Utf8ToString()); Assert.AreEqual(1, termsEnum.Ord); Assert.AreEqual("world", termsEnum.Next().Utf8ToString()); Assert.AreEqual(2, termsEnum.Ord); // seekCeil() Assert.AreEqual(SeekStatus.NOT_FOUND, termsEnum.SeekCeil(new BytesRef("ha!"))); Assert.AreEqual("hello", termsEnum.Term.Utf8ToString()); Assert.AreEqual(1, termsEnum.Ord); Assert.AreEqual(SeekStatus.FOUND, termsEnum.SeekCeil(new BytesRef("beer"))); Assert.AreEqual("beer", termsEnum.Term.Utf8ToString()); Assert.AreEqual(0, termsEnum.Ord); Assert.AreEqual(SeekStatus.END, termsEnum.SeekCeil(new BytesRef("zzz"))); // seekExact() Assert.IsTrue(termsEnum.SeekExact(new BytesRef("beer"))); Assert.AreEqual("beer", termsEnum.Term.Utf8ToString()); Assert.AreEqual(0, termsEnum.Ord); Assert.IsTrue(termsEnum.SeekExact(new BytesRef("hello"))); Assert.AreEqual("hello", termsEnum.Term.Utf8ToString()); Assert.AreEqual(1, termsEnum.Ord); Assert.IsTrue(termsEnum.SeekExact(new BytesRef("world"))); Assert.AreEqual("world", termsEnum.Term.Utf8ToString()); Assert.AreEqual(2, termsEnum.Ord); Assert.IsFalse(termsEnum.SeekExact(new BytesRef("bogus"))); // seek(ord) termsEnum.SeekExact(0); Assert.AreEqual("beer", termsEnum.Term.Utf8ToString()); Assert.AreEqual(0, termsEnum.Ord); termsEnum.SeekExact(1); Assert.AreEqual("hello", termsEnum.Term.Utf8ToString()); Assert.AreEqual(1, termsEnum.Ord); termsEnum.SeekExact(2); Assert.AreEqual("world", termsEnum.Term.Utf8ToString()); Assert.AreEqual(2, termsEnum.Ord); ireader.Dispose(); directory.Dispose(); }
/// <summary> /// Test term vectors. /// @lucene.experimental /// </summary> public static Status.TermVectorStatus TestTermVectors(AtomicReader reader, TextWriter infoStream) { return TestTermVectors(reader, infoStream, false, false); }
public virtual void TestRandomWithPrefix() { Directory dir = NewDirectory(); ISet <string> prefixes = new JCG.HashSet <string>(); int numPrefix = TestUtil.NextInt32(Random, 2, 7); if (Verbose) { Console.WriteLine("TEST: use " + numPrefix + " prefixes"); } while (prefixes.Count < numPrefix) { prefixes.Add(TestUtil.RandomRealisticUnicodeString(Random)); //prefixes.Add(TestUtil.RandomSimpleString(random)); } string[] prefixesArray = prefixes.ToArray(/*new string[prefixes.Count]*/); int NUM_TERMS = AtLeast(20); ISet <BytesRef> terms = new JCG.HashSet <BytesRef>(); while (terms.Count < NUM_TERMS) { string s = prefixesArray[Random.Next(prefixesArray.Length)] + TestUtil.RandomRealisticUnicodeString(Random); //final String s = prefixesArray[random.nextInt(prefixesArray.Length)] + TestUtil.RandomSimpleString(random); if (s.Length > 0) { terms.Add(new BytesRef(s)); } } BytesRef[] termsArray = terms.ToArray(); Array.Sort(termsArray); int NUM_DOCS = AtLeast(100); IndexWriterConfig conf = NewIndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer(Random)); // Sometimes swap in codec that impls ord(): if (Random.Next(10) == 7) { Codec codec = TestUtil.AlwaysPostingsFormat(PostingsFormat.ForName("Lucene41WithOrds")); conf.SetCodec(codec); } RandomIndexWriter w = new RandomIndexWriter(Random, dir, conf); int[][] idToOrds = new int[NUM_DOCS][]; ISet <int?> ordsForDocSet = new JCG.HashSet <int?>(); for (int id = 0; id < NUM_DOCS; id++) { Document doc = new Document(); doc.Add(new Int32Field("id", id, Field.Store.NO)); int termCount = TestUtil.NextInt32(Random, 0, 20 * RandomMultiplier); while (ordsForDocSet.Count < termCount) { ordsForDocSet.Add(Random.Next(termsArray.Length)); } int[] ordsForDoc = new int[termCount]; int upto = 0; if (Verbose) { Console.WriteLine("TEST: doc id=" + id); } foreach (int ord in ordsForDocSet) { ordsForDoc[upto++] = ord; Field field = NewStringField("field", termsArray[ord].Utf8ToString(), Field.Store.NO); if (Verbose) { Console.WriteLine(" f=" + termsArray[ord].Utf8ToString()); } doc.Add(field); } ordsForDocSet.Clear(); Array.Sort(ordsForDoc); idToOrds[id] = ordsForDoc; w.AddDocument(doc); } DirectoryReader r = w.GetReader(); w.Dispose(); if (Verbose) { Console.WriteLine("TEST: reader=" + r); } AtomicReader slowR = SlowCompositeReaderWrapper.Wrap(r); foreach (string prefix in prefixesArray) { BytesRef prefixRef = prefix == null ? null : new BytesRef(prefix); int[][] idToOrdsPrefix = new int[NUM_DOCS][]; for (int id = 0; id < NUM_DOCS; id++) { int[] docOrds = idToOrds[id]; IList <int?> newOrds = new List <int?>(); foreach (int ord in idToOrds[id]) { if (StringHelper.StartsWith(termsArray[ord], prefixRef)) { newOrds.Add(ord); } } int[] newOrdsArray = new int[newOrds.Count]; int upto = 0; foreach (int ord in newOrds) { newOrdsArray[upto++] = ord; } idToOrdsPrefix[id] = newOrdsArray; } foreach (AtomicReaderContext ctx in r.Leaves) { if (Verbose) { Console.WriteLine("\nTEST: sub=" + ctx.Reader); } Verify((AtomicReader)ctx.Reader, idToOrdsPrefix, termsArray, prefixRef); } // Also test top-level reader: its enum does not support // ord, so this forces the OrdWrapper to run: if (Verbose) { Console.WriteLine("TEST: top reader"); } Verify(slowR, idToOrdsPrefix, termsArray, prefixRef); } FieldCache.DEFAULT.PurgeByCacheKey(slowR.CoreCacheKey); r.Dispose(); dir.Dispose(); }
public override AtomicReader Wrap(AtomicReader reader) { return reader; }
internal Iterator(DocTermOrds outerInstance, AtomicReader reader) { this.OuterInstance = outerInstance; this.Reader = reader; this.Te = TermsEnum(); }
internal virtual AtomicReader[] Wrap(IList<AtomicReader> readers) { AtomicReader[] wrapped = new AtomicReader[readers.Count]; for (int i = 0; i < readers.Count; i++) { wrapped[i] = Wrap(readers[i]); } return wrapped; }
public virtual void Test2() { Random random = Random; int NUM_DOCS = AtLeast(100); Directory dir = NewDirectory(); RandomIndexWriter writer = new RandomIndexWriter( #if FEATURE_INSTANCE_TESTDATA_INITIALIZATION this, #endif random, dir); bool allowDups = random.NextBoolean(); ISet <string> seen = new JCG.HashSet <string>(); if (Verbose) { Console.WriteLine("TEST: NUM_DOCS=" + NUM_DOCS + " allowDups=" + allowDups); } int numDocs = 0; IList <BytesRef> docValues = new List <BytesRef>(); // TODO: deletions while (numDocs < NUM_DOCS) { string s; if (random.NextBoolean()) { s = TestUtil.RandomSimpleString(random); } else { s = TestUtil.RandomUnicodeString(random); } BytesRef br = new BytesRef(s); if (!allowDups) { if (seen.Contains(s)) { continue; } seen.Add(s); } if (Verbose) { Console.WriteLine(" " + numDocs + ": s=" + s); } Document doc = new Document(); doc.Add(new SortedDocValuesField("stringdv", br)); doc.Add(new NumericDocValuesField("id", numDocs)); docValues.Add(br); writer.AddDocument(doc); numDocs++; if (random.Next(40) == 17) { // force flush writer.GetReader().Dispose(); } } writer.ForceMerge(1); DirectoryReader r = writer.GetReader(); writer.Dispose(); AtomicReader sr = GetOnlySegmentReader(r); long END_TIME = (J2N.Time.NanoTime() / J2N.Time.MillisecondsPerNanosecond) + (TestNightly ? 30 : 1); // LUCENENET: Use NanoTime() rather than CurrentTimeMilliseconds() for more accurate/reliable results int NUM_THREADS = TestUtil.NextInt32(LuceneTestCase.Random, 1, 10); ThreadJob[] threads = new ThreadJob[NUM_THREADS]; for (int thread = 0; thread < NUM_THREADS; thread++) { threads[thread] = new ThreadAnonymousClass2(random, docValues, sr, END_TIME); threads[thread].Start(); } foreach (ThreadJob thread in threads) { thread.Join(); } r.Dispose(); dir.Dispose(); }
public ThreadAnonymousInnerClassHelper2(Random random, IList <BytesRef> docValues, AtomicReader sr, long END_TIME) { this.Random = random; this.DocValues = docValues; this.Sr = sr; this.END_TIME = END_TIME; }
/// <summary> /// Test stored fields. /// @lucene.experimental /// </summary> public static Status.StoredFieldStatus TestStoredFields(AtomicReader reader, TextWriter infoStream) { Status.StoredFieldStatus status = new Status.StoredFieldStatus(); try { if (infoStream != null) { infoStream.Write(" test: stored fields......."); } // Scan stored fields for all documents Bits liveDocs = reader.LiveDocs; for (int j = 0; j < reader.MaxDoc; ++j) { // Intentionally pull even deleted documents to // make sure they too are not corrupt: Document doc = reader.Document(j); if (liveDocs == null || liveDocs.Get(j)) { status.DocCount++; status.TotFields += doc.Fields.Count; } } // Validate docCount if (status.DocCount != reader.NumDocs) { throw new Exception("docCount=" + status.DocCount + " but saw " + status.DocCount + " undeleted docs"); } Msg(infoStream, "OK [" + status.TotFields + " total field count; avg " + ((((float)status.TotFields) / status.DocCount)).ToString(CultureInfo.InvariantCulture.NumberFormat) + " fields per doc]"); } catch (Exception e) { Msg(infoStream, "ERROR [" + Convert.ToString(e.Message) + "]"); status.Error = e; if (infoStream != null) { // LUCENENET NOTE: Some tests rely on the error type being in // the message. We can't get the error type with StackTrace, we // need ToString() for that. infoStream.WriteLine(e.ToString()); //infoStream.WriteLine(e.StackTrace); } } return status; }
/// <summary> /// Inverts all terms </summary> public DocTermOrds(AtomicReader reader, IBits liveDocs, string field) : this(reader, liveDocs, field, null, int.MaxValue) { }
/// <summary> /// Test term vectors. /// @lucene.experimental /// </summary> public static Status.TermVectorStatus TestTermVectors(AtomicReader reader, TextWriter infoStream, bool verbose, bool crossCheckTermVectors) { Status.TermVectorStatus status = new Status.TermVectorStatus(); FieldInfos fieldInfos = reader.FieldInfos; Bits onlyDocIsDeleted = new FixedBitSet(1); try { if (infoStream != null) { infoStream.Write(" test: term vectors........"); } DocsEnum docs = null; DocsAndPositionsEnum postings = null; // Only used if crossCheckTermVectors is true: DocsEnum postingsDocs = null; DocsAndPositionsEnum postingsPostings = null; Bits liveDocs = reader.LiveDocs; Fields postingsFields; // TODO: testTermsIndex if (crossCheckTermVectors) { postingsFields = reader.Fields; } else { postingsFields = null; } TermsEnum termsEnum = null; TermsEnum postingsTermsEnum = null; for (int j = 0; j < reader.MaxDoc; ++j) { // Intentionally pull/visit (but don't count in // stats) deleted documents to make sure they too // are not corrupt: Fields tfv = reader.GetTermVectors(j); // TODO: can we make a IS(FIR) that searches just // this term vector... to pass for searcher? if (tfv != null) { // First run with no deletions: CheckFields(tfv, null, 1, fieldInfos, false, true, infoStream, verbose); // Again, with the one doc deleted: CheckFields(tfv, onlyDocIsDeleted, 1, fieldInfos, false, true, infoStream, verbose); // Only agg stats if the doc is live: bool doStats = liveDocs == null || liveDocs.Get(j); if (doStats) { status.DocCount++; } foreach (string field in tfv) { if (doStats) { status.TotVectors++; } // Make sure FieldInfo thinks this field is vector'd: FieldInfo fieldInfo = fieldInfos.FieldInfo(field); if (!fieldInfo.HasVectors()) { throw new Exception("docID=" + j + " has term vectors for field=" + field + " but FieldInfo has storeTermVector=false"); } if (crossCheckTermVectors) { Terms terms = tfv.Terms(field); termsEnum = terms.Iterator(termsEnum); bool postingsHasFreq = fieldInfo.FieldIndexOptions >= FieldInfo.IndexOptions.DOCS_AND_FREQS; bool postingsHasPayload = fieldInfo.HasPayloads(); bool vectorsHasPayload = terms.HasPayloads(); Terms postingsTerms = postingsFields.Terms(field); if (postingsTerms == null) { throw new Exception("vector field=" + field + " does not exist in postings; doc=" + j); } postingsTermsEnum = postingsTerms.Iterator(postingsTermsEnum); bool hasProx = terms.HasOffsets() || terms.HasPositions(); BytesRef term = null; while ((term = termsEnum.Next()) != null) { if (hasProx) { postings = termsEnum.DocsAndPositions(null, postings); Debug.Assert(postings != null); docs = null; } else { docs = termsEnum.Docs(null, docs); Debug.Assert(docs != null); postings = null; } DocsEnum docs2; if (hasProx) { Debug.Assert(postings != null); docs2 = postings; } else { Debug.Assert(docs != null); docs2 = docs; } DocsEnum postingsDocs2; if (!postingsTermsEnum.SeekExact(term)) { throw new Exception("vector term=" + term + " field=" + field + " does not exist in postings; doc=" + j); } postingsPostings = postingsTermsEnum.DocsAndPositions(null, postingsPostings); if (postingsPostings == null) { // Term vectors were indexed w/ pos but postings were not postingsDocs = postingsTermsEnum.Docs(null, postingsDocs); if (postingsDocs == null) { throw new Exception("vector term=" + term + " field=" + field + " does not exist in postings; doc=" + j); } } if (postingsPostings != null) { postingsDocs2 = postingsPostings; } else { postingsDocs2 = postingsDocs; } int advanceDoc = postingsDocs2.Advance(j); if (advanceDoc != j) { throw new Exception("vector term=" + term + " field=" + field + ": doc=" + j + " was not found in postings (got: " + advanceDoc + ")"); } int doc = docs2.NextDoc(); if (doc != 0) { throw new Exception("vector for doc " + j + " didn't return docID=0: got docID=" + doc); } if (postingsHasFreq) { int tf = docs2.Freq(); if (postingsHasFreq && postingsDocs2.Freq() != tf) { throw new Exception("vector term=" + term + " field=" + field + " doc=" + j + ": freq=" + tf + " differs from postings freq=" + postingsDocs2.Freq()); } if (hasProx) { for (int i = 0; i < tf; i++) { int pos = postings.NextPosition(); if (postingsPostings != null) { int postingsPos = postingsPostings.NextPosition(); if (terms.HasPositions() && pos != postingsPos) { throw new Exception("vector term=" + term + " field=" + field + " doc=" + j + ": pos=" + pos + " differs from postings pos=" + postingsPos); } } // Call the methods to at least make // sure they don't throw exc: int startOffset = postings.StartOffset(); int endOffset = postings.EndOffset(); // TODO: these are too anal...? /* if (endOffset < startOffset) { throw new RuntimeException("vector startOffset=" + startOffset + " is > endOffset=" + endOffset); } if (startOffset < lastStartOffset) { throw new RuntimeException("vector startOffset=" + startOffset + " is < prior startOffset=" + lastStartOffset); } lastStartOffset = startOffset; */ if (postingsPostings != null) { int postingsStartOffset = postingsPostings.StartOffset(); int postingsEndOffset = postingsPostings.EndOffset(); if (startOffset != -1 && postingsStartOffset != -1 && startOffset != postingsStartOffset) { throw new Exception("vector term=" + term + " field=" + field + " doc=" + j + ": startOffset=" + startOffset + " differs from postings startOffset=" + postingsStartOffset); } if (endOffset != -1 && postingsEndOffset != -1 && endOffset != postingsEndOffset) { throw new Exception("vector term=" + term + " field=" + field + " doc=" + j + ": endOffset=" + endOffset + " differs from postings endOffset=" + postingsEndOffset); } } BytesRef payload = postings.Payload; if (payload != null) { Debug.Assert(vectorsHasPayload); } if (postingsHasPayload && vectorsHasPayload) { Debug.Assert(postingsPostings != null); if (payload == null) { // we have payloads, but not at this position. // postings has payloads too, it should not have one at this position if (postingsPostings.Payload != null) { throw new Exception("vector term=" + term + " field=" + field + " doc=" + j + " has no payload but postings does: " + postingsPostings.Payload); } } else { // we have payloads, and one at this position // postings should also have one at this position, with the same bytes. if (postingsPostings.Payload == null) { throw new Exception("vector term=" + term + " field=" + field + " doc=" + j + " has payload=" + payload + " but postings does not."); } BytesRef postingsPayload = postingsPostings.Payload; if (!payload.Equals(postingsPayload)) { throw new Exception("vector term=" + term + " field=" + field + " doc=" + j + " has payload=" + payload + " but differs from postings payload=" + postingsPayload); } } } } } } } } } } } float vectorAvg = status.DocCount == 0 ? 0 : status.TotVectors / (float)status.DocCount; Msg(infoStream, "OK [" + status.TotVectors + " total vector count; avg " + vectorAvg.ToString(CultureInfo.InvariantCulture.NumberFormat) + " term/freq vector fields per doc]"); } catch (Exception e) { Msg(infoStream, "ERROR [" + Convert.ToString(e.Message) + "]"); status.Error = e; if (infoStream != null) { // LUCENENET NOTE: Some tests rely on the error type being in // the message. We can't get the error type with StackTrace, we // need ToString() for that. infoStream.WriteLine(e.ToString()); //infoStream.WriteLine(e.StackTrace); } } return status; }
public virtual void TestIgnoreStoredFields() { Directory dir1 = GetDir1(Random); Directory dir2 = GetDir2(Random); CompositeReader ir1 = DirectoryReader.Open(dir1); CompositeReader ir2 = DirectoryReader.Open(dir2); // with overlapping ParallelCompositeReader pr = new ParallelCompositeReader(false, new CompositeReader[] { ir1, ir2 }, new CompositeReader[] { ir1 }); Assert.AreEqual("v1", pr.Document(0).Get("f1")); Assert.AreEqual("v1", pr.Document(0).Get("f2")); Assert.IsNull(pr.Document(0).Get("f3")); Assert.IsNull(pr.Document(0).Get("f4")); // check that fields are there AtomicReader slow = SlowCompositeReaderWrapper.Wrap(pr); Assert.IsNotNull(slow.GetTerms("f1")); Assert.IsNotNull(slow.GetTerms("f2")); Assert.IsNotNull(slow.GetTerms("f3")); Assert.IsNotNull(slow.GetTerms("f4")); pr.Dispose(); // no stored fields at all pr = new ParallelCompositeReader(false, new CompositeReader[] { ir2 }, new CompositeReader[0]); Assert.IsNull(pr.Document(0).Get("f1")); Assert.IsNull(pr.Document(0).Get("f2")); Assert.IsNull(pr.Document(0).Get("f3")); Assert.IsNull(pr.Document(0).Get("f4")); // check that fields are there slow = SlowCompositeReaderWrapper.Wrap(pr); Assert.IsNull(slow.GetTerms("f1")); Assert.IsNull(slow.GetTerms("f2")); Assert.IsNotNull(slow.GetTerms("f3")); Assert.IsNotNull(slow.GetTerms("f4")); pr.Dispose(); // without overlapping pr = new ParallelCompositeReader(true, new CompositeReader[] { ir2 }, new CompositeReader[] { ir1 }); Assert.AreEqual("v1", pr.Document(0).Get("f1")); Assert.AreEqual("v1", pr.Document(0).Get("f2")); Assert.IsNull(pr.Document(0).Get("f3")); Assert.IsNull(pr.Document(0).Get("f4")); // check that fields are there slow = SlowCompositeReaderWrapper.Wrap(pr); Assert.IsNull(slow.GetTerms("f1")); Assert.IsNull(slow.GetTerms("f2")); Assert.IsNotNull(slow.GetTerms("f3")); Assert.IsNotNull(slow.GetTerms("f4")); pr.Dispose(); // no main readers try { new ParallelCompositeReader(true, new CompositeReader[0], new CompositeReader[] { ir1 }); Assert.Fail("didn't get expected exception: need a non-empty main-reader array"); } #pragma warning disable 168 catch (System.ArgumentException iae) #pragma warning restore 168 { // pass } dir1.Dispose(); dir2.Dispose(); }
/// <summary> /// Wrap one of the parent DirectoryReader's subreaders </summary> /// <param name="reader"> the subreader to wrap </param> /// <returns> a wrapped/filtered AtomicReader </returns> public abstract AtomicReader Wrap(AtomicReader reader);
/// <summary> /// <p>Construct a FilterAtomicReader based on the specified base reader. /// <p>Note that base reader is closed if this FilterAtomicReader is closed.</p> </summary> /// <param name="in"> specified base reader. </param> public FilterAtomicReader(AtomicReader @in) : base() { this.@in = @in; @in.RegisterParentReader(this); }
/// <summary> /// Invoked on the <seealso cref="AtomicReader"/> for the newly /// merged segment, before that segment is made visible /// to near-real-time readers. /// </summary> public abstract void Warm(AtomicReader reader);
public FilterAtomicReaderAnonymousInnerClassHelper(TestIndexReaderClose outerInstance, AtomicReader wrap, bool throwOnClose) : base(wrap) { this.OuterInstance = outerInstance; this.ThrowOnClose = throwOnClose; }
public FakeDeleteAtomicIndexReader(AtomicReader reader) : base(reader) { UndeleteAll(); // initialize main bitset }
/// <summary> /// Inverts only terms starting w/ prefix, and only terms /// whose docFreq (not taking deletions into account) is /// <= <paramref name="maxTermDocFreq"/>, with a custom indexing interval /// (default is every 128nd term). /// </summary> public DocTermOrds(AtomicReader reader, IBits liveDocs, string field, BytesRef termPrefix, int maxTermDocFreq, int indexIntervalBits) : this(field, maxTermDocFreq, indexIntervalBits) { Uninvert(reader, liveDocs, termPrefix); }
public virtual void TestRandom() { // token -> docID -> tokens IDictionary <string, IDictionary <int?, IList <Token> > > actualTokens = new Dictionary <string, IDictionary <int?, IList <Token> > >(); Directory dir = NewDirectory(); RandomIndexWriter w = new RandomIndexWriter(Random, dir, iwc); int numDocs = AtLeast(20); //final int numDocs = AtLeast(5); FieldType ft = new FieldType(TextField.TYPE_NOT_STORED); // TODO: randomize what IndexOptions we use; also test // changing this up in one IW buffered segment...: ft.IndexOptions = IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS; if (Random.NextBoolean()) { ft.StoreTermVectors = true; ft.StoreTermVectorOffsets = Random.NextBoolean(); ft.StoreTermVectorPositions = Random.NextBoolean(); } for (int docCount = 0; docCount < numDocs; docCount++) { Document doc = new Document(); doc.Add(new Int32Field("id", docCount, Field.Store.NO)); IList <Token> tokens = new List <Token>(); int numTokens = AtLeast(100); //final int numTokens = AtLeast(20); int pos = -1; int offset = 0; //System.out.println("doc id=" + docCount); for (int tokenCount = 0; tokenCount < numTokens; tokenCount++) { string text; if (Random.NextBoolean()) { text = "a"; } else if (Random.NextBoolean()) { text = "b"; } else if (Random.NextBoolean()) { text = "c"; } else { text = "d"; } int posIncr = Random.NextBoolean() ? 1 : Random.Next(5); if (tokenCount == 0 && posIncr == 0) { posIncr = 1; } int offIncr = Random.NextBoolean() ? 0 : Random.Next(5); int tokenOffset = Random.Next(5); Token token = MakeToken(text, posIncr, offset + offIncr, offset + offIncr + tokenOffset); if (!actualTokens.TryGetValue(text, out IDictionary <int?, IList <Token> > postingsByDoc)) { actualTokens[text] = postingsByDoc = new Dictionary <int?, IList <Token> >(); } if (!postingsByDoc.TryGetValue(docCount, out IList <Token> postings)) { postingsByDoc[docCount] = postings = new List <Token>(); } postings.Add(token); tokens.Add(token); pos += posIncr; // stuff abs position into type: token.Type = "" + pos; offset += offIncr + tokenOffset; //System.out.println(" " + token + " posIncr=" + token.getPositionIncrement() + " pos=" + pos + " off=" + token.StartOffset + "/" + token.EndOffset + " (freq=" + postingsByDoc.Get(docCount).Size() + ")"); } doc.Add(new Field("content", new CannedTokenStream(tokens.ToArray()), ft)); w.AddDocument(doc); } DirectoryReader r = w.GetReader(); w.Dispose(); string[] terms = new string[] { "a", "b", "c", "d" }; foreach (AtomicReaderContext ctx in r.Leaves) { // TODO: improve this AtomicReader sub = (AtomicReader)ctx.Reader; //System.out.println("\nsub=" + sub); TermsEnum termsEnum = sub.Fields.GetTerms("content").GetIterator(null); DocsEnum docs = null; DocsAndPositionsEnum docsAndPositions = null; DocsAndPositionsEnum docsAndPositionsAndOffsets = null; FieldCache.Int32s docIDToID = FieldCache.DEFAULT.GetInt32s(sub, "id", false); foreach (string term in terms) { //System.out.println(" term=" + term); if (termsEnum.SeekExact(new BytesRef(term))) { docs = termsEnum.Docs(null, docs); Assert.IsNotNull(docs); int doc; //System.out.println(" doc/freq"); while ((doc = docs.NextDoc()) != DocIdSetIterator.NO_MORE_DOCS) { IList <Token> expected = actualTokens[term][docIDToID.Get(doc)]; //System.out.println(" doc=" + docIDToID.Get(doc) + " docID=" + doc + " " + expected.Size() + " freq"); Assert.IsNotNull(expected); Assert.AreEqual(expected.Count, docs.Freq); } // explicitly exclude offsets here docsAndPositions = termsEnum.DocsAndPositions(null, docsAndPositions, DocsAndPositionsFlags.PAYLOADS); Assert.IsNotNull(docsAndPositions); //System.out.println(" doc/freq/pos"); while ((doc = docsAndPositions.NextDoc()) != DocIdSetIterator.NO_MORE_DOCS) { IList <Token> expected = actualTokens[term][docIDToID.Get(doc)]; //System.out.println(" doc=" + docIDToID.Get(doc) + " " + expected.Size() + " freq"); Assert.IsNotNull(expected); Assert.AreEqual(expected.Count, docsAndPositions.Freq); foreach (Token token in expected) { int pos = Convert.ToInt32(token.Type); //System.out.println(" pos=" + pos); Assert.AreEqual(pos, docsAndPositions.NextPosition()); } } docsAndPositionsAndOffsets = termsEnum.DocsAndPositions(null, docsAndPositions); Assert.IsNotNull(docsAndPositionsAndOffsets); //System.out.println(" doc/freq/pos/offs"); while ((doc = docsAndPositionsAndOffsets.NextDoc()) != DocIdSetIterator.NO_MORE_DOCS) { IList <Token> expected = actualTokens[term][docIDToID.Get(doc)]; //System.out.println(" doc=" + docIDToID.Get(doc) + " " + expected.Size() + " freq"); Assert.IsNotNull(expected); Assert.AreEqual(expected.Count, docsAndPositionsAndOffsets.Freq); foreach (Token token in expected) { int pos = Convert.ToInt32(token.Type); //System.out.println(" pos=" + pos); Assert.AreEqual(pos, docsAndPositionsAndOffsets.NextPosition()); Assert.AreEqual(token.StartOffset, docsAndPositionsAndOffsets.StartOffset); Assert.AreEqual(token.EndOffset, docsAndPositionsAndOffsets.EndOffset); } } } } // TODO: test advance: } r.Dispose(); dir.Dispose(); }
public virtual void Test() { IList <string> postingsList = new List <string>(); int numTerms = AtLeast(300); int maxTermsPerDoc = TestUtil.NextInt32(Random, 10, 20); bool isSimpleText = "SimpleText".Equals(TestUtil.GetPostingsFormat("field"), StringComparison.Ordinal); IndexWriterConfig iwc = NewIndexWriterConfig(Random, TEST_VERSION_CURRENT, new MockAnalyzer(Random)); if ((isSimpleText || iwc.MergePolicy is MockRandomMergePolicy) && (TestNightly || RandomMultiplier > 1)) { // Otherwise test can take way too long (> 2 hours) //numTerms /= 2; // LUCENENET specific - To keep this under the 1 hour free limit // of Azure DevOps, this was reduced from /2 to /6. numTerms /= 6; } if (Verbose) { Console.WriteLine("maxTermsPerDoc=" + maxTermsPerDoc); Console.WriteLine("numTerms=" + numTerms); } for (int i = 0; i < numTerms; i++) { string term = Convert.ToString(i, CultureInfo.InvariantCulture); for (int j = 0; j < i; j++) { postingsList.Add(term); } } postingsList.Shuffle(Random); ConcurrentQueue <string> postings = new ConcurrentQueue <string>(postingsList); Directory dir = NewFSDirectory(CreateTempDir("bagofpostings")); RandomIndexWriter iw = new RandomIndexWriter(Random, dir, iwc); int threadCount = TestUtil.NextInt32(Random, 1, 5); if (Verbose) { Console.WriteLine("config: " + iw.IndexWriter.Config); Console.WriteLine("threadCount=" + threadCount); } ThreadJob[] threads = new ThreadJob[threadCount]; CountdownEvent startingGun = new CountdownEvent(1); for (int threadID = 0; threadID < threadCount; threadID++) { threads[threadID] = new ThreadAnonymousClass(this, maxTermsPerDoc, postings, iw, startingGun); threads[threadID].Start(); } startingGun.Signal(); foreach (ThreadJob t in threads) { t.Join(); } iw.ForceMerge(1); DirectoryReader ir = iw.GetReader(); Assert.AreEqual(1, ir.Leaves.Count); AtomicReader air = (AtomicReader)ir.Leaves[0].Reader; Terms terms = air.GetTerms("field"); // numTerms-1 because there cannot be a term 0 with 0 postings: #pragma warning disable 612, 618 Assert.AreEqual(numTerms - 1, air.Fields.UniqueTermCount); if (iwc.Codec is Lucene3xCodec == false) #pragma warning restore 612, 618 { Assert.AreEqual(numTerms - 1, terms.Count); } TermsEnum termsEnum = terms.GetEnumerator(); while (termsEnum.MoveNext()) { int value = Convert.ToInt32(termsEnum.Term.Utf8ToString(), CultureInfo.InvariantCulture); Assert.AreEqual(value, termsEnum.DocFreq); // don't really need to check more than this, as CheckIndex // will verify that docFreq == actual number of documents seen // from a docsAndPositionsEnum. } ir.Dispose(); iw.Dispose(); dir.Dispose(); }
public virtual void Test() { IList <string> postingsList = new List <string>(); int numTerms = AtLeast(300); int maxTermsPerDoc = TestUtil.NextInt(Random(), 10, 20); bool isSimpleText = "SimpleText".Equals(TestUtil.GetPostingsFormat("field")); IndexWriterConfig iwc = NewIndexWriterConfig(Random(), TEST_VERSION_CURRENT, new MockAnalyzer(Random())); if ((isSimpleText || iwc.MergePolicy is MockRandomMergePolicy) && (TEST_NIGHTLY || RANDOM_MULTIPLIER > 1)) { // Otherwise test can take way too long (> 2 hours) numTerms /= 2; } if (VERBOSE) { Console.WriteLine("maxTermsPerDoc=" + maxTermsPerDoc); Console.WriteLine("numTerms=" + numTerms); } for (int i = 0; i < numTerms; i++) { string term = Convert.ToString(i); for (int j = 0; j < i; j++) { postingsList.Add(term); } } postingsList = CollectionsHelper.Shuffle(postingsList); ConcurrentQueue <string> postings = new ConcurrentQueue <string>(postingsList); Directory dir = NewFSDirectory(CreateTempDir(GetFullMethodName())); RandomIndexWriter iw = new RandomIndexWriter(Random(), dir, iwc); int threadCount = TestUtil.NextInt(Random(), 1, 5); if (VERBOSE) { Console.WriteLine("config: " + iw.w.Config); Console.WriteLine("threadCount=" + threadCount); } Field prototype = NewTextField("field", "", Field.Store.NO); FieldType fieldType = new FieldType((FieldType)prototype.FieldType); if (Random().NextBoolean()) { fieldType.OmitNorms = true; } int options = Random().Next(3); if (options == 0) { fieldType.IndexOptions = FieldInfo.IndexOptions.DOCS_AND_FREQS; // we dont actually need positions fieldType.StoreTermVectors = true; // but enforce term vectors when we do this so we check SOMETHING } else if (options == 1 && !DoesntSupportOffsets.Contains(TestUtil.GetPostingsFormat("field"))) { fieldType.IndexOptions = FieldInfo.IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS; } // else just positions ThreadClass[] threads = new ThreadClass[threadCount]; CountdownEvent startingGun = new CountdownEvent(1); for (int threadID = 0; threadID < threadCount; threadID++) { Random threadRandom = new Random(Random().Next()); Document document = new Document(); Field field = new Field("field", "", fieldType); document.Add(field); threads[threadID] = new ThreadAnonymousInnerClassHelper(this, numTerms, maxTermsPerDoc, postings, iw, startingGun, threadRandom, document, field); threads[threadID].Start(); } startingGun.Signal(); foreach (ThreadClass t in threads) { t.Join(); } iw.ForceMerge(1); DirectoryReader ir = iw.Reader; Assert.AreEqual(1, ir.Leaves.Count); AtomicReader air = (AtomicReader)ir.Leaves[0].Reader; Terms terms = air.Terms("field"); // numTerms-1 because there cannot be a term 0 with 0 postings: Assert.AreEqual(numTerms - 1, terms.Size()); TermsEnum termsEnum = terms.Iterator(null); BytesRef termBR; while ((termBR = termsEnum.Next()) != null) { int value = Convert.ToInt32(termBR.Utf8ToString()); Assert.AreEqual(value, termsEnum.TotalTermFreq()); // don't really need to check more than this, as CheckIndex // will verify that totalTermFreq == total number of positions seen // from a docsAndPositionsEnum. } ir.Dispose(); iw.Dispose(); dir.Dispose(); }
/// <summary> /// Inverts only terms starting w/ prefix </summary> public DocTermOrds(AtomicReader reader, IBits liveDocs, string field, BytesRef termPrefix) : this(reader, liveDocs, field, termPrefix, int.MaxValue) { }
public AllDeletedFilterReader(AtomicReader @in) : base(@in) { LiveDocs_Renamed = new Bits_MatchNoBits(@in.MaxDoc); Debug.Assert(MaxDoc == 0 || HasDeletions); }
public void TestNumericField() { Directory dir = NewDirectory(); var w = new RandomIndexWriter(Random(), dir); var numDocs = AtLeast(500); var answers = new object[numDocs]; FieldType.NumericType[] typeAnswers = new FieldType.NumericType[numDocs]; for (int id = 0; id < numDocs; id++) { Document doc = new Document(); Field nf; Field sf; object answer; FieldType.NumericType typeAnswer; if (Random().NextBoolean()) { // float/double if (Random().NextBoolean()) { float f = Random().NextFloat(); answer = Convert.ToSingle(f); nf = new FloatField("nf", f, Field.Store.NO); sf = new StoredField("nf", f); typeAnswer = FieldType.NumericType.FLOAT; } else { double d = Random().NextDouble(); answer = Convert.ToDouble(d); nf = new DoubleField("nf", d, Field.Store.NO); sf = new StoredField("nf", d); typeAnswer = FieldType.NumericType.DOUBLE; } } else { // int/long if (Random().NextBoolean()) { int i = Random().Next(); answer = Convert.ToInt32(i); nf = new IntField("nf", i, Field.Store.NO); sf = new StoredField("nf", i); typeAnswer = FieldType.NumericType.INT; } else { long l = Random().NextLong(); answer = Convert.ToInt64(l); nf = new LongField("nf", l, Field.Store.NO); sf = new StoredField("nf", l); typeAnswer = FieldType.NumericType.LONG; } } doc.Add(nf); doc.Add(sf); answers[id] = answer; typeAnswers[id] = typeAnswer; FieldType ft = new FieldType(IntField.TYPE_STORED); ft.NumericPrecisionStep = int.MaxValue; doc.Add(new IntField("id", id, ft)); w.AddDocument(doc); } DirectoryReader r = w.Reader; w.Dispose(); Assert.AreEqual(numDocs, r.NumDocs); foreach (AtomicReaderContext ctx in r.Leaves) { AtomicReader sub = (AtomicReader)ctx.Reader; FieldCache.Ints ids = FieldCache.DEFAULT.GetInts(sub, "id", false); for (int docID = 0; docID < sub.NumDocs; docID++) { Document doc = sub.Document(docID); Field f = (Field)doc.GetField("nf"); Assert.IsTrue(f is StoredField, "got f=" + f); Assert.AreEqual(answers[ids.Get(docID)], f.NumericValue); } } r.Dispose(); dir.Dispose(); }