public virtual void Test() { Directory dir = NewDirectory(); MockAnalyzer analyzer = new MockAnalyzer(Random); analyzer.MaxTokenLength = TestUtil.NextInt32(Random, 1, IndexWriter.MAX_TERM_LENGTH); RandomIndexWriter w = new RandomIndexWriter( #if FEATURE_INSTANCE_TESTDATA_INITIALIZATION this, #endif Random, dir, analyzer); LineFileDocs docs = new LineFileDocs(Random, DefaultCodecSupportsDocValues); int charsToIndex = AtLeast(100000); int charsIndexed = 0; //System.out.println("bytesToIndex=" + charsToIndex); while (charsIndexed < charsToIndex) { Document doc = docs.NextDoc(); charsIndexed += doc.Get("body").Length; w.AddDocument(doc); //System.out.println(" bytes=" + charsIndexed + " add: " + doc); } IndexReader r = w.GetReader(); //System.out.println("numDocs=" + r.NumDocs); w.Dispose(); IndexSearcher s = NewSearcher(r); Terms terms = MultiFields.GetFields(r).GetTerms("body"); int termCount = 0; TermsEnum termsEnum = terms.GetEnumerator(); while (termsEnum.MoveNext()) { termCount++; } Assert.IsTrue(termCount > 0); // Target ~10 terms to search: double chance = 10.0 / termCount; termsEnum = terms.GetEnumerator(termsEnum); IDictionary <BytesRef, TopDocs> answers = new Dictionary <BytesRef, TopDocs>(); while (termsEnum.MoveNext()) { if (Random.NextDouble() <= chance) { BytesRef term = BytesRef.DeepCopyOf(termsEnum.Term); answers[term] = s.Search(new TermQuery(new Term("body", term)), 100); } } if (answers.Count > 0) { CountdownEvent startingGun = new CountdownEvent(1); int numThreads = TestUtil.NextInt32(Random, 2, 5); ThreadJob[] threads = new ThreadJob[numThreads]; for (int threadID = 0; threadID < numThreads; threadID++) { ThreadJob thread = new ThreadAnonymousClass(this, s, answers, startingGun); threads[threadID] = thread; thread.Start(); } startingGun.Signal(); foreach (ThreadJob thread in threads) { thread.Join(); } } r.Dispose(); dir.Dispose(); }
/// <summary> /// Build the suggest index, using up to the specified /// amount of temporary RAM while building. Note that /// the weights for the suggestions are ignored. /// </summary> public virtual void Build(IInputEnumerator enumerator, double ramBufferSizeMB) { if (enumerator.HasPayloads) { throw new ArgumentException("this suggester doesn't support payloads"); } if (enumerator.HasContexts) { throw new ArgumentException("this suggester doesn't support contexts"); } string prefix = this.GetType().Name; var directory = OfflineSorter.DefaultTempDir(); // LUCENENET specific - using GetRandomFileName() instead of picking a random int DirectoryInfo tempIndexPath; // LUCENENET: IDE0059: Remove unnecessary value assignment while (true) { tempIndexPath = new DirectoryInfo(Path.Combine(directory.FullName, prefix + ".index." + Path.GetFileNameWithoutExtension(Path.GetRandomFileName()))); tempIndexPath.Create(); if (System.IO.Directory.Exists(tempIndexPath.FullName)) { break; } } Directory dir = FSDirectory.Open(tempIndexPath); try { #pragma warning disable 612, 618 IndexWriterConfig iwc = new IndexWriterConfig(LuceneVersion.LUCENE_CURRENT, indexAnalyzer); #pragma warning restore 612, 618 iwc.SetOpenMode(OpenMode.CREATE); iwc.SetRAMBufferSizeMB(ramBufferSizeMB); IndexWriter writer = new IndexWriter(dir, iwc); var ft = new FieldType(TextField.TYPE_NOT_STORED); // TODO: if only we had IndexOptions.TERMS_ONLY... ft.IndexOptions = IndexOptions.DOCS_AND_FREQS; ft.OmitNorms = true; ft.Freeze(); Document doc = new Document(); Field field = new Field("body", "", ft); doc.Add(field); totTokens = 0; IndexReader reader = null; bool success = false; count = 0; try { while (enumerator.MoveNext()) { BytesRef surfaceForm = enumerator.Current; field.SetStringValue(surfaceForm.Utf8ToString()); writer.AddDocument(doc); count++; } reader = DirectoryReader.Open(writer, false); Terms terms = MultiFields.GetTerms(reader, "body"); if (terms == null) { throw new ArgumentException("need at least one suggestion"); } // Move all ngrams into an FST: TermsEnum termsEnum = terms.GetEnumerator(null); Outputs <long?> outputs = PositiveInt32Outputs.Singleton; Builder <long?> builder = new Builder <long?>(FST.INPUT_TYPE.BYTE1, outputs); Int32sRef scratchInts = new Int32sRef(); while (termsEnum.MoveNext()) { BytesRef term = termsEnum.Term; int ngramCount = CountGrams(term); if (ngramCount > grams) { throw new ArgumentException("tokens must not contain separator byte; got token=" + term + " but gramCount=" + ngramCount + ", which is greater than expected max ngram size=" + grams); } if (ngramCount == 1) { totTokens += termsEnum.TotalTermFreq; } builder.Add(Lucene.Net.Util.Fst.Util.ToInt32sRef(term, scratchInts), EncodeWeight(termsEnum.TotalTermFreq)); } fst = builder.Finish(); if (fst == null) { throw new ArgumentException("need at least one suggestion"); } //System.out.println("FST: " + fst.getNodeCount() + " nodes"); /* * PrintWriter pw = new PrintWriter("/x/tmp/out.dot"); * Util.toDot(fst, pw, true, true); * pw.close(); */ success = true; } finally { if (success) { IOUtils.Dispose(writer, reader); } else { IOUtils.DisposeWhileHandlingException(writer, reader); } } } finally { try { IOUtils.Dispose(dir); } finally { // LUCENENET specific - since we are removing the entire directory anyway, // it doesn't make sense to first do a loop in order remove the files. // Let the System.IO.Directory.Delete() method handle that. // We also need to dispose the Directory instance first before deleting from disk. try { System.IO.Directory.Delete(tempIndexPath.FullName, true); } catch (Exception e) { throw new InvalidOperationException("failed to remove " + tempIndexPath, e); } } } }
public virtual void Test() { #pragma warning disable 612, 618 IFieldCache cache = FieldCache.DEFAULT; FieldCache.Doubles doubles = cache.GetDoubles(reader, "theDouble", Random.NextBoolean()); Assert.AreSame(doubles, cache.GetDoubles(reader, "theDouble", Random.NextBoolean()), "Second request to cache return same array"); Assert.AreSame(doubles, cache.GetDoubles(reader, "theDouble", FieldCache.DEFAULT_DOUBLE_PARSER, Random.NextBoolean()), "Second request with explicit parser return same array"); for (int i = 0; i < NUM_DOCS; i++) { Assert.IsTrue(doubles.Get(i) == (double.MaxValue - i), doubles.Get(i) + " does not equal: " + (double.MaxValue - i)); } FieldCache.Int64s longs = cache.GetInt64s(reader, "theLong", Random.NextBoolean()); Assert.AreSame(longs, cache.GetInt64s(reader, "theLong", Random.NextBoolean()), "Second request to cache return same array"); Assert.AreSame(longs, cache.GetInt64s(reader, "theLong", FieldCache.DEFAULT_INT64_PARSER, Random.NextBoolean()), "Second request with explicit parser return same array"); for (int i = 0; i < NUM_DOCS; i++) { Assert.IsTrue(longs.Get(i) == (long.MaxValue - i), longs.Get(i) + " does not equal: " + (long.MaxValue - i) + " i=" + i); } FieldCache.Bytes bytes = cache.GetBytes(reader, "theByte", Random.NextBoolean()); Assert.AreSame(bytes, cache.GetBytes(reader, "theByte", Random.NextBoolean()), "Second request to cache return same array"); Assert.AreSame(bytes, cache.GetBytes(reader, "theByte", FieldCache.DEFAULT_BYTE_PARSER, Random.NextBoolean()), "Second request with explicit parser return same array"); for (int i = 0; i < NUM_DOCS; i++) { Assert.IsTrue((sbyte)bytes.Get(i) == (sbyte)(sbyte.MaxValue - i), (sbyte)bytes.Get(i) + " does not equal: " + (sbyte.MaxValue - i)); } FieldCache.Int16s shorts = cache.GetInt16s(reader, "theShort", Random.NextBoolean()); Assert.AreSame(shorts, cache.GetInt16s(reader, "theShort", Random.NextBoolean()), "Second request to cache return same array"); Assert.AreSame(shorts, cache.GetInt16s(reader, "theShort", FieldCache.DEFAULT_INT16_PARSER, Random.NextBoolean()), "Second request with explicit parser return same array"); for (int i = 0; i < NUM_DOCS; i++) { Assert.IsTrue(shorts.Get(i) == (short)(short.MaxValue - i), shorts.Get(i) + " does not equal: " + (short.MaxValue - i)); } FieldCache.Int32s ints = cache.GetInt32s(reader, "theInt", Random.NextBoolean()); Assert.AreSame(ints, cache.GetInt32s(reader, "theInt", Random.NextBoolean()), "Second request to cache return same array"); Assert.AreSame(ints, cache.GetInt32s(reader, "theInt", FieldCache.DEFAULT_INT32_PARSER, Random.NextBoolean()), "Second request with explicit parser return same array"); for (int i = 0; i < NUM_DOCS; i++) { Assert.IsTrue(ints.Get(i) == (int.MaxValue - i), ints.Get(i) + " does not equal: " + (int.MaxValue - i)); } FieldCache.Singles floats = cache.GetSingles(reader, "theFloat", Random.NextBoolean()); Assert.AreSame(floats, cache.GetSingles(reader, "theFloat", Random.NextBoolean()), "Second request to cache return same array"); Assert.AreSame(floats, cache.GetSingles(reader, "theFloat", FieldCache.DEFAULT_SINGLE_PARSER, Random.NextBoolean()), "Second request with explicit parser return same array"); for (int i = 0; i < NUM_DOCS; i++) { Assert.IsTrue(floats.Get(i) == (float.MaxValue - i), floats.Get(i) + " does not equal: " + (float.MaxValue - i)); } #pragma warning restore 612, 618 IBits docsWithField = cache.GetDocsWithField(reader, "theLong"); Assert.AreSame(docsWithField, cache.GetDocsWithField(reader, "theLong"), "Second request to cache return same array"); Assert.IsTrue(docsWithField is Bits.MatchAllBits, "docsWithField(theLong) must be class Bits.MatchAllBits"); Assert.IsTrue(docsWithField.Length == NUM_DOCS, "docsWithField(theLong) Size: " + docsWithField.Length + " is not: " + NUM_DOCS); for (int i = 0; i < docsWithField.Length; i++) { Assert.IsTrue(docsWithField.Get(i)); } docsWithField = cache.GetDocsWithField(reader, "sparse"); Assert.AreSame(docsWithField, cache.GetDocsWithField(reader, "sparse"), "Second request to cache return same array"); Assert.IsFalse(docsWithField is Bits.MatchAllBits, "docsWithField(sparse) must not be class Bits.MatchAllBits"); Assert.IsTrue(docsWithField.Length == NUM_DOCS, "docsWithField(sparse) Size: " + docsWithField.Length + " is not: " + NUM_DOCS); for (int i = 0; i < docsWithField.Length; i++) { Assert.AreEqual(i % 2 == 0, docsWithField.Get(i)); } // getTermsIndex SortedDocValues termsIndex = cache.GetTermsIndex(reader, "theRandomUnicodeString"); Assert.AreSame(termsIndex, cache.GetTermsIndex(reader, "theRandomUnicodeString"), "Second request to cache return same array"); BytesRef br = new BytesRef(); for (int i = 0; i < NUM_DOCS; i++) { BytesRef term; int ord = termsIndex.GetOrd(i); if (ord == -1) { term = null; } else { termsIndex.LookupOrd(ord, br); term = br; } string s = term == null ? null : term.Utf8ToString(); Assert.IsTrue(unicodeStrings[i] == null || unicodeStrings[i].Equals(s, StringComparison.Ordinal), "for doc " + i + ": " + s + " does not equal: " + unicodeStrings[i]); } int nTerms = termsIndex.ValueCount; TermsEnum tenum = termsIndex.GetTermsEnum(); BytesRef val = new BytesRef(); for (int i = 0; i < nTerms; i++) { tenum.MoveNext(); BytesRef val1 = tenum.Term; termsIndex.LookupOrd(i, val); // System.out.println("i="+i); Assert.AreEqual(val, val1); } // seek the enum around (note this isn't a great test here) int num = AtLeast(100); for (int i = 0; i < num; i++) { int k = Random.Next(nTerms); termsIndex.LookupOrd(k, val); Assert.AreEqual(TermsEnum.SeekStatus.FOUND, tenum.SeekCeil(val)); Assert.AreEqual(val, tenum.Term); } for (int i = 0; i < nTerms; i++) { termsIndex.LookupOrd(i, val); Assert.AreEqual(TermsEnum.SeekStatus.FOUND, tenum.SeekCeil(val)); Assert.AreEqual(val, tenum.Term); } // test bad field termsIndex = cache.GetTermsIndex(reader, "bogusfield"); // getTerms BinaryDocValues terms = cache.GetTerms(reader, "theRandomUnicodeString", true); Assert.AreSame(terms, cache.GetTerms(reader, "theRandomUnicodeString", true), "Second request to cache return same array"); IBits bits = cache.GetDocsWithField(reader, "theRandomUnicodeString"); for (int i = 0; i < NUM_DOCS; i++) { terms.Get(i, br); BytesRef term; if (!bits.Get(i)) { term = null; } else { term = br; } string s = term == null ? null : term.Utf8ToString(); Assert.IsTrue(unicodeStrings[i] == null || unicodeStrings[i].Equals(s, StringComparison.Ordinal), "for doc " + i + ": " + s + " does not equal: " + unicodeStrings[i]); } // test bad field terms = cache.GetTerms(reader, "bogusfield", false); // getDocTermOrds SortedSetDocValues termOrds = cache.GetDocTermOrds(reader, "theRandomUnicodeMultiValuedField"); int numEntries = cache.GetCacheEntries().Length; // ask for it again, and check that we didnt create any additional entries: termOrds = cache.GetDocTermOrds(reader, "theRandomUnicodeMultiValuedField"); Assert.AreEqual(numEntries, cache.GetCacheEntries().Length); for (int i = 0; i < NUM_DOCS; i++) { termOrds.SetDocument(i); // this will remove identical terms. A DocTermOrds doesn't return duplicate ords for a docId ISet <BytesRef> values = new JCG.LinkedHashSet <BytesRef>(multiValued[i]); foreach (BytesRef v in values) { if (v == null) { // why does this test use null values... instead of an empty list: confusing break; } long ord = termOrds.NextOrd(); if (Debugging.AssertsEnabled) { Debugging.Assert(ord != SortedSetDocValues.NO_MORE_ORDS); } BytesRef scratch = new BytesRef(); termOrds.LookupOrd(ord, scratch); Assert.AreEqual(v, scratch); } Assert.AreEqual(SortedSetDocValues.NO_MORE_ORDS, termOrds.NextOrd()); } // test bad field termOrds = cache.GetDocTermOrds(reader, "bogusfield"); Assert.IsTrue(termOrds.ValueCount == 0); FieldCache.DEFAULT.PurgeByCacheKey(reader.CoreCacheKey); }
public override void Run() { if (Verbose) { Console.WriteLine(Thread.CurrentThread.Name + ": launch search thread"); } while (Environment.TickCount < stopTimeMS) { try { IndexSearcher s = outerInstance.GetCurrentSearcher(); try { // Verify 1) IW is correctly setting // diagnostics, and 2) segment warming for // merged segments is actually happening: foreach (AtomicReaderContext sub in s.IndexReader.Leaves) { SegmentReader segReader = (SegmentReader)sub.Reader; IDictionary <string, string> diagnostics = segReader.SegmentInfo.Info.Diagnostics; assertNotNull(diagnostics); diagnostics.TryGetValue("source", out string source); assertNotNull(source); if (source.Equals("merge", StringComparison.Ordinal)) { assertTrue("sub reader " + sub + " wasn't warmed: warmed=" + outerInstance.warmed + " diagnostics=" + diagnostics + " si=" + segReader.SegmentInfo, // LUCENENET: ConditionalWeakTable doesn't have ContainsKey, so we normalize to TryGetValue !outerInstance.m_assertMergedSegmentsWarmed || outerInstance.warmed.TryGetValue(segReader.core, out BooleanRef _)); } } if (s.IndexReader.NumDocs > 0) { outerInstance.SmokeTestSearcher(s); Fields fields = MultiFields.GetFields(s.IndexReader); if (fields == null) { continue; } Terms terms = fields.GetTerms("body"); if (terms == null) { continue; } TermsEnum termsEnum = terms.GetEnumerator(); int seenTermCount = 0; int shift; int trigger; if (totTermCount < 30) { shift = 0; trigger = 1; } else { trigger = totTermCount / 30; shift = Random.Next(trigger); } while (Environment.TickCount < stopTimeMS) { if (!termsEnum.MoveNext()) { totTermCount.Value = seenTermCount; break; } seenTermCount++; // search 30 terms if ((seenTermCount + shift) % trigger == 0) { //if (VERBOSE) { //System.out.println(Thread.currentThread().getName() + " now search body:" + term.Utf8ToString()); //} totHits.AddAndGet(outerInstance.RunQuery(s, new TermQuery(new Term("body", termsEnum.Term)))); } } //if (VERBOSE) { //System.out.println(Thread.currentThread().getName() + ": search done"); //} } } finally { outerInstance.ReleaseSearcher(s); } } catch (Exception t) { Console.WriteLine(Thread.CurrentThread.Name + ": hit exc"); outerInstance.m_failed.Value = (true); Console.WriteLine(t.ToString()); throw new Exception(t.ToString(), t); } } }
private void AssertTermsSeeking(Terms leftTerms, Terms rightTerms) { TermsEnum leftEnum = null; TermsEnum rightEnum = null; // just an upper bound int numTests = AtLeast(20); Random random = Random; // collect this number of terms from the left side ISet <BytesRef> tests = new JCG.HashSet <BytesRef>(); int numPasses = 0; while (numPasses < 10 && tests.Count < numTests) { leftEnum = leftTerms.GetEnumerator(leftEnum); BytesRef term = null; while (leftEnum.MoveNext()) { term = leftEnum.Term; int code = random.Next(10); if (code == 0) { // the term tests.Add(BytesRef.DeepCopyOf(term)); } else if (code == 1) { // truncated subsequence of term term = BytesRef.DeepCopyOf(term); if (term.Length > 0) { // truncate it term.Length = random.Next(term.Length); } } else if (code == 2) { // term, but ensure a non-zero offset var newbytes = new byte[term.Length + 5]; Array.Copy(term.Bytes, term.Offset, newbytes, 5, term.Length); tests.Add(new BytesRef(newbytes, 5, term.Length)); } } numPasses++; } IList <BytesRef> shuffledTests = new JCG.List <BytesRef>(tests); shuffledTests.Shuffle(Random); foreach (BytesRef b in shuffledTests) { leftEnum = leftTerms.GetEnumerator(leftEnum); rightEnum = rightTerms.GetEnumerator(rightEnum); Assert.AreEqual(leftEnum.SeekExact(b), rightEnum.SeekExact(b)); Assert.AreEqual(leftEnum.SeekExact(b), rightEnum.SeekExact(b)); SeekStatus leftStatus; SeekStatus rightStatus; leftStatus = leftEnum.SeekCeil(b); rightStatus = rightEnum.SeekCeil(b); Assert.AreEqual(leftStatus, rightStatus); if (leftStatus != SeekStatus.END) { Assert.AreEqual(leftEnum.Term, rightEnum.Term); } leftStatus = leftEnum.SeekCeil(b); rightStatus = rightEnum.SeekCeil(b); Assert.AreEqual(leftStatus, rightStatus); if (leftStatus != SeekStatus.END) { Assert.AreEqual(leftEnum.Term, rightEnum.Term); } } }
public virtual void TestTransitionAPI() { Directory dir = NewDirectory(); RandomIndexWriter w = new RandomIndexWriter(Random, dir); Documents.Document doc = new Documents.Document(); #pragma warning disable 612, 618 doc.Add(new Field("stored", "abc", Field.Store.YES, Field.Index.NO)); doc.Add(new Field("stored_indexed", "abc xyz", Field.Store.YES, Field.Index.NOT_ANALYZED)); doc.Add(new Field("stored_tokenized", "abc xyz", Field.Store.YES, Field.Index.ANALYZED)); doc.Add(new Field("indexed", "abc xyz", Field.Store.NO, Field.Index.NOT_ANALYZED)); doc.Add(new Field("tokenized", "abc xyz", Field.Store.NO, Field.Index.ANALYZED)); doc.Add(new Field("tokenized_reader", new StringReader("abc xyz"))); doc.Add(new Field("tokenized_tokenstream", w.IndexWriter.Analyzer.GetTokenStream("tokenized_tokenstream", new StringReader("abc xyz")))); doc.Add(new Field("binary", new byte[10])); doc.Add(new Field("tv", "abc xyz", Field.Store.NO, Field.Index.ANALYZED, Field.TermVector.YES)); doc.Add(new Field("tv_pos", "abc xyz", Field.Store.NO, Field.Index.ANALYZED, Field.TermVector.WITH_POSITIONS)); doc.Add(new Field("tv_off", "abc xyz", Field.Store.NO, Field.Index.ANALYZED, Field.TermVector.WITH_OFFSETS)); doc.Add(new Field("tv_pos_off", "abc xyz", Field.Store.NO, Field.Index.ANALYZED, Field.TermVector.WITH_POSITIONS_OFFSETS)); #pragma warning restore 612, 618 w.AddDocument(doc); IndexReader r = w.GetReader(); w.Dispose(); doc = r.Document(0); // 4 stored fields Assert.AreEqual(4, doc.Fields.Count); Assert.AreEqual("abc", doc.Get("stored")); Assert.AreEqual("abc xyz", doc.Get("stored_indexed")); Assert.AreEqual("abc xyz", doc.Get("stored_tokenized")); BytesRef br = doc.GetBinaryValue("binary"); Assert.IsNotNull(br); Assert.AreEqual(10, br.Length); IndexSearcher s = new IndexSearcher(r); Assert.AreEqual(1, s.Search(new TermQuery(new Term("stored_indexed", "abc xyz")), 1).TotalHits); Assert.AreEqual(1, s.Search(new TermQuery(new Term("stored_tokenized", "abc")), 1).TotalHits); Assert.AreEqual(1, s.Search(new TermQuery(new Term("stored_tokenized", "xyz")), 1).TotalHits); Assert.AreEqual(1, s.Search(new TermQuery(new Term("indexed", "abc xyz")), 1).TotalHits); Assert.AreEqual(1, s.Search(new TermQuery(new Term("tokenized", "abc")), 1).TotalHits); Assert.AreEqual(1, s.Search(new TermQuery(new Term("tokenized", "xyz")), 1).TotalHits); Assert.AreEqual(1, s.Search(new TermQuery(new Term("tokenized_reader", "abc")), 1).TotalHits); Assert.AreEqual(1, s.Search(new TermQuery(new Term("tokenized_reader", "xyz")), 1).TotalHits); Assert.AreEqual(1, s.Search(new TermQuery(new Term("tokenized_tokenstream", "abc")), 1).TotalHits); Assert.AreEqual(1, s.Search(new TermQuery(new Term("tokenized_tokenstream", "xyz")), 1).TotalHits); foreach (string field in new string[] { "tv", "tv_pos", "tv_off", "tv_pos_off" }) { Fields tvFields = r.GetTermVectors(0); Terms tvs = tvFields.GetTerms(field); Assert.IsNotNull(tvs); Assert.AreEqual(2, tvs.Count); TermsEnum tvsEnum = tvs.GetEnumerator(); Assert.IsTrue(tvsEnum.MoveNext()); Assert.AreEqual(new BytesRef("abc"), tvsEnum.Term); DocsAndPositionsEnum dpEnum = tvsEnum.DocsAndPositions(null, null); if (field.Equals("tv", StringComparison.Ordinal)) { Assert.IsNull(dpEnum); } else { Assert.IsNotNull(dpEnum); } Assert.IsTrue(tvsEnum.MoveNext()); Assert.AreEqual(new BytesRef("xyz"), tvsEnum.Term); Assert.IsFalse(tvsEnum.MoveNext()); } r.Dispose(); dir.Dispose(); }
public virtual void TestPhrasePrefix() { Directory indexStore = NewDirectory(); RandomIndexWriter writer = new RandomIndexWriter(Random, indexStore); Add("blueberry pie", writer); Add("blueberry strudel", writer); Add("blueberry pizza", writer); Add("blueberry chewing gum", writer); Add("bluebird pizza", writer); Add("bluebird foobar pizza", writer); Add("piccadilly circus", writer); IndexReader reader = writer.GetReader(); IndexSearcher searcher = NewSearcher(reader); // search for "blueberry pi*": MultiPhraseQuery query1 = new MultiPhraseQuery(); // search for "strawberry pi*": MultiPhraseQuery query2 = new MultiPhraseQuery(); query1.Add(new Term("body", "blueberry")); query2.Add(new Term("body", "strawberry")); LinkedList <Term> termsWithPrefix = new LinkedList <Term>(); // this TermEnum gives "piccadilly", "pie" and "pizza". string prefix = "pi"; TermsEnum te = MultiFields.GetFields(reader).GetTerms("body").GetEnumerator(); te.SeekCeil(new BytesRef(prefix)); do { string s = te.Term.Utf8ToString(); if (s.StartsWith(prefix, StringComparison.Ordinal)) { termsWithPrefix.AddLast(new Term("body", s)); } else { break; } } while (te.MoveNext()); query1.Add(termsWithPrefix.ToArray(/*new Term[0]*/)); Assert.AreEqual("body:\"blueberry (piccadilly pie pizza)\"", query1.ToString()); query2.Add(termsWithPrefix.ToArray(/*new Term[0]*/)); Assert.AreEqual("body:\"strawberry (piccadilly pie pizza)\"", query2.ToString()); ScoreDoc[] result; result = searcher.Search(query1, null, 1000).ScoreDocs; Assert.AreEqual(2, result.Length); result = searcher.Search(query2, null, 1000).ScoreDocs; Assert.AreEqual(0, result.Length); // search for "blue* pizza": MultiPhraseQuery query3 = new MultiPhraseQuery(); termsWithPrefix.Clear(); prefix = "blue"; te.SeekCeil(new BytesRef(prefix)); do { if (te.Term.Utf8ToString().StartsWith(prefix, StringComparison.Ordinal)) { termsWithPrefix.AddLast(new Term("body", te.Term.Utf8ToString())); } } while (te.MoveNext()); query3.Add(termsWithPrefix.ToArray(/*new Term[0]*/)); query3.Add(new Term("body", "pizza")); result = searcher.Search(query3, null, 1000).ScoreDocs; Assert.AreEqual(2, result.Length); // blueberry pizza, bluebird pizza Assert.AreEqual("body:\"(blueberry bluebird) pizza\"", query3.ToString()); // test slop: query3.Slop = 1; result = searcher.Search(query3, null, 1000).ScoreDocs; // just make sure no exc: searcher.Explain(query3, 0); Assert.AreEqual(3, result.Length); // blueberry pizza, bluebird pizza, bluebird // foobar pizza MultiPhraseQuery query4 = new MultiPhraseQuery(); try { query4.Add(new Term("field1", "foo")); query4.Add(new Term("field2", "foobar")); Assert.Fail(); } catch (Exception e) when(e.IsIllegalArgumentException()) { // okay, all terms must belong to the same field } writer.Dispose(); reader.Dispose(); indexStore.Dispose(); }
public void TestRandomIndex() { Directory dir = NewDirectory(); MockAnalyzer analyzer = new MockAnalyzer(Random); analyzer.MaxTokenLength = TestUtil.NextInt32(Random, 1, IndexWriter.MAX_TERM_LENGTH); RandomIndexWriter w = new RandomIndexWriter( #if FEATURE_INSTANCE_TESTDATA_INITIALIZATION this, #endif Random, dir, analyzer); CreateRandomIndex(AtLeast(50), w, Random.NextInt64()); DirectoryReader reader = w.GetReader(); AtomicReader wrapper = SlowCompositeReaderWrapper.Wrap(reader); string field = @"body"; Terms terms = wrapper.GetTerms(field); var lowFreqQueue = new PriorityQueueAnonymousClass(5); var highFreqQueue = new PriorityQueueAnonymousClass1(5); try { TermsEnum iterator = terms.GetEnumerator(); while (iterator.MoveNext()) { if (highFreqQueue.Count < 5) { highFreqQueue.Add(new TermAndFreq( BytesRef.DeepCopyOf(iterator.Term), iterator.DocFreq)); lowFreqQueue.Add(new TermAndFreq( BytesRef.DeepCopyOf(iterator.Term), iterator.DocFreq)); } else { if (highFreqQueue.Top.freq < iterator.DocFreq) { highFreqQueue.Top.freq = iterator.DocFreq; highFreqQueue.Top.term = BytesRef.DeepCopyOf(iterator.Term); highFreqQueue.UpdateTop(); } if (lowFreqQueue.Top.freq > iterator.DocFreq) { lowFreqQueue.Top.freq = iterator.DocFreq; lowFreqQueue.Top.term = BytesRef.DeepCopyOf(iterator.Term); lowFreqQueue.UpdateTop(); } } } int lowFreq = lowFreqQueue.Top.freq; int highFreq = highFreqQueue.Top.freq; AssumeTrue(@"unlucky index", highFreq - 1 > lowFreq); List <TermAndFreq> highTerms = QueueToList(highFreqQueue); List <TermAndFreq> lowTerms = QueueToList(lowFreqQueue); IndexSearcher searcher = NewSearcher(reader); Occur lowFreqOccur = RandomOccur(Random); BooleanQuery verifyQuery = new BooleanQuery(); CommonTermsQuery cq = new CommonTermsQuery(RandomOccur(Random), lowFreqOccur, highFreq - 1, Random.NextBoolean()); foreach (TermAndFreq termAndFreq in lowTerms) { cq.Add(new Term(field, termAndFreq.term)); verifyQuery.Add(new BooleanClause(new TermQuery(new Term(field, termAndFreq.term)), lowFreqOccur)); } foreach (TermAndFreq termAndFreq in highTerms) { cq.Add(new Term(field, termAndFreq.term)); } TopDocs cqSearch = searcher.Search(cq, reader.MaxDoc); TopDocs verifySearch = searcher.Search(verifyQuery, reader.MaxDoc); assertEquals(verifySearch.TotalHits, cqSearch.TotalHits); var hits = new JCG.HashSet <int>(); foreach (ScoreDoc doc in verifySearch.ScoreDocs) { hits.Add(doc.Doc); } foreach (ScoreDoc doc in cqSearch.ScoreDocs) { assertTrue(hits.Remove(doc.Doc)); } assertTrue(hits.Count == 0); /* * need to force merge here since QueryUtils adds checks based * on leave readers which have different statistics than the top * level reader if we have more than one segment. This could * result in a different query / results. */ w.ForceMerge(1); DirectoryReader reader2 = w.GetReader(); QueryUtils.Check( #if FEATURE_INSTANCE_TESTDATA_INITIALIZATION this, #endif Random, cq, NewSearcher(reader2)); reader2.Dispose(); } finally { reader.Dispose(); wrapper.Dispose(); w.Dispose(); dir.Dispose(); } }
//public static void main( string[] args ) throws Exception { // Analyzer analyzer = new WhitespaceAnalyzer(Version.LUCENE_CURRENT); // QueryParser parser = new QueryParser(Version.LUCENE_CURRENT, "f", analyzer ); // Query query = parser.parse( "a x:b" ); // FieldQuery fieldQuery = new FieldQuery( query, true, false ); // Directory dir = new RAMDirectory(); // IndexWriter writer = new IndexWriter(dir, new IndexWriterConfig(Version.LUCENE_CURRENT, analyzer)); // Document doc = new Document(); // IndexableFieldType ft = new IndexableFieldType(TextField.TYPE_STORED); // ft.setStoreTermVectors(true); // ft.setStoreTermVectorOffsets(true); // ft.setStoreTermVectorPositions(true); // doc.add( new Field( "f", ft, "a a a b b c a b b c d e f" ) ); // doc.add( new Field( "f", ft, "b a b a f" ) ); // writer.addDocument( doc ); // writer.close(); // IndexReader reader = IndexReader.open(dir1); // new FieldTermStack( reader, 0, "f", fieldQuery ); // reader.close(); //} /// <summary> /// a constructor. /// </summary> /// <param name="reader"><see cref="IndexReader"/> of the index</param> /// <param name="docId">document id to be highlighted</param> /// <param name="fieldName">field of the document to be highlighted</param> /// <param name="fieldQuery"><see cref="FieldQuery"/> object</param> /// <exception cref="IOException">If there is a low-level I/O error</exception> public FieldTermStack(IndexReader reader, int docId, string fieldName, FieldQuery fieldQuery) { this.fieldName = fieldName; ISet <string> termSet = fieldQuery.GetTermSet(fieldName); // just return to make null snippet if un-matched fieldName specified when fieldMatch == true if (termSet is null) { return; } Fields vectors = reader.GetTermVectors(docId); if (vectors is null) { // null snippet return; } Terms vector = vectors.GetTerms(fieldName); if (vector is null) { // null snippet return; } CharsRef spare = new CharsRef(); TermsEnum termsEnum = vector.GetEnumerator(); DocsAndPositionsEnum dpEnum = null; BytesRef text; int numDocs = reader.MaxDoc; while (termsEnum.MoveNext()) { text = termsEnum.Term; UnicodeUtil.UTF8toUTF16(text, spare); string term = spare.ToString(); if (!termSet.Contains(term)) { continue; } dpEnum = termsEnum.DocsAndPositions(null, dpEnum); if (dpEnum is null) { // null snippet return; } dpEnum.NextDoc(); // For weight look here: http://lucene.apache.org/core/3_6_0/api/core/org/apache/lucene/search/DefaultSimilarity.html float weight = (float)(Math.Log(numDocs / (double)(reader.DocFreq(new Term(fieldName, text)) + 1)) + 1.0); int freq = dpEnum.Freq; for (int i = 0; i < freq; i++) { int pos = dpEnum.NextPosition(); if (dpEnum.StartOffset < 0) { return; // no offsets, null snippet } termList.Add(new TermInfo(term, dpEnum.StartOffset, dpEnum.EndOffset, pos, weight)); } } // sort by position CollectionUtil.TimSort(termList); // now look for dups at the same position, linking them together int currentPos = -1; TermInfo previous = null; TermInfo first = null; for (int i = 0; i < termList.Count;) { TermInfo current = termList[i]; if (current.Position == currentPos) { if (Debugging.AssertsEnabled) { Debugging.Assert(previous != null); } previous.SetNext(current); previous = current; //iterator.Remove(); // LUCENENET NOTE: Remove, but don't advance the i position (since removing will advance to the next item) termList.RemoveAt(i); } else { if (previous != null) { previous.SetNext(first); } previous = first = current; currentPos = current.Position; // LUCENENET NOTE: Only increment the position if we don't do a delete. i++; } } if (previous != null) { previous.SetNext(first); } }
/// <summary> /// Low level api. Returns a token stream generated from a <see cref="Terms"/>. This /// can be used to feed the highlighter with a pre-parsed token /// stream. The <see cref="Terms"/> must have offsets available. /// <para/> /// In my tests the speeds to recreate 1000 token streams using this method are: /// <list type="bullet"> /// <item><description> /// with TermVector offset only data stored - 420 milliseconds /// </description></item> /// <item><description> /// with TermVector offset AND position data stored - 271 milliseconds /// (nb timings for TermVector with position data are based on a tokenizer with contiguous /// positions - no overlaps or gaps) /// </description></item> /// <item><description> /// The cost of not using TermPositionVector to store /// pre-parsed content and using an analyzer to re-parse the original content: /// - reanalyzing the original content - 980 milliseconds /// </description></item> /// </list> /// /// The re-analyze timings will typically vary depending on - /// <list type="number"> /// <item><description> /// The complexity of the analyzer code (timings above were using a /// stemmer/lowercaser/stopword combo) /// </description></item> /// <item><description> /// The number of other fields (Lucene reads ALL fields off the disk /// when accessing just one document field - can cost dear!) /// </description></item> /// <item><description> /// Use of compression on field storage - could be faster due to compression (less disk IO) /// or slower (more CPU burn) depending on the content. /// </description></item> /// </list> /// </summary> /// <param name="tpv"></param> /// <param name="tokenPositionsGuaranteedContiguous">true if the token position numbers have no overlaps or gaps. If looking /// to eek out the last drops of performance, set to true. If in doubt, set to false.</param> /// <exception cref="ArgumentException">if no offsets are available</exception> public static TokenStream GetTokenStream(Terms tpv, bool tokenPositionsGuaranteedContiguous) { if (!tpv.HasOffsets) { throw new ArgumentException("Cannot create TokenStream from Terms without offsets"); } if (!tokenPositionsGuaranteedContiguous && tpv.HasPositions) { return(new TokenStreamFromTermPositionVector(tpv)); } bool hasPayloads = tpv.HasPayloads; // code to reconstruct the original sequence of Tokens TermsEnum termsEnum = tpv.GetEnumerator(); int totalTokens = 0; while (termsEnum.MoveNext()) { totalTokens += (int)termsEnum.TotalTermFreq; } Token[] tokensInOriginalOrder = new Token[totalTokens]; List <Token> unsortedTokens = null; termsEnum = tpv.GetEnumerator(); DocsAndPositionsEnum dpEnum = null; while (termsEnum.MoveNext()) { dpEnum = termsEnum.DocsAndPositions(null, dpEnum); if (dpEnum == null) { throw new ArgumentException("Required TermVector Offset information was not found"); } string term = termsEnum.Term.Utf8ToString(); dpEnum.NextDoc(); int freq = dpEnum.Freq; for (int posUpto = 0; posUpto < freq; posUpto++) { int pos = dpEnum.NextPosition(); if (dpEnum.StartOffset < 0) { throw new ArgumentException("Required TermVector Offset information was not found"); } Token token = new Token(term, dpEnum.StartOffset, dpEnum.EndOffset); if (hasPayloads) { // Must make a deep copy of the returned payload, // since D&PEnum API is allowed to re-use on every // call: token.Payload = BytesRef.DeepCopyOf(dpEnum.GetPayload()); } if (tokenPositionsGuaranteedContiguous && pos != -1) { // We have positions stored and a guarantee that the token position // information is contiguous // This may be fast BUT wont work if Tokenizers used which create >1 // token in same position or // creates jumps in position numbers - this code would fail under those // circumstances // tokens stored with positions - can use this to index straight into // sorted array tokensInOriginalOrder[pos] = token; } else { // tokens NOT stored with positions or not guaranteed contiguous - must // add to list and sort later if (unsortedTokens == null) { unsortedTokens = new List <Token>(); } unsortedTokens.Add(token); } } } // If the field has been stored without position data we must perform a sort if (unsortedTokens != null) { tokensInOriginalOrder = unsortedTokens.ToArray(); ArrayUtil.TimSort(tokensInOriginalOrder, new TokenComparer()); //tokensInOriginalOrder = tokensInOriginalOrder // .OrderBy(t => t, new TokenComparer() ) // .ToArray(); } return(new StoredTokenStream(tokensInOriginalOrder)); }
/// <summary> /// Safe (but, slowish) default method to write every /// vector field in the document. /// </summary> protected void AddAllDocVectors(Fields vectors, MergeState mergeState) { if (vectors == null) { StartDocument(0); FinishDocument(); return; } int numFields = vectors.Count; if (numFields == -1) { // count manually! TODO: Maybe enforce that Fields.size() returns something valid? numFields = 0; //for (IEnumerator<string> it = vectors.Iterator(); it.hasNext();) foreach (string it in vectors) { numFields++; } } StartDocument(numFields); string lastFieldName = null; TermsEnum termsEnum = null; DocsAndPositionsEnum docsAndPositionsEnum = null; int fieldCount = 0; foreach (string fieldName in vectors) { fieldCount++; FieldInfo fieldInfo = mergeState.FieldInfos.FieldInfo(fieldName); if (Debugging.AssertsEnabled) { Debugging.Assert(lastFieldName == null || fieldName.CompareToOrdinal(lastFieldName) > 0, "lastFieldName={0} fieldName={1}", lastFieldName, fieldName); } lastFieldName = fieldName; Terms terms = vectors.GetTerms(fieldName); if (terms == null) { // FieldsEnum shouldn't lie... continue; } bool hasPositions = terms.HasPositions; bool hasOffsets = terms.HasOffsets; bool hasPayloads = terms.HasPayloads; if (Debugging.AssertsEnabled) { Debugging.Assert(!hasPayloads || hasPositions); } int numTerms = (int)terms.Count; if (numTerms == -1) { // count manually. It is stupid, but needed, as Terms.size() is not a mandatory statistics function numTerms = 0; termsEnum = terms.GetEnumerator(termsEnum); while (termsEnum.MoveNext()) { numTerms++; } } StartField(fieldInfo, numTerms, hasPositions, hasOffsets, hasPayloads); termsEnum = terms.GetEnumerator(termsEnum); int termCount = 0; while (termsEnum.MoveNext()) { termCount++; int freq = (int)termsEnum.TotalTermFreq; StartTerm(termsEnum.Term, freq); if (hasPositions || hasOffsets) { docsAndPositionsEnum = termsEnum.DocsAndPositions(null, docsAndPositionsEnum); if (Debugging.AssertsEnabled) { Debugging.Assert(docsAndPositionsEnum != null); } int docID = docsAndPositionsEnum.NextDoc(); if (Debugging.AssertsEnabled) { Debugging.Assert(docID != DocIdSetIterator.NO_MORE_DOCS); Debugging.Assert(docsAndPositionsEnum.Freq == freq); } for (int posUpto = 0; posUpto < freq; posUpto++) { int pos = docsAndPositionsEnum.NextPosition(); int startOffset = docsAndPositionsEnum.StartOffset; int endOffset = docsAndPositionsEnum.EndOffset; BytesRef payload = docsAndPositionsEnum.GetPayload(); if (Debugging.AssertsEnabled) { Debugging.Assert(!hasPositions || pos >= 0); } AddPosition(pos, startOffset, endOffset, payload); } } FinishTerm(); } if (Debugging.AssertsEnabled) { Debugging.Assert(termCount == numTerms); } FinishField(); } if (Debugging.AssertsEnabled) { Debugging.Assert(fieldCount == numFields); } FinishDocument(); }
protected internal override void NextTerm() { m_mergeTerm = tenum.MoveNext() ? tenum.Term : null; }
/// <summary> /// checks the terms enum sequentially /// if deep is false, it does a 'shallow' test that doesnt go down to the docsenums /// </summary> public virtual void AssertTermsEnum(TermsEnum leftTermsEnum, TermsEnum rightTermsEnum, bool deep) { IBits randomBits = new RandomBits(MAXDOC, Random.NextDouble(), Random); DocsAndPositionsEnum leftPositions = null; DocsAndPositionsEnum rightPositions = null; DocsEnum leftDocs = null; DocsEnum rightDocs = null; while (leftTermsEnum.MoveNext()) { Assert.IsTrue(rightTermsEnum.MoveNext()); Assert.AreEqual(leftTermsEnum.Term, rightTermsEnum.Term); AssertTermStats(leftTermsEnum, rightTermsEnum); if (deep) { // with payloads + off AssertDocsAndPositionsEnum(leftPositions = leftTermsEnum.DocsAndPositions(null, leftPositions), rightPositions = rightTermsEnum.DocsAndPositions(null, rightPositions)); AssertDocsAndPositionsEnum(leftPositions = leftTermsEnum.DocsAndPositions(randomBits, leftPositions), rightPositions = rightTermsEnum.DocsAndPositions(randomBits, rightPositions)); AssertPositionsSkipping(leftTermsEnum.DocFreq, leftPositions = leftTermsEnum.DocsAndPositions(null, leftPositions), rightPositions = rightTermsEnum.DocsAndPositions(null, rightPositions)); AssertPositionsSkipping(leftTermsEnum.DocFreq, leftPositions = leftTermsEnum.DocsAndPositions(randomBits, leftPositions), rightPositions = rightTermsEnum.DocsAndPositions(randomBits, rightPositions)); // with payloads only AssertDocsAndPositionsEnum(leftPositions = leftTermsEnum.DocsAndPositions(null, leftPositions, DocsAndPositionsFlags.PAYLOADS), rightPositions = rightTermsEnum.DocsAndPositions(null, rightPositions, DocsAndPositionsFlags.PAYLOADS)); AssertDocsAndPositionsEnum(leftPositions = leftTermsEnum.DocsAndPositions(randomBits, leftPositions, DocsAndPositionsFlags.PAYLOADS), rightPositions = rightTermsEnum.DocsAndPositions(randomBits, rightPositions, DocsAndPositionsFlags.PAYLOADS)); AssertPositionsSkipping(leftTermsEnum.DocFreq, leftPositions = leftTermsEnum.DocsAndPositions(null, leftPositions, DocsAndPositionsFlags.PAYLOADS), rightPositions = rightTermsEnum.DocsAndPositions(null, rightPositions, DocsAndPositionsFlags.PAYLOADS)); AssertPositionsSkipping(leftTermsEnum.DocFreq, leftPositions = leftTermsEnum.DocsAndPositions(randomBits, leftPositions, DocsAndPositionsFlags.PAYLOADS), rightPositions = rightTermsEnum.DocsAndPositions(randomBits, rightPositions, DocsAndPositionsFlags.PAYLOADS)); // with offsets only AssertDocsAndPositionsEnum(leftPositions = leftTermsEnum.DocsAndPositions(null, leftPositions, DocsAndPositionsFlags.OFFSETS), rightPositions = rightTermsEnum.DocsAndPositions(null, rightPositions, DocsAndPositionsFlags.OFFSETS)); AssertDocsAndPositionsEnum(leftPositions = leftTermsEnum.DocsAndPositions(randomBits, leftPositions, DocsAndPositionsFlags.OFFSETS), rightPositions = rightTermsEnum.DocsAndPositions(randomBits, rightPositions, DocsAndPositionsFlags.OFFSETS)); AssertPositionsSkipping(leftTermsEnum.DocFreq, leftPositions = leftTermsEnum.DocsAndPositions(null, leftPositions, DocsAndPositionsFlags.OFFSETS), rightPositions = rightTermsEnum.DocsAndPositions(null, rightPositions, DocsAndPositionsFlags.OFFSETS)); AssertPositionsSkipping(leftTermsEnum.DocFreq, leftPositions = leftTermsEnum.DocsAndPositions(randomBits, leftPositions, DocsAndPositionsFlags.OFFSETS), rightPositions = rightTermsEnum.DocsAndPositions(randomBits, rightPositions, DocsAndPositionsFlags.OFFSETS)); // with positions only AssertDocsAndPositionsEnum(leftPositions = leftTermsEnum.DocsAndPositions(null, leftPositions, DocsAndPositionsFlags.NONE), rightPositions = rightTermsEnum.DocsAndPositions(null, rightPositions, DocsAndPositionsFlags.NONE)); AssertDocsAndPositionsEnum(leftPositions = leftTermsEnum.DocsAndPositions(randomBits, leftPositions, DocsAndPositionsFlags.NONE), rightPositions = rightTermsEnum.DocsAndPositions(randomBits, rightPositions, DocsAndPositionsFlags.NONE)); AssertPositionsSkipping(leftTermsEnum.DocFreq, leftPositions = leftTermsEnum.DocsAndPositions(null, leftPositions, DocsAndPositionsFlags.NONE), rightPositions = rightTermsEnum.DocsAndPositions(null, rightPositions, DocsAndPositionsFlags.NONE)); AssertPositionsSkipping(leftTermsEnum.DocFreq, leftPositions = leftTermsEnum.DocsAndPositions(randomBits, leftPositions, DocsAndPositionsFlags.NONE), rightPositions = rightTermsEnum.DocsAndPositions(randomBits, rightPositions, DocsAndPositionsFlags.NONE)); // with freqs: AssertDocsEnum(leftDocs = leftTermsEnum.Docs(null, leftDocs), rightDocs = rightTermsEnum.Docs(null, rightDocs)); AssertDocsEnum(leftDocs = leftTermsEnum.Docs(randomBits, leftDocs), rightDocs = rightTermsEnum.Docs(randomBits, rightDocs)); // w/o freqs: AssertDocsEnum(leftDocs = leftTermsEnum.Docs(null, leftDocs, DocsFlags.NONE), rightDocs = rightTermsEnum.Docs(null, rightDocs, DocsFlags.NONE)); AssertDocsEnum(leftDocs = leftTermsEnum.Docs(randomBits, leftDocs, DocsFlags.NONE), rightDocs = rightTermsEnum.Docs(randomBits, rightDocs, DocsFlags.NONE)); // with freqs: AssertDocsSkipping(leftTermsEnum.DocFreq, leftDocs = leftTermsEnum.Docs(null, leftDocs), rightDocs = rightTermsEnum.Docs(null, rightDocs)); AssertDocsSkipping(leftTermsEnum.DocFreq, leftDocs = leftTermsEnum.Docs(randomBits, leftDocs), rightDocs = rightTermsEnum.Docs(randomBits, rightDocs)); // w/o freqs: AssertDocsSkipping(leftTermsEnum.DocFreq, leftDocs = leftTermsEnum.Docs(null, leftDocs, DocsFlags.NONE), rightDocs = rightTermsEnum.Docs(null, rightDocs, DocsFlags.NONE)); AssertDocsSkipping(leftTermsEnum.DocFreq, leftDocs = leftTermsEnum.Docs(randomBits, leftDocs, DocsFlags.NONE), rightDocs = rightTermsEnum.Docs(randomBits, rightDocs, DocsFlags.NONE)); } } Assert.IsFalse(rightTermsEnum.MoveNext()); }
public virtual void TestPhrasePrefix() { Directory indexStore = NewDirectory(); RandomIndexWriter writer = new RandomIndexWriter(Random, indexStore); Document doc1 = new Document(); Document doc2 = new Document(); Document doc3 = new Document(); Document doc4 = new Document(); Document doc5 = new Document(); doc1.Add(NewTextField("body", "blueberry pie", Field.Store.YES)); doc2.Add(NewTextField("body", "blueberry strudel", Field.Store.YES)); doc3.Add(NewTextField("body", "blueberry pizza", Field.Store.YES)); doc4.Add(NewTextField("body", "blueberry chewing gum", Field.Store.YES)); doc5.Add(NewTextField("body", "piccadilly circus", Field.Store.YES)); writer.AddDocument(doc1); writer.AddDocument(doc2); writer.AddDocument(doc3); writer.AddDocument(doc4); writer.AddDocument(doc5); IndexReader reader = writer.GetReader(); writer.Dispose(); IndexSearcher searcher = NewSearcher(reader); // PhrasePrefixQuery query1 = new PhrasePrefixQuery(); MultiPhraseQuery query1 = new MultiPhraseQuery(); // PhrasePrefixQuery query2 = new PhrasePrefixQuery(); MultiPhraseQuery query2 = new MultiPhraseQuery(); query1.Add(new Term("body", "blueberry")); query2.Add(new Term("body", "strawberry")); LinkedList <Term> termsWithPrefix = new LinkedList <Term>(); // this TermEnum gives "piccadilly", "pie" and "pizza". string prefix = "pi"; TermsEnum te = MultiFields.GetFields(reader).GetTerms("body").GetEnumerator(); te.SeekCeil(new BytesRef(prefix)); do { string s = te.Term.Utf8ToString(); if (s.StartsWith(prefix, StringComparison.Ordinal)) { termsWithPrefix.AddLast(new Term("body", s)); } else { break; } } while (te.MoveNext()); query1.Add(termsWithPrefix.ToArray(/*new Term[0]*/)); query2.Add(termsWithPrefix.ToArray(/*new Term[0]*/)); ScoreDoc[] result; result = searcher.Search(query1, null, 1000).ScoreDocs; Assert.AreEqual(2, result.Length); result = searcher.Search(query2, null, 1000).ScoreDocs; Assert.AreEqual(0, result.Length); reader.Dispose(); indexStore.Dispose(); }
///<summary>Constructor</summary> /// <param name="vector"> /// Terms that contains the data for /// creating the <see cref="TokenStream"/>. Must have positions and offsets. /// </param> public TokenStreamFromTermPositionVector(Terms vector) { termAttribute = AddAttribute <ICharTermAttribute>(); positionIncrementAttribute = AddAttribute <IPositionIncrementAttribute>(); offsetAttribute = AddAttribute <IOffsetAttribute>(); payloadAttribute = AddAttribute <IPayloadAttribute>(); bool hasOffsets = vector.HasOffsets; bool hasPayloads = vector.HasPayloads; TermsEnum termsEnum = vector.GetEnumerator(); BytesRef text; DocsAndPositionsEnum dpEnum = null; while (termsEnum.MoveNext()) { text = termsEnum.Term; dpEnum = termsEnum.DocsAndPositions(null, dpEnum); dpEnum.NextDoc(); int freq = dpEnum.Freq; for (int j = 0; j < freq; j++) { int pos = dpEnum.NextPosition(); Token token; if (hasOffsets) { token = new Token(text.Utf8ToString(), dpEnum.StartOffset, dpEnum.EndOffset); } else { token = new Token(); token.SetEmpty().Append(text.Utf8ToString()); } if (hasPayloads) { // Must make a deep copy of the returned payload, // since D&PEnum API is allowed to re-use on every // call: token.Payload = BytesRef.DeepCopyOf(dpEnum.GetPayload()); } // Yes - this is the position, not the increment! This is for // sorting. This value // will be corrected before use. token.PositionIncrement = pos; this.positionedTokens.Add(token); } } CollectionUtil.TimSort(this.positionedTokens, tokenComparer); int lastPosition = -1; foreach (Token token in this.positionedTokens) { int thisPosition = token.PositionIncrement; token.PositionIncrement = thisPosition - lastPosition; lastPosition = thisPosition; } this.tokensAtCurrentPosition = this.positionedTokens.GetEnumerator(); }
public virtual void Merge(MergeState mergeState, IndexOptions indexOptions, TermsEnum termsEnum) { BytesRef term; if (Debugging.AssertsEnabled) { Debugging.Assert(termsEnum != null); } long sumTotalTermFreq = 0; long sumDocFreq = 0; long sumDFsinceLastAbortCheck = 0; FixedBitSet visitedDocs = new FixedBitSet(mergeState.SegmentInfo.DocCount); if (indexOptions == IndexOptions.DOCS_ONLY) { if (docsEnum == null) { docsEnum = new MappingMultiDocsEnum(); } docsEnum.MergeState = mergeState; MultiDocsEnum docsEnumIn = null; while (termsEnum.MoveNext()) { term = termsEnum.Term; // We can pass null for liveDocs, because the // mapping enum will skip the non-live docs: docsEnumIn = (MultiDocsEnum)termsEnum.Docs(null, docsEnumIn, DocsFlags.NONE); if (docsEnumIn != null) { docsEnum.Reset(docsEnumIn); PostingsConsumer postingsConsumer = StartTerm(term); TermStats stats = postingsConsumer.Merge(mergeState, indexOptions, docsEnum, visitedDocs); if (stats.DocFreq > 0) { FinishTerm(term, stats); sumTotalTermFreq += stats.DocFreq; sumDFsinceLastAbortCheck += stats.DocFreq; sumDocFreq += stats.DocFreq; if (sumDFsinceLastAbortCheck > 60000) { mergeState.CheckAbort.Work(sumDFsinceLastAbortCheck / 5.0); sumDFsinceLastAbortCheck = 0; } } } } } else if (indexOptions == IndexOptions.DOCS_AND_FREQS) { if (docsAndFreqsEnum == null) { docsAndFreqsEnum = new MappingMultiDocsEnum(); } docsAndFreqsEnum.MergeState = mergeState; MultiDocsEnum docsAndFreqsEnumIn = null; while (termsEnum.MoveNext()) { term = termsEnum.Term; // We can pass null for liveDocs, because the // mapping enum will skip the non-live docs: docsAndFreqsEnumIn = (MultiDocsEnum)termsEnum.Docs(null, docsAndFreqsEnumIn); if (Debugging.AssertsEnabled) { Debugging.Assert(docsAndFreqsEnumIn != null); } docsAndFreqsEnum.Reset(docsAndFreqsEnumIn); PostingsConsumer postingsConsumer = StartTerm(term); TermStats stats = postingsConsumer.Merge(mergeState, indexOptions, docsAndFreqsEnum, visitedDocs); if (stats.DocFreq > 0) { FinishTerm(term, stats); sumTotalTermFreq += stats.TotalTermFreq; sumDFsinceLastAbortCheck += stats.DocFreq; sumDocFreq += stats.DocFreq; if (sumDFsinceLastAbortCheck > 60000) { mergeState.CheckAbort.Work(sumDFsinceLastAbortCheck / 5.0); sumDFsinceLastAbortCheck = 0; } } } } else if (indexOptions == IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) { if (postingsEnum == null) { postingsEnum = new MappingMultiDocsAndPositionsEnum(); } postingsEnum.MergeState = mergeState; MultiDocsAndPositionsEnum postingsEnumIn = null; while (termsEnum.MoveNext()) { term = termsEnum.Term; // We can pass null for liveDocs, because the // mapping enum will skip the non-live docs: postingsEnumIn = (MultiDocsAndPositionsEnum)termsEnum.DocsAndPositions(null, postingsEnumIn, DocsAndPositionsFlags.PAYLOADS); if (Debugging.AssertsEnabled) { Debugging.Assert(postingsEnumIn != null); } postingsEnum.Reset(postingsEnumIn); PostingsConsumer postingsConsumer = StartTerm(term); TermStats stats = postingsConsumer.Merge(mergeState, indexOptions, postingsEnum, visitedDocs); if (stats.DocFreq > 0) { FinishTerm(term, stats); sumTotalTermFreq += stats.TotalTermFreq; sumDFsinceLastAbortCheck += stats.DocFreq; sumDocFreq += stats.DocFreq; if (sumDFsinceLastAbortCheck > 60000) { mergeState.CheckAbort.Work(sumDFsinceLastAbortCheck / 5.0); sumDFsinceLastAbortCheck = 0; } } } } else { if (Debugging.AssertsEnabled) { Debugging.Assert(indexOptions == IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS); } if (postingsEnum == null) { postingsEnum = new MappingMultiDocsAndPositionsEnum(); } postingsEnum.MergeState = mergeState; MultiDocsAndPositionsEnum postingsEnumIn = null; while (termsEnum.MoveNext()) { term = termsEnum.Term; // We can pass null for liveDocs, because the // mapping enum will skip the non-live docs: postingsEnumIn = (MultiDocsAndPositionsEnum)termsEnum.DocsAndPositions(null, postingsEnumIn); if (Debugging.AssertsEnabled) { Debugging.Assert(postingsEnumIn != null); } postingsEnum.Reset(postingsEnumIn); PostingsConsumer postingsConsumer = StartTerm(term); TermStats stats = postingsConsumer.Merge(mergeState, indexOptions, postingsEnum, visitedDocs); if (stats.DocFreq > 0) { FinishTerm(term, stats); sumTotalTermFreq += stats.TotalTermFreq; sumDFsinceLastAbortCheck += stats.DocFreq; sumDocFreq += stats.DocFreq; if (sumDFsinceLastAbortCheck > 60000) { mergeState.CheckAbort.Work(sumDFsinceLastAbortCheck / 5.0); sumDFsinceLastAbortCheck = 0; } } } } Finish(indexOptions == IndexOptions.DOCS_ONLY ? -1 : sumTotalTermFreq, sumDocFreq, visitedDocs.Cardinality()); }
public virtual void TestSimple() { int numNodes = TestUtil.NextInt32(Random, 1, 10); double runTimeSec = AtLeast(3); int minDocsToMakeTerms = TestUtil.NextInt32(Random, 5, 20); int maxSearcherAgeSeconds = TestUtil.NextInt32(Random, 1, 3); if (Verbose) { Console.WriteLine("TEST: numNodes=" + numNodes + " runTimeSec=" + runTimeSec + " maxSearcherAgeSeconds=" + maxSearcherAgeSeconds); } Start(numNodes, runTimeSec, maxSearcherAgeSeconds); JCG.List <PreviousSearchState> priorSearches = new JCG.List <PreviousSearchState>(); IList <BytesRef> terms = null; while (J2N.Time.NanoTime() < endTimeNanos) { bool doFollowon = priorSearches.Count > 0 && Random.Next(7) == 1; // Pick a random node; we will run the query on this node: int myNodeID = Random.Next(numNodes); NodeState.ShardIndexSearcher localShardSearcher; PreviousSearchState prevSearchState; if (doFollowon) { // Pretend user issued a followon query: prevSearchState = priorSearches[Random.Next(priorSearches.Count)]; if (Verbose) { Console.WriteLine("\nTEST: follow-on query age=" + ((J2N.Time.NanoTime() - prevSearchState.SearchTimeNanos) / 1000000000.0)); } try { localShardSearcher = m_nodes[myNodeID].Acquire(prevSearchState.Versions); } catch (SearcherExpiredException see) { // Expected, sometimes; in a "real" app we would // either forward this error to the user ("too // much time has passed; please re-run your // search") or sneakily just switch to newest // searcher w/o telling them... if (Verbose) { Console.WriteLine(" searcher expired during local shard searcher init: " + see); } priorSearches.Remove(prevSearchState); continue; } } else { if (Verbose) { Console.WriteLine("\nTEST: fresh query"); } // Do fresh query: localShardSearcher = m_nodes[myNodeID].Acquire(); prevSearchState = null; } IndexReader[] subs = new IndexReader[numNodes]; PreviousSearchState searchState = null; try { // Mock: now make a single reader (MultiReader) from all node // searchers. In a real shard env you can't do this... we // do it to confirm results from the shard searcher // are correct: int docCount = 0; try { for (int nodeID = 0; nodeID < numNodes; nodeID++) { long subVersion = localShardSearcher.GetNodeVersions()[nodeID]; IndexSearcher sub = m_nodes[nodeID].Searchers.Acquire(subVersion); if (sub is null) { nodeID--; while (nodeID >= 0) { subs[nodeID].DecRef(); subs[nodeID] = null; nodeID--; } throw new SearcherExpiredException("nodeID=" + nodeID + " version=" + subVersion); } subs[nodeID] = sub.IndexReader; docCount += subs[nodeID].MaxDoc; } } catch (SearcherExpiredException see) { // Expected if (Verbose) { Console.WriteLine(" searcher expired during mock reader init: " + see); } continue; } IndexReader mockReader = new MultiReader(subs); IndexSearcher mockSearcher = new IndexSearcher(mockReader); Query query; Sort sort; if (prevSearchState != null) { query = prevSearchState.Query; sort = prevSearchState.Sort; } else { if (terms is null && docCount > minDocsToMakeTerms) { // TODO: try to "focus" on high freq terms sometimes too // TODO: maybe also periodically reset the terms...? TermsEnum termsEnum = MultiFields.GetTerms(mockReader, "body").GetEnumerator(); terms = new JCG.List <BytesRef>(); while (termsEnum.MoveNext()) { terms.Add(BytesRef.DeepCopyOf(termsEnum.Term)); } if (Verbose) { Console.WriteLine("TEST: init terms: " + terms.Count + " terms"); } if (terms.Count == 0) { terms = null; } } if (Verbose) { Console.WriteLine(" maxDoc=" + mockReader.MaxDoc); } if (terms != null) { if (Random.NextBoolean()) { query = new TermQuery(new Term("body", terms[Random.Next(terms.Count)])); } else { string t = terms[Random.Next(terms.Count)].Utf8ToString(); string prefix; if (t.Length <= 1) { prefix = t; } else { prefix = t.Substring(0, TestUtil.NextInt32(Random, 1, 2)); } query = new PrefixQuery(new Term("body", prefix)); } if (Random.NextBoolean()) { sort = null; } else { // TODO: sort by more than 1 field int what = Random.Next(3); if (what == 0) { sort = new Sort(SortField.FIELD_SCORE); } else if (what == 1) { // TODO: this sort doesn't merge // correctly... it's tricky because you // could have > 2.1B docs across all shards: //sort = new Sort(SortField.FIELD_DOC); sort = null; } else if (what == 2) { sort = new Sort(new SortField[] { new SortField("docid", SortFieldType.INT32, Random.NextBoolean()) }); } else { sort = new Sort(new SortField[] { new SortField("title", SortFieldType.STRING, Random.NextBoolean()) }); } } } else { query = null; sort = null; } } if (query != null) { try { searchState = AssertSame(mockSearcher, localShardSearcher, query, sort, prevSearchState); } catch (SearcherExpiredException see) { // Expected; in a "real" app we would // either forward this error to the user ("too // much time has passed; please re-run your // search") or sneakily just switch to newest // searcher w/o telling them... if (Verbose) { Console.WriteLine(" searcher expired during search: " + see); Console.Out.Write(see.StackTrace); } // We can't do this in general: on a very slow // computer it's possible the local searcher // expires before we can finish our search: // assert prevSearchState != null; if (prevSearchState != null) { priorSearches.Remove(prevSearchState); } } } } finally { //m_nodes[myNodeID].Release(localShardSearcher); NodeState.Release(localShardSearcher); // LUCENENET: Made Release() static per CA1822 for performance foreach (IndexReader sub in subs) { if (sub != null) { sub.DecRef(); } } } if (searchState != null && searchState.SearchAfterLocal != null && Random.Next(5) == 3) { priorSearches.Add(searchState); if (priorSearches.Count > 200) { priorSearches.Shuffle(Random); priorSearches.RemoveRange(100, priorSearches.Count - 100); // LUCENENET: Converted end index to length } } } Finish(); }
public virtual void Test10kPulsed() { // we always run this test with pulsing codec. Codec cp = TestUtil.AlwaysPostingsFormat(new Pulsing41PostingsFormat(1)); DirectoryInfo f = CreateTempDir("10kpulsed"); BaseDirectoryWrapper dir = NewFSDirectory(f); dir.CheckIndexOnDispose = false; // we do this ourselves explicitly RandomIndexWriter iw = new RandomIndexWriter(Random, dir, NewIndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer(Random)).SetCodec(cp)); Document document = new Document(); FieldType ft = new FieldType(TextField.TYPE_STORED); switch (TestUtil.NextInt32(Random, 0, 2)) { case 0: ft.IndexOptions = IndexOptions.DOCS_ONLY; break; case 1: ft.IndexOptions = IndexOptions.DOCS_AND_FREQS; break; default: ft.IndexOptions = IndexOptions.DOCS_AND_FREQS_AND_POSITIONS; break; } Field field = NewField("field", "", ft); document.Add(field); //NumberFormat df = new DecimalFormat("00000", new DecimalFormatSymbols(Locale.ROOT)); for (int i = 0; i < 10050; i++) { //field.StringValue = df.format(i); field.SetStringValue(i.ToString("00000", CultureInfo.InvariantCulture)); iw.AddDocument(document); } IndexReader ir = iw.GetReader(); iw.Dispose(); TermsEnum te = MultiFields.GetTerms(ir, "field").GetEnumerator(); DocsEnum de = null; for (int i = 0; i < 10050; i++) { //string expected = df.format(i); string expected = i.ToString("00000", CultureInfo.InvariantCulture); te.MoveNext(); assertEquals(expected, te.Term.Utf8ToString()); de = TestUtil.Docs(Random, te, null, de, DocsFlags.NONE); assertTrue(de.NextDoc() != DocIdSetIterator.NO_MORE_DOCS); assertEquals(DocIdSetIterator.NO_MORE_DOCS, de.NextDoc()); } ir.Dispose(); TestUtil.CheckIndex(dir); dir.Dispose(); }
private void DuellReaders(CompositeReader other, AtomicReader memIndexReader) { AtomicReader competitor = SlowCompositeReaderWrapper.Wrap(other); Fields memFields = memIndexReader.Fields; foreach (string field in competitor.Fields) { Terms memTerms = memFields.GetTerms(field); Terms iwTerms = memIndexReader.GetTerms(field); if (iwTerms == null) { assertNull(memTerms); } else { NumericDocValues normValues = competitor.GetNormValues(field); NumericDocValues memNormValues = memIndexReader.GetNormValues(field); if (normValues != null) { // mem idx always computes norms on the fly assertNotNull(memNormValues); assertEquals(normValues.Get(0), memNormValues.Get(0)); } assertNotNull(memTerms); assertEquals(iwTerms.DocCount, memTerms.DocCount); assertEquals(iwTerms.SumDocFreq, memTerms.SumDocFreq); assertEquals(iwTerms.SumTotalTermFreq, memTerms.SumTotalTermFreq); TermsEnum iwTermsIter = iwTerms.GetEnumerator(); TermsEnum memTermsIter = memTerms.GetEnumerator(); if (iwTerms.HasPositions) { bool offsets = iwTerms.HasOffsets && memTerms.HasOffsets; while (iwTermsIter.MoveNext()) { assertTrue(memTermsIter.MoveNext()); assertEquals(iwTermsIter.Term, memTermsIter.Term); DocsAndPositionsEnum iwDocsAndPos = iwTermsIter.DocsAndPositions(null, null); DocsAndPositionsEnum memDocsAndPos = memTermsIter.DocsAndPositions(null, null); while (iwDocsAndPos.NextDoc() != DocsAndPositionsEnum.NO_MORE_DOCS) { assertEquals(iwDocsAndPos.DocID, memDocsAndPos.NextDoc()); assertEquals(iwDocsAndPos.Freq, memDocsAndPos.Freq); for (int i = 0; i < iwDocsAndPos.Freq; i++) { assertEquals("term: " + iwTermsIter.Term.Utf8ToString(), iwDocsAndPos.NextPosition(), memDocsAndPos.NextPosition()); if (offsets) { assertEquals(iwDocsAndPos.StartOffset, memDocsAndPos.StartOffset); assertEquals(iwDocsAndPos.EndOffset, memDocsAndPos.EndOffset); } } } } } else { while (iwTermsIter.MoveNext()) { assertEquals(iwTermsIter.Term, memTermsIter.Term); DocsEnum iwDocsAndPos = iwTermsIter.Docs(null, null); DocsEnum memDocsAndPos = memTermsIter.Docs(null, null); while (iwDocsAndPos.NextDoc() != DocsAndPositionsEnum.NO_MORE_DOCS) { assertEquals(iwDocsAndPos.DocID, memDocsAndPos.NextDoc()); assertEquals(iwDocsAndPos.Freq, memDocsAndPos.Freq); } } } } } }