private void DoTestSeekDoesNotExist(Random r, int numField, IList <Term> fieldTerms, Term[] fieldTermsArray, IndexReader reader) { IDictionary <string, TermsEnum> tes = new Dictionary <string, TermsEnum>(); if (Verbose) { Console.WriteLine("TEST: top random seeks"); } { int num = AtLeast(100); for (int iter = 0; iter < num; iter++) { // seek to random spot string field = ("f" + r.Next(numField)).Intern(); Term tx = new Term(field, GetRandomString(r)); int spot = Array.BinarySearch(fieldTermsArray, tx); if (spot < 0) { if (Verbose) { Console.WriteLine("TEST: non-exist seek to " + field + ":" + UnicodeUtil.ToHexString(tx.Text)); } // term does not exist: if (!tes.TryGetValue(field, out TermsEnum te)) { te = MultiFields.GetTerms(reader, field).GetEnumerator(); tes[field] = te; } if (Verbose) { Console.WriteLine(" got enum"); } spot = -spot - 1; if (spot == fieldTerms.Count || !fieldTerms[spot].Field.Equals(field, StringComparison.Ordinal)) { Assert.AreEqual(TermsEnum.SeekStatus.END, te.SeekCeil(tx.Bytes)); } else { Assert.AreEqual(TermsEnum.SeekStatus.NOT_FOUND, te.SeekCeil(tx.Bytes)); if (Verbose) { Console.WriteLine(" got term=" + UnicodeUtil.ToHexString(te.Term.Utf8ToString())); Console.WriteLine(" exp term=" + UnicodeUtil.ToHexString(fieldTerms[spot].Text)); } Assert.AreEqual(fieldTerms[spot].Bytes, te.Term); // now .next() this many times: int ct = TestUtil.NextInt32(r, 5, 100); for (int i = 0; i < ct; i++) { if (Verbose) { Console.WriteLine("TEST: now next()"); } if (1 + spot + i >= fieldTerms.Count) { break; } Term term = fieldTerms[1 + spot + i]; if (!term.Field.Equals(field, StringComparison.Ordinal)) { Assert.IsFalse(te.MoveNext()); break; } else { Assert.IsTrue(te.MoveNext()); BytesRef t = te.Term; if (Verbose) { Console.WriteLine(" got term=" + (t is null ? null : UnicodeUtil.ToHexString(t.Utf8ToString()))); Console.WriteLine(" exp=" + UnicodeUtil.ToHexString(term.Text.ToString())); } Assert.AreEqual(term.Bytes, t); } } } } } } }
public virtual void Test10kPulsed() { // we always run this test with pulsing codec. Codec cp = TestUtil.AlwaysPostingsFormat(new Pulsing41PostingsFormat(1)); DirectoryInfo f = CreateTempDir("10kpulsed"); BaseDirectoryWrapper dir = NewFSDirectory(f); dir.CheckIndexOnDispose = false; // we do this ourselves explicitly RandomIndexWriter iw = new RandomIndexWriter(Random, dir, NewIndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer(Random)).SetCodec(cp)); Document document = new Document(); FieldType ft = new FieldType(TextField.TYPE_STORED); switch (TestUtil.NextInt32(Random, 0, 2)) { case 0: ft.IndexOptions = IndexOptions.DOCS_ONLY; break; case 1: ft.IndexOptions = IndexOptions.DOCS_AND_FREQS; break; default: ft.IndexOptions = IndexOptions.DOCS_AND_FREQS_AND_POSITIONS; break; } Field field = NewField("field", "", ft); document.Add(field); //NumberFormat df = new DecimalFormat("00000", new DecimalFormatSymbols(Locale.ROOT)); // LUCENENET specific: Use .ToString formating instead for (int i = 0; i < 10050; i++) { //field.StringValue = df.format(i); field.SetStringValue(i.ToString("00000", CultureInfo.InvariantCulture)); iw.AddDocument(document); } IndexReader ir = iw.GetReader(); iw.Dispose(); TermsEnum te = MultiFields.GetTerms(ir, "field").GetEnumerator(); DocsEnum de = null; for (int i = 0; i < 10050; i++) { //string expected = df.format(i); string expected = i.ToString("00000", CultureInfo.InvariantCulture); te.MoveNext(); assertEquals(expected, te.Term.Utf8ToString()); de = TestUtil.Docs(Random, te, null, de, DocsFlags.NONE); assertTrue(de.NextDoc() != DocIdSetIterator.NO_MORE_DOCS); assertEquals(DocIdSetIterator.NO_MORE_DOCS, de.NextDoc()); } ir.Dispose(); TestUtil.CheckIndex(dir); dir.Dispose(); }
// randomly seeks to term that we know exists, then next's // from there private void DoTestSeekExists(Random r, IList <Term> fieldTerms, IndexReader reader) { IDictionary <string, TermsEnum> tes = new Dictionary <string, TermsEnum>(); // Test random seek to existing term, then enum: if (Verbose) { Console.WriteLine("\nTEST: top now seek"); } int num = AtLeast(100); for (int iter = 0; iter < num; iter++) { // pick random field+term int spot = r.Next(fieldTerms.Count); Term term = fieldTerms[spot]; string field = term.Field; if (Verbose) { Console.WriteLine("TEST: exist seek field=" + field + " term=" + UnicodeUtil.ToHexString(term.Text)); } // seek to it if (!tes.TryGetValue(field, out TermsEnum te)) { te = MultiFields.GetTerms(reader, field).GetEnumerator(); tes[field] = te; } if (Verbose) { Console.WriteLine(" done get enum"); } // seek should find the term Assert.AreEqual(TermsEnum.SeekStatus.FOUND, te.SeekCeil(term.Bytes)); // now .next() this many times: int ct = TestUtil.NextInt32(r, 5, 100); for (int i = 0; i < ct; i++) { if (Verbose) { Console.WriteLine("TEST: now next()"); } if (1 + spot + i >= fieldTerms.Count) { break; } term = fieldTerms[1 + spot + i]; if (!term.Field.Equals(field, StringComparison.Ordinal)) { Assert.IsFalse(te.MoveNext()); break; } else { Assert.IsTrue(te.MoveNext()); BytesRef t = te.Term; if (Verbose) { Console.WriteLine(" got term=" + (t is null ? null : UnicodeUtil.ToHexString(t.Utf8ToString()))); Console.WriteLine(" exp=" + UnicodeUtil.ToHexString(term.Text.ToString())); } Assert.AreEqual(term.Bytes, t); } } } }
private void AddTerms(IndexReader reader, FieldVals f) { if (f.queryString == null) { return; } Terms terms = MultiFields.GetTerms(reader, f.fieldName); if (terms == null) { return; } TokenStream ts = analyzer.GetTokenStream(f.fieldName, f.queryString); try { ICharTermAttribute termAtt = ts.AddAttribute <ICharTermAttribute>(); int corpusNumDocs = reader.NumDocs; ISet <string> processedTerms = new JCG.HashSet <string>(); ts.Reset(); while (ts.IncrementToken()) { string term = termAtt.ToString(); if (!processedTerms.Contains(term)) { processedTerms.Add(term); ScoreTermQueue variantsQ = new ScoreTermQueue(MAX_VARIANTS_PER_TERM); //maxNum variants considered for any one term float minScore = 0; Term startTerm = new Term(f.fieldName, term); AttributeSource atts = new AttributeSource(); IMaxNonCompetitiveBoostAttribute maxBoostAtt = atts.AddAttribute <IMaxNonCompetitiveBoostAttribute>(); #pragma warning disable 612, 618 SlowFuzzyTermsEnum fe = new SlowFuzzyTermsEnum(terms, atts, startTerm, f.minSimilarity, f.prefixLength); #pragma warning restore 612, 618 //store the df so all variants use same idf int df = reader.DocFreq(startTerm); int numVariants = 0; int totalVariantDocFreqs = 0; BytesRef possibleMatch; IBoostAttribute boostAtt = fe.Attributes.AddAttribute <IBoostAttribute>(); while ((possibleMatch = fe.Next()) != null) { numVariants++; totalVariantDocFreqs += fe.DocFreq; float score = boostAtt.Boost; if (variantsQ.Count < MAX_VARIANTS_PER_TERM || score > minScore) { ScoreTerm st = new ScoreTerm(new Term(startTerm.Field, BytesRef.DeepCopyOf(possibleMatch)), score, startTerm); variantsQ.InsertWithOverflow(st); minScore = variantsQ.Top.Score; // maintain minScore } maxBoostAtt.MaxNonCompetitiveBoost = variantsQ.Count >= MAX_VARIANTS_PER_TERM ? minScore : float.NegativeInfinity; } if (numVariants > 0) { int avgDf = totalVariantDocFreqs / numVariants; if (df == 0) //no direct match we can use as df for all variants { df = avgDf; //use avg df of all variants } // take the top variants (scored by edit distance) and reset the score // to include an IDF factor then add to the global queue for ranking // overall top query terms int size = variantsQ.Count; for (int i = 0; i < size; i++) { ScoreTerm st = variantsQ.Pop(); st.Score = (st.Score * st.Score) * sim.Idf(df, corpusNumDocs); q.InsertWithOverflow(st); } } } } ts.End(); } finally { IOUtils.DisposeWhileHandlingException(ts); } }
/// <summary> /// Build the suggest index, using up to the specified /// amount of temporary RAM while building. Note that /// the weights for the suggestions are ignored. /// </summary> public virtual void Build(IInputEnumerator enumerator, double ramBufferSizeMB) { // LUCENENET: Added guard clause for null if (enumerator is null) { throw new ArgumentNullException(nameof(enumerator)); } if (enumerator.HasPayloads) { throw new ArgumentException("this suggester doesn't support payloads"); } if (enumerator.HasContexts) { throw new ArgumentException("this suggester doesn't support contexts"); } string prefix = this.GetType().Name; var directory = OfflineSorter.DefaultTempDir(); // LUCENENET specific - using GetRandomFileName() instead of picking a random int DirectoryInfo tempIndexPath; // LUCENENET: IDE0059: Remove unnecessary value assignment while (true) { tempIndexPath = new DirectoryInfo(Path.Combine(directory.FullName, prefix + ".index." + Path.GetFileNameWithoutExtension(Path.GetRandomFileName()))); tempIndexPath.Create(); if (System.IO.Directory.Exists(tempIndexPath.FullName)) { break; } } Directory dir = FSDirectory.Open(tempIndexPath); try { #pragma warning disable 612, 618 IndexWriterConfig iwc = new IndexWriterConfig(LuceneVersion.LUCENE_CURRENT, indexAnalyzer); #pragma warning restore 612, 618 iwc.SetOpenMode(OpenMode.CREATE); iwc.SetRAMBufferSizeMB(ramBufferSizeMB); IndexWriter writer = new IndexWriter(dir, iwc); var ft = new FieldType(TextField.TYPE_NOT_STORED); // TODO: if only we had IndexOptions.TERMS_ONLY... ft.IndexOptions = IndexOptions.DOCS_AND_FREQS; ft.OmitNorms = true; ft.Freeze(); Document doc = new Document(); Field field = new Field("body", "", ft); doc.Add(field); totTokens = 0; IndexReader reader = null; bool success = false; count = 0; try { while (enumerator.MoveNext()) { BytesRef surfaceForm = enumerator.Current; field.SetStringValue(surfaceForm.Utf8ToString()); writer.AddDocument(doc); count++; } reader = DirectoryReader.Open(writer, false); Terms terms = MultiFields.GetTerms(reader, "body"); if (terms is null) { throw new ArgumentException("need at least one suggestion"); } // Move all ngrams into an FST: TermsEnum termsEnum = terms.GetEnumerator(null); Outputs <long?> outputs = PositiveInt32Outputs.Singleton; Builder <long?> builder = new Builder <long?>(FST.INPUT_TYPE.BYTE1, outputs); Int32sRef scratchInts = new Int32sRef(); while (termsEnum.MoveNext()) { BytesRef term = termsEnum.Term; int ngramCount = CountGrams(term); if (ngramCount > grams) { throw new ArgumentException("tokens must not contain separator byte; got token=" + term + " but gramCount=" + ngramCount + ", which is greater than expected max ngram size=" + grams); } if (ngramCount == 1) { totTokens += termsEnum.TotalTermFreq; } builder.Add(Lucene.Net.Util.Fst.Util.ToInt32sRef(term, scratchInts), EncodeWeight(termsEnum.TotalTermFreq)); } fst = builder.Finish(); if (fst is null) { throw new ArgumentException("need at least one suggestion"); } //System.out.println("FST: " + fst.getNodeCount() + " nodes"); /* * PrintWriter pw = new PrintWriter("/x/tmp/out.dot"); * Util.toDot(fst, pw, true, true); * pw.close(); */ success = true; } finally { if (success) { IOUtils.Dispose(writer, reader); } else { IOUtils.DisposeWhileHandlingException(writer, reader); } } } finally { try { IOUtils.Dispose(dir); } finally { // LUCENENET specific - since we are removing the entire directory anyway, // it doesn't make sense to first do a loop in order remove the files. // Let the System.IO.Directory.Delete() method handle that. // We also need to dispose the Directory instance first before deleting from disk. try { System.IO.Directory.Delete(tempIndexPath.FullName, true); } catch (Exception e) { throw IllegalStateException.Create("failed to remove " + tempIndexPath, e); } } } }
public virtual void TestSimple() { int numNodes = TestUtil.NextInt32(Random, 1, 10); double runTimeSec = AtLeast(3); int minDocsToMakeTerms = TestUtil.NextInt32(Random, 5, 20); int maxSearcherAgeSeconds = TestUtil.NextInt32(Random, 1, 3); if (Verbose) { Console.WriteLine("TEST: numNodes=" + numNodes + " runTimeSec=" + runTimeSec + " maxSearcherAgeSeconds=" + maxSearcherAgeSeconds); } Start(numNodes, runTimeSec, maxSearcherAgeSeconds); JCG.List <PreviousSearchState> priorSearches = new JCG.List <PreviousSearchState>(); IList <BytesRef> terms = null; while (J2N.Time.NanoTime() < endTimeNanos) { bool doFollowon = priorSearches.Count > 0 && Random.Next(7) == 1; // Pick a random node; we will run the query on this node: int myNodeID = Random.Next(numNodes); NodeState.ShardIndexSearcher localShardSearcher; PreviousSearchState prevSearchState; if (doFollowon) { // Pretend user issued a followon query: prevSearchState = priorSearches[Random.Next(priorSearches.Count)]; if (Verbose) { Console.WriteLine("\nTEST: follow-on query age=" + ((J2N.Time.NanoTime() - prevSearchState.SearchTimeNanos) / 1000000000.0)); } try { localShardSearcher = m_nodes[myNodeID].Acquire(prevSearchState.Versions); } catch (SearcherExpiredException see) { // Expected, sometimes; in a "real" app we would // either forward this error to the user ("too // much time has passed; please re-run your // search") or sneakily just switch to newest // searcher w/o telling them... if (Verbose) { Console.WriteLine(" searcher expired during local shard searcher init: " + see); } priorSearches.Remove(prevSearchState); continue; } } else { if (Verbose) { Console.WriteLine("\nTEST: fresh query"); } // Do fresh query: localShardSearcher = m_nodes[myNodeID].Acquire(); prevSearchState = null; } IndexReader[] subs = new IndexReader[numNodes]; PreviousSearchState searchState = null; try { // Mock: now make a single reader (MultiReader) from all node // searchers. In a real shard env you can't do this... we // do it to confirm results from the shard searcher // are correct: int docCount = 0; try { for (int nodeID = 0; nodeID < numNodes; nodeID++) { long subVersion = localShardSearcher.GetNodeVersions()[nodeID]; IndexSearcher sub = m_nodes[nodeID].Searchers.Acquire(subVersion); if (sub == null) { nodeID--; while (nodeID >= 0) { subs[nodeID].DecRef(); subs[nodeID] = null; nodeID--; } throw new SearcherExpiredException("nodeID=" + nodeID + " version=" + subVersion); } subs[nodeID] = sub.IndexReader; docCount += subs[nodeID].MaxDoc; } } catch (SearcherExpiredException see) { // Expected if (Verbose) { Console.WriteLine(" searcher expired during mock reader init: " + see); } continue; } IndexReader mockReader = new MultiReader(subs); IndexSearcher mockSearcher = new IndexSearcher(mockReader); Query query; Sort sort; if (prevSearchState != null) { query = prevSearchState.Query; sort = prevSearchState.Sort; } else { if (terms == null && docCount > minDocsToMakeTerms) { // TODO: try to "focus" on high freq terms sometimes too // TODO: maybe also periodically reset the terms...? TermsEnum termsEnum = MultiFields.GetTerms(mockReader, "body").GetEnumerator(); terms = new JCG.List <BytesRef>(); while (termsEnum.MoveNext()) { terms.Add(BytesRef.DeepCopyOf(termsEnum.Term)); } if (Verbose) { Console.WriteLine("TEST: init terms: " + terms.Count + " terms"); } if (terms.Count == 0) { terms = null; } } if (Verbose) { Console.WriteLine(" maxDoc=" + mockReader.MaxDoc); } if (terms != null) { if (Random.NextBoolean()) { query = new TermQuery(new Term("body", terms[Random.Next(terms.Count)])); } else { string t = terms[Random.Next(terms.Count)].Utf8ToString(); string prefix; if (t.Length <= 1) { prefix = t; } else { prefix = t.Substring(0, TestUtil.NextInt32(Random, 1, 2)); } query = new PrefixQuery(new Term("body", prefix)); } if (Random.NextBoolean()) { sort = null; } else { // TODO: sort by more than 1 field int what = Random.Next(3); if (what == 0) { sort = new Sort(SortField.FIELD_SCORE); } else if (what == 1) { // TODO: this sort doesn't merge // correctly... it's tricky because you // could have > 2.1B docs across all shards: //sort = new Sort(SortField.FIELD_DOC); sort = null; } else if (what == 2) { sort = new Sort(new SortField[] { new SortField("docid", SortFieldType.INT32, Random.NextBoolean()) }); } else { sort = new Sort(new SortField[] { new SortField("title", SortFieldType.STRING, Random.NextBoolean()) }); } } } else { query = null; sort = null; } } if (query != null) { try { searchState = AssertSame(mockSearcher, localShardSearcher, query, sort, prevSearchState); } catch (SearcherExpiredException see) { // Expected; in a "real" app we would // either forward this error to the user ("too // much time has passed; please re-run your // search") or sneakily just switch to newest // searcher w/o telling them... if (Verbose) { Console.WriteLine(" searcher expired during search: " + see); Console.Out.Write(see.StackTrace); } // We can't do this in general: on a very slow // computer it's possible the local searcher // expires before we can finish our search: // assert prevSearchState != null; if (prevSearchState != null) { priorSearches.Remove(prevSearchState); } } } } finally { //m_nodes[myNodeID].Release(localShardSearcher); NodeState.Release(localShardSearcher); // LUCENENET: Made Release() static per CA1822 for performance foreach (IndexReader sub in subs) { if (sub != null) { sub.DecRef(); } } } if (searchState != null && searchState.SearchAfterLocal != null && Random.Next(5) == 3) { priorSearches.Add(searchState); if (priorSearches.Count > 200) { priorSearches.Shuffle(Random); priorSearches.RemoveRange(100, priorSearches.Count - 100); // LUCENENET: Converted end index to length } } } Finish(); }
/// <summary> /// Build the suggest index, using up to the specified /// amount of temporary RAM while building. Note that /// the weights for the suggestions are ignored. /// </summary> public virtual void Build(InputIterator iterator, double ramBufferSizeMB) { if (iterator.HasPayloads()) { throw new System.ArgumentException("this suggester doesn't support payloads"); } if (iterator.HasContexts()) { throw new System.ArgumentException("this suggester doesn't support contexts"); } string prefix = this.GetType().Name; var directory = OfflineSorter.DefaultTempDir(); // TODO: messy ... java7 has Files.createTempDirectory // ... but 4.x is java6: File tempIndexPath = null; Random random = new Random(); while (true) { tempIndexPath = new File(directory, prefix + ".index." + random.Next(int.MaxValue)); if (tempIndexPath.mkdir()) { break; } } Directory dir = FSDirectory.Open(tempIndexPath); IndexWriterConfig iwc = new IndexWriterConfig(Version.LUCENE_CURRENT, indexAnalyzer); iwc.OpenMode = IndexWriterConfig.OpenMode.CREATE; iwc.RAMBufferSizeMB = ramBufferSizeMB; IndexWriter writer = new IndexWriter(dir, iwc); FieldType ft = new FieldType(TextField.TYPE_NOT_STORED); // TODO: if only we had IndexOptions.TERMS_ONLY... ft.IndexOptions = FieldInfo.IndexOptions.DOCS_AND_FREQS; ft.OmitNorms = true; ft.Freeze(); Document doc = new Document(); Field field = new Field("body", "", ft); doc.Add(field); totTokens = 0; IndexReader reader = null; bool success = false; count = 0; try { while (true) { BytesRef surfaceForm = iterator.Next(); if (surfaceForm == null) { break; } field.StringValue = surfaceForm.Utf8ToString(); writer.AddDocument(doc); count++; } reader = DirectoryReader.Open(writer, false); Terms terms = MultiFields.GetTerms(reader, "body"); if (terms == null) { throw new System.ArgumentException("need at least one suggestion"); } // Move all ngrams into an FST: TermsEnum termsEnum = terms.Iterator(null); Outputs <long?> outputs = PositiveIntOutputs.Singleton; Builder <long?> builder = new Builder <long?>(FST.INPUT_TYPE.BYTE1, outputs); IntsRef scratchInts = new IntsRef(); while (true) { BytesRef term = termsEnum.next(); if (term == null) { break; } int ngramCount = countGrams(term); if (ngramCount > grams) { throw new System.ArgumentException("tokens must not contain separator byte; got token=" + term + " but gramCount=" + ngramCount + ", which is greater than expected max ngram size=" + grams); } if (ngramCount == 1) { totTokens += termsEnum.TotalTermFreq(); } builder.Add(Util.ToIntsRef(term, scratchInts), encodeWeight(termsEnum.TotalTermFreq())); } fst = builder.Finish(); if (fst == null) { throw new System.ArgumentException("need at least one suggestion"); } //System.out.println("FST: " + fst.getNodeCount() + " nodes"); /* * PrintWriter pw = new PrintWriter("/x/tmp/out.dot"); * Util.toDot(fst, pw, true, true); * pw.close(); */ success = true; } finally { try { if (success) { IOUtils.Close(writer, reader); } else { IOUtils.CloseWhileHandlingException(writer, reader); } } finally { foreach (string file in dir.ListAll()) { File path = new File(tempIndexPath, file); if (path.Delete() == false) { throw new InvalidOperationException("failed to remove " + path); } } if (tempIndexPath.Delete() == false) { throw new InvalidOperationException("failed to remove " + tempIndexPath); } dir.Dispose(); } } }