public virtual void Test2() { Random random = Random; int NUM_DOCS = AtLeast(100); Directory dir = NewDirectory(); RandomIndexWriter writer = new RandomIndexWriter( #if FEATURE_INSTANCE_TESTDATA_INITIALIZATION this, #endif random, dir); bool allowDups = random.NextBoolean(); ISet <string> seen = new JCG.HashSet <string>(); if (Verbose) { Console.WriteLine("TEST: NUM_DOCS=" + NUM_DOCS + " allowDups=" + allowDups); } int numDocs = 0; IList <BytesRef> docValues = new JCG.List <BytesRef>(); // TODO: deletions while (numDocs < NUM_DOCS) { string s; if (random.NextBoolean()) { s = TestUtil.RandomSimpleString(random); } else { s = TestUtil.RandomUnicodeString(random); } BytesRef br = new BytesRef(s); if (!allowDups) { if (seen.Contains(s)) { continue; } seen.Add(s); } if (Verbose) { Console.WriteLine(" " + numDocs + ": s=" + s); } Document doc = new Document(); doc.Add(new SortedDocValuesField("stringdv", br)); doc.Add(new NumericDocValuesField("id", numDocs)); docValues.Add(br); writer.AddDocument(doc); numDocs++; if (random.Next(40) == 17) { // force flush writer.GetReader().Dispose(); } } writer.ForceMerge(1); DirectoryReader r = writer.GetReader(); writer.Dispose(); AtomicReader sr = GetOnlySegmentReader(r); long END_TIME = (J2N.Time.NanoTime() / J2N.Time.MillisecondsPerNanosecond) + (TestNightly ? 30 : 1); // LUCENENET: Use NanoTime() rather than CurrentTimeMilliseconds() for more accurate/reliable results int NUM_THREADS = TestUtil.NextInt32(LuceneTestCase.Random, 1, 10); ThreadJob[] threads = new ThreadJob[NUM_THREADS]; for (int thread = 0; thread < NUM_THREADS; thread++) { threads[thread] = new ThreadAnonymousClass2(random, docValues, sr, END_TIME); threads[thread].Start(); } foreach (ThreadJob thread in threads) { thread.Join(); } r.Dispose(); dir.Dispose(); }
public void Test() { RandomIndexWriter writer; DirectoryReader indexReader; int numParents = AtLeast(200); IndexWriterConfig cfg = NewIndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer(Random)); cfg.SetMergePolicy(NewLogMergePolicy()); using (writer = new RandomIndexWriter(Random, NewDirectory(), cfg)) { Document parentDoc = new Document(); NumericDocValuesField parentVal = new NumericDocValuesField("parent_val", 0L); parentDoc.Add(parentVal); StringField parent = new StringField("parent", "true", Field.Store.YES); parentDoc.Add(parent); for (int i = 0; i < numParents; ++i) { IList <Document> documents = new JCG.List <Document>(); int numChildren = Random.nextInt(10); for (int j = 0; j < numChildren; ++j) { Document childDoc = new Document(); childDoc.Add(new NumericDocValuesField("child_val", Random.nextInt(5))); documents.Add(childDoc); } parentVal.SetInt64Value(Random.nextInt(50)); documents.Add(parentDoc); writer.AddDocuments(documents); } writer.ForceMerge(1); indexReader = writer.GetReader(); } AtomicReader reader = GetOnlySegmentReader(indexReader); Filter parentsFilter = new FixedBitSetCachingWrapperFilter(new QueryWrapperFilter(new TermQuery(new Term("parent", "true")))); FixedBitSet parentBits = (FixedBitSet)parentsFilter.GetDocIdSet(reader.AtomicContext, null); NumericDocValues parentValues = reader.GetNumericDocValues("parent_val"); NumericDocValues childValues = reader.GetNumericDocValues("child_val"); Sort parentSort = new Sort(new SortField("parent_val", SortFieldType.INT64)); Sort childSort = new Sort(new SortField("child_val", SortFieldType.INT64)); Sort sort = new Sort(new SortField("custom", new BlockJoinComparerSource(parentsFilter, parentSort, childSort))); Sorter sorter = new Sorter(sort); Sorter.DocMap docMap = sorter.Sort(reader); assertEquals(reader.MaxDoc, docMap.Count); int[] children = new int[1]; int numChildren2 = 0; int previousParent = -1; for (int i = 0; i < docMap.Count; ++i) { int oldID = docMap.NewToOld(i); if (parentBits.Get(oldID)) { // check that we have the right children for (int j = 0; j < numChildren2; ++j) { assertEquals(oldID, parentBits.NextSetBit(children[j])); } // check that children are sorted for (int j = 1; j < numChildren2; ++j) { int doc1 = children[j - 1]; int doc2 = children[j]; if (childValues.Get(doc1) == childValues.Get(doc2)) { assertTrue(doc1 < doc2); // sort is stable } else { assertTrue(childValues.Get(doc1) < childValues.Get(doc2)); } } // check that parents are sorted if (previousParent != -1) { if (parentValues.Get(previousParent) == parentValues.Get(oldID)) { assertTrue(previousParent < oldID); } else { assertTrue(parentValues.Get(previousParent) < parentValues.Get(oldID)); } } // reset previousParent = oldID; numChildren2 = 0; } else { children = ArrayUtil.Grow(children, numChildren2 + 1); children[numChildren2++] = oldID; } } indexReader.Dispose(); writer.IndexWriter.Directory.Dispose(); }
/// <summary> /// Splits a backslash escaped string on the separator. /// <para/> /// Current backslash escaping supported: /// <para/> \n \t \r \b \f are escaped the same as a .NET string /// <para/> Other characters following a backslash are produced verbatim (\c => c) /// </summary> /// <param name="s"> the string to split </param> /// <param name="separator"> the separator to split on </param> /// <param name="decode"> decode backslash escaping </param> public static IList <string> SplitSmart(string s, string separator, bool decode) { IList <string> lst = new JCG.List <string>(2); StringBuilder sb = new StringBuilder(); int pos = 0, end = s.Length; while (pos < end) { //if (s.StartsWith(separator,pos)) if (s.Substring(pos).StartsWith(separator, StringComparison.Ordinal)) { if (sb.Length > 0) { lst.Add(sb.ToString()); sb = new StringBuilder(); } pos += separator.Length; continue; } char ch = s[pos++]; if (ch == '\\') { if (!decode) { sb.Append(ch); } if (pos >= end) // ERROR, or let it go? { break; } ch = s[pos++]; if (decode) { switch (ch) { case 'n': ch = '\n'; break; case 't': ch = '\t'; break; case 'r': ch = '\r'; break; case 'b': ch = '\b'; break; case 'f': ch = '\f'; break; } } } sb.Append(ch); } if (sb.Length > 0) { lst.Add(sb.ToString()); } return(lst); }
private void QueryToSpanQuery(Query query, ICollection<byte[]> payloads) { if (query is BooleanQuery booleanQuery) { BooleanClause[] queryClauses = booleanQuery.GetClauses(); for (int i = 0; i < queryClauses.Length; i++) { if (!queryClauses[i].IsProhibited) { QueryToSpanQuery(queryClauses[i].Query, payloads); } } } else if (query is PhraseQuery phraseQuery) { Term[] phraseQueryTerms = phraseQuery.GetTerms(); SpanQuery[] clauses = new SpanQuery[phraseQueryTerms.Length]; for (int i = 0; i < phraseQueryTerms.Length; i++) { clauses[i] = new SpanTermQuery(phraseQueryTerms[i]); } int slop = phraseQuery.Slop; bool inorder = false; if (slop == 0) { inorder = true; } SpanNearQuery sp = new SpanNearQuery(clauses, slop, inorder) { Boost = query.Boost }; GetPayloads(payloads, sp); } else if (query is TermQuery termQuery) { SpanTermQuery stq = new SpanTermQuery(termQuery.Term) { Boost = query.Boost }; GetPayloads(payloads, stq); } else if (query is SpanQuery spanQuery) { GetPayloads(payloads, spanQuery); } else if (query is FilteredQuery filteredQuery) { QueryToSpanQuery(filteredQuery.Query, payloads); } else if (query is DisjunctionMaxQuery disjunctionMaxQuery) { foreach (var q in disjunctionMaxQuery) { QueryToSpanQuery(q, payloads); } } else if (query is MultiPhraseQuery mpq) { IList<Term[]> termArrays = mpq.GetTermArrays(); int[] positions = mpq.GetPositions(); if (positions.Length > 0) { int maxPosition = positions[positions.Length - 1]; for (int i = 0; i < positions.Length - 1; ++i) { if (positions[i] > maxPosition) { maxPosition = positions[i]; } } // LUCENENET: Changed from Query to SpanQuery to eliminate the O(n) cast // required to instantiate SpanOrQuery below IList<SpanQuery>[] disjunctLists = new JCG.List<SpanQuery>[maxPosition + 1]; int distinctPositions = 0; for (int i = 0; i < termArrays.Count; ++i) { Term[] termArray = termArrays[i]; IList<SpanQuery> disjuncts = disjunctLists[positions[i]]; // LUCENENET: Changed from Query to SpanQuery if (disjuncts is null) { disjuncts = (disjunctLists[positions[i]] = new JCG.List<SpanQuery>(termArray.Length)); // LUCENENET: Changed from Query to SpanQuery ++distinctPositions; } foreach (Term term in termArray) { disjuncts.Add(new SpanTermQuery(term)); } } int positionGaps = 0; int position = 0; SpanQuery[] clauses = new SpanQuery[distinctPositions]; for (int i = 0; i < disjunctLists.Length; ++i) { IList<SpanQuery> disjuncts = disjunctLists[i]; // LUCENENET: Changed from Query to SpanQuery if (disjuncts != null) { clauses[position++] = new SpanOrQuery(disjuncts); } else { ++positionGaps; } } int slop = mpq.Slop; bool inorder = (slop == 0); SpanNearQuery sp = new SpanNearQuery(clauses, slop + positionGaps, inorder); sp.Boost = query.Boost; GetPayloads(payloads, sp); } } }
public virtual TokenInfoDictionaryWriter BuildDictionary(IList <string> csvFiles) { TokenInfoDictionaryWriter dictionary = new TokenInfoDictionaryWriter(10 * 1024 * 1024); // all lines in the file Console.WriteLine(" parse..."); JCG.List <string[]> lines = new JCG.List <string[]>(400000); foreach (string file in csvFiles) { using Stream inputStream = new FileStream(file, FileMode.Open, FileAccess.Read); Encoding decoder = Encoding.GetEncoding(encoding); TextReader reader = new StreamReader(inputStream, decoder); string line = null; while ((line = reader.ReadLine()) != null) { string[] entry = CSVUtil.Parse(line); if (entry.Length < 13) { Console.WriteLine("Entry in CSV is not valid: " + line); continue; } string[] formatted = FormatEntry(entry); lines.Add(formatted); // NFKC normalize dictionary entry if (normalizeEntries) { //if (normalizer.isNormalized(entry[0])){ if (entry[0].IsNormalized(NormalizationForm.FormKC)) { continue; } string[] normalizedEntry = new string[entry.Length]; for (int i = 0; i < entry.Length; i++) { //normalizedEntry[i] = normalizer.normalize(entry[i]); normalizedEntry[i] = entry[i].Normalize(NormalizationForm.FormKC); } formatted = FormatEntry(normalizedEntry); lines.Add(formatted); } } } Console.WriteLine(" sort..."); // sort by term: we sorted the files already and use a stable sort. lines.Sort(Comparer <string[]> .Create((left, right) => left[0].CompareToOrdinal(right[0]))); Console.WriteLine(" encode..."); PositiveInt32Outputs fstOutput = PositiveInt32Outputs.Singleton; Builder <long?> fstBuilder = new Builder <long?>(Lucene.Net.Util.Fst.FST.INPUT_TYPE.BYTE2, 0, 0, true, true, int.MaxValue, fstOutput, null, true, PackedInt32s.DEFAULT, true, 15); Int32sRef scratch = new Int32sRef(); long ord = -1; // first ord will be 0 string lastValue = null; // build tokeninfo dictionary foreach (string[] entry in lines) { int next = dictionary.Put(entry); if (next == offset) { Console.WriteLine("Failed to process line: " + Collections.ToString(entry)); continue; } string token = entry[0]; if (!token.Equals(lastValue, StringComparison.Ordinal)) { // new word to add to fst ord++; lastValue = token; scratch.Grow(token.Length); scratch.Length = token.Length; for (int i = 0; i < token.Length; i++) { scratch.Int32s[i] = (int)token[i]; } fstBuilder.Add(scratch, ord); } dictionary.AddMapping((int)ord, offset); offset = next; } FST <long?> fst = fstBuilder.Finish(); Console.WriteLine(" " + fst.NodeCount + " nodes, " + fst.ArcCount + " arcs, " + fst.GetSizeInBytes() + " bytes... "); dictionary.SetFST(fst); Console.WriteLine(" done"); return(dictionary); }
/// <summary> /// Low level api to get the most relevant (formatted) sections of the document. /// This method has been made public to allow visibility of score information held in <see cref="TextFragment"/> objects. /// Thanks to Jason Calabrese for help in redefining the interface. /// </summary> /// <exception cref="IOException">If there is a low-level I/O error</exception> /// <exception cref="InvalidTokenOffsetsException">thrown if any token's EndOffset exceeds the provided text's length</exception> public TextFragment[] GetBestTextFragments( TokenStream tokenStream, string text, bool mergeContiguousFragments, int maxNumFragments) { var docFrags = new JCG.List <TextFragment>(); var newText = new StringBuilder(); var termAtt = tokenStream.AddAttribute <ICharTermAttribute>(); var offsetAtt = tokenStream.AddAttribute <IOffsetAttribute>(); tokenStream.Reset(); var currentFrag = new TextFragment(newText, newText.Length, docFrags.Count); if (_fragmentScorer is QueryScorer queryScorer) { queryScorer.SetMaxDocCharsToAnalyze(_maxDocCharsToAnalyze); } var newStream = _fragmentScorer.Init(tokenStream); if (newStream != null) { tokenStream = newStream; } _fragmentScorer.StartFragment(currentFrag); docFrags.Add(currentFrag); var fragQueue = new FragmentQueue(maxNumFragments); try { string tokenText; int startOffset; int endOffset; int lastEndOffset = 0; _textFragmenter.Start(text, tokenStream); var tokenGroup = new TokenGroup(tokenStream); for (bool next = tokenStream.IncrementToken(); next && (offsetAtt.StartOffset < _maxDocCharsToAnalyze); next = tokenStream.IncrementToken()) { if ((offsetAtt.EndOffset > text.Length) || (offsetAtt.StartOffset > text.Length) ) { throw new InvalidTokenOffsetsException("Token " + termAtt.ToString() + " exceeds length of provided text sized " + text.Length); } if ((tokenGroup.NumTokens > 0) && (tokenGroup.IsDistinct())) { //the current token is distinct from previous tokens - // markup the cached token group info startOffset = tokenGroup.MatchStartOffset; endOffset = tokenGroup.MatchEndOffset; tokenText = text.Substring(startOffset, endOffset - startOffset); string markedUpText = _formatter.HighlightTerm(_encoder.EncodeText(tokenText), tokenGroup); //store any whitespace etc from between this and last group if (startOffset > lastEndOffset) { newText.Append(_encoder.EncodeText(text.Substring(lastEndOffset, startOffset - lastEndOffset))); } newText.Append(markedUpText); lastEndOffset = Math.Max(endOffset, lastEndOffset); tokenGroup.Clear(); //check if current token marks the start of a new fragment if (_textFragmenter.IsNewFragment()) { currentFrag.Score = _fragmentScorer.FragmentScore; //record stats for a new fragment currentFrag.TextEndPos = newText.Length; currentFrag = new TextFragment(newText, newText.Length, docFrags.Count); _fragmentScorer.StartFragment(currentFrag); docFrags.Add(currentFrag); } } tokenGroup.AddToken(_fragmentScorer.GetTokenScore()); // if(lastEndOffset>maxDocBytesToAnalyze) // { // break; // } } currentFrag.Score = _fragmentScorer.FragmentScore; if (tokenGroup.NumTokens > 0) { //flush the accumulated text (same code as in above loop) startOffset = tokenGroup.MatchStartOffset; endOffset = tokenGroup.MatchEndOffset; tokenText = text.Substring(startOffset, endOffset - startOffset); var markedUpText = _formatter.HighlightTerm(_encoder.EncodeText(tokenText), tokenGroup); //store any whitespace etc from between this and last group if (startOffset > lastEndOffset) { newText.Append(_encoder.EncodeText(text.Substring(lastEndOffset, startOffset - lastEndOffset))); } newText.Append(markedUpText); lastEndOffset = Math.Max(lastEndOffset, endOffset); } //Test what remains of the original text beyond the point where we stopped analyzing if ( // if there is text beyond the last token considered.. (lastEndOffset < text.Length) && // and that text is not too large... (text.Length <= _maxDocCharsToAnalyze) ) { //append it to the last fragment newText.Append(_encoder.EncodeText(text.Substring(lastEndOffset))); } currentFrag.TextEndPos = newText.Length; //sort the most relevant sections of the text foreach (var f in docFrags) { currentFrag = f; //If you are running with a version of Lucene before 11th Sept 03 // you do not have PriorityQueue.insert() - so uncomment the code below /* * if (currentFrag.getScore() >= minScore) * { * fragQueue.put(currentFrag); * if (fragQueue.size() > maxNumFragments) * { // if hit queue overfull * fragQueue.pop(); // remove lowest in hit queue * minScore = ((TextFragment) fragQueue.top()).getScore(); // reset minScore * } * } */ //The above code caused a problem as a result of Christoph Goller's 11th Sept 03 //fix to PriorityQueue. The correct method to use here is the new "insert" method // USE ABOVE CODE IF THIS DOES NOT COMPILE! fragQueue.InsertWithOverflow(currentFrag); } //return the most relevant fragments var frag = new TextFragment[fragQueue.Count]; for (int i = frag.Length - 1; i >= 0; i--) { frag[i] = fragQueue.Pop(); } //merge any contiguous fragments to improve readability if (mergeContiguousFragments) { MergeContiguousFragments(frag); JCG.List <TextFragment> fragTexts = new JCG.List <TextFragment>(); for (int i = 0; i < frag.Length; i++) { if ((frag[i] != null) && (frag[i].Score > 0)) { fragTexts.Add(frag[i]); } } frag = new TextFragment[fragTexts.Count]; fragTexts.CopyTo(frag); } return(frag); } finally { if (tokenStream != null) { try { tokenStream.End(); tokenStream.Dispose(); } catch (Exception e) when(e.IsException()) { } } } }
public virtual void TestRandom() { string[] tokens = GetRandomTokens(10); Store.Directory indexDir = NewDirectory(); Store.Directory taxoDir = NewDirectory(); RandomIndexWriter w = new RandomIndexWriter( #if FEATURE_INSTANCE_TESTDATA_INITIALIZATION this, #endif Random, indexDir); var tw = new DirectoryTaxonomyWriter(taxoDir); FacetsConfig config = new FacetsConfig(); int numDocs = AtLeast(1000); int numDims = TestUtil.NextInt32(Random, 1, 7); IList <TestDoc> testDocs = GetRandomDocs(tokens, numDocs, numDims); foreach (TestDoc testDoc in testDocs) { Document doc = new Document(); doc.Add(NewStringField("content", testDoc.content, Field.Store.NO)); testDoc.value = Random.NextSingle(); doc.Add(new SingleDocValuesField("value", testDoc.value)); for (int j = 0; j < numDims; j++) { if (testDoc.dims[j] != null) { doc.Add(new FacetField("dim" + j, testDoc.dims[j])); } } w.AddDocument(config.Build(tw, doc)); } // NRT open IndexSearcher searcher = NewSearcher(w.GetReader()); // NRT open var tr = new DirectoryTaxonomyReader(tw); ValueSource values = new SingleFieldSource("value"); int iters = AtLeast(100); for (int iter = 0; iter < iters; iter++) { string searchToken = tokens[Random.Next(tokens.Length)]; if (Verbose) { Console.WriteLine("\nTEST: iter content=" + searchToken); } FacetsCollector fc = new FacetsCollector(); FacetsCollector.Search(searcher, new TermQuery(new Term("content", searchToken)), 10, fc); Facets facets = new TaxonomyFacetSumValueSource(tr, config, fc, values); // Slow, yet hopefully bug-free, faceting: var expectedValues = new JCG.List <Dictionary <string, float?> >(numDims); for (int i = 0; i < numDims; i++) { expectedValues.Add(new Dictionary <string, float?>()); } foreach (TestDoc doc in testDocs) { if (doc.content.Equals(searchToken, StringComparison.Ordinal)) { for (int j = 0; j < numDims; j++) { if (doc.dims[j] != null) { if (!expectedValues[j].TryGetValue(doc.dims[j], out float?v) || v == null) { expectedValues[j][doc.dims[j]] = doc.value; } else { expectedValues[j][doc.dims[j]] = (float)v + doc.value; } } } } } JCG.List <FacetResult> expected = new JCG.List <FacetResult>(); for (int i = 0; i < numDims; i++) { JCG.List <LabelAndValue> labelValues = new JCG.List <LabelAndValue>(); float totValue = 0; foreach (KeyValuePair <string, float?> ent in expectedValues[i]) { labelValues.Add(new LabelAndValue(ent.Key, ent.Value.Value)); totValue += ent.Value.Value; } SortLabelValues(labelValues); if (totValue > 0) { expected.Add(new FacetResult("dim" + i, new string[0], totValue, labelValues.ToArray(), labelValues.Count)); } } // Sort by highest value, tie break by value: SortFacetResults(expected); IList <FacetResult> actual = facets.GetAllDims(10); // Messy: fixup ties SortTies(actual); if (Verbose) { Console.WriteLine("expected=\n" + expected.ToString()); Console.WriteLine("actual=\n" + actual.ToString()); } AssertFloatValuesEquals(expected, actual); } IOUtils.Dispose(w, tw, searcher.IndexReader, tr, indexDir, taxoDir); }
/// <summary> /// Internal helper method used by check that iterates over /// the keys of <paramref name="readerFieldToValIds"/> and generates a <see cref="ICollection{T}"/> /// of <see cref="Insanity"/> instances whenever two (or more) <see cref="ReaderField"/> instances are /// found that have an ancestry relationships. /// </summary> /// <seealso cref="InsanityType.SUBREADER"/> private static ICollection <Insanity> CheckSubreaders(MapOfSets <int, FieldCache.CacheEntry> valIdToItems, MapOfSets <ReaderField, int> readerFieldToValIds) // LUCENENET: CA1822: Mark members as static { JCG.List <Insanity> insanity = new JCG.List <Insanity>(23); Dictionary <ReaderField, ISet <ReaderField> > badChildren = new Dictionary <ReaderField, ISet <ReaderField> >(17); MapOfSets <ReaderField, ReaderField> badKids = new MapOfSets <ReaderField, ReaderField>(badChildren); // wrapper IDictionary <int, ISet <FieldCache.CacheEntry> > viToItemSets = valIdToItems.Map; IDictionary <ReaderField, ISet <int> > rfToValIdSets = readerFieldToValIds.Map; HashSet <ReaderField> seen = new HashSet <ReaderField>(); //IDictionary<ReaderField, ISet<int>>.KeyCollection readerFields = rfToValIdSets.Keys; foreach (ReaderField rf in rfToValIdSets.Keys) { if (seen.Contains(rf)) { continue; } IList <object> kids = GetAllDescendantReaderKeys(rf.ReaderKey); foreach (object kidKey in kids) { ReaderField kid = new ReaderField(kidKey, rf.FieldName); // LUCENENET: Eliminated extra lookup by using TryGetValue instead of ContainsKey if (badChildren.TryGetValue(kid, out ISet <ReaderField> badKid)) { // we've already process this kid as RF and found other problems // track those problems as our own badKids.Put(rf, kid); badKids.PutAll(rf, badKid); badChildren.Remove(kid); } else if (rfToValIdSets.ContainsKey(kid)) { // we have cache entries for the kid badKids.Put(rf, kid); } seen.Add(kid); } seen.Add(rf); } // every mapping in badKids represents an Insanity foreach (ReaderField parent in badChildren.Keys) { ISet <ReaderField> kids = badChildren[parent]; JCG.List <FieldCache.CacheEntry> badEntries = new JCG.List <FieldCache.CacheEntry>(kids.Count * 2); // put parent entr(ies) in first { foreach (int value in rfToValIdSets[parent]) { badEntries.AddRange(viToItemSets[value]); } } // now the entries for the descendants foreach (ReaderField kid in kids) { foreach (int value in rfToValIdSets[kid]) { badEntries.AddRange(viToItemSets[value]); } } FieldCache.CacheEntry[] badness = badEntries.ToArray(); insanity.Add(new Insanity(InsanityType.SUBREADER, "Found caches for descendants of " + parent.ToString(), badness)); } return(insanity); }
private void TestCase(int itrsWithVal, int specifiedValsOnItr, bool removeDups) { // Build a random number of lists IList <int?> expected = new JCG.List <int?>(); Random random = new Random(Random.Next()); int numLists = itrsWithVal + random.Next(1000 - itrsWithVal); IList <int>[] lists = new IList <int> [numLists]; for (int i = 0; i < numLists; i++) { lists[i] = new JCG.List <int>(); } int start = random.Next(1000000); int end = start + VALS_TO_MERGE / itrsWithVal / Math.Abs(specifiedValsOnItr); for (int i = start; i < end; i++) { int maxList = lists.Length; int maxValsOnItr = 0; int sumValsOnItr = 0; for (int itrWithVal = 0; itrWithVal < itrsWithVal; itrWithVal++) { int list = random.Next(maxList); int valsOnItr = specifiedValsOnItr < 0 ? (1 + random.Next(-specifiedValsOnItr)) : specifiedValsOnItr; maxValsOnItr = Math.Max(maxValsOnItr, valsOnItr); sumValsOnItr += valsOnItr; for (int valOnItr = 0; valOnItr < valsOnItr; valOnItr++) { lists[list].Add(i); } maxList = maxList - 1; ArrayUtil.Swap(lists, list, maxList); } int maxCount = removeDups ? maxValsOnItr : sumValsOnItr; for (int count = 0; count < maxCount; count++) { expected.Add(i); } } // Now check that they get merged cleanly IEnumerator <int>[] itrs = new IEnumerator <int> [numLists]; for (int i = 0; i < numLists; i++) { itrs[i] = lists[i].GetEnumerator(); } try { MergedEnumerator <int> mergedItr = new MergedEnumerator <int>(removeDups, itrs); IEnumerator <int?> expectedItr = expected.GetEnumerator(); while (expectedItr.MoveNext()) { Assert.IsTrue(mergedItr.MoveNext()); Assert.AreEqual(expectedItr.Current, mergedItr.Current); } Assert.IsFalse(mergedItr.MoveNext()); } finally { IOUtils.Dispose(itrs); } }
public virtual void TestRandomStoredFields() { using Directory dir = NewDirectory(); Random rand = Random; using RandomIndexWriter w = new RandomIndexWriter(rand, dir, NewIndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer(Random)).SetMaxBufferedDocs(TestUtil.NextInt32(rand, 5, 20))); //w.w.setNoCFSRatio(0.0); int docCount = AtLeast(200); int fieldCount = TestUtil.NextInt32(rand, 1, 5); IList <int> fieldIDs = new JCG.List <int>(); FieldType customType = new FieldType(TextField.TYPE_STORED); customType.IsTokenized = false; Field idField = NewField("id", "", customType); for (int i = 0; i < fieldCount; i++) { fieldIDs.Add(i); } IDictionary <string, Document> docs = new Dictionary <string, Document>(); if (Verbose) { Console.WriteLine("TEST: build index docCount=" + docCount); } FieldType customType2 = new FieldType(); customType2.IsStored = true; for (int i = 0; i < docCount; i++) { Document doc = new Document(); doc.Add(idField); string id = "" + i; idField.SetStringValue(id); docs[id] = doc; if (Verbose) { Console.WriteLine("TEST: add doc id=" + id); } foreach (int field in fieldIDs) { string s; if (rand.Next(4) != 3) { s = TestUtil.RandomUnicodeString(rand, 1000); doc.Add(NewField("f" + field, s, customType2)); } else { s = null; } } w.AddDocument(doc); if (rand.Next(50) == 17) { // mixup binding of field name -> Number every so often fieldIDs.Shuffle(Random); } if (rand.Next(5) == 3 && i > 0) { string delID = "" + rand.Next(i); if (Verbose) { Console.WriteLine("TEST: delete doc id=" + delID); } w.DeleteDocuments(new Term("id", delID)); docs.Remove(delID); } } if (Verbose) { Console.WriteLine("TEST: " + docs.Count + " docs in index; now load fields"); } if (docs.Count > 0) { string[] idsList = docs.Keys.ToArray(/*new string[docs.Count]*/); for (int x = 0; x < 2; x++) { using (IndexReader r = w.GetReader()) { IndexSearcher s = NewSearcher(r); if (Verbose) { Console.WriteLine("TEST: cycle x=" + x + " r=" + r); } int num = AtLeast(1000); for (int iter = 0; iter < num; iter++) { string testID = idsList[rand.Next(idsList.Length)]; if (Verbose) { Console.WriteLine("TEST: test id=" + testID); } TopDocs hits = s.Search(new TermQuery(new Term("id", testID)), 1); Assert.AreEqual(1, hits.TotalHits); Document doc = r.Document(hits.ScoreDocs[0].Doc); Document docExp = docs[testID]; for (int i = 0; i < fieldCount; i++) { assertEquals("doc " + testID + ", field f" + fieldCount + " is wrong", docExp.Get("f" + i), doc.Get("f" + i)); } } } // r.Dispose(); w.ForceMerge(1); } } }
/// <summary> /// Tests a CacheEntry[] for indication of "insane" cache usage. /// <para> /// <b>NOTE:</b>FieldCache CreationPlaceholder objects are ignored. /// (:TODO: is this a bad idea? are we masking a real problem?) /// </para> /// </summary> public Insanity[] Check(params FieldCache.CacheEntry[] cacheEntries) { if (null == cacheEntries || 0 == cacheEntries.Length) { return(Arrays.Empty <Insanity>()); } if (estimateRam) { for (int i = 0; i < cacheEntries.Length; i++) { cacheEntries[i].EstimateSize(); } } // the indirect mapping lets MapOfSet dedup identical valIds for us // maps the (valId) identityhashCode of cache values to // sets of CacheEntry instances MapOfSets <int, FieldCache.CacheEntry> valIdToItems = new MapOfSets <int, FieldCache.CacheEntry>(new Dictionary <int, ISet <FieldCache.CacheEntry> >(17)); // maps ReaderField keys to Sets of ValueIds MapOfSets <ReaderField, int> readerFieldToValIds = new MapOfSets <ReaderField, int>(new Dictionary <ReaderField, ISet <int> >(17)); // any keys that we know result in more then one valId ISet <ReaderField> valMismatchKeys = new JCG.HashSet <ReaderField>(); // iterate over all the cacheEntries to get the mappings we'll need for (int i = 0; i < cacheEntries.Length; i++) { FieldCache.CacheEntry item = cacheEntries[i]; object val = item.Value; // It's OK to have dup entries, where one is eg // float[] and the other is the Bits (from // getDocWithField()) if (val is IBits) { continue; } if (val is FieldCache.ICreationPlaceholder) { continue; } ReaderField rf = new ReaderField(item.ReaderKey, item.FieldName); int valId = RuntimeHelpers.GetHashCode(val); // indirect mapping, so the MapOfSet will dedup identical valIds for us valIdToItems.Put(valId, item); if (1 < readerFieldToValIds.Put(rf, valId)) { valMismatchKeys.Add(rf); } } JCG.List <Insanity> insanity = new JCG.List <Insanity>(valMismatchKeys.Count * 3); insanity.AddRange(CheckValueMismatch(valIdToItems, readerFieldToValIds, valMismatchKeys)); insanity.AddRange(CheckSubreaders(valIdToItems, readerFieldToValIds)); return(insanity.ToArray()); }
public virtual void runTestQuery(SpatialMatchConcern concern, SpatialTestQuery q) { String msg = q.toString(); //"Query: " + q.args.toString(ctx); SearchResults got = executeQuery(makeQuery(q), Math.Max(100, q.ids.size() + 1)); if (storeShape && got.numFound > 0) { //check stored value is there assertNotNull(got.results[0].document.Get(strategy.FieldName)); } if (concern.orderIsImportant) { IEnumerator <String> ids = q.ids.GetEnumerator(); foreach (SearchResult r in got.results) { String id = r.document.Get("id"); if (!ids.MoveNext()) { fail(msg + " :: Did not get enough results. Expect" + q.ids + ", got: " + got.toDebugString()); } assertEquals("out of order: " + msg, ids.Current, id); } if (ids.MoveNext()) { fail(msg + " :: expect more results then we got: " + ids.Current); } } else { // We are looking at how the results overlap if (concern.resultsAreSuperset) { ISet <string> found = new JCG.HashSet <string>(); foreach (SearchResult r in got.results) { found.add(r.document.Get("id")); } foreach (String s in q.ids) { if (!found.contains(s)) { fail("Results are mising id: " + s + " :: " + found); } } } else { IList <string> found = new JCG.List <string>(); foreach (SearchResult r in got.results) { found.Add(r.document.Get("id")); } // sort both so that the order is not important CollectionUtil.TimSort(q.ids); CollectionUtil.TimSort(found); assertEquals(msg, q.ids.ToString(), found.ToString()); } } }
private void AssertTermsSeeking(Terms leftTerms, Terms rightTerms) { TermsEnum leftEnum = null; TermsEnum rightEnum = null; // just an upper bound int numTests = AtLeast(20); Random random = Random; // collect this number of terms from the left side ISet <BytesRef> tests = new JCG.HashSet <BytesRef>(); int numPasses = 0; while (numPasses < 10 && tests.Count < numTests) { leftEnum = leftTerms.GetEnumerator(leftEnum); BytesRef term = null; while (leftEnum.MoveNext()) { term = leftEnum.Term; int code = random.Next(10); if (code == 0) { // the term tests.Add(BytesRef.DeepCopyOf(term)); } else if (code == 1) { // truncated subsequence of term term = BytesRef.DeepCopyOf(term); if (term.Length > 0) { // truncate it term.Length = random.Next(term.Length); } } else if (code == 2) { // term, but ensure a non-zero offset var newbytes = new byte[term.Length + 5]; Array.Copy(term.Bytes, term.Offset, newbytes, 5, term.Length); tests.Add(new BytesRef(newbytes, 5, term.Length)); } } numPasses++; } IList <BytesRef> shuffledTests = new JCG.List <BytesRef>(tests); shuffledTests.Shuffle(Random); foreach (BytesRef b in shuffledTests) { leftEnum = leftTerms.GetEnumerator(leftEnum); rightEnum = rightTerms.GetEnumerator(rightEnum); Assert.AreEqual(leftEnum.SeekExact(b), rightEnum.SeekExact(b)); Assert.AreEqual(leftEnum.SeekExact(b), rightEnum.SeekExact(b)); SeekStatus leftStatus; SeekStatus rightStatus; leftStatus = leftEnum.SeekCeil(b); rightStatus = rightEnum.SeekCeil(b); Assert.AreEqual(leftStatus, rightStatus); if (leftStatus != SeekStatus.END) { Assert.AreEqual(leftEnum.Term, rightEnum.Term); } leftStatus = leftEnum.SeekCeil(b); rightStatus = rightEnum.SeekCeil(b); Assert.AreEqual(leftStatus, rightStatus); if (leftStatus != SeekStatus.END) { Assert.AreEqual(leftEnum.Term, rightEnum.Term); } } }
/// <summary> /// Enumerates all minimal prefix paths in the automaton that also intersect the <see cref="FST"/>, /// accumulating the <see cref="FST"/> end node and output for each path. /// </summary> public static IList <Path <T> > IntersectPrefixPaths <T>(Automaton a, FST <T> fst) where T : class // LUCENENET specific - added class constraint because we are comparing reference equality { if (Debugging.AssertsEnabled) { Debugging.Assert(a.IsDeterministic); } IList <Path <T> > queue = new JCG.List <Path <T> >(); IList <Path <T> > endNodes = new JCG.List <Path <T> >(); queue.Add(new Path <T>(a.GetInitialState(), fst.GetFirstArc(new FST.Arc <T>()), fst.Outputs.NoOutput, new Int32sRef())); FST.Arc <T> scratchArc = new FST.Arc <T>(); FST.BytesReader fstReader = fst.GetBytesReader(); while (queue.Count != 0) { Path <T> path = queue[queue.Count - 1]; queue.Remove(path); if (path.State.Accept) { endNodes.Add(path); // we can stop here if we accept this path, // we accept all further paths too continue; } Int32sRef currentInput = path.Input; foreach (Transition t in path.State.GetTransitions()) { int min = t.Min; int max = t.Max; if (min == max) { FST.Arc <T> nextArc = fst.FindTargetArc(t.Min, path.FstNode, scratchArc, fstReader); if (nextArc != null) { Int32sRef newInput = new Int32sRef(currentInput.Length + 1); newInput.CopyInt32s(currentInput); newInput.Int32s[currentInput.Length] = t.Min; newInput.Length = currentInput.Length + 1; queue.Add(new Path <T>(t.Dest, new FST.Arc <T>() .CopyFrom(nextArc), fst.Outputs.Add(path.Output, nextArc.Output), newInput)); } } else { // TODO: if this transition's TO state is accepting, and // it accepts the entire range possible in the FST (ie. 0 to 255), // we can simply use the prefix as the accepted state instead of // looking up all the ranges and terminate early // here. This just shifts the work from one queue // (this one) to another (the completion search // done in AnalyzingSuggester). FST.Arc <T> nextArc = Lucene.Net.Util.Fst.Util.ReadCeilArc(min, fst, path.FstNode, scratchArc, fstReader); while (nextArc != null && nextArc.Label <= max) { if (Debugging.AssertsEnabled) { Debugging.Assert(nextArc.Label <= max); } if (Debugging.AssertsEnabled) { Debugging.Assert(nextArc.Label >= min, "{0} {1}", nextArc.Label, min); } Int32sRef newInput = new Int32sRef(currentInput.Length + 1); newInput.CopyInt32s(currentInput); newInput.Int32s[currentInput.Length] = nextArc.Label; newInput.Length = currentInput.Length + 1; queue.Add(new Path <T>(t.Dest, new FST.Arc <T>() .CopyFrom(nextArc), fst.Outputs.Add(path.Output, nextArc.Output), newInput)); int label = nextArc.Label; // used in assert nextArc = nextArc.IsLast ? null : fst.ReadNextRealArc(nextArc, fstReader); if (Debugging.AssertsEnabled) { Debugging.Assert(nextArc is null || label < nextArc.Label, "last: {0} next: {1}", label, nextArc?.Label); } } } } } return(endNodes); }
private void DoTest(int inputMode, Int32sRef[] terms) { Array.Sort(terms); // Up to two positive ints, shared, generally but not // monotonically increasing { if (Verbose) { Console.WriteLine("TEST: now test UpToTwoPositiveIntOutputs"); } UpToTwoPositiveInt64Outputs outputs = UpToTwoPositiveInt64Outputs.GetSingleton(true); IList <InputOutput <object> > pairs = new JCG.List <InputOutput <object> >(terms.Length); long lastOutput = 0; for (int idx = 0; idx < terms.Length; idx++) { // Sometimes go backwards long value = lastOutput + TestUtil.NextInt32(Random, -100, 1000); while (value < 0) { value = lastOutput + TestUtil.NextInt32(Random, -100, 1000); } object output; if (Random.nextInt(5) == 3) { long value2 = lastOutput + TestUtil.NextInt32(Random, -100, 1000); while (value2 < 0) { value2 = lastOutput + TestUtil.NextInt32(Random, -100, 1000); } IList <long> values = new JCG.List <long>(); values.Add(value); values.Add(value2); output = values; } else { output = outputs.Get(value); } pairs.Add(new InputOutput <object>(terms[idx], output)); } new FSTTesterHelper <object>(Random, dir, inputMode, pairs, outputs, false).DoTest(false); // ListOfOutputs(PositiveIntOutputs), generally but not // monotonically increasing { if (Verbose) { Console.WriteLine("TEST: now test OneOrMoreOutputs"); } PositiveInt32Outputs _outputs = PositiveInt32Outputs.Singleton; ListOfOutputs <long?> outputs2 = new ListOfOutputs <long?>(_outputs); IList <InputOutput <object> > pairs2 = new JCG.List <InputOutput <object> >(terms.Length); long lastOutput2 = 0; for (int idx = 0; idx < terms.Length; idx++) { int outputCount = TestUtil.NextInt32(Random, 1, 7); IList <long?> values = new JCG.List <long?>(); for (int i = 0; i < outputCount; i++) { // Sometimes go backwards long value = lastOutput2 + TestUtil.NextInt32(Random, -100, 1000); while (value < 0) { value = lastOutput2 + TestUtil.NextInt32(Random, -100, 1000); } values.Add(value); lastOutput2 = value; } object output; if (values.size() == 1) { output = values[0]; } else { output = values; } pairs2.Add(new InputOutput <object>(terms[idx], output)); } new FSTTester <object>(Random, dir, inputMode, pairs2, outputs2, false).DoTest(false); } } }
/// <summary> /// Extracts all <see cref="MultiTermQuery"/>s for <paramref name="field"/>, and returns equivalent /// automata that will match terms. /// </summary> internal static CharacterRunAutomaton[] ExtractAutomata(Query query, string field) { JCG.List <CharacterRunAutomaton> list = new JCG.List <CharacterRunAutomaton>(); if (query is BooleanQuery booleanQuery) { foreach (BooleanClause clause in booleanQuery.GetClauses()) { if (!clause.IsProhibited) { list.AddRange(ExtractAutomata(clause.Query, field)); } } } else if (query is DisjunctionMaxQuery disjunctionMaxQuery) { foreach (Query sub in disjunctionMaxQuery.Disjuncts) { list.AddRange(ExtractAutomata(sub, field)); } } else if (query is SpanOrQuery spanOrQuery) { foreach (Query sub in spanOrQuery.GetClauses()) { list.AddRange(ExtractAutomata(sub, field)); } } else if (query is SpanNearQuery spanNearQuery) { foreach (Query sub in spanNearQuery.GetClauses()) { list.AddRange(ExtractAutomata(sub, field)); } } else if (query is SpanNotQuery spanNotQuery) { list.AddRange(ExtractAutomata(spanNotQuery.Include, field)); } else if (query is SpanPositionCheckQuery spanPositionCheckQuery) { list.AddRange(ExtractAutomata(spanPositionCheckQuery.Match, field)); } else if (query is ISpanMultiTermQueryWrapper spanMultiTermQueryWrapper) { list.AddRange(ExtractAutomata(spanMultiTermQueryWrapper.WrappedQuery, field)); } else if (query is AutomatonQuery aq) { if (aq.Field.Equals(field, StringComparison.Ordinal)) { list.Add(new CharacterRunAutomatonToStringAnonymousClass(aq.Automaton, () => aq.ToString())); } } else if (query is PrefixQuery pq) { Term prefix = pq.Prefix; if (prefix.Field.Equals(field, StringComparison.Ordinal)) { list.Add(new CharacterRunAutomatonToStringAnonymousClass( BasicOperations.Concatenate(BasicAutomata.MakeString(prefix.Text), BasicAutomata.MakeAnyString()), () => pq.ToString())); } } else if (query is FuzzyQuery fq) { if (fq.Field.Equals(field, StringComparison.Ordinal)) { string utf16 = fq.Term.Text; int[] termText = new int[utf16.CodePointCount(0, utf16.Length)]; for (int cp, i = 0, j = 0; i < utf16.Length; i += Character.CharCount(cp)) { termText[j++] = cp = utf16.CodePointAt(i); } int termLength = termText.Length; int prefixLength = Math.Min(fq.PrefixLength, termLength); string suffix = UnicodeUtil.NewString(termText, prefixLength, termText.Length - prefixLength); LevenshteinAutomata builder = new LevenshteinAutomata(suffix, fq.Transpositions); Automaton automaton = builder.ToAutomaton(fq.MaxEdits); if (prefixLength > 0) { Automaton prefix = BasicAutomata.MakeString(UnicodeUtil.NewString(termText, 0, prefixLength)); automaton = BasicOperations.Concatenate(prefix, automaton); } list.Add(new CharacterRunAutomatonToStringAnonymousClass(automaton, () => fq.ToString())); } } else if (query is TermRangeQuery tq) { if (tq.Field.Equals(field, StringComparison.Ordinal)) { // this is *not* an automaton, but its very simple list.Add(new SimpleCharacterRunAutomatonAnonymousClass(BasicAutomata.MakeEmpty(), tq)); } } return(list.ToArray(/*new CharacterRunAutomaton[list.size()]*/)); }
public virtual void TestCRTReopen() { //test behaving badly //should be high enough int maxStaleSecs = 20; //build crap data just to store it. string s = " abcdefghijklmnopqrstuvwxyz "; char[] chars = s.ToCharArray(); StringBuilder builder = new StringBuilder(2048); for (int i = 0; i < 2048; i++) { builder.Append(chars[Random.Next(chars.Length)]); } string content = builder.ToString(); SnapshotDeletionPolicy sdp = new SnapshotDeletionPolicy(new KeepOnlyLastCommitDeletionPolicy()); Directory dir = new NRTCachingDirectory(NewFSDirectory(CreateTempDir("nrt")), 5, 128); IndexWriterConfig config = new IndexWriterConfig( #pragma warning disable 612, 618 Version.LUCENE_46, #pragma warning restore 612, 618 new MockAnalyzer(Random)); config.SetIndexDeletionPolicy(sdp); config.SetOpenMode(OpenMode.CREATE_OR_APPEND); IndexWriter iw = new IndexWriter(dir, config); SearcherManager sm = new SearcherManager(iw, true, new SearcherFactory()); TrackingIndexWriter tiw = new TrackingIndexWriter(iw); ControlledRealTimeReopenThread <IndexSearcher> controlledRealTimeReopenThread = new ControlledRealTimeReopenThread <IndexSearcher>(tiw, sm, maxStaleSecs, 0); controlledRealTimeReopenThread.IsBackground = (true); controlledRealTimeReopenThread.Start(); IList <ThreadJob> commitThreads = new JCG.List <ThreadJob>(); for (int i = 0; i < 500; i++) { if (i > 0 && i % 50 == 0) { ThreadJob commitThread = new RunnableAnonymousClass(this, sdp, dir, iw); commitThread.Start(); commitThreads.Add(commitThread); } Document d = new Document(); d.Add(new TextField("count", i + "", Field.Store.NO)); d.Add(new TextField("content", content, Field.Store.YES)); long start = J2N.Time.NanoTime() / J2N.Time.MillisecondsPerNanosecond; // LUCENENET: Use NanoTime() rather than CurrentTimeMilliseconds() for more accurate/reliable results long l = tiw.AddDocument(d); controlledRealTimeReopenThread.WaitForGeneration(l); long wait = (J2N.Time.NanoTime() / J2N.Time.MillisecondsPerNanosecond) - start; // LUCENENET: Use NanoTime() rather than CurrentTimeMilliseconds() for more accurate/reliable results assertTrue("waited too long for generation " + wait, wait < (maxStaleSecs * 1000)); IndexSearcher searcher = sm.Acquire(); TopDocs td = searcher.Search(new TermQuery(new Term("count", i + "")), 10); sm.Release(searcher); assertEquals(1, td.TotalHits); } foreach (ThreadJob commitThread in commitThreads) { commitThread.Join(); } controlledRealTimeReopenThread.Dispose(); sm.Dispose(); iw.Dispose(); dir.Dispose(); }
public override Scorer GetScorer(AtomicReaderContext context, IBits acceptDocs) { IList <Scorer> required = new JCG.List <Scorer>(); IList <Scorer> prohibited = new JCG.List <Scorer>(); IList <Scorer> optional = new JCG.List <Scorer>(); IEnumerator <BooleanClause> cIter = outerInstance.clauses.GetEnumerator(); foreach (Weight w in m_weights) { cIter.MoveNext(); BooleanClause c = cIter.Current; Scorer subScorer = w.GetScorer(context, acceptDocs); if (subScorer == null) { if (c.IsRequired) { return(null); } } else if (c.IsRequired) { required.Add(subScorer); } else if (c.IsProhibited) { prohibited.Add(subScorer); } else { optional.Add(subScorer); } } if (required.Count == 0 && optional.Count == 0) { // no required and optional clauses. return(null); } else if (optional.Count < outerInstance.m_minNrShouldMatch) { // either >1 req scorer, or there are 0 req scorers and at least 1 // optional scorer. Therefore if there are not enough optional scorers // no documents will be matched by the query return(null); } // simple conjunction if (optional.Count == 0 && prohibited.Count == 0) { float coord = disableCoord ? 1.0f : Coord(required.Count, m_maxCoord); return(new ConjunctionScorer(this, required.ToArray(), coord)); } // simple disjunction if (required.Count == 0 && prohibited.Count == 0 && outerInstance.m_minNrShouldMatch <= 1 && optional.Count > 1) { var coord = new float[optional.Count + 1]; for (int i = 0; i < coord.Length; i++) { coord[i] = disableCoord ? 1.0f : Coord(i, m_maxCoord); } return(new DisjunctionSumScorer(this, optional.ToArray(), coord)); } // Return a BooleanScorer2 return(new BooleanScorer2(this, disableCoord, outerInstance.m_minNrShouldMatch, required, prohibited, optional, m_maxCoord)); }
/// <summary> /// Low level api. Returns a token stream generated from a <see cref="Terms"/>. This /// can be used to feed the highlighter with a pre-parsed token /// stream. The <see cref="Terms"/> must have offsets available. /// <para/> /// In my tests the speeds to recreate 1000 token streams using this method are: /// <list type="bullet"> /// <item><description> /// with TermVector offset only data stored - 420 milliseconds /// </description></item> /// <item><description> /// with TermVector offset AND position data stored - 271 milliseconds /// (nb timings for TermVector with position data are based on a tokenizer with contiguous /// positions - no overlaps or gaps) /// </description></item> /// <item><description> /// The cost of not using TermPositionVector to store /// pre-parsed content and using an analyzer to re-parse the original content: /// - reanalyzing the original content - 980 milliseconds /// </description></item> /// </list> /// /// The re-analyze timings will typically vary depending on - /// <list type="number"> /// <item><description> /// The complexity of the analyzer code (timings above were using a /// stemmer/lowercaser/stopword combo) /// </description></item> /// <item><description> /// The number of other fields (Lucene reads ALL fields off the disk /// when accessing just one document field - can cost dear!) /// </description></item> /// <item><description> /// Use of compression on field storage - could be faster due to compression (less disk IO) /// or slower (more CPU burn) depending on the content. /// </description></item> /// </list> /// </summary> /// <param name="tpv"></param> /// <param name="tokenPositionsGuaranteedContiguous">true if the token position numbers have no overlaps or gaps. If looking /// to eek out the last drops of performance, set to true. If in doubt, set to false.</param> /// <exception cref="ArgumentException">if no offsets are available</exception> public static TokenStream GetTokenStream(Terms tpv, bool tokenPositionsGuaranteedContiguous) { if (!tpv.HasOffsets) { throw new ArgumentException("Cannot create TokenStream from Terms without offsets"); } if (!tokenPositionsGuaranteedContiguous && tpv.HasPositions) { return(new TokenStreamFromTermPositionVector(tpv)); } bool hasPayloads = tpv.HasPayloads; // code to reconstruct the original sequence of Tokens TermsEnum termsEnum = tpv.GetEnumerator(); int totalTokens = 0; while (termsEnum.MoveNext()) { totalTokens += (int)termsEnum.TotalTermFreq; } Token[] tokensInOriginalOrder = new Token[totalTokens]; JCG.List <Token> unsortedTokens = null; termsEnum = tpv.GetEnumerator(); DocsAndPositionsEnum dpEnum = null; while (termsEnum.MoveNext()) { dpEnum = termsEnum.DocsAndPositions(null, dpEnum); if (dpEnum is null) { throw new ArgumentException("Required TermVector Offset information was not found"); } string term = termsEnum.Term.Utf8ToString(); dpEnum.NextDoc(); int freq = dpEnum.Freq; for (int posUpto = 0; posUpto < freq; posUpto++) { int pos = dpEnum.NextPosition(); if (dpEnum.StartOffset < 0) { throw new ArgumentException("Required TermVector Offset information was not found"); } Token token = new Token(term, dpEnum.StartOffset, dpEnum.EndOffset); if (hasPayloads) { // Must make a deep copy of the returned payload, // since D&PEnum API is allowed to re-use on every // call: token.Payload = BytesRef.DeepCopyOf(dpEnum.GetPayload()); } if (tokenPositionsGuaranteedContiguous && pos != -1) { // We have positions stored and a guarantee that the token position // information is contiguous // This may be fast BUT wont work if Tokenizers used which create >1 // token in same position or // creates jumps in position numbers - this code would fail under those // circumstances // tokens stored with positions - can use this to index straight into // sorted array tokensInOriginalOrder[pos] = token; } else { // tokens NOT stored with positions or not guaranteed contiguous - must // add to list and sort later if (unsortedTokens is null) { unsortedTokens = new JCG.List <Token>(); } unsortedTokens.Add(token); } } } // If the field has been stored without position data we must perform a sort if (unsortedTokens != null) { tokensInOriginalOrder = unsortedTokens.ToArray(); ArrayUtil.TimSort(tokensInOriginalOrder, TokenComparer.Default); //tokensInOriginalOrder = tokensInOriginalOrder // .OrderBy(t => t, new TokenComparer() ) // .ToArray(); } return(new StoredTokenStream(tokensInOriginalOrder)); }
public override Query Rewrite(IndexReader reader) { // ArrayList spanClauses = new ArrayList(); if (contents is TermQuery) { return(contents); } // Build a sequence of Span clauses arranged in a SpanNear - child // clauses can be complex // Booleans e.g. nots and ors etc int numNegatives = 0; if (!(contents is BooleanQuery)) { throw new ArgumentException("Unknown query type \"" + contents.GetType().Name + "\" found in phrase query string \"" + phrasedQueryStringContents + "\""); } BooleanQuery bq = (BooleanQuery)contents; BooleanClause[] bclauses = bq.GetClauses(); SpanQuery[] allSpanClauses = new SpanQuery[bclauses.Length]; // For all clauses e.g. one* two~ for (int i = 0; i < bclauses.Length; i++) { // HashSet bclauseterms=new HashSet(); Query qc = bclauses[i].Query; // Rewrite this clause e.g one* becomes (one OR onerous) qc = qc.Rewrite(reader); if (bclauses[i].Occur.Equals(Occur.MUST_NOT)) { numNegatives++; } if (qc is BooleanQuery booleanQuery) { IList <SpanQuery> sc = new JCG.List <SpanQuery>(); AddComplexPhraseClause(sc, booleanQuery); if (sc.Count > 0) { allSpanClauses[i] = sc[0]; } else { // Insert fake term e.g. phrase query was for "Fred Smithe*" and // there were no "Smithe*" terms - need to // prevent match on just "Fred". allSpanClauses[i] = new SpanTermQuery(new Term(field, "Dummy clause because no terms found - must match nothing")); } } else { if (qc is TermQuery tq) { allSpanClauses[i] = new SpanTermQuery(tq.Term); } else { throw new ArgumentException("Unknown query type \"" + qc.GetType().Name + "\" found in phrase query string \"" + phrasedQueryStringContents + "\""); } } } if (numNegatives == 0) { // The simple case - no negative elements in phrase return(new SpanNearQuery(allSpanClauses, slopFactor, inOrder)); } // Complex case - we have mixed positives and negatives in the // sequence. // Need to return a SpanNotQuery JCG.List <SpanQuery> positiveClauses = new JCG.List <SpanQuery>(); for (int j = 0; j < allSpanClauses.Length; j++) { if (!bclauses[j].Occur.Equals(Occur.MUST_NOT)) { positiveClauses.Add(allSpanClauses[j]); } } SpanQuery[] includeClauses = positiveClauses .ToArray(); SpanQuery include; // LUCENENET: IDE0059: Remove unnecessary value assignment if (includeClauses.Length == 1) { include = includeClauses[0]; // only one positive clause } else { // need to increase slop factor based on gaps introduced by // negatives include = new SpanNearQuery(includeClauses, slopFactor + numNegatives, inOrder); } // Use sequence of positive and negative values as the exclude. SpanNearQuery exclude = new SpanNearQuery(allSpanClauses, slopFactor, inOrder); SpanNotQuery snot = new SpanNotQuery(include, exclude); return(snot); }
public override MergeSpecification FindForcedMerges(SegmentInfos segmentInfos, int maxSegmentCount, IDictionary <SegmentCommitInfo, bool> segmentsToMerge) { // first find all old segments IDictionary <SegmentCommitInfo, bool> oldSegments = new Dictionary <SegmentCommitInfo, bool>(); foreach (SegmentCommitInfo si in segmentInfos.Segments) { if (segmentsToMerge.TryGetValue(si, out bool v) && ShouldUpgradeSegment(si)) { oldSegments[si] = v; } } if (Verbose()) { Message("findForcedMerges: segmentsToUpgrade=" + oldSegments); } if (oldSegments.Count == 0) { return(null); } MergeSpecification spec = m_base.FindForcedMerges(segmentInfos, maxSegmentCount, oldSegments); if (spec != null) { // remove all segments that are in merge specification from oldSegments, // the resulting set contains all segments that are left over // and will be merged to one additional segment: foreach (OneMerge om in spec.Merges) { foreach (SegmentCommitInfo sipc in om.Segments) { oldSegments.Remove(sipc); } } } if (oldSegments.Count > 0) { if (Verbose()) { Message("findForcedMerges: " + m_base.GetType().Name + " does not want to merge all old segments, merge remaining ones into new segment: " + oldSegments); } IList <SegmentCommitInfo> newInfos = new JCG.List <SegmentCommitInfo>(); foreach (SegmentCommitInfo si in segmentInfos.Segments) { if (oldSegments.ContainsKey(si)) { newInfos.Add(si); } } // add the final merge if (spec == null) { spec = new MergeSpecification(); } spec.Add(new OneMerge(newInfos)); } return(spec); }
public Int64RangeCounter(Int64Range[] ranges) { // Maps all range inclusive endpoints to int flags; 1 // = start of interval, 2 = end of interval. We need to // track the start vs end case separately because if a // given point is both, then it must be its own // elementary interval: IDictionary <long, int> endsMap = new Dictionary <long, int> { [long.MinValue] = 1, [long.MaxValue] = 2 }; foreach (Int64Range range in ranges) { if (!endsMap.TryGetValue(range.minIncl, out int cur)) { endsMap[range.minIncl] = 1; } else { endsMap[range.minIncl] = cur | 1; } if (!endsMap.TryGetValue(range.maxIncl, out cur)) { endsMap[range.maxIncl] = 2; } else { endsMap[range.maxIncl] = cur | 2; } } var endsList = new JCG.List <long>(endsMap.Keys); endsList.Sort(); // Build elementaryIntervals (a 1D Venn diagram): IList <InclusiveRange> elementaryIntervals = new JCG.List <InclusiveRange>(); int upto0 = 1; long v = endsList[0]; long prev; if (endsMap[v] == 3) { elementaryIntervals.Add(new InclusiveRange(v, v)); prev = v + 1; } else { prev = v; } while (upto0 < endsList.Count) { v = endsList[upto0]; int flags = endsMap[v]; //System.out.println(" v=" + v + " flags=" + flags); if (flags == 3) { // This point is both an end and a start; we need to // separate it: if (v > prev) { elementaryIntervals.Add(new InclusiveRange(prev, v - 1)); } elementaryIntervals.Add(new InclusiveRange(v, v)); prev = v + 1; } else if (flags == 1) { // This point is only the start of an interval; // attach it to next interval: if (v > prev) { elementaryIntervals.Add(new InclusiveRange(prev, v - 1)); } prev = v; } else { if (Debugging.AssertsEnabled) { Debugging.Assert(flags == 2); } // This point is only the end of an interval; attach // it to last interval: elementaryIntervals.Add(new InclusiveRange(prev, v)); prev = v + 1; } //System.out.println(" ints=" + elementaryIntervals); upto0++; } // Build binary tree on top of intervals: root = Split(0, elementaryIntervals.Count, elementaryIntervals); // Set outputs, so we know which range to output for // each node in the tree: for (int i = 0; i < ranges.Length; i++) { root.AddOutputs(i, ranges[i]); } // Set boundaries (ends of each elementary interval): boundaries = new long[elementaryIntervals.Count]; for (int i = 0; i < boundaries.Length; i++) { boundaries[i] = elementaryIntervals[i].End; } leafCounts = new int[boundaries.Length]; //System.out.println("ranges: " + Arrays.toString(ranges)); //System.out.println("intervals: " + elementaryIntervals); //System.out.println("boundaries: " + Arrays.toString(boundaries)); //System.out.println("root:\n" + root); }
/// <summary> /// Query should be rewritten for wild/fuzzy support. /// </summary> /// <param name="query"> rewritten query </param> /// <returns> payloads Collection </returns> /// <exception cref="IOException"> if there is a low-level I/O error </exception> public virtual ICollection<byte[]> GetPayloadsForQuery(Query query) { var payloads = new JCG.List<byte[]>(); QueryToSpanQuery(query, payloads); return payloads; }
public override void Flush(IDictionary <string, TermsHashConsumerPerField> fieldsToFlush, SegmentWriteState state) { // Gather all FieldData's that have postings, across all // ThreadStates IList <FreqProxTermsWriterPerField> allFields = new JCG.List <FreqProxTermsWriterPerField>(); foreach (TermsHashConsumerPerField f in fieldsToFlush.Values) { FreqProxTermsWriterPerField perField = (FreqProxTermsWriterPerField)f; if (perField.termsHashPerField.bytesHash.Count > 0) { allFields.Add(perField); } } int numAllFields = allFields.Count; // Sort by field name CollectionUtil.IntroSort(allFields); FieldsConsumer consumer = state.SegmentInfo.Codec.PostingsFormat.FieldsConsumer(state); bool success = false; try { TermsHash termsHash = null; /* * Current writer chain: * FieldsConsumer * -> IMPL: FormatPostingsTermsDictWriter * -> TermsConsumer * -> IMPL: FormatPostingsTermsDictWriter.TermsWriter * -> DocsConsumer * -> IMPL: FormatPostingsDocsWriter * -> PositionsConsumer * -> IMPL: FormatPostingsPositionsWriter */ for (int fieldNumber = 0; fieldNumber < numAllFields; fieldNumber++) { FieldInfo fieldInfo = allFields[fieldNumber].fieldInfo; FreqProxTermsWriterPerField fieldWriter = allFields[fieldNumber]; // If this field has postings then add them to the // segment fieldWriter.Flush(fieldInfo.Name, consumer, state); TermsHashPerField perField = fieldWriter.termsHashPerField; if (Debugging.AssertsEnabled) { Debugging.Assert(termsHash == null || termsHash == perField.termsHash); } termsHash = perField.termsHash; int numPostings = perField.bytesHash.Count; perField.Reset(); perField.ShrinkHash(/* numPostings // LUCENENET: Not used */); fieldWriter.Reset(); } if (termsHash != null) { termsHash.Reset(); } success = true; } finally { if (success) { IOUtils.Dispose(consumer); } else { IOUtils.DisposeWhileHandlingException(consumer); } } }
public static void Main(string[] args) { if (args.Length < 5) { // LUCENENET specific - our wrapper console shows the correct usage throw new ArgumentException(); //Console.Error.WriteLine("Usage: MultiPassIndexSplitter -out <outputDir> -num <numParts> [-seq] <inputIndex1> [<inputIndex2 ...]"); //Console.Error.WriteLine("\tinputIndex\tpath to input index, multiple values are ok"); //Console.Error.WriteLine("\t-out ouputDir\tpath to output directory to contain partial indexes"); //Console.Error.WriteLine("\t-num numParts\tnumber of parts to produce"); //Console.Error.WriteLine("\t-seq\tsequential docid-range split (default is round-robin)"); //Environment.Exit(-1); } IList <IndexReader> indexes = new JCG.List <IndexReader>(); try { string outDir = null; int numParts = -1; bool seq = false; for (int i = 0; i < args.Length; i++) { if (args[i].Equals("-out", StringComparison.Ordinal)) { outDir = args[++i]; } else if (args[i].Equals("-num", StringComparison.Ordinal)) { numParts = Convert.ToInt32(args[++i], CultureInfo.InvariantCulture); } else if (args[i].Equals("-seq", StringComparison.Ordinal)) { seq = true; } else { DirectoryInfo file = new DirectoryInfo(args[i]); if (!file.Exists) { Console.Error.WriteLine("Invalid input path - skipping: " + file); continue; } using Store.Directory dir = FSDirectory.Open(new DirectoryInfo(args[i])); try { if (!DirectoryReader.IndexExists(dir)) { Console.Error.WriteLine("Invalid input index - skipping: " + file); continue; } } catch (Exception e) when(e.IsException()) { Console.Error.WriteLine("Invalid input index - skipping: " + file); continue; } indexes.Add(DirectoryReader.Open(dir)); } } if (outDir is null) { throw new Exception("Required argument missing: -out outputDir"); } if (numParts < 2) { throw new Exception("Invalid value of required argument: -num numParts"); } if (indexes.Count == 0) { throw new Exception("No input indexes to process"); } DirectoryInfo @out = new DirectoryInfo(outDir); @out.Create(); if (!new DirectoryInfo(outDir).Exists) { throw new Exception("Can't create output directory: " + @out); } Store.Directory[] dirs = new Store.Directory[numParts]; try { for (int i = 0; i < numParts; i++) { dirs[i] = FSDirectory.Open(new DirectoryInfo(Path.Combine(@out.FullName, "part-" + i))); } MultiPassIndexSplitter splitter = new MultiPassIndexSplitter(); IndexReader input; if (indexes.Count == 1) { input = indexes[0]; } else { input = new MultiReader(indexes.ToArray()); } #pragma warning disable 612, 618 splitter.Split(LuceneVersion.LUCENE_CURRENT, input, dirs, seq); #pragma warning restore 612, 618 } finally { // LUCENENET specific - properly dispose directories to prevent resource leaks IOUtils.Dispose(dirs); } } finally { // LUCENENET specific - properly dispose index readers to prevent resource leaks IOUtils.Dispose(indexes); } }
/// <summary> /// Retrieve suggestions. /// </summary> public virtual IList <LookupResult> DoLookup(string key, IEnumerable <BytesRef> contexts, int num) { // LUCENENET: Added guard clause for null if (key is null) { throw new ArgumentNullException(nameof(key)); } if (contexts != null) { throw new ArgumentException("this suggester doesn't support contexts"); } TokenStream ts = queryAnalyzer.GetTokenStream("", key); try { ITermToBytesRefAttribute termBytesAtt = ts.AddAttribute <ITermToBytesRefAttribute>(); IOffsetAttribute offsetAtt = ts.AddAttribute <IOffsetAttribute>(); IPositionLengthAttribute posLenAtt = ts.AddAttribute <IPositionLengthAttribute>(); IPositionIncrementAttribute posIncAtt = ts.AddAttribute <IPositionIncrementAttribute>(); ts.Reset(); var lastTokens = new BytesRef[grams]; //System.out.println("lookup: key='" + key + "'"); // Run full analysis, but save only the // last 1gram, last 2gram, etc.: BytesRef tokenBytes = termBytesAtt.BytesRef; int maxEndOffset = -1; bool sawRealToken = false; while (ts.IncrementToken()) { termBytesAtt.FillBytesRef(); sawRealToken |= tokenBytes.Length > 0; // TODO: this is somewhat iffy; today, ShingleFilter // sets posLen to the gram count; maybe we should make // a separate dedicated att for this? int gramCount = posLenAtt.PositionLength; if (Debugging.AssertsEnabled) { Debugging.Assert(gramCount <= grams); } // Safety: make sure the recalculated count "agrees": if (CountGrams(tokenBytes) != gramCount) { throw new ArgumentException("tokens must not contain separator byte; got token=" + tokenBytes + " but gramCount=" + gramCount + " does not match recalculated count=" + CountGrams(tokenBytes)); } maxEndOffset = Math.Max(maxEndOffset, offsetAtt.EndOffset); lastTokens[gramCount - 1] = BytesRef.DeepCopyOf(tokenBytes); } ts.End(); if (!sawRealToken) { throw new ArgumentException("no tokens produced by analyzer, or the only tokens were empty strings"); } // Carefully fill last tokens with _ tokens; // ShingleFilter appraently won't emit "only hole" // tokens: int endPosInc = posIncAtt.PositionIncrement; // Note this will also be true if input is the empty // string (in which case we saw no tokens and // maxEndOffset is still -1), which in fact works out OK // because we fill the unigram with an empty BytesRef // below: bool lastTokenEnded = offsetAtt.EndOffset > maxEndOffset || endPosInc > 0; //System.out.println("maxEndOffset=" + maxEndOffset + " vs " + offsetAtt.EndOffset); if (lastTokenEnded) { //System.out.println(" lastTokenEnded"); // If user hit space after the last token, then // "upgrade" all tokens. This way "foo " will suggest // all bigrams starting w/ foo, and not any unigrams // starting with "foo": for (int i = grams - 1; i > 0; i--) { BytesRef token = lastTokens[i - 1]; if (token is null) { continue; } token.Grow(token.Length + 1); token.Bytes[token.Length] = separator; token.Length++; lastTokens[i] = token; } lastTokens[0] = new BytesRef(); } var arc = new FST.Arc <Int64>(); var bytesReader = fst.GetBytesReader(); // Try highest order models first, and if they return // results, return that; else, fallback: double backoff = 1.0; JCG.List <LookupResult> results = new JCG.List <LookupResult>(num); // We only add a given suffix once, from the highest // order model that saw it; for subsequent lower order // models we skip it: var seen = new JCG.HashSet <BytesRef>(); for (int gram = grams - 1; gram >= 0; gram--) { BytesRef token = lastTokens[gram]; // Don't make unigram predictions from empty string: if (token is null || (token.Length == 0 && key.Length > 0)) { // Input didn't have enough tokens: //System.out.println(" gram=" + gram + ": skip: not enough input"); continue; } if (endPosInc > 0 && gram <= endPosInc) { // Skip hole-only predictions; in theory we // shouldn't have to do this, but we'd need to fix // ShingleFilter to produce only-hole tokens: //System.out.println(" break: only holes now"); break; } //System.out.println("try " + (gram+1) + " gram token=" + token.utf8ToString()); // TODO: we could add fuzziness here // match the prefix portion exactly //Pair<Long,BytesRef> prefixOutput = null; Int64 prefixOutput = null; try { prefixOutput = LookupPrefix(fst, bytesReader, token, arc); } catch (Exception bogus) when(bogus.IsIOException()) { throw RuntimeException.Create(bogus); } //System.out.println(" prefixOutput=" + prefixOutput); if (prefixOutput is null) { // This model never saw this prefix, e.g. the // trigram model never saw context "purple mushroom" backoff *= ALPHA; continue; } // TODO: we could do this division at build time, and // bake it into the FST? // Denominator for computing scores from current // model's predictions: long contextCount = totTokens; BytesRef lastTokenFragment = null; for (int i = token.Length - 1; i >= 0; i--) { if (token.Bytes[token.Offset + i] == separator) { BytesRef context = new BytesRef(token.Bytes, token.Offset, i); long? output = Lucene.Net.Util.Fst.Util.Get(fst, Lucene.Net.Util.Fst.Util.ToInt32sRef(context, new Int32sRef())); if (Debugging.AssertsEnabled) { Debugging.Assert(output != null); } contextCount = DecodeWeight(output); lastTokenFragment = new BytesRef(token.Bytes, token.Offset + i + 1, token.Length - i - 1); break; } } BytesRef finalLastToken; if (lastTokenFragment is null) { finalLastToken = BytesRef.DeepCopyOf(token); } else { finalLastToken = BytesRef.DeepCopyOf(lastTokenFragment); } if (Debugging.AssertsEnabled) { Debugging.Assert(finalLastToken.Offset == 0); } CharsRef spare = new CharsRef(); // complete top-N Util.Fst.Util.TopResults <Int64> completions = null; try { // Because we store multiple models in one FST // (1gram, 2gram, 3gram), we must restrict the // search so that it only considers the current // model. For highest order model, this is not // necessary since all completions in the FST // must be from this model, but for lower order // models we have to filter out the higher order // ones: // Must do num+seen.size() for queue depth because we may // reject up to seen.size() paths in acceptResult(): Util.Fst.Util.TopNSearcher <Int64> searcher = new TopNSearcherAnonymousClass(this, fst, num, num + seen.Count, weightComparer, seen, finalLastToken); // since this search is initialized with a single start node // it is okay to start with an empty input path here searcher.AddStartPaths(arc, prefixOutput, true, new Int32sRef()); completions = searcher.Search(); if (Debugging.AssertsEnabled) { Debugging.Assert(completions.IsComplete); } } catch (Exception bogus) when(bogus.IsIOException()) { throw RuntimeException.Create(bogus); } int prefixLength = token.Length; BytesRef suffix = new BytesRef(8); //System.out.println(" " + completions.length + " completions"); foreach (Util.Fst.Util.Result <Int64> completion in completions) { token.Length = prefixLength; // append suffix Util.Fst.Util.ToBytesRef(completion.Input, suffix); token.Append(suffix); //System.out.println(" completion " + token.utf8ToString()); // Skip this path if a higher-order model already // saw/predicted its last token: BytesRef lastToken = token; for (int i = token.Length - 1; i >= 0; i--) { if (token.Bytes[token.Offset + i] == separator) { if (Debugging.AssertsEnabled) { Debugging.Assert(token.Length - i - 1 > 0); } lastToken = new BytesRef(token.Bytes, token.Offset + i + 1, token.Length - i - 1); break; } } if (seen.Contains(lastToken)) { //System.out.println(" skip dup " + lastToken.utf8ToString()); goto nextCompletionContinue; } seen.Add(BytesRef.DeepCopyOf(lastToken)); spare.Grow(token.Length); UnicodeUtil.UTF8toUTF16(token, spare); LookupResult result = new LookupResult(spare.ToString(), // LUCENENET NOTE: We need to calculate this as decimal because when using double it can sometimes // return numbers that are greater than long.MaxValue, which results in a negative long number. (long)(long.MaxValue * (decimal)backoff * ((decimal)DecodeWeight(completion.Output)) / contextCount)); results.Add(result); if (Debugging.AssertsEnabled) { Debugging.Assert(results.Count == seen.Count); } //System.out.println(" add result=" + result); nextCompletionContinue :; } backoff *= ALPHA; } results.Sort(Comparer <Lookup.LookupResult> .Create((a, b) => { if (a.Value > b.Value) { return(-1); } else if (a.Value < b.Value) { return(1); } else { // Tie break by UTF16 sort order: return(a.Key.CompareToOrdinal(b.Key)); } })); if (results.Count > num) { results.RemoveRange(num, results.Count - num); // LUCENENET: Converted end index to length } return(results); } finally { IOUtils.DisposeWhileHandlingException(ts); } }
public static IList <string> SplitWS(string s, bool decode) { IList <string> lst = new JCG.List <string>(2); StringBuilder sb = new StringBuilder(); int pos = 0, end = s.Length; while (pos < end) { char ch = s[pos++]; if (char.IsWhiteSpace(ch)) { if (sb.Length > 0) { lst.Add(sb.ToString()); sb = new StringBuilder(); } continue; } if (ch == '\\') { if (!decode) { sb.Append(ch); } if (pos >= end) // ERROR, or let it go? { break; } ch = s[pos++]; if (decode) { switch (ch) { case 'n': ch = '\n'; break; case 't': ch = '\t'; break; case 'r': ch = '\r'; break; case 'b': ch = '\b'; break; case 'f': ch = '\f'; break; } } } sb.Append(ch); } if (sb.Length > 0) { lst.Add(sb.ToString()); } return(lst); }
public virtual void TestBS2DisjunctionNextVsAdvance() { Directory d = NewDirectory(); RandomIndexWriter w = new RandomIndexWriter(Random, d); int numDocs = AtLeast(300); for (int docUpto = 0; docUpto < numDocs; docUpto++) { string contents = "a"; if (Random.Next(20) <= 16) { contents += " b"; } if (Random.Next(20) <= 8) { contents += " c"; } if (Random.Next(20) <= 4) { contents += " d"; } if (Random.Next(20) <= 2) { contents += " e"; } if (Random.Next(20) <= 1) { contents += " f"; } Document doc = new Document(); doc.Add(new TextField("field", contents, Field.Store.NO)); w.AddDocument(doc); } w.ForceMerge(1); IndexReader r = w.GetReader(); IndexSearcher s = NewSearcher(r); w.Dispose(); for (int iter = 0; iter < 10 * RandomMultiplier; iter++) { if (Verbose) { Console.WriteLine("iter=" + iter); } IList <string> terms = new JCG.List <string> { "a", "b", "c", "d", "e", "f" }; int numTerms = TestUtil.NextInt32(Random, 1, terms.Count); while (terms.Count > numTerms) { terms.RemoveAt(Random.Next(terms.Count)); } if (Verbose) { Console.WriteLine(" terms=" + terms); } BooleanQuery q = new BooleanQuery(); foreach (string term in terms) { q.Add(new BooleanClause(new TermQuery(new Term("field", term)), Occur.SHOULD)); } Weight weight = s.CreateNormalizedWeight(q); Scorer scorer = weight.GetScorer(s.m_leafContexts[0], null); // First pass: just use .NextDoc() to gather all hits IList <ScoreDoc> hits = new JCG.List <ScoreDoc>(); while (scorer.NextDoc() != DocIdSetIterator.NO_MORE_DOCS) { hits.Add(new ScoreDoc(scorer.DocID, scorer.GetScore())); } if (Verbose) { Console.WriteLine(" " + hits.Count + " hits"); } // Now, randomly next/advance through the list and // verify exact match: for (int iter2 = 0; iter2 < 10; iter2++) { weight = s.CreateNormalizedWeight(q); scorer = weight.GetScorer(s.m_leafContexts[0], null); if (Verbose) { Console.WriteLine(" iter2=" + iter2); } int upto = -1; while (upto < hits.Count) { int nextUpto; int nextDoc; int left = hits.Count - upto; if (left == 1 || Random.nextBoolean()) { // next nextUpto = 1 + upto; nextDoc = scorer.NextDoc(); } else { // advance int inc = TestUtil.NextInt32(Random, 1, left - 1); nextUpto = inc + upto; nextDoc = scorer.Advance(hits[nextUpto].Doc); } if (nextUpto == hits.Count) { Assert.AreEqual(DocIdSetIterator.NO_MORE_DOCS, nextDoc); } else { ScoreDoc hit = hits[nextUpto]; Assert.AreEqual(hit.Doc, nextDoc); // LUCENENET: For some weird reason, on x86 in .NET Framework (optimizations enabled), using == (as they did in Lucene) doesn't work with optimizations enabled, but using AreEqual with epsilon of 0f does. // Test for precise float equality: Assert.AreEqual(hit.Score, scorer.GetScore(), 0f, "doc " + hit.Doc + " has wrong score: expected=" + hit.Score + " actual=" + scorer.GetScore()); } upto = nextUpto; } } } r.Dispose(); d.Dispose(); }
/// <summary> /// Auto-completes a given prefix query using Depth-First Search with the end /// of prefix as source node each time finding a new leaf to get a complete key /// to be added in the suggest list. /// </summary> /// <param name="root"> /// a reference to root node of TST. </param> /// <param name="s"> /// prefix query to be auto-completed. </param> /// <param name="x"> /// index of current character to be searched while traversing through /// the prefix in TST. </param> /// <returns> suggest list of auto-completed keys for the given prefix query. </returns> public virtual IList <TernaryTreeNode> PrefixCompletion(TernaryTreeNode root, string s, int x) { TernaryTreeNode p = root; JCG.List <TernaryTreeNode> suggest = new JCG.List <TernaryTreeNode>(); while (p != null) { if (s[x] < p.splitchar) { p = p.loKid; } else if (s[x] == p.splitchar) { if (x == s.Length - 1) { break; } else { x++; } p = p.eqKid; } else { p = p.hiKid; } } if (p == null) { return(suggest); } if (p.eqKid == null && p.token == null) { return(suggest); } if (p.eqKid == null && p.token != null) { suggest.Add(p); return(suggest); } if (p.token != null) { suggest.Add(p); } p = p.eqKid; var st = new Stack <TernaryTreeNode>(); st.Push(p); while (st.Count > 0) { TernaryTreeNode top = st.Peek(); st.Pop(); if (top.token != null) { suggest.Add(top); } if (top.eqKid != null) { st.Push(top.eqKid); } if (top.loKid != null) { st.Push(top.loKid); } if (top.hiKid != null) { st.Push(top.hiKid); } } return(suggest); }
/// <summary> /// Call this only once (if you subclass!) </summary> protected virtual void Uninvert(AtomicReader reader, IBits liveDocs, BytesRef termPrefix) { FieldInfo info = reader.FieldInfos.FieldInfo(m_field); if (info != null && info.HasDocValues) { throw IllegalStateException.Create("Type mismatch: " + m_field + " was indexed as " + info.DocValuesType); } //System.out.println("DTO uninvert field=" + field + " prefix=" + termPrefix); long startTime = J2N.Time.NanoTime() / J2N.Time.MillisecondsPerNanosecond; // LUCENENET: Use NanoTime() rather than CurrentTimeMilliseconds() for more accurate/reliable results m_prefix = termPrefix == null ? null : BytesRef.DeepCopyOf(termPrefix); int maxDoc = reader.MaxDoc; int[] index = new int[maxDoc]; // immediate term numbers, or the index into the byte[] representing the last number int[] lastTerm = new int[maxDoc]; // last term we saw for this document var bytes = new sbyte[maxDoc][]; // list of term numbers for the doc (delta encoded vInts) Fields fields = reader.Fields; if (fields == null) { // No terms return; } Terms terms = fields.GetTerms(m_field); if (terms == null) { // No terms return; } TermsEnum te = terms.GetEnumerator(); BytesRef seekStart = termPrefix ?? new BytesRef(); //System.out.println("seekStart=" + seekStart.utf8ToString()); if (te.SeekCeil(seekStart) == TermsEnum.SeekStatus.END) { // No terms match return; } // If we need our "term index wrapper", these will be // init'd below: IList <BytesRef> indexedTerms = null; PagedBytes indexedTermsBytes = null; bool testedOrd = false; // we need a minimum of 9 bytes, but round up to 12 since the space would // be wasted with most allocators anyway. var tempArr = new sbyte[12]; // // enumerate all terms, and build an intermediate form of the un-inverted field. // // During this intermediate form, every document has a (potential) byte[] // and the int[maxDoc()] array either contains the termNumber list directly // or the *end* offset of the termNumber list in it's byte array (for faster // appending and faster creation of the final form). // // idea... if things are too large while building, we could do a range of docs // at a time (but it would be a fair amount slower to build) // could also do ranges in parallel to take advantage of multiple CPUs // OPTIONAL: remap the largest df terms to the lowest 128 (single byte) // values. this requires going over the field first to find the most // frequent terms ahead of time. int termNum = 0; m_docsEnum = null; // Loop begins with te positioned to first term (we call // seek above): for (; ;) { BytesRef t = te.Term; if (t == null || (termPrefix != null && !StringHelper.StartsWith(t, termPrefix))) { break; } //System.out.println("visit term=" + t.utf8ToString() + " " + t + " termNum=" + termNum); if (!testedOrd) { try { m_ordBase = (int)te.Ord; //System.out.println("got ordBase=" + ordBase); } catch (Exception uoe) when(uoe.IsUnsupportedOperationException()) { // Reader cannot provide ord support, so we wrap // our own support by creating our own terms index: indexedTerms = new JCG.List <BytesRef>(); indexedTermsBytes = new PagedBytes(15); //System.out.println("NO ORDS"); } testedOrd = true; } VisitTerm(te, termNum); if (indexedTerms != null && (termNum & indexIntervalMask) == 0) { // Index this term m_sizeOfIndexedStrings += t.Length; BytesRef indexedTerm = new BytesRef(); indexedTermsBytes.Copy(t, indexedTerm); // TODO: really should 1) strip off useless suffix, // and 2) use FST not array/PagedBytes indexedTerms.Add(indexedTerm); } int df = te.DocFreq; if (df <= m_maxTermDocFreq) { m_docsEnum = te.Docs(liveDocs, m_docsEnum, DocsFlags.NONE); // dF, but takes deletions into account int actualDF = 0; for (; ;) { int doc = m_docsEnum.NextDoc(); if (doc == DocIdSetIterator.NO_MORE_DOCS) { break; } //System.out.println(" chunk=" + chunk + " docs"); actualDF++; m_termInstances++; //System.out.println(" docID=" + doc); // add TNUM_OFFSET to the term number to make room for special reserved values: // 0 (end term) and 1 (index into byte array follows) int delta = termNum - lastTerm[doc] + TNUM_OFFSET; lastTerm[doc] = termNum; int val = index[doc]; if ((val & 0xff) == 1) { // index into byte array (actually the end of // the doc-specific byte[] when building) int pos = val.TripleShift(8); int ilen = VInt32Size(delta); var arr = bytes[doc]; int newend = pos + ilen; if (newend > arr.Length) { // We avoid a doubling strategy to lower memory usage. // this faceting method isn't for docs with many terms. // In hotspot, objects have 2 words of overhead, then fields, rounded up to a 64-bit boundary. // TODO: figure out what array lengths we can round up to w/o actually using more memory // (how much space does a byte[] take up? Is data preceded by a 32 bit length only? // It should be safe to round up to the nearest 32 bits in any case. int newLen = (newend + 3) & unchecked ((int)0xfffffffc); // 4 byte alignment var newarr = new sbyte[newLen]; Array.Copy(arr, 0, newarr, 0, pos); arr = newarr; bytes[doc] = newarr; } pos = WriteInt32(delta, arr, pos); index[doc] = (pos << 8) | 1; // update pointer to end index in byte[] } else { // OK, this int has data in it... find the end (a zero starting byte - not // part of another number, hence not following a byte with the high bit set). int ipos; if (val == 0) { ipos = 0; } else if ((val & 0x0000ff80) == 0) { ipos = 1; } else if ((val & 0x00ff8000) == 0) { ipos = 2; } else if ((val & 0xff800000) == 0) { ipos = 3; } else { ipos = 4; } //System.out.println(" ipos=" + ipos); int endPos = WriteInt32(delta, tempArr, ipos); //System.out.println(" endpos=" + endPos); if (endPos <= 4) { //System.out.println(" fits!"); // value will fit in the integer... move bytes back for (int j = ipos; j < endPos; j++) { val |= (tempArr[j] & 0xff) << (j << 3); } index[doc] = val; } else { // value won't fit... move integer into byte[] for (int j = 0; j < ipos; j++) { tempArr[j] = (sbyte)val; val = val.TripleShift(8); } // point at the end index in the byte[] index[doc] = (endPos << 8) | 1; bytes[doc] = tempArr; tempArr = new sbyte[12]; } } } SetActualDocFreq(termNum, actualDF); } termNum++; if (!te.MoveNext()) { break; } } m_numTermsInField = termNum; long midPoint = J2N.Time.NanoTime() / J2N.Time.MillisecondsPerNanosecond; // LUCENENET: Use NanoTime() rather than CurrentTimeMilliseconds() for more accurate/reliable results if (m_termInstances == 0) { // we didn't invert anything // lower memory consumption. m_tnums = null; } else { this.m_index = index; // // transform intermediate form into the final form, building a single byte[] // at a time, and releasing the intermediate byte[]s as we go to avoid // increasing the memory footprint. // for (int pass = 0; pass < 256; pass++) { var target = m_tnums[pass]; var pos = 0; // end in target; if (target != null) { pos = target.Length; } else { target = new sbyte[4096]; } // loop over documents, 0x00ppxxxx, 0x01ppxxxx, 0x02ppxxxx // where pp is the pass (which array we are building), and xx is all values. // each pass shares the same byte[] for termNumber lists. for (int docbase = pass << 16; docbase < maxDoc; docbase += (1 << 24)) { int lim = Math.Min(docbase + (1 << 16), maxDoc); for (int doc = docbase; doc < lim; doc++) { //System.out.println(" pass="******" process docID=" + doc); int val = index[doc]; if ((val & 0xff) == 1) { int len = val.TripleShift(8); //System.out.println(" ptr pos=" + pos); index[doc] = (pos << 8) | 1; // change index to point to start of array if ((pos & 0xff000000) != 0) { // we only have 24 bits for the array index throw IllegalStateException.Create("Too many values for UnInvertedField faceting on field " + m_field); } var arr = bytes[doc]; /* * for(byte b : arr) { * //System.out.println(" b=" + Integer.toHexString((int) b)); * } */ bytes[doc] = null; // IMPORTANT: allow GC to avoid OOM if (target.Length <= pos + len) { int newlen = target.Length; //* we don't have to worry about the array getting too large // since the "pos" param will overflow first (only 24 bits available) // if ((newlen<<1) <= 0) { // // overflow... // newlen = Integer.MAX_VALUE; // if (newlen <= pos + len) { // throw new SolrException(400,"Too many terms to uninvert field!"); // } // } else { // while (newlen <= pos + len) newlen<<=1; // doubling strategy // } // while (newlen <= pos + len) // doubling strategy { newlen <<= 1; } var newtarget = new sbyte[newlen]; Array.Copy(target, 0, newtarget, 0, pos); target = newtarget; } Array.Copy(arr, 0, target, pos, len); pos += len + 1; // skip single byte at end and leave it 0 for terminator } } } // shrink array if (pos < target.Length) { var newtarget = new sbyte[pos]; Array.Copy(target, 0, newtarget, 0, pos); target = newtarget; } m_tnums[pass] = target; if ((pass << 16) > maxDoc) { break; } } } if (indexedTerms != null) { m_indexedTermsArray = new BytesRef[indexedTerms.Count]; indexedTerms.CopyTo(m_indexedTermsArray, 0); } long endTime = J2N.Time.NanoTime() / J2N.Time.MillisecondsPerNanosecond; // LUCENENET: Use NanoTime() rather than CurrentTimeMilliseconds() for more accurate/reliable results m_total_time = (int)(endTime - startTime); m_phase1_time = (int)(midPoint - startTime); }