internal SeekingTermSetTermsEnum(TermsEnum tenum, BytesRefHash terms, int[] ords) : base(tenum) { Terms = terms; Ords = ords; _comparator = BytesRef.UTF8SortedAsUnicodeComparer; _lastElement = terms.Size() - 1; _lastTerm = terms.Get(ords[_lastElement], new BytesRef()); _seekTerm = terms.Get(ords[_upto], _spare); }
protected override AcceptStatus Accept(BytesRef term) { if (_comparer.Compare(term, _lastTerm) > 0) { return(AcceptStatus.END); } BytesRef currentTerm = terms.Get(ords[_upto], _spare); if (_comparer.Compare(term, currentTerm) == 0) { if (_upto == _lastElement) { return(AcceptStatus.YES); } _seekTerm = terms.Get(ords[++_upto], _spare); return(AcceptStatus.YES_AND_SEEK); } if (_upto == _lastElement) { return(AcceptStatus.NO); } // Our current term doesn't match the the given term. int cmp; do // We maybe are behind the given term by more than one step. Keep incrementing till we're the same or higher. { if (_upto == _lastElement) { return(AcceptStatus.NO); } // typically the terms dict is a superset of query's terms so it's unusual that we have to skip many of // our terms so we don't do a binary search here _seekTerm = terms.Get(ords[++_upto], _spare); } while ((cmp = _comparer.Compare(_seekTerm, term)) < 0); if (cmp == 0) { if (_upto == _lastElement) { return(AcceptStatus.YES); } _seekTerm = terms.Get(ords[++_upto], _spare); return(AcceptStatus.YES_AND_SEEK); } return(AcceptStatus.NO_AND_SEEK); }
internal int BinarySearch(BytesRef b, BytesRef bytesRef, int low, int high, BytesRefHash hash, int[] ords, IComparer <BytesRef> comparer) { int mid; // LUCENENET: IDE0059: Remove unnecessary value assignment while (low <= high) { mid = (low + high).TripleShift(1); hash.Get(ords[mid], bytesRef); int cmp = comparer.Compare(bytesRef, b); if (cmp < 0) { low = mid + 1; } else if (cmp > 0) { high = mid - 1; } else { return(mid); } } if (Debugging.AssertsEnabled) { Debugging.Assert(comparer.Compare(bytesRef, b) != 0); } return(-(low + 1)); }
internal int BinarySearch(BytesRef b, BytesRef bytesRef, int low, int high, BytesRefHash hash, int[] ords, IComparer <BytesRef> comparer) { int mid = 0; while (low <= high) { mid = (int)((uint)(low + high) >> 1); hash.Get(ords[mid], bytesRef); int cmp = comparer.Compare(bytesRef, b); if (cmp < 0) { low = mid + 1; } else if (cmp > 0) { high = mid - 1; } else { return(mid); } } Debug.Assert(comparer.Compare(bytesRef, b) != 0); return(-(low + 1)); }
private IEnumerable <BytesRef> GetBytesRefEnumberable(int valueCount, int[] sortedValues) { for (int i = 0; i < valueCount; ++i) { var scratch = new BytesRef(); yield return(hash.Get(sortedValues[i], scratch)); } }
/// <summary> /// Returns a <see cref="StemmerOverrideMap"/> to be used with the <see cref="StemmerOverrideFilter"/> </summary> /// <returns> a <see cref="StemmerOverrideMap"/> to be used with the <see cref="StemmerOverrideFilter"/> </returns> /// <exception cref="IOException"> if an <see cref="IOException"/> occurs; </exception> public virtual StemmerOverrideMap Build() { ByteSequenceOutputs outputs = ByteSequenceOutputs.Singleton; Builder <BytesRef> builder = new Builder <BytesRef>(FST.INPUT_TYPE.BYTE4, outputs); int[] sort = hash.Sort(BytesRef.UTF8SortedAsUnicodeComparer); Int32sRef intsSpare = new Int32sRef(); int size = hash.Count; for (int i = 0; i < size; i++) { int id = sort[i]; BytesRef bytesRef = hash.Get(id, spare); UnicodeUtil.UTF8toUTF32(bytesRef, intsSpare); builder.Add(intsSpare, new BytesRef(outputValues[id])); } return(new StemmerOverrideMap(builder.Finish(), ignoreCase)); }
public override Query Rewrite(IndexReader reader, MultiTermQuery query) { // Get the enum and start visiting terms. If we // exhaust the enum before hitting either of the // cutoffs, we use ConstantBooleanQueryRewrite; else, // ConstantFilterRewrite: int docCountCutoff = (int)((docCountPercent / 100.0) * reader.MaxDoc); int termCountLimit = Math.Min(BooleanQuery.MaxClauseCount, termCountCutoff); CutOffTermCollector col = new CutOffTermCollector(docCountCutoff, termCountLimit); CollectTerms(reader, query, col); int size = col.pendingTerms.Count; if (col.hasCutOff) { return(MultiTermQuery.CONSTANT_SCORE_FILTER_REWRITE.Rewrite(reader, query)); } else { BooleanQuery bq = GetTopLevelQuery(); if (size > 0) { BytesRefHash pendingTerms = col.pendingTerms; int[] sort = pendingTerms.Sort(col.termsEnum.Comparer); for (int i = 0; i < size; i++) { int pos = sort[i]; // docFreq is not used for constant score here, we pass 1 // to explicitely set a fake value, so it's not calculated AddClause(bq, new Term(query.m_field, pendingTerms.Get(pos, new BytesRef())), 1, 1.0f, col.array.termState[pos]); } } // Strip scores Query result = new ConstantScoreQuery(bq); result.Boost = query.Boost; return(result); } }
// [Test] // LUCENENET NOTE: For now, we are overriding this test in every subclass to pull it into the right context for the subclass public virtual void TestRandomSortedBytes() { Directory dir = NewDirectory(); IndexWriterConfig cfg = NewIndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer(Random())); if (!DefaultCodecSupportsDocsWithField()) { // if the codec doesnt support missing, we expect missing to be mapped to byte[] // by the impersonator, but we have to give it a chance to merge them to this cfg.SetMergePolicy(NewLogMergePolicy()); } RandomIndexWriter w = new RandomIndexWriter(Random(), dir, cfg); int numDocs = AtLeast(100); BytesRefHash hash = new BytesRefHash(); IDictionary<string, string> docToString = new Dictionary<string, string>(); int maxLength = TestUtil.NextInt(Random(), 1, 50); for (int i = 0; i < numDocs; i++) { Document doc = new Document(); doc.Add(NewTextField("id", "" + i, Field.Store.YES)); string @string = TestUtil.RandomRealisticUnicodeString(Random(), 1, maxLength); BytesRef br = new BytesRef(@string); doc.Add(new SortedDocValuesField("field", br)); hash.Add(br); docToString["" + i] = @string; w.AddDocument(doc); } if (Rarely()) { w.Commit(); } int numDocsNoValue = AtLeast(10); for (int i = 0; i < numDocsNoValue; i++) { Document doc = new Document(); doc.Add(NewTextField("id", "noValue", Field.Store.YES)); w.AddDocument(doc); } if (!DefaultCodecSupportsDocsWithField()) { BytesRef bytesRef = new BytesRef(); hash.Add(bytesRef); // add empty value for the gaps } if (Rarely()) { w.Commit(); } if (!DefaultCodecSupportsDocsWithField()) { // if the codec doesnt support missing, we expect missing to be mapped to byte[] // by the impersonator, but we have to give it a chance to merge them to this w.ForceMerge(1); } for (int i = 0; i < numDocs; i++) { Document doc = new Document(); string id = "" + i + numDocs; doc.Add(NewTextField("id", id, Field.Store.YES)); string @string = TestUtil.RandomRealisticUnicodeString(Random(), 1, maxLength); BytesRef br = new BytesRef(@string); hash.Add(br); docToString[id] = @string; doc.Add(new SortedDocValuesField("field", br)); w.AddDocument(doc); } w.Commit(); IndexReader reader = w.Reader; SortedDocValues docValues = MultiDocValues.GetSortedValues(reader, "field"); int[] sort = hash.Sort(BytesRef.UTF8SortedAsUnicodeComparer); BytesRef expected = new BytesRef(); BytesRef actual = new BytesRef(); Assert.AreEqual(hash.Size(), docValues.ValueCount); for (int i = 0; i < hash.Size(); i++) { hash.Get(sort[i], expected); docValues.LookupOrd(i, actual); Assert.AreEqual(expected.Utf8ToString(), actual.Utf8ToString()); int ord = docValues.LookupTerm(expected); Assert.AreEqual(i, ord); } AtomicReader slowR = SlowCompositeReaderWrapper.Wrap(reader); ISet<KeyValuePair<string, string>> entrySet = docToString.EntrySet(); foreach (KeyValuePair<string, string> entry in entrySet) { // pk lookup DocsEnum termDocsEnum = slowR.TermDocsEnum(new Term("id", entry.Key)); int docId = termDocsEnum.NextDoc(); expected = new BytesRef(entry.Value); docValues.Get(docId, actual); Assert.AreEqual(expected, actual); } reader.Dispose(); w.Dispose(); dir.Dispose(); }