/// <summary> /// Sorts hashed terms into ascending order, reusing memory along the /// way. Note that sorting is lazily delayed until required (often it's /// not required at all). If a sorted view is required then hashing + /// sort + binary search is still faster and smaller than TreeMap usage /// (which would be an alternative and somewhat more elegant approach, /// apart from more sophisticated Tries / prefix trees). /// </summary> public void SortTerms() { if (sortedTerms is null) { sortedTerms = terms.Sort(BytesRef.UTF8SortedAsUnicodeComparer); } }
public override void Flush(SegmentWriteState state, DocValuesConsumer dvConsumer) { int maxDoc = state.SegmentInfo.DocCount; int maxCountPerDoc = maxCount; if (Debugging.AssertsEnabled) { Debugging.Assert(pendingCounts.Count == maxDoc); } int valueCount = hash.Count; int[] sortedValues = hash.Sort(BytesRef.UTF8SortedAsUnicodeComparer); int[] ordMap = new int[valueCount]; for (int ord = 0; ord < valueCount; ord++) { ordMap[sortedValues[ord]] = ord; } dvConsumer.AddSortedSetField(fieldInfo, GetBytesRefEnumberable(valueCount, sortedValues), // doc -> ordCount GetOrdsEnumberable(maxDoc), // ords GetOrdCountEnumberable(maxCountPerDoc, ordMap)); }
private readonly Query _fromQuery; // Used for equals() only /// <summary> /// /// </summary> /// <param name="field">The field that should contain terms that are specified in the previous parameter.</param> /// <param name="fromQuery"></param> /// <param name="terms">The terms that matching documents should have. The terms must be sorted by natural order.</param> internal TermsQuery(string field, Query fromQuery, BytesRefHash terms) : base(field) { _fromQuery = fromQuery; _terms = terms; _ords = terms.Sort(BytesRef.UTF8SortedAsUnicodeComparer); }
internal TermsIncludingScoreQuery(string field, bool multipleValuesPerDocument, BytesRefHash terms, float[] scores, Query originalQuery) { _field = field; _multipleValuesPerDocument = multipleValuesPerDocument; _terms = terms; _scores = scores; _originalQuery = originalQuery; _ords = terms.Sort(BytesRef.UTF8SortedAsUnicodeComparer); _unwrittenOriginalQuery = originalQuery; }
/// <summary> /// Returns a <see cref="StemmerOverrideMap"/> to be used with the <see cref="StemmerOverrideFilter"/> </summary> /// <returns> a <see cref="StemmerOverrideMap"/> to be used with the <see cref="StemmerOverrideFilter"/> </returns> /// <exception cref="IOException"> if an <see cref="IOException"/> occurs; </exception> public virtual StemmerOverrideMap Build() { ByteSequenceOutputs outputs = ByteSequenceOutputs.Singleton; Builder <BytesRef> builder = new Builder <BytesRef>(FST.INPUT_TYPE.BYTE4, outputs); int[] sort = hash.Sort(BytesRef.UTF8SortedAsUnicodeComparer); Int32sRef intsSpare = new Int32sRef(); int size = hash.Count; for (int i = 0; i < size; i++) { int id = sort[i]; BytesRef bytesRef = hash.Get(id, spare); UnicodeUtil.UTF8toUTF32(bytesRef, intsSpare); builder.Add(intsSpare, new BytesRef(outputValues[id])); } return(new StemmerOverrideMap(builder.Finish(), ignoreCase)); }
internal override void Flush(SegmentWriteState state, DocValuesConsumer dvConsumer) { int maxDoc = state.SegmentInfo.DocCount; Debug.Assert(Pending.Size() == maxDoc); int valueCount = Hash.Size(); int[] sortedValues = Hash.Sort(BytesRef.UTF8SortedAsUnicodeComparer); int[] ordMap = new int[valueCount]; for (int ord = 0; ord < valueCount; ord++) { ordMap[sortedValues[ord]] = ord; } dvConsumer.AddSortedField(FieldInfo, GetBytesRefEnumberable(valueCount, sortedValues), // doc -> ord GetOrdsEnumberable(maxDoc, ordMap)); }
public override Query Rewrite(IndexReader reader, MultiTermQuery query) { // Get the enum and start visiting terms. If we // exhaust the enum before hitting either of the // cutoffs, we use ConstantBooleanQueryRewrite; else, // ConstantFilterRewrite: int docCountCutoff = (int)((docCountPercent / 100.0) * reader.MaxDoc); int termCountLimit = Math.Min(BooleanQuery.MaxClauseCount, termCountCutoff); CutOffTermCollector col = new CutOffTermCollector(docCountCutoff, termCountLimit); CollectTerms(reader, query, col); int size = col.pendingTerms.Count; if (col.hasCutOff) { return(MultiTermQuery.CONSTANT_SCORE_FILTER_REWRITE.Rewrite(reader, query)); } else { BooleanQuery bq = GetTopLevelQuery(); if (size > 0) { BytesRefHash pendingTerms = col.pendingTerms; int[] sort = pendingTerms.Sort(col.termsEnum.Comparer); for (int i = 0; i < size; i++) { int pos = sort[i]; // docFreq is not used for constant score here, we pass 1 // to explicitely set a fake value, so it's not calculated AddClause(bq, new Term(query.m_field, pendingTerms.Get(pos, new BytesRef())), 1, 1.0f, col.array.termState[pos]); } } // Strip scores Query result = new ConstantScoreQuery(bq); result.Boost = query.Boost; return(result); } }
// [Test] // LUCENENET NOTE: For now, we are overriding this test in every subclass to pull it into the right context for the subclass public virtual void TestRandomSortedBytes() { Directory dir = NewDirectory(); IndexWriterConfig cfg = NewIndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer(Random())); if (!DefaultCodecSupportsDocsWithField()) { // if the codec doesnt support missing, we expect missing to be mapped to byte[] // by the impersonator, but we have to give it a chance to merge them to this cfg.SetMergePolicy(NewLogMergePolicy()); } RandomIndexWriter w = new RandomIndexWriter(Random(), dir, cfg); int numDocs = AtLeast(100); BytesRefHash hash = new BytesRefHash(); IDictionary<string, string> docToString = new Dictionary<string, string>(); int maxLength = TestUtil.NextInt(Random(), 1, 50); for (int i = 0; i < numDocs; i++) { Document doc = new Document(); doc.Add(NewTextField("id", "" + i, Field.Store.YES)); string @string = TestUtil.RandomRealisticUnicodeString(Random(), 1, maxLength); BytesRef br = new BytesRef(@string); doc.Add(new SortedDocValuesField("field", br)); hash.Add(br); docToString["" + i] = @string; w.AddDocument(doc); } if (Rarely()) { w.Commit(); } int numDocsNoValue = AtLeast(10); for (int i = 0; i < numDocsNoValue; i++) { Document doc = new Document(); doc.Add(NewTextField("id", "noValue", Field.Store.YES)); w.AddDocument(doc); } if (!DefaultCodecSupportsDocsWithField()) { BytesRef bytesRef = new BytesRef(); hash.Add(bytesRef); // add empty value for the gaps } if (Rarely()) { w.Commit(); } if (!DefaultCodecSupportsDocsWithField()) { // if the codec doesnt support missing, we expect missing to be mapped to byte[] // by the impersonator, but we have to give it a chance to merge them to this w.ForceMerge(1); } for (int i = 0; i < numDocs; i++) { Document doc = new Document(); string id = "" + i + numDocs; doc.Add(NewTextField("id", id, Field.Store.YES)); string @string = TestUtil.RandomRealisticUnicodeString(Random(), 1, maxLength); BytesRef br = new BytesRef(@string); hash.Add(br); docToString[id] = @string; doc.Add(new SortedDocValuesField("field", br)); w.AddDocument(doc); } w.Commit(); IndexReader reader = w.Reader; SortedDocValues docValues = MultiDocValues.GetSortedValues(reader, "field"); int[] sort = hash.Sort(BytesRef.UTF8SortedAsUnicodeComparer); BytesRef expected = new BytesRef(); BytesRef actual = new BytesRef(); Assert.AreEqual(hash.Size(), docValues.ValueCount); for (int i = 0; i < hash.Size(); i++) { hash.Get(sort[i], expected); docValues.LookupOrd(i, actual); Assert.AreEqual(expected.Utf8ToString(), actual.Utf8ToString()); int ord = docValues.LookupTerm(expected); Assert.AreEqual(i, ord); } AtomicReader slowR = SlowCompositeReaderWrapper.Wrap(reader); ISet<KeyValuePair<string, string>> entrySet = docToString.EntrySet(); foreach (KeyValuePair<string, string> entry in entrySet) { // pk lookup DocsEnum termDocsEnum = slowR.TermDocsEnum(new Term("id", entry.Key)); int docId = termDocsEnum.NextDoc(); expected = new BytesRef(entry.Value); docValues.Get(docId, actual); Assert.AreEqual(expected, actual); } reader.Dispose(); w.Dispose(); dir.Dispose(); }