Example #1
0
 /// <summary>
 /// Sorts hashed terms into ascending order, reusing memory along the
 /// way. Note that sorting is lazily delayed until required (often it's
 /// not required at all). If a sorted view is required then hashing +
 /// sort + binary search is still faster and smaller than TreeMap usage
 /// (which would be an alternative and somewhat more elegant approach,
 /// apart from more sophisticated Tries / prefix trees).
 /// </summary>
 public void SortTerms()
 {
     if (sortedTerms is null)
     {
         sortedTerms = terms.Sort(BytesRef.UTF8SortedAsUnicodeComparer);
     }
 }
        public override void Flush(SegmentWriteState state, DocValuesConsumer dvConsumer)
        {
            int maxDoc         = state.SegmentInfo.DocCount;
            int maxCountPerDoc = maxCount;

            if (Debugging.AssertsEnabled)
            {
                Debugging.Assert(pendingCounts.Count == maxDoc);
            }
            int valueCount = hash.Count;

            int[] sortedValues = hash.Sort(BytesRef.UTF8SortedAsUnicodeComparer);
            int[] ordMap       = new int[valueCount];

            for (int ord = 0; ord < valueCount; ord++)
            {
                ordMap[sortedValues[ord]] = ord;
            }

            dvConsumer.AddSortedSetField(fieldInfo, GetBytesRefEnumberable(valueCount, sortedValues),

                                         // doc -> ordCount
                                         GetOrdsEnumberable(maxDoc),

                                         // ords
                                         GetOrdCountEnumberable(maxCountPerDoc, ordMap));
        }
Example #3
0
        private readonly Query _fromQuery; // Used for equals() only

        /// <summary>
        ///
        /// </summary>
        /// <param name="field">The field that should contain terms that are specified in the previous parameter.</param>
        /// <param name="fromQuery"></param>
        /// <param name="terms">The terms that matching documents should have. The terms must be sorted by natural order.</param>
        internal TermsQuery(string field, Query fromQuery, BytesRefHash terms)
            : base(field)
        {
            _fromQuery = fromQuery;
            _terms     = terms;
            _ords      = terms.Sort(BytesRef.UTF8SortedAsUnicodeComparer);
        }
Example #4
0
 internal TermsIncludingScoreQuery(string field, bool multipleValuesPerDocument, BytesRefHash terms,
                                   float[] scores, Query originalQuery)
 {
     _field = field;
     _multipleValuesPerDocument = multipleValuesPerDocument;
     _terms                  = terms;
     _scores                 = scores;
     _originalQuery          = originalQuery;
     _ords                   = terms.Sort(BytesRef.UTF8SortedAsUnicodeComparer);
     _unwrittenOriginalQuery = originalQuery;
 }
Example #5
0
            /// <summary>
            /// Returns a <see cref="StemmerOverrideMap"/> to be used with the <see cref="StemmerOverrideFilter"/> </summary>
            /// <returns> a <see cref="StemmerOverrideMap"/> to be used with the <see cref="StemmerOverrideFilter"/> </returns>
            /// <exception cref="IOException"> if an <see cref="IOException"/> occurs; </exception>
            public virtual StemmerOverrideMap Build()
            {
                ByteSequenceOutputs outputs = ByteSequenceOutputs.Singleton;
                Builder <BytesRef>  builder = new Builder <BytesRef>(FST.INPUT_TYPE.BYTE4, outputs);

                int[]     sort      = hash.Sort(BytesRef.UTF8SortedAsUnicodeComparer);
                Int32sRef intsSpare = new Int32sRef();
                int       size      = hash.Count;

                for (int i = 0; i < size; i++)
                {
                    int      id       = sort[i];
                    BytesRef bytesRef = hash.Get(id, spare);
                    UnicodeUtil.UTF8toUTF32(bytesRef, intsSpare);
                    builder.Add(intsSpare, new BytesRef(outputValues[id]));
                }
                return(new StemmerOverrideMap(builder.Finish(), ignoreCase));
            }
Example #6
0
        internal override void Flush(SegmentWriteState state, DocValuesConsumer dvConsumer)
        {
            int maxDoc = state.SegmentInfo.DocCount;

            Debug.Assert(Pending.Size() == maxDoc);
            int valueCount = Hash.Size();

            int[] sortedValues = Hash.Sort(BytesRef.UTF8SortedAsUnicodeComparer);
            int[] ordMap       = new int[valueCount];

            for (int ord = 0; ord < valueCount; ord++)
            {
                ordMap[sortedValues[ord]] = ord;
            }

            dvConsumer.AddSortedField(FieldInfo, GetBytesRefEnumberable(valueCount, sortedValues),
                                      // doc -> ord
                                      GetOrdsEnumberable(maxDoc, ordMap));
        }
        public override Query Rewrite(IndexReader reader, MultiTermQuery query)
        {
            // Get the enum and start visiting terms.  If we
            // exhaust the enum before hitting either of the
            // cutoffs, we use ConstantBooleanQueryRewrite; else,
            // ConstantFilterRewrite:
            int docCountCutoff = (int)((docCountPercent / 100.0) * reader.MaxDoc);
            int termCountLimit = Math.Min(BooleanQuery.MaxClauseCount, termCountCutoff);

            CutOffTermCollector col = new CutOffTermCollector(docCountCutoff, termCountLimit);

            CollectTerms(reader, query, col);
            int size = col.pendingTerms.Count;

            if (col.hasCutOff)
            {
                return(MultiTermQuery.CONSTANT_SCORE_FILTER_REWRITE.Rewrite(reader, query));
            }
            else
            {
                BooleanQuery bq = GetTopLevelQuery();
                if (size > 0)
                {
                    BytesRefHash pendingTerms = col.pendingTerms;
                    int[]        sort         = pendingTerms.Sort(col.termsEnum.Comparer);
                    for (int i = 0; i < size; i++)
                    {
                        int pos = sort[i];
                        // docFreq is not used for constant score here, we pass 1
                        // to explicitely set a fake value, so it's not calculated
                        AddClause(bq, new Term(query.m_field, pendingTerms.Get(pos, new BytesRef())), 1, 1.0f, col.array.termState[pos]);
                    }
                }
                // Strip scores
                Query result = new ConstantScoreQuery(bq);
                result.Boost = query.Boost;
                return(result);
            }
        }
        // [Test] // LUCENENET NOTE: For now, we are overriding this test in every subclass to pull it into the right context for the subclass
        public virtual void TestRandomSortedBytes()
        {
            Directory dir = NewDirectory();
            IndexWriterConfig cfg = NewIndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer(Random()));
            if (!DefaultCodecSupportsDocsWithField())
            {
                // if the codec doesnt support missing, we expect missing to be mapped to byte[]
                // by the impersonator, but we have to give it a chance to merge them to this
                cfg.SetMergePolicy(NewLogMergePolicy());
            }
            RandomIndexWriter w = new RandomIndexWriter(Random(), dir, cfg);
            int numDocs = AtLeast(100);
            BytesRefHash hash = new BytesRefHash();
            IDictionary<string, string> docToString = new Dictionary<string, string>();
            int maxLength = TestUtil.NextInt(Random(), 1, 50);
            for (int i = 0; i < numDocs; i++)
            {
                Document doc = new Document();
                doc.Add(NewTextField("id", "" + i, Field.Store.YES));
                string @string = TestUtil.RandomRealisticUnicodeString(Random(), 1, maxLength);
                BytesRef br = new BytesRef(@string);
                doc.Add(new SortedDocValuesField("field", br));
                hash.Add(br);
                docToString["" + i] = @string;
                w.AddDocument(doc);
            }
            if (Rarely())
            {
                w.Commit();
            }
            int numDocsNoValue = AtLeast(10);
            for (int i = 0; i < numDocsNoValue; i++)
            {
                Document doc = new Document();
                doc.Add(NewTextField("id", "noValue", Field.Store.YES));
                w.AddDocument(doc);
            }
            if (!DefaultCodecSupportsDocsWithField())
            {
                BytesRef bytesRef = new BytesRef();
                hash.Add(bytesRef); // add empty value for the gaps
            }
            if (Rarely())
            {
                w.Commit();
            }
            if (!DefaultCodecSupportsDocsWithField())
            {
                // if the codec doesnt support missing, we expect missing to be mapped to byte[]
                // by the impersonator, but we have to give it a chance to merge them to this
                w.ForceMerge(1);
            }
            for (int i = 0; i < numDocs; i++)
            {
                Document doc = new Document();
                string id = "" + i + numDocs;
                doc.Add(NewTextField("id", id, Field.Store.YES));
                string @string = TestUtil.RandomRealisticUnicodeString(Random(), 1, maxLength);
                BytesRef br = new BytesRef(@string);
                hash.Add(br);
                docToString[id] = @string;
                doc.Add(new SortedDocValuesField("field", br));
                w.AddDocument(doc);
            }
            w.Commit();
            IndexReader reader = w.Reader;
            SortedDocValues docValues = MultiDocValues.GetSortedValues(reader, "field");
            int[] sort = hash.Sort(BytesRef.UTF8SortedAsUnicodeComparer);
            BytesRef expected = new BytesRef();
            BytesRef actual = new BytesRef();
            Assert.AreEqual(hash.Size(), docValues.ValueCount);
            for (int i = 0; i < hash.Size(); i++)
            {
                hash.Get(sort[i], expected);
                docValues.LookupOrd(i, actual);
                Assert.AreEqual(expected.Utf8ToString(), actual.Utf8ToString());
                int ord = docValues.LookupTerm(expected);
                Assert.AreEqual(i, ord);
            }
            AtomicReader slowR = SlowCompositeReaderWrapper.Wrap(reader);
            ISet<KeyValuePair<string, string>> entrySet = docToString.EntrySet();

            foreach (KeyValuePair<string, string> entry in entrySet)
            {
                // pk lookup
                DocsEnum termDocsEnum = slowR.TermDocsEnum(new Term("id", entry.Key));
                int docId = termDocsEnum.NextDoc();
                expected = new BytesRef(entry.Value);
                docValues.Get(docId, actual);
                Assert.AreEqual(expected, actual);
            }

            reader.Dispose();
            w.Dispose();
            dir.Dispose();
        }