internal SeekingTermSetTermsEnum(TermsEnum tenum, BytesRefHash terms, int[] ords) : base(tenum) { Terms = terms; Ords = ords; _comparator = BytesRef.UTF8SortedAsUnicodeComparer; _lastElement = terms.Size() - 1; _lastTerm = terms.Get(ords[_lastElement], new BytesRef()); _seekTerm = terms.Get(ords[_upto], _spare); }
/// <summary> /// Returns an <seealso cref="StemmerOverrideMap"/> to be used with the <seealso cref="StemmerOverrideFilter"/> </summary> /// <returns> an <seealso cref="StemmerOverrideMap"/> to be used with the <seealso cref="StemmerOverrideFilter"/> </returns> /// <exception cref="IOException"> if an <seealso cref="IOException"/> occurs; </exception> public virtual StemmerOverrideMap Build() { ByteSequenceOutputs outputs = ByteSequenceOutputs.Singleton; Lucene.Net.Util.Fst.Builder <BytesRef> builder = new Lucene.Net.Util.Fst.Builder <BytesRef>(FST.INPUT_TYPE.BYTE4, outputs); int[] sort = hash.Sort(BytesRef.UTF8SortedAsUnicodeComparer); IntsRef intsSpare = new IntsRef(); int size = hash.Size(); for (int i = 0; i < size; i++) { int id = sort[i]; BytesRef bytesRef = hash.Get(id, spare); UnicodeUtil.UTF8toUTF32(bytesRef, intsSpare); builder.Add(intsSpare, new BytesRef(outputValues[id])); } return(new StemmerOverrideMap(builder.Finish(), ignoreCase)); }
internal override void Flush(SegmentWriteState state, DocValuesConsumer dvConsumer) { int maxDoc = state.SegmentInfo.DocCount; Debug.Assert(Pending.Size() == maxDoc); int valueCount = Hash.Size(); int[] sortedValues = Hash.Sort(BytesRef.UTF8SortedAsUnicodeComparer); int[] ordMap = new int[valueCount]; for (int ord = 0; ord < valueCount; ord++) { ordMap[sortedValues[ord]] = ord; } dvConsumer.AddSortedField(FieldInfo, GetBytesRefEnumberable(valueCount, sortedValues), // doc -> ord GetOrdsEnumberable(maxDoc, ordMap)); }
public override bool Collect(BytesRef bytes) { int e = Terms.Add(bytes); TermState state = TermsEnum.TermState(); Debug.Assert(state != null); if (e < 0) { // duplicate term: update docFreq int pos = (-e) - 1; Array.TermState[pos].Register(state, ReaderContext.Ord, TermsEnum.DocFreq(), TermsEnum.TotalTermFreq()); Debug.Assert(Array.Boost[pos] == BoostAtt.Boost, "boost should be equal in all segment TermsEnums"); } else { // new entry: we populate the entry initially Array.Boost[e] = BoostAtt.Boost; Array.TermState[e] = new TermContext(TopReaderContext, state, ReaderContext.Ord, TermsEnum.DocFreq(), TermsEnum.TotalTermFreq()); OuterInstance.CheckMaxClauseCount(Terms.Size()); } return(true); }
// [Test] // LUCENENET NOTE: For now, we are overriding this test in every subclass to pull it into the right context for the subclass public virtual void TestRandomSortedBytes() { Directory dir = NewDirectory(); IndexWriterConfig cfg = NewIndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer(Random())); if (!DefaultCodecSupportsDocsWithField()) { // if the codec doesnt support missing, we expect missing to be mapped to byte[] // by the impersonator, but we have to give it a chance to merge them to this cfg.SetMergePolicy(NewLogMergePolicy()); } RandomIndexWriter w = new RandomIndexWriter(Random(), dir, cfg); int numDocs = AtLeast(100); BytesRefHash hash = new BytesRefHash(); IDictionary<string, string> docToString = new Dictionary<string, string>(); int maxLength = TestUtil.NextInt(Random(), 1, 50); for (int i = 0; i < numDocs; i++) { Document doc = new Document(); doc.Add(NewTextField("id", "" + i, Field.Store.YES)); string @string = TestUtil.RandomRealisticUnicodeString(Random(), 1, maxLength); BytesRef br = new BytesRef(@string); doc.Add(new SortedDocValuesField("field", br)); hash.Add(br); docToString["" + i] = @string; w.AddDocument(doc); } if (Rarely()) { w.Commit(); } int numDocsNoValue = AtLeast(10); for (int i = 0; i < numDocsNoValue; i++) { Document doc = new Document(); doc.Add(NewTextField("id", "noValue", Field.Store.YES)); w.AddDocument(doc); } if (!DefaultCodecSupportsDocsWithField()) { BytesRef bytesRef = new BytesRef(); hash.Add(bytesRef); // add empty value for the gaps } if (Rarely()) { w.Commit(); } if (!DefaultCodecSupportsDocsWithField()) { // if the codec doesnt support missing, we expect missing to be mapped to byte[] // by the impersonator, but we have to give it a chance to merge them to this w.ForceMerge(1); } for (int i = 0; i < numDocs; i++) { Document doc = new Document(); string id = "" + i + numDocs; doc.Add(NewTextField("id", id, Field.Store.YES)); string @string = TestUtil.RandomRealisticUnicodeString(Random(), 1, maxLength); BytesRef br = new BytesRef(@string); hash.Add(br); docToString[id] = @string; doc.Add(new SortedDocValuesField("field", br)); w.AddDocument(doc); } w.Commit(); IndexReader reader = w.Reader; SortedDocValues docValues = MultiDocValues.GetSortedValues(reader, "field"); int[] sort = hash.Sort(BytesRef.UTF8SortedAsUnicodeComparer); BytesRef expected = new BytesRef(); BytesRef actual = new BytesRef(); Assert.AreEqual(hash.Size(), docValues.ValueCount); for (int i = 0; i < hash.Size(); i++) { hash.Get(sort[i], expected); docValues.LookupOrd(i, actual); Assert.AreEqual(expected.Utf8ToString(), actual.Utf8ToString()); int ord = docValues.LookupTerm(expected); Assert.AreEqual(i, ord); } AtomicReader slowR = SlowCompositeReaderWrapper.Wrap(reader); ISet<KeyValuePair<string, string>> entrySet = docToString.EntrySet(); foreach (KeyValuePair<string, string> entry in entrySet) { // pk lookup DocsEnum termDocsEnum = slowR.TermDocsEnum(new Term("id", entry.Key)); int docId = termDocsEnum.NextDoc(); expected = new BytesRef(entry.Value); docValues.Get(docId, actual); Assert.AreEqual(expected, actual); } reader.Dispose(); w.Dispose(); dir.Dispose(); }