public override BytesRef Encode(char[] buffer, int offset, int length) { float payload = float.Parse(new string(buffer, offset, length)); //TODO: improve this so that we don't have to new Strings byte[] bytes = PayloadHelper.EncodeFloat(payload); BytesRef result = new BytesRef(bytes); return result; }
public override BytesRef Encode(char[] buffer, int offset, int length) { int payload = ArrayUtil.ParseInt(buffer, offset, length); //TODO: improve this so that we don't have to new Strings byte[] bytes = PayloadHelper.EncodeInt(payload); BytesRef result = new BytesRef(bytes); return result; }
internal virtual byte[] Decompress(byte[] compressed, int originalLength, int offset, int length) { Decompressor decompressor = Mode.NewDecompressor(); BytesRef bytes = new BytesRef(); decompressor.Decompress(new ByteArrayDataInput(compressed), originalLength, offset, length, bytes); return Arrays.CopyOfRange(bytes.Bytes, bytes.Offset, bytes.Offset + bytes.Length); }
public void TestBlendingType() { BytesRef pl = new BytesRef("lake"); long w = 20; Input[] keys = new Input[]{ new Input("top of the lake", w, pl) }; DirectoryInfo tempDir = CreateTempDir("BlendedInfixSuggesterTest"); Analyzer a = new StandardAnalyzer(TEST_VERSION_CURRENT, CharArraySet.EMPTY_SET); // BlenderType.LINEAR is used by default (remove position*10%) BlendedInfixSuggester suggester = new BlendedInfixSuggester(TEST_VERSION_CURRENT, NewFSDirectory(tempDir), a); suggester.Build(new InputArrayIterator(keys)); assertEquals(w, GetInResults(suggester, "top", pl, 1)); assertEquals((int)(w * (1 - 0.10 * 2)), GetInResults(suggester, "the", pl, 1)); assertEquals((int)(w * (1 - 0.10 * 3)), GetInResults(suggester, "lake", pl, 1)); suggester.Dispose(); // BlenderType.RECIPROCAL is using 1/(1+p) * w where w is weight and p the position of the word suggester = new BlendedInfixSuggester(TEST_VERSION_CURRENT, NewFSDirectory(tempDir), a, a, AnalyzingInfixSuggester.DEFAULT_MIN_PREFIX_CHARS, BlendedInfixSuggester.BlenderType.POSITION_RECIPROCAL, 1); suggester.Build(new InputArrayIterator(keys)); assertEquals(w, GetInResults(suggester, "top", pl, 1)); assertEquals((int)(w * 1 / (1 + 2)), GetInResults(suggester, "the", pl, 1)); assertEquals((int)(w * 1 / (1 + 3)), GetInResults(suggester, "lake", pl, 1)); suggester.Dispose(); }
public DocFreqValueSource(string field, string val, string indexedField, BytesRef indexedBytes) { this.field = field; this.val = val; this.indexedField = indexedField; this.indexedBytes = indexedBytes; }
public virtual void TestSimpleDictionary() { using (System.IO.Stream affixStream = this.GetType().getResourceAsStream("simple.aff")) { using (System.IO.Stream dictStream = this.GetType().getResourceAsStream("simple.dic")) { Dictionary dictionary = new Dictionary(affixStream, dictStream); assertEquals(3, dictionary.LookupSuffix(new char[] { 'e' }, 0, 1).Length); assertEquals(1, dictionary.LookupPrefix(new char[] { 's' }, 0, 1).Length); IntsRef ordList = dictionary.LookupWord(new char[] { 'o', 'l', 'r' }, 0, 3); assertNotNull(ordList); assertEquals(1, ordList.Length); BytesRef @ref = new BytesRef(); dictionary.flagLookup.Get(ordList.Ints[0], @ref); char[] flags = Dictionary.DecodeFlags(@ref); assertEquals(1, flags.Length); ordList = dictionary.LookupWord(new char[] { 'l', 'u', 'c', 'e', 'n' }, 0, 5); assertNotNull(ordList); assertEquals(1, ordList.Length); dictionary.flagLookup.Get(ordList.Ints[0], @ref); flags = Dictionary.DecodeFlags(@ref); assertEquals(1, flags.Length); } } }
internal TermStats(string field, BytesRef termtext, int df, long tf) { this.termtext = BytesRef.DeepCopyOf(termtext); this.Field = field; this.DocFreq = df; this.TotalTermFreq = tf; }
public virtual void TestFarsiRangeFilterCollating(Analyzer analyzer, BytesRef firstBeg, BytesRef firstEnd, BytesRef secondBeg, BytesRef secondEnd) { Directory dir = NewDirectory(); IndexWriter writer = new IndexWriter(dir, new IndexWriterConfig(TEST_VERSION_CURRENT, analyzer)); Document doc = new Document(); doc.Add(new TextField("content", "\u0633\u0627\u0628", Field.Store.YES)); doc.Add(new StringField("body", "body", Field.Store.YES)); writer.AddDocument(doc); writer.Dispose(); IndexReader reader = DirectoryReader.Open(dir); IndexSearcher searcher = new IndexSearcher(reader); Query query = new TermQuery(new Term("body", "body")); // Unicode order would include U+0633 in [ U+062F - U+0698 ], but Farsi // orders the U+0698 character before the U+0633 character, so the single // index Term below should NOT be returned by a TermRangeFilter with a Farsi // Collator (or an Arabic one for the case when Farsi searcher not // supported). ScoreDoc[] result = searcher.Search(query, new TermRangeFilter("content", firstBeg, firstEnd, true, true), 1).ScoreDocs; Assert.AreEqual(0, result.Length, "The index Term should not be included."); result = searcher.Search(query, new TermRangeFilter("content", secondBeg, secondEnd, true, true), 1).ScoreDocs; Assert.AreEqual(1, result.Length, "The index Term should be included."); reader.Dispose(); dir.Dispose(); }
public MockVariableLengthPayloadFilter(Random random, TokenStream @in) : base(@in) { this.Random = random; this.Payload = new BytesRef(Bytes); this.PayloadAtt = AddAttribute<IPayloadAttribute>(); }
private void SumValues(IList<FacetsCollector.MatchingDocs> matchingDocs) { //System.out.println("count matchingDocs=" + matchingDocs + " facetsField=" + facetsFieldName); foreach (FacetsCollector.MatchingDocs hits in matchingDocs) { BinaryDocValues dv = hits.context.AtomicReader.GetBinaryDocValues(IndexFieldName); if (dv == null) // this reader does not have DocValues for the requested category list { continue; } DocIdSetIterator docs = hits.bits.GetIterator(); int doc; while ((doc = docs.NextDoc()) != DocIdSetIterator.NO_MORE_DOCS) { //System.out.println(" doc=" + doc); // TODO: use OrdinalsReader? we'd need to add a // BytesRef getAssociation()? BytesRef bytesRef = new BytesRef(); dv.Get(doc, bytesRef); byte[] bytes = bytesRef.Bytes; int end = bytesRef.Offset + bytesRef.Length; int offset = bytesRef.Offset; while (offset < end) { int ord = ((bytes[offset] & 0xFF) << 24) | ((bytes[offset + 1] & 0xFF) << 16) | ((bytes[offset + 2] & 0xFF) << 8) | (bytes[offset + 3] & 0xFF); offset += 4; int value = ((bytes[offset] & 0xFF) << 24) | ((bytes[offset + 1] & 0xFF) << 16) | ((bytes[offset + 2] & 0xFF) << 8) | (bytes[offset + 3] & 0xFF); offset += 4; values[ord] += value; } } } }
public virtual void TestEmpty() { BytesRef b = new BytesRef(); Assert.AreEqual(BytesRef.EMPTY_BYTES, b.Bytes); Assert.AreEqual(0, b.Offset); Assert.AreEqual(0, b.Length); }
public virtual void TestAppend() { var bytes = new[] { (byte)'a', (byte)'b', (byte)'c', (byte)'d' }; BytesRef b = new BytesRef(bytes, 1, 3); // bcd b.Append(new BytesRef("e")); Assert.AreEqual("bcde", b.Utf8ToString()); }
/// <summary> /// Creates a new iterator, buffering entries from the specified iterator </summary> public BufferedInputIterator(InputIterator source) { BytesRef spare; int freqIndex = 0; hasPayloads = source.HasPayloads; hasContexts_Renamed = source.HasContexts; while ((spare = source.Next()) != null) { entries.Append(spare); if (hasPayloads) { payloads.Append(source.Payload); } if (hasContexts_Renamed) { contextSets.Add(source.Contexts); } if (freqIndex >= freqs.Length) { freqs = ArrayUtil.Grow(freqs, freqs.Length + 1); } freqs[freqIndex++] = source.Weight; } comp = source.Comparator; }
public virtual void TestCopyBytes() { sbyte[] bytes = new sbyte[] { (sbyte)'a', (sbyte)'b', (sbyte)'c', (sbyte)'d' }; BytesRef b = new BytesRef(bytes, 1, 3); // bcd b.CopyBytes(new BytesRef("bcde")); Assert.AreEqual("bcde", b.Utf8ToString()); }
public SrndPrefixQuery(string prefix, bool quoted, char truncator) : base(quoted) { this.prefix = prefix; prefixRef = new BytesRef(prefix); this.truncator = truncator; }
public virtual void TestSize() { BytesRef @ref = new BytesRef(); int num = AtLeast(2); for (int j = 0; j < num; j++) { int mod = 1 + Random().Next(39); for (int i = 0; i < 797; i++) { string str; do { str = TestUtil.RandomRealisticUnicodeString(Random(), 1000); } while (str.Length == 0); @ref.CopyChars(str); int count = Hash.Size(); int key = Hash.Add(@ref); if (key < 0) { Assert.AreEqual(Hash.Size(), count); } else { Assert.AreEqual(Hash.Size(), count + 1); } if (i % mod == 0) { Hash.Clear(); Assert.AreEqual(0, Hash.Size()); Hash.Reinit(); } } } }
public static void Main(string[] args) { FileInfo input = new FileInfo("/home/dweiss/tmp/shuffled.dict"); int buckets = 20; int shareMaxTail = 10; ExternalRefSorter sorter = new ExternalRefSorter(new OfflineSorter()); FSTCompletionBuilder builder = new FSTCompletionBuilder(buckets, sorter, shareMaxTail); TextReader reader = new StreamReader( new FileStream(input.FullName, FileMode.Open), Encoding.UTF8); BytesRef scratch = new BytesRef(); string line; int count = 0; while ((line = reader.ReadLine()) != null) { scratch.CopyChars(line); builder.Add(scratch, count % buckets); if ((count++ % 100000) == 0) { Console.WriteLine("Line: " + count); } } Console.WriteLine("Building FSTCompletion."); FSTCompletion completion = builder.Build(); FileInfo fstFile = new FileInfo("completion.fst"); Console.WriteLine("Done. Writing automaton: " + fstFile.FullName); completion.FST.Save(fstFile); sorter.Dispose(); }
protected virtual void TruncatedToPrefixAndPattern() { int i = 0; while ((i < truncated.Length) && MatchingChar(truncated[i])) { i++; } prefix = truncated.Substring(0, i); prefixRef = new BytesRef(prefix); StringBuilder re = new StringBuilder(); // LUCENENET NOTE: To mimic Java's matches() method, we alter // the Regex to match the entire string. This makes the Regex // fail fast when not at the beginning of the string, which is // more efficient than testing the length after a successful match. // http://stackoverflow.com/a/12547528/181087 re.Append(@"\A(?:"); while (i < truncated.Length) { AppendRegExpForChar(truncated[i], re); i++; } re.Append(@")\z"); pattern = new Regex(re.ToString(), RegexOptions.Compiled); }
public void TestBlendedSort() { BytesRef payload = new BytesRef("star"); Input[] keys = new Input[]{ new Input("star wars: episode v - the empire strikes back", 8, payload) }; DirectoryInfo tempDir = CreateTempDir("BlendedInfixSuggesterTest"); Analyzer a = new StandardAnalyzer(TEST_VERSION_CURRENT, CharArraySet.EMPTY_SET); BlendedInfixSuggester suggester = new BlendedInfixSuggester(TEST_VERSION_CURRENT, NewFSDirectory(tempDir), a, a, AnalyzingInfixSuggester.DEFAULT_MIN_PREFIX_CHARS, BlendedInfixSuggester.BlenderType.POSITION_LINEAR, BlendedInfixSuggester.DEFAULT_NUM_FACTOR); suggester.Build(new InputArrayIterator(keys)); // we query for star wars and check that the weight // is smaller when we search for tokens that are far from the beginning long w0 = GetInResults(suggester, "star ", payload, 1); long w1 = GetInResults(suggester, "war", payload, 1); long w2 = GetInResults(suggester, "empire ba", payload, 1); long w3 = GetInResults(suggester, "back", payload, 1); long w4 = GetInResults(suggester, "bacc", payload, 1); assertTrue(w0 > w1); assertTrue(w1 > w2); assertTrue(w2 > w3); assertTrue(w4 < 0); suggester.Dispose(); }
public virtual void AddValue(int docID, BytesRef value) { if (value == null) { throw new System.ArgumentException("field \"" + FieldInfo.Name + "\": null value not allowed"); } if (value.Length > (ByteBlockPool.BYTE_BLOCK_SIZE - 2)) { throw new System.ArgumentException("DocValuesField \"" + FieldInfo.Name + "\" is too large, must be <= " + (ByteBlockPool.BYTE_BLOCK_SIZE - 2)); } if (docID != CurrentDoc) { FinishCurrentDoc(); } // Fill in any holes: while (CurrentDoc < docID) { PendingCounts.Add(0); // no values CurrentDoc++; } AddOneValue(value); UpdateBytesUsed(); }
public virtual void TestFixedSorted([ValueSource(typeof(ConcurrentMergeSchedulers), "Values")]IConcurrentMergeScheduler scheduler) { BaseDirectoryWrapper dir = NewFSDirectory(CreateTempDir("2BFixedSorted")); if (dir is MockDirectoryWrapper) { ((MockDirectoryWrapper)dir).Throttling = MockDirectoryWrapper.Throttling_e.NEVER; } IndexWriter w = new IndexWriter(dir, new IndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer(Random())) .SetMaxBufferedDocs(IndexWriterConfig.DISABLE_AUTO_FLUSH) .SetRAMBufferSizeMB(256.0) .SetMergeScheduler(scheduler) .SetMergePolicy(NewLogMergePolicy(false, 10)) .SetOpenMode(IndexWriterConfig.OpenMode_e.CREATE)); Document doc = new Document(); var bytes = new byte[2]; BytesRef data = new BytesRef(bytes); SortedDocValuesField dvField = new SortedDocValuesField("dv", data); doc.Add(dvField); for (int i = 0; i < int.MaxValue; i++) { bytes[0] = (byte)(i >> 8); bytes[1] = (byte)i; w.AddDocument(doc); if (i % 100000 == 0) { Console.WriteLine("indexed: " + i); Console.Out.Flush(); } } w.ForceMerge(1); w.Dispose(); Console.WriteLine("verifying..."); Console.Out.Flush(); DirectoryReader r = DirectoryReader.Open(dir); int expectedValue = 0; foreach (AtomicReaderContext context in r.Leaves) { AtomicReader reader = context.AtomicReader; BytesRef scratch = new BytesRef(); BinaryDocValues dv = reader.GetSortedDocValues("dv"); for (int i = 0; i < reader.MaxDoc; i++) { bytes[0] = (byte)(expectedValue >> 8); bytes[1] = (byte)expectedValue; dv.Get(i, scratch); Assert.AreEqual(data, scratch); expectedValue++; } } r.Dispose(); dir.Dispose(); }
public TermStatistics(BytesRef term, long docFreq, long totalTermFreq) { Debug.Assert(docFreq >= 0); Debug.Assert(totalTermFreq == -1 || totalTermFreq >= docFreq); // #positions must be >= #postings this.Term_Renamed = term; this.DocFreq_Renamed = docFreq; this.TotalTermFreq_Renamed = totalTermFreq; }
/// <summary> /// Constructs a query selecting all terms greater/equal than <code>lowerTerm</code> /// but less/equal than <code>upperTerm</code>. /// /// <p> /// If an endpoint is null, it is said /// to be "open". Either or both endpoints may be open. Open endpoints may not /// be exclusive (you can't select all but the first or last term without /// explicitly specifying the term to exclude.) /// </summary> /// <param name="field"> The field that holds both lower and upper terms. </param> /// <param name="lowerTerm"> /// The term text at the lower end of the range </param> /// <param name="upperTerm"> /// The term text at the upper end of the range </param> /// <param name="includeLower"> /// If true, the <code>lowerTerm</code> is /// included in the range. </param> /// <param name="includeUpper"> /// If true, the <code>upperTerm</code> is /// included in the range. </param> public TermRangeQuery(string field, BytesRef lowerTerm, BytesRef upperTerm, bool includeLower, bool includeUpper) : base(field) { this.LowerTerm_Renamed = lowerTerm; this.UpperTerm_Renamed = upperTerm; this.IncludeLower = includeLower; this.IncludeUpper = includeUpper; }
private DocTermOrdsRangeFilter(string field, BytesRef lowerVal, BytesRef upperVal, bool includeLower, bool includeUpper) { this.Field_Renamed = field; this.LowerVal_Renamed = lowerVal; this.UpperVal_Renamed = upperVal; this.IncludeLower = includeLower; this.IncludeUpper = includeUpper; }
public void Add(BytesRef utf8) { if (closed) { throw new InvalidOperationException(); } buffer.Append(utf8); }
public void Add(BytesRef utf8) { if (writer == null) { throw new InvalidOperationException(); } writer.Write(utf8); }
public DocTermOrdsRangeFilterAnonymousInnerClassHelper(string field, BytesRef lowerVal, BytesRef upperVal, bool includeLower, bool includeUpper) : base(field, lowerVal, upperVal, includeLower, includeUpper) { this.Field = field; this.LowerVal = lowerVal; this.UpperVal = upperVal; this.IncludeLower = includeLower; this.IncludeUpper = includeUpper; }
internal Iterator(int size, PagedGrowableWriter offsets, PagedGrowableWriter lengths, PagedMutable docs, BytesRef values, FixedBitSet docsWithField) { this.Offsets = offsets; this.Size = size; this.Lengths = lengths; this.Docs = docs; this.DocsWithField = docsWithField; Value_Renamed = (BytesRef)values.Clone(); }
public Input(BytesRef term, long v, BytesRef payload, bool hasPayloads, IEnumerable<BytesRef> contexts, bool hasContexts) { this.term = term; this.v = v; this.payload = payload; this.hasPayloads = hasPayloads; this.contexts = contexts; this.hasContexts = hasContexts; }
/// <summary> /// Factory method for creating the right implementation based on the fact whether the facet field contains /// multiple tokens per documents. /// </summary> /// <remarks> /// Factory method for creating the right implementation based on the fact whether the facet field contains /// multiple tokens per documents. /// </remarks> /// <param name="groupField">The group field</param> /// <param name="facetField">The facet field</param> /// <param name="facetFieldMultivalued">Whether the facet field has multiple tokens per document /// </param> /// <param name="facetPrefix">The facet prefix a facet entry should start with to be included. /// </param> /// <param name="initialSize"> /// The initial allocation size of the internal int set and group facet list which should roughly /// match the total number of expected unique groups. Be aware that the heap usage is /// 4 bytes * initialSize. /// </param> /// <returns><code>TermGroupFacetCollector</code> implementation</returns> public static TermGroupFacetCollector CreateTermGroupFacetCollector (string groupField, string facetField, bool facetFieldMultivalued, BytesRef facetPrefix , int initialSize) { if (facetFieldMultivalued) { return new MV(groupField, facetField, facetPrefix, initialSize); } return new SV(groupField, facetField, facetPrefix, initialSize); }
/// <summary> /// Call this only once (if you subclass!) </summary> protected virtual void Uninvert(AtomicReader reader, IBits liveDocs, BytesRef termPrefix) { FieldInfo info = reader.FieldInfos.FieldInfo(m_field); if (info != null && info.HasDocValues) { throw new InvalidOperationException("Type mismatch: " + m_field + " was indexed as " + info.DocValuesType); } //System.out.println("DTO uninvert field=" + field + " prefix=" + termPrefix); long startTime = Environment.TickCount; m_prefix = termPrefix == null ? null : BytesRef.DeepCopyOf(termPrefix); int maxDoc = reader.MaxDoc; int[] index = new int[maxDoc]; // immediate term numbers, or the index into the byte[] representing the last number int[] lastTerm = new int[maxDoc]; // last term we saw for this document var bytes = new sbyte[maxDoc][]; // list of term numbers for the doc (delta encoded vInts) Fields fields = reader.Fields; if (fields == null) { // No terms return; } Terms terms = fields.GetTerms(m_field); if (terms == null) { // No terms return; } TermsEnum te = terms.GetIterator(null); BytesRef seekStart = termPrefix != null ? termPrefix : new BytesRef(); //System.out.println("seekStart=" + seekStart.utf8ToString()); if (te.SeekCeil(seekStart) == TermsEnum.SeekStatus.END) { // No terms match return; } // If we need our "term index wrapper", these will be // init'd below: IList <BytesRef> indexedTerms = null; PagedBytes indexedTermsBytes = null; bool testedOrd = false; // we need a minimum of 9 bytes, but round up to 12 since the space would // be wasted with most allocators anyway. var tempArr = new sbyte[12]; // // enumerate all terms, and build an intermediate form of the un-inverted field. // // During this intermediate form, every document has a (potential) byte[] // and the int[maxDoc()] array either contains the termNumber list directly // or the *end* offset of the termNumber list in it's byte array (for faster // appending and faster creation of the final form). // // idea... if things are too large while building, we could do a range of docs // at a time (but it would be a fair amount slower to build) // could also do ranges in parallel to take advantage of multiple CPUs // OPTIONAL: remap the largest df terms to the lowest 128 (single byte) // values. this requires going over the field first to find the most // frequent terms ahead of time. int termNum = 0; m_docsEnum = null; // Loop begins with te positioned to first term (we call // seek above): for (; ;) { BytesRef t = te.Term; if (t == null || (termPrefix != null && !StringHelper.StartsWith(t, termPrefix))) { break; } //System.out.println("visit term=" + t.utf8ToString() + " " + t + " termNum=" + termNum); if (!testedOrd) { try { m_ordBase = (int)te.Ord; //System.out.println("got ordBase=" + ordBase); } #pragma warning disable 168 catch (NotSupportedException uoe) #pragma warning restore 168 { // Reader cannot provide ord support, so we wrap // our own support by creating our own terms index: indexedTerms = new List <BytesRef>(); indexedTermsBytes = new PagedBytes(15); //System.out.println("NO ORDS"); } testedOrd = true; } VisitTerm(te, termNum); if (indexedTerms != null && (termNum & indexIntervalMask) == 0) { // Index this term m_sizeOfIndexedStrings += t.Length; BytesRef indexedTerm = new BytesRef(); indexedTermsBytes.Copy(t, indexedTerm); // TODO: really should 1) strip off useless suffix, // and 2) use FST not array/PagedBytes indexedTerms.Add(indexedTerm); } int df = te.DocFreq; if (df <= m_maxTermDocFreq) { m_docsEnum = te.Docs(liveDocs, m_docsEnum, DocsFlags.NONE); // dF, but takes deletions into account int actualDF = 0; for (; ;) { int doc = m_docsEnum.NextDoc(); if (doc == DocIdSetIterator.NO_MORE_DOCS) { break; } //System.out.println(" chunk=" + chunk + " docs"); actualDF++; m_termInstances++; //System.out.println(" docID=" + doc); // add TNUM_OFFSET to the term number to make room for special reserved values: // 0 (end term) and 1 (index into byte array follows) int delta = termNum - lastTerm[doc] + TNUM_OFFSET; lastTerm[doc] = termNum; int val = index[doc]; if ((val & 0xff) == 1) { // index into byte array (actually the end of // the doc-specific byte[] when building) int pos = (int)((uint)val >> 8); int ilen = VInt32Size(delta); var arr = bytes[doc]; int newend = pos + ilen; if (newend > arr.Length) { // We avoid a doubling strategy to lower memory usage. // this faceting method isn't for docs with many terms. // In hotspot, objects have 2 words of overhead, then fields, rounded up to a 64-bit boundary. // TODO: figure out what array lengths we can round up to w/o actually using more memory // (how much space does a byte[] take up? Is data preceded by a 32 bit length only? // It should be safe to round up to the nearest 32 bits in any case. int newLen = (newend + 3) & unchecked ((int)0xfffffffc); // 4 byte alignment var newarr = new sbyte[newLen]; Array.Copy(arr, 0, newarr, 0, pos); arr = newarr; bytes[doc] = newarr; } pos = WriteInt32(delta, arr, pos); index[doc] = (pos << 8) | 1; // update pointer to end index in byte[] } else { // OK, this int has data in it... find the end (a zero starting byte - not // part of another number, hence not following a byte with the high bit set). int ipos; if (val == 0) { ipos = 0; } else if ((val & 0x0000ff80) == 0) { ipos = 1; } else if ((val & 0x00ff8000) == 0) { ipos = 2; } else if ((val & 0xff800000) == 0) { ipos = 3; } else { ipos = 4; } //System.out.println(" ipos=" + ipos); int endPos = WriteInt32(delta, tempArr, ipos); //System.out.println(" endpos=" + endPos); if (endPos <= 4) { //System.out.println(" fits!"); // value will fit in the integer... move bytes back for (int j = ipos; j < endPos; j++) { val |= (tempArr[j] & 0xff) << (j << 3); } index[doc] = val; } else { // value won't fit... move integer into byte[] for (int j = 0; j < ipos; j++) { tempArr[j] = (sbyte)val; val = (int)((uint)val >> 8); } // point at the end index in the byte[] index[doc] = (endPos << 8) | 1; bytes[doc] = tempArr; tempArr = new sbyte[12]; } } } SetActualDocFreq(termNum, actualDF); } termNum++; if (te.Next() == null) { break; } } m_numTermsInField = termNum; long midPoint = Environment.TickCount; if (m_termInstances == 0) { // we didn't invert anything // lower memory consumption. m_tnums = null; } else { this.m_index = index; // // transform intermediate form into the final form, building a single byte[] // at a time, and releasing the intermediate byte[]s as we go to avoid // increasing the memory footprint. // for (int pass = 0; pass < 256; pass++) { var target = m_tnums[pass]; var pos = 0; // end in target; if (target != null) { pos = target.Length; } else { target = new sbyte[4096]; } // loop over documents, 0x00ppxxxx, 0x01ppxxxx, 0x02ppxxxx // where pp is the pass (which array we are building), and xx is all values. // each pass shares the same byte[] for termNumber lists. for (int docbase = pass << 16; docbase < maxDoc; docbase += (1 << 24)) { int lim = Math.Min(docbase + (1 << 16), maxDoc); for (int doc = docbase; doc < lim; doc++) { //System.out.println(" pass="******" process docID=" + doc); int val = index[doc]; if ((val & 0xff) == 1) { int len = (int)((uint)val >> 8); //System.out.println(" ptr pos=" + pos); index[doc] = (pos << 8) | 1; // change index to point to start of array if ((pos & 0xff000000) != 0) { // we only have 24 bits for the array index throw new InvalidOperationException("Too many values for UnInvertedField faceting on field " + m_field); } var arr = bytes[doc]; /* * for(byte b : arr) { * //System.out.println(" b=" + Integer.toHexString((int) b)); * } */ bytes[doc] = null; // IMPORTANT: allow GC to avoid OOM if (target.Length <= pos + len) { int newlen = target.Length; //* we don't have to worry about the array getting too large // since the "pos" param will overflow first (only 24 bits available) // if ((newlen<<1) <= 0) { // // overflow... // newlen = Integer.MAX_VALUE; // if (newlen <= pos + len) { // throw new SolrException(400,"Too many terms to uninvert field!"); // } // } else { // while (newlen <= pos + len) newlen<<=1; // doubling strategy // } // while (newlen <= pos + len) // doubling strategy { newlen <<= 1; } var newtarget = new sbyte[newlen]; Array.Copy(target, 0, newtarget, 0, pos); target = newtarget; } Array.Copy(arr, 0, target, pos, len); pos += len + 1; // skip single byte at end and leave it 0 for terminator } } } // shrink array if (pos < target.Length) { var newtarget = new sbyte[pos]; Array.Copy(target, 0, newtarget, 0, pos); target = newtarget; } m_tnums[pass] = target; if ((pass << 16) > maxDoc) { break; } } } if (indexedTerms != null) { m_indexedTermsArray = new BytesRef[indexedTerms.Count]; indexedTerms.CopyTo(m_indexedTermsArray, 0); } long endTime = Environment.TickCount; m_total_time = (int)(endTime - startTime); m_phase1_time = (int)(midPoint - startTime); }
// Swap in S, in place of E: private bool SeekToNonBMP(SegmentTermEnum te, BytesRef term, int pos) { int savLength = term.Length; Debug.Assert(term.Offset == 0); // The 3 bytes starting at downTo make up 1 // unicode character: Debug.Assert(IsHighBMPChar(term.Bytes, pos)); // NOTE: we cannot make this assert, because // AutomatonQuery legitimately sends us malformed UTF8 // (eg the UTF8 bytes with just 0xee) // assert term.length >= pos + 3: "term.length=" + term.length + " pos+3=" + (pos+3) + " byte=" + Integer.toHexString(term.bytes[pos]) + " term=" + term.toString(); // Save the bytes && length, since we need to // restore this if seek "back" finds no matching // terms if (term.Bytes.Length < 4 + pos) { term.Grow(4 + pos); } scratch[0] = (sbyte)term.Bytes[pos]; scratch[1] = (sbyte)term.Bytes[pos + 1]; scratch[2] = (sbyte)term.Bytes[pos + 2]; term.Bytes[pos] = 0xf0; term.Bytes[pos + 1] = 0x90; term.Bytes[pos + 2] = 0x80; term.Bytes[pos + 3] = 0x80; term.Length = 4 + pos; if (DEBUG_SURROGATES) { Console.WriteLine(" try seek term=" + UnicodeUtil.ToHexString(term.Utf8ToString())); } // Seek "back": outerInstance.TermsDict.SeekEnum(te, new Term(fieldInfo.Name, term), true); // Test if the term we seek'd to in fact found a // surrogate pair at the same position as the E: Term t2 = te.Term(); // Cannot be null (or move to next field) because at // "worst" it'd seek to the same term we are on now, // unless we are being called from seek if (t2 == null || t2.Field != internedFieldName) { return(false); } if (DEBUG_SURROGATES) { Console.WriteLine(" got term=" + UnicodeUtil.ToHexString(t2.Text())); } // Now test if prefix is identical and we found // a non-BMP char at the same position: BytesRef b2 = t2.Bytes; Debug.Assert(b2.Offset == 0); bool matches; if (b2.Length >= term.Length && IsNonBMPChar(b2.Bytes, pos)) { matches = true; for (int i = 0; i < pos; i++) { if (term.Bytes[i] != b2.Bytes[i]) { matches = false; break; } } } else { matches = false; } // Restore term: term.Length = savLength; term.Bytes[pos] = (byte)scratch[0]; term.Bytes[pos + 1] = (byte)scratch[1]; term.Bytes[pos + 2] = (byte)scratch[2]; return(matches); }
public LiteralValueSource(string str) { this.m_str = str; this.m_bytesRef = new BytesRef(str); }
// Look for seek type 1 ("push"): if the newly added // suffix contains any S, we must try to seek to the // corresponding E. If we find a match, we go there; // else we keep looking for additional S's in the new // suffix. this "starts" the dance, at this character // position: private void DoPushes() { int upTo = newSuffixStart; if (DEBUG_SURROGATES) { Console.WriteLine(" try push newSuffixStart=" + newSuffixStart + " scratchLen=" + scratchTerm.Length); } while (upTo < scratchTerm.Length) { if (IsNonBMPChar(scratchTerm.Bytes, upTo) && (upTo > newSuffixStart || (upTo >= prevTerm.Length || (!IsNonBMPChar(prevTerm.Bytes, upTo) && !IsHighBMPChar(prevTerm.Bytes, upTo))))) { // A non-BMP char (4 bytes UTF8) starts here: Debug.Assert(scratchTerm.Length >= upTo + 4); int savLength = scratchTerm.Length; scratch[0] = (sbyte)scratchTerm.Bytes[upTo]; scratch[1] = (sbyte)scratchTerm.Bytes[upTo + 1]; scratch[2] = (sbyte)scratchTerm.Bytes[upTo + 2]; scratchTerm.Bytes[upTo] = (byte)UTF8_HIGH_BMP_LEAD; scratchTerm.Bytes[upTo + 1] = 0x80; scratchTerm.Bytes[upTo + 2] = 0x80; scratchTerm.Length = upTo + 3; if (DEBUG_SURROGATES) { Console.WriteLine(" try seek 1 pos=" + upTo + " term=" + UnicodeUtil.ToHexString(scratchTerm.Utf8ToString()) + " " + scratchTerm.ToString() + " len=" + scratchTerm.Length); } // Seek "forward": // TODO: more efficient seek? outerInstance.TermsDict.SeekEnum(seekTermEnum, new Term(fieldInfo.Name, scratchTerm), true); scratchTerm.Bytes[upTo] = (byte)scratch[0]; scratchTerm.Bytes[upTo + 1] = (byte)scratch[1]; scratchTerm.Bytes[upTo + 2] = (byte)scratch[2]; scratchTerm.Length = savLength; // Did we find a match? Term t2 = seekTermEnum.Term(); if (DEBUG_SURROGATES) { if (t2 == null) { Console.WriteLine(" hit term=null"); } else { Console.WriteLine(" hit term=" + UnicodeUtil.ToHexString(t2.Text()) + " " + (t2 == null ? null : t2.Bytes)); } } // Since this was a seek "forward", we could hit // EOF or a different field: bool matches; if (t2 != null && t2.Field == internedFieldName) { BytesRef b2 = t2.Bytes; Debug.Assert(b2.Offset == 0); if (b2.Length >= upTo + 3 && IsHighBMPChar(b2.Bytes, upTo)) { matches = true; for (int i = 0; i < upTo; i++) { if (scratchTerm.Bytes[i] != b2.Bytes[i]) { matches = false; break; } } } else { matches = false; } } else { matches = false; } if (matches) { if (DEBUG_SURROGATES) { Console.WriteLine(" matches!"); } // OK seek "back" // TODO: more efficient seek? outerInstance.TermsDict.SeekEnum(termEnum, seekTermEnum.Term(), true); scratchTerm.CopyBytes(seekTermEnum.Term().Bytes); // +3 because we don't need to check the char // at upTo: we know it's > BMP upTo += 3; // NOTE: we keep iterating, now, since this // can easily "recurse". Ie, after seeking // forward at a certain char position, we may // find another surrogate in our [new] suffix // and must then do another seek (recurse) } else { upTo++; } } else { upTo++; } } }
public override void AddPosition(int position, int startOffset, int endOffset, BytesRef payload) { if (Debugging.AssertsEnabled) { Debugging.Assert(curField.flags != 0); } curField.AddPosition(position, startOffset, endOffset - startOffset, payload == null ? 0 : payload.Length); if (curField.hasPayloads && payload != null) { payloadBytes.WriteBytes(payload.Bytes, payload.Offset, payload.Length); } }
/// <summary> /// The default implementation returns <code>1</code> /// </summary> public override float ScorePayload(int doc, int start, int end, BytesRef payload) { return(1); }
public override SeekStatus SeekCeil(BytesRef term) { if (DEBUG_SURROGATES) { Console.WriteLine("TE.seek target=" + UnicodeUtil.ToHexString(term.Utf8ToString())); } skipNext = false; TermInfosReader tis = outerInstance.TermsDict; Term t0 = new Term(fieldInfo.Name, term); Debug.Assert(termEnum != null); tis.SeekEnum(termEnum, t0, false); Term t = termEnum.Term(); if (t != null && t.Field == internedFieldName && term.BytesEquals(t.Bytes)) { // If we found an exact match, no need to do the // surrogate dance if (DEBUG_SURROGATES) { Console.WriteLine(" seek exact match"); } current = t.Bytes; return(SeekStatus.FOUND); } else if (t == null || t.Field != internedFieldName) { // TODO: maybe we can handle this like the next() // into null? set term as prevTerm then dance? if (DEBUG_SURROGATES) { Console.WriteLine(" seek hit EOF"); } // We hit EOF; try end-case surrogate dance: if we // find an E, try swapping in S, backwards: scratchTerm.CopyBytes(term); Debug.Assert(scratchTerm.Offset == 0); for (int i = scratchTerm.Length - 1; i >= 0; i--) { if (IsHighBMPChar(scratchTerm.Bytes, i)) { if (DEBUG_SURROGATES) { Console.WriteLine(" found E pos=" + i + "; try seek"); } if (SeekToNonBMP(seekTermEnum, scratchTerm, i)) { scratchTerm.CopyBytes(seekTermEnum.Term().Bytes); outerInstance.TermsDict.SeekEnum(termEnum, seekTermEnum.Term(), false); newSuffixStart = 1 + i; DoPushes(); // Found a match // TODO: faster seek? current = termEnum.Term().Bytes; return(SeekStatus.NOT_FOUND); } } } if (DEBUG_SURROGATES) { Console.WriteLine(" seek END"); } current = null; return(SeekStatus.END); } else { // We found a non-exact but non-null term; this one // is fun -- just treat it like next, by pretending // requested term was prev: prevTerm.CopyBytes(term); if (DEBUG_SURROGATES) { Console.WriteLine(" seek hit non-exact term=" + UnicodeUtil.ToHexString(t.Text())); } BytesRef br = t.Bytes; Debug.Assert(br.Offset == 0); SetNewSuffixStart(term, br); SurrogateDance(); Term t2 = termEnum.Term(); if (t2 == null || t2.Field != internedFieldName) { // PreFlex codec interns field names; verify: Debug.Assert(t2 == null || !t2.Field.Equals(internedFieldName, StringComparison.Ordinal)); current = null; return(SeekStatus.END); } else { current = t2.Bytes; Debug.Assert(!unicodeSortOrder || term.CompareTo(current) < 0, "term=" + UnicodeUtil.ToHexString(term.Utf8ToString()) + " vs current=" + UnicodeUtil.ToHexString(current.Utf8ToString())); return(SeekStatus.NOT_FOUND); } } }
public void TestRandomIndex() { Directory dir = NewDirectory(); MockAnalyzer analyzer = new MockAnalyzer(Random()); analyzer.MaxTokenLength = TestUtil.NextInt(Random(), 1, IndexWriter.MAX_TERM_LENGTH); RandomIndexWriter w = new RandomIndexWriter(Random(), dir, analyzer, Similarity, TimeZone); CreateRandomIndex(AtLeast(50), w, Random().NextLong()); DirectoryReader reader = w.Reader; AtomicReader wrapper = SlowCompositeReaderWrapper.Wrap(reader); string field = @"body"; Terms terms = wrapper.GetTerms(field); var lowFreqQueue = new AnonymousPriorityQueue(this, 5); Util.PriorityQueue <TermAndFreq> highFreqQueue = new AnonymousPriorityQueue1(this, 5); try { TermsEnum iterator = terms.GetIterator(null); while (iterator.Next() != null) { if (highFreqQueue.Count < 5) { highFreqQueue.Add(new TermAndFreq(BytesRef.DeepCopyOf(iterator.Term), iterator.DocFreq)); lowFreqQueue.Add(new TermAndFreq(BytesRef.DeepCopyOf(iterator.Term), iterator.DocFreq)); } else { if (highFreqQueue.Top.freq < iterator.DocFreq) { highFreqQueue.Top.freq = iterator.DocFreq; highFreqQueue.Top.term = BytesRef.DeepCopyOf(iterator.Term); highFreqQueue.UpdateTop(); } if (lowFreqQueue.Top.freq > iterator.DocFreq) { lowFreqQueue.Top.freq = iterator.DocFreq; lowFreqQueue.Top.term = BytesRef.DeepCopyOf(iterator.Term); lowFreqQueue.UpdateTop(); } } } int lowFreq = lowFreqQueue.Top.freq; int highFreq = highFreqQueue.Top.freq; AssumeTrue(@"unlucky index", highFreq - 1 > lowFreq); List <TermAndFreq> highTerms = QueueToList(highFreqQueue); List <TermAndFreq> lowTerms = QueueToList(lowFreqQueue); IndexSearcher searcher = NewSearcher(reader); Occur lowFreqOccur = RandomOccur(Random()); BooleanQuery verifyQuery = new BooleanQuery(); CommonTermsQuery cq = new CommonTermsQuery(RandomOccur(Random()), lowFreqOccur, highFreq - 1, Random().NextBoolean()); foreach (TermAndFreq termAndFreq in lowTerms) { cq.Add(new Term(field, termAndFreq.term)); verifyQuery.Add(new BooleanClause(new TermQuery(new Term(field, termAndFreq.term)), lowFreqOccur)); } foreach (TermAndFreq termAndFreq in highTerms) { cq.Add(new Term(field, termAndFreq.term)); } TopDocs cqSearch = searcher.Search(cq, reader.MaxDoc); TopDocs verifySearch = searcher.Search(verifyQuery, reader.MaxDoc); assertEquals(verifySearch.TotalHits, cqSearch.TotalHits); var hits = new HashSet <int>(); foreach (ScoreDoc doc in verifySearch.ScoreDocs) { hits.Add(doc.Doc); } foreach (ScoreDoc doc in cqSearch.ScoreDocs) { assertTrue(hits.Remove(doc.Doc)); } assertTrue(hits.Count == 0); w.ForceMerge(1); DirectoryReader reader2 = w.Reader; QueryUtils.Check(Random(), cq, NewSearcher(reader2), Similarity); reader2.Dispose(); } finally { reader.Dispose(); wrapper.Dispose(); w.Dispose(); dir.Dispose(); } }
public TermAndFreq(BytesRef term, int freq) { this.term = term; this.freq = freq; }
public override bool IsIndexTerm(BytesRef term, TermStats stats) { return(rand.Next(gap) == gap / 2); }
public override SeekStatus SeekCeil(BytesRef target) { // already here if (term != null && term.Equals(target)) { return(SeekStatus.FOUND); } int startIdx = Array.BinarySearch(outerInstance.m_indexedTermsArray, target); if (startIdx >= 0) { // we hit the term exactly... lucky us! TermsEnum.SeekStatus seekStatus = termsEnum.SeekCeil(target); Debug.Assert(seekStatus == TermsEnum.SeekStatus.FOUND); ord = startIdx << outerInstance.indexIntervalBits; SetTerm(); Debug.Assert(term != null); return(SeekStatus.FOUND); } // we didn't hit the term exactly startIdx = -startIdx - 1; if (startIdx == 0) { // our target occurs *before* the first term TermsEnum.SeekStatus seekStatus = termsEnum.SeekCeil(target); Debug.Assert(seekStatus == TermsEnum.SeekStatus.NOT_FOUND); ord = 0; SetTerm(); Debug.Assert(term != null); return(SeekStatus.NOT_FOUND); } // back up to the start of the block startIdx--; if ((ord >> outerInstance.indexIntervalBits) == startIdx && term != null && term.CompareTo(target) <= 0) { // we are already in the right block and the current term is before the term we want, // so we don't need to seek. } else { // seek to the right block TermsEnum.SeekStatus seekStatus = termsEnum.SeekCeil(outerInstance.m_indexedTermsArray[startIdx]); Debug.Assert(seekStatus == TermsEnum.SeekStatus.FOUND); ord = startIdx << outerInstance.indexIntervalBits; SetTerm(); Debug.Assert(term != null); // should be non-null since it's in the index } while (term != null && term.CompareTo(target) < 0) { Next(); } if (term == null) { return(SeekStatus.END); } else if (term.CompareTo(target) == 0) { return(SeekStatus.FOUND); } else { return(SeekStatus.NOT_FOUND); } }
public virtual void TestSimple() { int numNodes = TestUtil.NextInt32(Random, 1, 10); double runTimeSec = AtLeast(3); int minDocsToMakeTerms = TestUtil.NextInt32(Random, 5, 20); int maxSearcherAgeSeconds = TestUtil.NextInt32(Random, 1, 3); if (Verbose) { Console.WriteLine("TEST: numNodes=" + numNodes + " runTimeSec=" + runTimeSec + " maxSearcherAgeSeconds=" + maxSearcherAgeSeconds); } Start(numNodes, runTimeSec, maxSearcherAgeSeconds); List <PreviousSearchState> priorSearches = new List <PreviousSearchState>(); List <BytesRef> terms = null; while (Time.NanoTime() < endTimeNanos) { bool doFollowon = priorSearches.Count > 0 && Random.Next(7) == 1; // Pick a random node; we will run the query on this node: int myNodeID = Random.Next(numNodes); NodeState.ShardIndexSearcher localShardSearcher; PreviousSearchState prevSearchState; if (doFollowon) { // Pretend user issued a followon query: prevSearchState = priorSearches[Random.Next(priorSearches.Count)]; if (Verbose) { Console.WriteLine("\nTEST: follow-on query age=" + ((Time.NanoTime() - prevSearchState.SearchTimeNanos) / 1000000000.0)); } try { localShardSearcher = m_nodes[myNodeID].Acquire(prevSearchState.Versions); } catch (SearcherExpiredException see) { // Expected, sometimes; in a "real" app we would // either forward this error to the user ("too // much time has passed; please re-run your // search") or sneakily just switch to newest // searcher w/o telling them... if (Verbose) { Console.WriteLine(" searcher expired during local shard searcher init: " + see); } priorSearches.Remove(prevSearchState); continue; } } else { if (Verbose) { Console.WriteLine("\nTEST: fresh query"); } // Do fresh query: localShardSearcher = m_nodes[myNodeID].Acquire(); prevSearchState = null; } IndexReader[] subs = new IndexReader[numNodes]; PreviousSearchState searchState = null; try { // Mock: now make a single reader (MultiReader) from all node // searchers. In a real shard env you can't do this... we // do it to confirm results from the shard searcher // are correct: int docCount = 0; try { for (int nodeID = 0; nodeID < numNodes; nodeID++) { long subVersion = localShardSearcher.GetNodeVersions()[nodeID]; IndexSearcher sub = m_nodes[nodeID].Searchers.Acquire(subVersion); if (sub == null) { nodeID--; while (nodeID >= 0) { subs[nodeID].DecRef(); subs[nodeID] = null; nodeID--; } throw new SearcherExpiredException("nodeID=" + nodeID + " version=" + subVersion); } subs[nodeID] = sub.IndexReader; docCount += subs[nodeID].MaxDoc; } } catch (SearcherExpiredException see) { // Expected if (Verbose) { Console.WriteLine(" searcher expired during mock reader init: " + see); } continue; } IndexReader mockReader = new MultiReader(subs); IndexSearcher mockSearcher = new IndexSearcher(mockReader); Query query; Sort sort; if (prevSearchState != null) { query = prevSearchState.Query; sort = prevSearchState.Sort; } else { if (terms == null && docCount > minDocsToMakeTerms) { // TODO: try to "focus" on high freq terms sometimes too // TODO: maybe also periodically reset the terms...? TermsEnum termsEnum = MultiFields.GetTerms(mockReader, "body").GetEnumerator(); terms = new List <BytesRef>(); while (termsEnum.MoveNext()) { terms.Add(BytesRef.DeepCopyOf(termsEnum.Term)); } if (Verbose) { Console.WriteLine("TEST: init terms: " + terms.Count + " terms"); } if (terms.Count == 0) { terms = null; } } if (Verbose) { Console.WriteLine(" maxDoc=" + mockReader.MaxDoc); } if (terms != null) { if (Random.NextBoolean()) { query = new TermQuery(new Term("body", terms[Random.Next(terms.Count)])); } else { string t = terms[Random.Next(terms.Count)].Utf8ToString(); string prefix; if (t.Length <= 1) { prefix = t; } else { prefix = t.Substring(0, TestUtil.NextInt32(Random, 1, 2)); } query = new PrefixQuery(new Term("body", prefix)); } if (Random.NextBoolean()) { sort = null; } else { // TODO: sort by more than 1 field int what = Random.Next(3); if (what == 0) { sort = new Sort(SortField.FIELD_SCORE); } else if (what == 1) { // TODO: this sort doesn't merge // correctly... it's tricky because you // could have > 2.1B docs across all shards: //sort = new Sort(SortField.FIELD_DOC); sort = null; } else if (what == 2) { sort = new Sort(new SortField[] { new SortField("docid", SortFieldType.INT32, Random.NextBoolean()) }); } else { sort = new Sort(new SortField[] { new SortField("title", SortFieldType.STRING, Random.NextBoolean()) }); } } } else { query = null; sort = null; } } if (query != null) { try { searchState = AssertSame(mockSearcher, localShardSearcher, query, sort, prevSearchState); } catch (SearcherExpiredException see) { // Expected; in a "real" app we would // either forward this error to the user ("too // much time has passed; please re-run your // search") or sneakily just switch to newest // searcher w/o telling them... if (Verbose) { Console.WriteLine(" searcher expired during search: " + see); Console.Out.Write(see.StackTrace); } // We can't do this in general: on a very slow // computer it's possible the local searcher // expires before we can finish our search: // assert prevSearchState != null; if (prevSearchState != null) { priorSearches.Remove(prevSearchState); } } } } finally { m_nodes[myNodeID].Release(localShardSearcher); foreach (IndexReader sub in subs) { if (sub != null) { sub.DecRef(); } } } if (searchState != null && searchState.SearchAfterLocal != null && Random.Next(5) == 3) { priorSearches.Add(searchState); if (priorSearches.Count > 200) { priorSearches.Shuffle(Random); priorSearches.SubList(100, priorSearches.Count).Clear(); } } } Finish(); }
/// <summary> /// Inverts only terms starting w/ prefix, and only terms /// whose docFreq (not taking deletions into account) is /// <= <paramref name="maxTermDocFreq"/>, with a custom indexing interval /// (default is every 128nd term). /// </summary> public DocTermOrds(AtomicReader reader, IBits liveDocs, string field, BytesRef termPrefix, int maxTermDocFreq, int indexIntervalBits) : this(field, maxTermDocFreq, indexIntervalBits) { Uninvert(reader, liveDocs, termPrefix); }
/// <summary> /// Builds an <see cref="SynonymMap"/> and returns it. /// </summary> public virtual SynonymMap Build() { ByteSequenceOutputs outputs = ByteSequenceOutputs.Singleton; // TODO: are we using the best sharing options? var builder = new Builder <BytesRef>(FST.INPUT_TYPE.BYTE4, outputs); BytesRef scratch = new BytesRef(64); ByteArrayDataOutput scratchOutput = new ByteArrayDataOutput(); ISet <int?> dedupSet; if (dedup) { dedupSet = new JCG.HashSet <int?>(); } else { dedupSet = null; } var spare = new byte[5]; ICollection <CharsRef> keys = workingSet.Keys; CharsRef[] sortedKeys = keys.ToArray(); #pragma warning disable 612, 618 System.Array.Sort(sortedKeys, CharsRef.UTF16SortedAsUTF8Comparer); #pragma warning restore 612, 618 Int32sRef scratchIntsRef = new Int32sRef(); //System.out.println("fmap.build"); for (int keyIdx = 0; keyIdx < sortedKeys.Length; keyIdx++) { CharsRef input = sortedKeys[keyIdx]; MapEntry output = workingSet[input]; int numEntries = output.ords.Count; // output size, assume the worst case int estimatedSize = 5 + numEntries * 5; // numEntries + one ord for each entry scratch.Grow(estimatedSize); scratchOutput.Reset(scratch.Bytes, scratch.Offset, scratch.Bytes.Length); Debug.Assert(scratch.Offset == 0); // now write our output data: int count = 0; for (int i = 0; i < numEntries; i++) { if (dedupSet != null) { // box once int?ent = output.ords[i]; if (dedupSet.Contains(ent)) { continue; } dedupSet.Add(ent); } scratchOutput.WriteVInt32(output.ords[i]); count++; } int pos = scratchOutput.Position; scratchOutput.WriteVInt32(count << 1 | (output.includeOrig ? 0 : 1)); int pos2 = scratchOutput.Position; int vIntLen = pos2 - pos; // Move the count + includeOrig to the front of the byte[]: Array.Copy(scratch.Bytes, pos, spare, 0, vIntLen); Array.Copy(scratch.Bytes, 0, scratch.Bytes, vIntLen, pos); Array.Copy(spare, 0, scratch.Bytes, 0, vIntLen); if (dedupSet != null) { dedupSet.Clear(); } scratch.Length = scratchOutput.Position - scratch.Offset; //System.out.println(" add input=" + input + " output=" + scratch + " offset=" + scratch.offset + " length=" + scratch.length + " count=" + count); builder.Add(Lucene.Net.Util.Fst.Util.ToUTF32(input.ToString(), scratchIntsRef), BytesRef.DeepCopyOf(scratch)); } FST <BytesRef> fst = builder.Finish(); return(new SynonymMap(fst, words, maxHorizontalContext)); }
/// <summary> /// Inverts only terms starting w/ prefix, and only terms /// whose docFreq (not taking deletions into account) is /// <= <paramref name="maxTermDocFreq"/> /// </summary> public DocTermOrds(AtomicReader reader, IBits liveDocs, string field, BytesRef termPrefix, int maxTermDocFreq) : this(reader, liveDocs, field, termPrefix, maxTermDocFreq, DEFAULT_INDEX_INTERVAL_BITS) { }
public override void FinishTerm(BytesRef term, TermStats stats) { }
/// <summary> /// Inverts only terms starting w/ prefix </summary> public DocTermOrds(AtomicReader reader, IBits liveDocs, string field, BytesRef termPrefix) : this(reader, liveDocs, field, termPrefix, int.MaxValue) { }
public override PostingsConsumer StartTerm(BytesRef term) { return(_postingsWriter.Reset(term)); }
// Look for seek type 3 ("pop"): if the delta from // prev -> current was replacing an S with an E, // we must now seek to beyond that E. this seek // "finishes" the dance at this character // position. private bool DoPop() { if (DEBUG_SURROGATES) { Console.WriteLine(" try pop"); } Debug.Assert(newSuffixStart <= prevTerm.Length); Debug.Assert(newSuffixStart < scratchTerm.Length || newSuffixStart == 0); if (prevTerm.Length > newSuffixStart && IsNonBMPChar(prevTerm.Bytes, newSuffixStart) && IsHighBMPChar(scratchTerm.Bytes, newSuffixStart)) { // Seek type 2 -- put 0xFF at this position: scratchTerm.Bytes[newSuffixStart] = 0xff; scratchTerm.Length = newSuffixStart + 1; if (DEBUG_SURROGATES) { Console.WriteLine(" seek to term=" + UnicodeUtil.ToHexString(scratchTerm.Utf8ToString()) + " " + scratchTerm.ToString()); } // TODO: more efficient seek? can we simply swap // the enums? outerInstance.TermsDict.SeekEnum(termEnum, new Term(fieldInfo.Name, scratchTerm), true); Term t2 = termEnum.Term(); // We could hit EOF or different field since this // was a seek "forward": if (t2 != null && t2.Field == internedFieldName) { if (DEBUG_SURROGATES) { Console.WriteLine(" got term=" + UnicodeUtil.ToHexString(t2.Text()) + " " + t2.Bytes); } BytesRef b2 = t2.Bytes; Debug.Assert(b2.Offset == 0); // Set newSuffixStart -- we can't use // termEnum's since the above seek may have // done no scanning (eg, term was precisely // and index term, or, was in the term seek // cache): scratchTerm.CopyBytes(b2); SetNewSuffixStart(prevTerm, scratchTerm); return(true); } else if (newSuffixStart != 0 || scratchTerm.Length != 0) { if (DEBUG_SURROGATES) { Console.WriteLine(" got term=null (or next field)"); } newSuffixStart = 0; scratchTerm.Length = 0; return(true); } } return(false); }
public override BytesRef Next() { if (DEBUG_SURROGATES) { Console.WriteLine("TE.next()"); } if (skipNext) { if (DEBUG_SURROGATES) { Console.WriteLine(" skipNext=true"); } skipNext = false; if (termEnum.Term() == null) { return(null); // PreFlex codec interns field names: } else if (termEnum.Term().Field != internedFieldName) { return(null); } else { return(current = termEnum.Term().Bytes); } } // TODO: can we use STE's prevBuffer here? prevTerm.CopyBytes(termEnum.Term().Bytes); if (termEnum.Next() && termEnum.Term().Field == internedFieldName) { newSuffixStart = termEnum.newSuffixStart; if (DEBUG_SURROGATES) { Console.WriteLine(" newSuffixStart=" + newSuffixStart); } SurrogateDance(); Term t = termEnum.Term(); if (t == null || t.Field != internedFieldName) { // PreFlex codec interns field names; verify: Debug.Assert(t == null || !t.Field.Equals(internedFieldName, StringComparison.Ordinal)); current = null; } else { current = t.Bytes; } return(current); } else { // this field is exhausted, but we have to give // surrogateDance a chance to seek back: if (DEBUG_SURROGATES) { Console.WriteLine(" force cont"); } //newSuffixStart = prevTerm.length; newSuffixStart = 0; SurrogateDance(); Term t = termEnum.Term(); if (t == null || t.Field != internedFieldName) { // PreFlex codec interns field names; verify: Debug.Assert(t == null || !t.Field.Equals(internedFieldName, StringComparison.Ordinal)); return(null); } else { current = t.Bytes; return(current); } } }
public TermFreqValueSource(string field, string val, string indexedField, BytesRef indexedBytes) : base(field, val, indexedField, indexedBytes) { }
public override void LookupOrd(int ord, BytesRef result) { @in.LookupOrd(ord, result); }
private void InitializeInstanceFields() { p = new BytesRef(data, 0, 1); }
public virtual void Test() { int[] ints = new int[7]; IntsRef input = new IntsRef(ints, 0, ints.Length); int seed = Random().Next(); Directory dir = new MMapDirectory(CreateTempDir("2BFST")); for (int doPackIter = 0; doPackIter < 2; doPackIter++) { bool doPack = doPackIter == 1; // Build FST w/ NoOutputs and stop when nodeCount > 2.2B if (!doPack) { Console.WriteLine("\nTEST: 3B nodes; doPack=false output=NO_OUTPUTS"); Outputs <object> outputs = NoOutputs.Singleton; object NO_OUTPUT = outputs.NoOutput; Builder <object> b = new Builder <object>(FST.INPUT_TYPE.BYTE1, 0, 0, true, true, int.MaxValue, outputs, null, doPack, PackedInts.COMPACT, true, 15); int count = 0; Random r = new Random(seed); int[] ints2 = new int[200]; IntsRef input2 = new IntsRef(ints2, 0, ints2.Length); while (true) { //System.out.println("add: " + input + " -> " + output); for (int i = 10; i < ints2.Length; i++) { ints2[i] = r.Next(256); } b.Add(input2, NO_OUTPUT); count++; if (count % 100000 == 0) { Console.WriteLine(count + ": " + b.FstSizeInBytes() + " bytes; " + b.TotStateCount + " nodes"); } if (b.TotStateCount > int.MaxValue + 100L * 1024 * 1024) { break; } NextInput(r, ints2); } FST <object> fst = b.Finish(); for (int verify = 0; verify < 2; verify++) { Console.WriteLine("\nTEST: now verify [fst size=" + fst.SizeInBytes() + "; nodeCount=" + fst.NodeCount + "; arcCount=" + fst.ArcCount + "]"); Arrays.Fill(ints2, 0); r = new Random(seed); for (int i = 0; i < count; i++) { if (i % 1000000 == 0) { Console.WriteLine(i + "...: "); } for (int j = 10; j < ints2.Length; j++) { ints2[j] = r.Next(256); } Assert.AreEqual(NO_OUTPUT, Util.Get(fst, input2)); NextInput(r, ints2); } Console.WriteLine("\nTEST: enum all input/outputs"); IntsRefFSTEnum <object> fstEnum = new IntsRefFSTEnum <object>(fst); Arrays.Fill(ints2, 0); r = new Random(seed); int upto = 0; while (true) { IntsRefFSTEnum <object> .InputOutput <object> pair = fstEnum.Next(); if (pair == null) { break; } for (int j = 10; j < ints2.Length; j++) { ints2[j] = r.Next(256); } Assert.AreEqual(input2, pair.Input); Assert.AreEqual(NO_OUTPUT, pair.Output); upto++; NextInput(r, ints2); } Assert.AreEqual(count, upto); if (verify == 0) { Console.WriteLine("\nTEST: save/load FST and re-verify"); IndexOutput @out = dir.CreateOutput("fst", IOContext.DEFAULT); fst.Save(@out); @out.Dispose(); IndexInput @in = dir.OpenInput("fst", IOContext.DEFAULT); fst = new FST <object>(@in, outputs); @in.Dispose(); } else { dir.DeleteFile("fst"); } } } // Build FST w/ ByteSequenceOutputs and stop when FST // size = 3GB { Console.WriteLine("\nTEST: 3 GB size; doPack=" + doPack + " outputs=bytes"); Outputs <BytesRef> outputs = ByteSequenceOutputs.Singleton; Builder <BytesRef> b = new Builder <BytesRef>(FST.INPUT_TYPE.BYTE1, 0, 0, true, true, int.MaxValue, outputs, null, doPack, PackedInts.COMPACT, true, 15); var outputBytes = new byte[20]; BytesRef output = new BytesRef(outputBytes); Arrays.Fill(ints, 0); int count = 0; Random r = new Random(seed); while (true) { r.NextBytes(outputBytes); //System.out.println("add: " + input + " -> " + output); b.Add(input, BytesRef.DeepCopyOf(output)); count++; if (count % 1000000 == 0) { Console.WriteLine(count + "...: " + b.FstSizeInBytes() + " bytes"); } if (b.FstSizeInBytes() > LIMIT) { break; } NextInput(r, ints); } FST <BytesRef> fst = b.Finish(); for (int verify = 0; verify < 2; verify++) { Console.WriteLine("\nTEST: now verify [fst size=" + fst.SizeInBytes() + "; nodeCount=" + fst.NodeCount + "; arcCount=" + fst.ArcCount + "]"); r = new Random(seed); Arrays.Fill(ints, 0); for (int i = 0; i < count; i++) { if (i % 1000000 == 0) { Console.WriteLine(i + "...: "); } r.NextBytes((byte[])(Array)outputBytes); Assert.AreEqual(output, Util.Get(fst, input)); NextInput(r, ints); } Console.WriteLine("\nTEST: enum all input/outputs"); IntsRefFSTEnum <BytesRef> fstEnum = new IntsRefFSTEnum <BytesRef>(fst); Arrays.Fill(ints, 0); r = new Random(seed); int upto = 0; while (true) { IntsRefFSTEnum <BytesRef> .InputOutput <BytesRef> pair = fstEnum.Next(); if (pair == null) { break; } Assert.AreEqual(input, pair.Input); r.NextBytes((byte[])(Array)outputBytes); Assert.AreEqual(output, pair.Output); upto++; NextInput(r, ints); } Assert.AreEqual(count, upto); if (verify == 0) { Console.WriteLine("\nTEST: save/load FST and re-verify"); IndexOutput @out = dir.CreateOutput("fst", IOContext.DEFAULT); fst.Save(@out); @out.Dispose(); IndexInput @in = dir.OpenInput("fst", IOContext.DEFAULT); fst = new FST <BytesRef>(@in, outputs); @in.Dispose(); } else { dir.DeleteFile("fst"); } } } // Build FST w/ PositiveIntOutputs and stop when FST // size = 3GB { Console.WriteLine("\nTEST: 3 GB size; doPack=" + doPack + " outputs=long"); Outputs <long?> outputs = PositiveIntOutputs.Singleton; Builder <long?> b = new Builder <long?>(FST.INPUT_TYPE.BYTE1, 0, 0, true, true, int.MaxValue, outputs, null, doPack, PackedInts.COMPACT, true, 15); long output = 1; Arrays.Fill(ints, 0); int count = 0; Random r = new Random(seed); while (true) { //System.out.println("add: " + input + " -> " + output); b.Add(input, output); output += 1 + r.Next(10); count++; if (count % 1000000 == 0) { Console.WriteLine(count + "...: " + b.FstSizeInBytes() + " bytes"); } if (b.FstSizeInBytes() > LIMIT) { break; } NextInput(r, ints); } FST <long?> fst = b.Finish(); for (int verify = 0; verify < 2; verify++) { Console.WriteLine("\nTEST: now verify [fst size=" + fst.SizeInBytes() + "; nodeCount=" + fst.NodeCount + "; arcCount=" + fst.ArcCount + "]"); Arrays.Fill(ints, 0); output = 1; r = new Random(seed); for (int i = 0; i < count; i++) { if (i % 1000000 == 0) { Console.WriteLine(i + "...: "); } // forward lookup: Assert.AreEqual(output, (long)Util.Get(fst, input)); // reverse lookup: Assert.AreEqual(input, Util.GetByOutput(fst, output)); output += 1 + r.Next(10); NextInput(r, ints); } Console.WriteLine("\nTEST: enum all input/outputs"); IntsRefFSTEnum <long?> fstEnum = new IntsRefFSTEnum <long?>(fst); Arrays.Fill(ints, 0); r = new Random(seed); int upto = 0; output = 1; while (true) { IntsRefFSTEnum <long?> .InputOutput <long?> pair = fstEnum.Next(); if (pair == null) { break; } Assert.AreEqual(input, pair.Input); Assert.AreEqual(output, pair.Output.Value); output += 1 + r.Next(10); upto++; NextInput(r, ints); } Assert.AreEqual(count, upto); if (verify == 0) { Console.WriteLine("\nTEST: save/load FST and re-verify"); IndexOutput @out = dir.CreateOutput("fst", IOContext.DEFAULT); fst.Save(@out); @out.Dispose(); IndexInput @in = dir.OpenInput("fst", IOContext.DEFAULT); fst = new FST <long?>(@in, outputs); @in.Dispose(); } else { dir.DeleteFile("fst"); } } } } dir.Dispose(); }
public virtual void Test2BOrds() { BaseDirectoryWrapper dir = NewFSDirectory(CreateTempDir("2BOrds")); if (dir is MockDirectoryWrapper) { ((MockDirectoryWrapper)dir).Throttling = MockDirectoryWrapper.Throttling_e.NEVER; } IndexWriter w = new IndexWriter(dir, new IndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer(Random())) .SetMaxBufferedDocs(IndexWriterConfig.DISABLE_AUTO_FLUSH).SetRAMBufferSizeMB(256.0).SetMergeScheduler(new ConcurrentMergeScheduler()).SetMergePolicy(NewLogMergePolicy(false, 10)).SetOpenMode(IndexWriterConfig.OpenMode_e.CREATE)); Document doc = new Document(); sbyte[] bytes = new sbyte[4]; BytesRef data = new BytesRef(bytes); SortedDocValuesField dvField = new SortedDocValuesField("dv", data); doc.Add(dvField); for (int i = 0; i < int.MaxValue; i++) { bytes[0] = (sbyte)(i >> 24); bytes[1] = (sbyte)(i >> 16); bytes[2] = (sbyte)(i >> 8); bytes[3] = (sbyte)i; w.AddDocument(doc); if (i % 100000 == 0) { Console.WriteLine("indexed: " + i); Console.Out.Flush(); } } w.ForceMerge(1); w.Dispose(); Console.WriteLine("verifying..."); Console.Out.Flush(); DirectoryReader r = DirectoryReader.Open(dir); int counter = 0; foreach (AtomicReaderContext context in r.Leaves) { AtomicReader reader = context.AtomicReader; BytesRef scratch = new BytesRef(); BinaryDocValues dv = reader.GetSortedDocValues("dv"); for (int i = 0; i < reader.MaxDoc; i++) { bytes[0] = (sbyte)(counter >> 24); bytes[1] = (sbyte)(counter >> 16); bytes[2] = (sbyte)(counter >> 8); bytes[3] = (sbyte)counter; counter++; dv.Get(i, scratch); Assert.AreEqual(data, scratch); } } r.Dispose(); dir.Dispose(); }
public override int LookupTerm(BytesRef key) { return((int)@in.LookupTerm(key)); }
public override bool Collect(BytesRef bytes) { float boost = boostAtt.Boost; // make sure within a single seg we always collect // terms in order if (Debugging.AssertsEnabled) { Debugging.Assert(CompareToLastTerm(bytes)); } //System.out.println("TTR.collect term=" + bytes.utf8ToString() + " boost=" + boost + " ord=" + readerContext.ord); // ignore uncompetitive hits if (stQueue.Count == maxSize) { ScoreTerm t = stQueue.Peek(); if (boost < t.Boost) { return(true); } if (boost == t.Boost && termComp.Compare(bytes, t.Bytes) > 0) { return(true); } } TermState state = termsEnum.GetTermState(); if (Debugging.AssertsEnabled) { Debugging.Assert(state != null); } if (visitedTerms.TryGetValue(bytes, out ScoreTerm t2)) { // if the term is already in the PQ, only update docFreq of term in PQ if (Debugging.AssertsEnabled) { Debugging.Assert(t2.Boost == boost, "boost should be equal in all segment TermsEnums"); } t2.TermState.Register(state, m_readerContext.Ord, termsEnum.DocFreq, termsEnum.TotalTermFreq); } else { // add new entry in PQ, we must clone the term, else it may get overwritten! st.Bytes.CopyBytes(bytes); st.Boost = boost; visitedTerms[st.Bytes] = st; if (Debugging.AssertsEnabled) { Debugging.Assert(st.TermState.DocFreq == 0); } st.TermState.Register(state, m_readerContext.Ord, termsEnum.DocFreq, termsEnum.TotalTermFreq); stQueue.Add(st); // possibly drop entries from queue if (stQueue.Count > maxSize) { st = stQueue.Dequeue(); visitedTerms.Remove(st.Bytes); st.TermState.Clear(); // reset the termstate! } else { st = new ScoreTerm(termComp, new TermContext(m_topReaderContext)); } if (Debugging.AssertsEnabled) { Debugging.Assert(stQueue.Count <= maxSize, "the PQ size must be limited to maxSize"); } // set maxBoostAtt with values to help FuzzyTermsEnum to optimize if (stQueue.Count == maxSize) { t2 = stQueue.Peek(); maxBoostAtt.MaxNonCompetitiveBoost = t2.Boost; maxBoostAtt.CompetitiveTerm = t2.Bytes; } } return(true); }
public override bool BytesVal(int doc, BytesRef target) { target.CopyBytes(outerInstance.m_bytesRef); return(true); }
public virtual void Test([ValueSource(typeof(ConcurrentMergeSchedulers), "Values")] IConcurrentMergeScheduler scheduler) { MockDirectoryWrapper dir = new MockDirectoryWrapper(Random(), new MMapDirectory(CreateTempDir("4GBStoredFields"))); dir.Throttling = MockDirectoryWrapper.Throttling_e.NEVER; var config = new IndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer(Random())) .SetMaxBufferedDocs(IndexWriterConfig.DISABLE_AUTO_FLUSH) .SetRAMBufferSizeMB(256.0) .SetMergeScheduler(scheduler) .SetMergePolicy(NewLogMergePolicy(false, 10)) .SetOpenMode(IndexWriterConfig.OpenMode_e.CREATE); IndexWriter w = new IndexWriter(dir, config); MergePolicy mp = w.Config.MergePolicy; if (mp is LogByteSizeMergePolicy) { // 1 petabyte: ((LogByteSizeMergePolicy)mp).MaxMergeMB = 1024 * 1024 * 1024; } Document doc = new Document(); FieldType ft = new FieldType(); ft.Indexed = false; ft.Stored = true; ft.Freeze(); int valueLength = RandomInts.NextIntBetween(Random(), 1 << 13, 1 << 20); var value = new byte[valueLength]; for (int i = 0; i < valueLength; ++i) { // random so that even compressing codecs can't compress it value[i] = (byte)Random().Next(256); } Field f = new Field("fld", value, ft); doc.Add(f); int numDocs = (int)((1L << 32) / valueLength + 100); for (int i = 0; i < numDocs; ++i) { w.AddDocument(doc); if (VERBOSE && i % (numDocs / 10) == 0) { Console.WriteLine(i + " of " + numDocs + "..."); } } w.ForceMerge(1); w.Dispose(); if (VERBOSE) { bool found = false; foreach (string file in dir.ListAll()) { if (file.EndsWith(".fdt")) { long fileLength = dir.FileLength(file); if (fileLength >= 1L << 32) { found = true; } Console.WriteLine("File length of " + file + " : " + fileLength); } } if (!found) { Console.WriteLine("No .fdt file larger than 4GB, test bug?"); } } DirectoryReader rd = DirectoryReader.Open(dir); Document sd = rd.Document(numDocs - 1); Assert.IsNotNull(sd); Assert.AreEqual(1, sd.Fields.Count); BytesRef valueRef = sd.GetBinaryValue("fld"); Assert.IsNotNull(valueRef); Assert.AreEqual(new BytesRef(value), valueRef); rd.Dispose(); dir.Dispose(); }
/// <summary> /// Provide spelling corrections based on several parameters. /// </summary> /// <param name="term"> The term to suggest spelling corrections for </param> /// <param name="numSug"> The maximum number of spelling corrections </param> /// <param name="ir"> The index reader to fetch the candidate spelling corrections from </param> /// <param name="docfreq"> The minimum document frequency a potential suggestion need to have in order to be included </param> /// <param name="editDistance"> The maximum edit distance candidates are allowed to have </param> /// <param name="accuracy"> The minimum accuracy a suggested spelling correction needs to have in order to be included </param> /// <param name="spare"> a chars scratch </param> /// <returns> a collection of spelling corrections sorted by <code>ScoreTerm</code>'s natural order. </returns> /// <exception cref="System.IO.IOException"> If I/O related errors occur </exception> protected internal virtual IEnumerable <ScoreTerm> SuggestSimilar(Term term, int numSug, IndexReader ir, int docfreq, int editDistance, float accuracy, CharsRef spare) { var atts = new AttributeSource(); IMaxNonCompetitiveBoostAttribute maxBoostAtt = atts.AddAttribute <IMaxNonCompetitiveBoostAttribute>(); Terms terms = MultiFields.GetTerms(ir, term.Field); if (terms == null) { return(new List <ScoreTerm>()); } FuzzyTermsEnum e = new FuzzyTermsEnum(terms, atts, term, editDistance, Math.Max(minPrefix, editDistance - 1), true); var stQueue = new Support.PriorityQueue <ScoreTerm>(); BytesRef queryTerm = new BytesRef(term.Text()); BytesRef candidateTerm; ScoreTerm st = new ScoreTerm(); IBoostAttribute boostAtt = e.Attributes.AddAttribute <IBoostAttribute>(); while ((candidateTerm = e.Next()) != null) { float boost = boostAtt.Boost; // ignore uncompetitive hits if (stQueue.Count >= numSug && boost <= stQueue.Peek().Boost) { continue; } // ignore exact match of the same term if (queryTerm.BytesEquals(candidateTerm)) { continue; } int df = e.DocFreq; // check docFreq if required if (df <= docfreq) { continue; } float score; string termAsString; if (distance == INTERNAL_LEVENSHTEIN) { // delay creating strings until the end termAsString = null; // undo FuzzyTermsEnum's scale factor for a real scaled lev score score = boost / e.ScaleFactor + e.MinSimilarity; } else { UnicodeUtil.UTF8toUTF16(candidateTerm, spare); termAsString = spare.ToString(); score = distance.GetDistance(term.Text(), termAsString); } if (score < accuracy) { continue; } // add new entry in PQ st.Term = BytesRef.DeepCopyOf(candidateTerm); st.Boost = boost; st.Docfreq = df; st.TermAsString = termAsString; st.Score = score; stQueue.Offer(st); // possibly drop entries from queue st = (stQueue.Count > numSug) ? stQueue.Poll() : new ScoreTerm(); maxBoostAtt.MaxNonCompetitiveBoost = (stQueue.Count >= numSug) ? stQueue.Peek().Boost : float.NegativeInfinity; } return(stQueue); }