public override SeekStatus SeekCeil(BytesRef text) { long ord = values.LookupTerm(text); if (ord >= 0) { currentOrd = ord; term.Offset = 0; // TODO: is there a cleaner way? // term.bytes may be pointing to codec-private byte[] // storage, so we must force new byte[] allocation: term.Bytes = new byte[text.Length]; term.CopyBytes(text); return(SeekStatus.FOUND); } else { currentOrd = -ord - 1; if (currentOrd == values.ValueCount) { return(SeekStatus.END); } else { // TODO: hmm can we avoid this "extra" lookup?: values.LookupOrd(currentOrd, term); return(SeekStatus.NOT_FOUND); } } }
// Seek type 2 "continue" (back to the start of the // surrogates): scan the stripped suffix from the // prior term, backwards. If there was an E in that // part, then we try to seek back to S. If that // seek finds a matching term, we go there. private bool DoContinue() { if (DEBUG_SURROGATES) { Console.WriteLine(" try cont"); } int downTo = prevTerm.Length - 1; bool didSeek = false; int limit = Math.Min(newSuffixStart, scratchTerm.Length - 1); while (downTo > limit) { if (IsHighBMPChar(prevTerm.Bytes, downTo)) { if (DEBUG_SURROGATES) { Console.WriteLine(" found E pos=" + downTo + " vs len=" + prevTerm.Length); } if (SeekToNonBMP(seekTermEnum, prevTerm, downTo)) { // TODO: more efficient seek? outerInstance.TermsDict.SeekEnum(termEnum, seekTermEnum.Term(), true); //newSuffixStart = downTo+4; newSuffixStart = downTo; scratchTerm.CopyBytes(termEnum.Term().Bytes); didSeek = true; if (DEBUG_SURROGATES) { Console.WriteLine(" seek!"); } break; } else { if (DEBUG_SURROGATES) { Console.WriteLine(" no seek"); } } } // Shorten prevTerm in place so that we don't redo // this loop if we come back here: if ((prevTerm.Bytes[downTo] & 0xc0) == 0xc0 || (prevTerm.Bytes[downTo] & 0x80) == 0) { prevTerm.Length = downTo; } downTo--; } return(didSeek); }
public void Set(Term term) { if (term == null) { Reset(); return; } Bytes.CopyBytes(term.Bytes); Field = String.Intern(term.Field); CurrentFieldNumber = -1; this.Term = term; }
public override bool CheckIndexTerm(BytesRef text, TermStats stats) { // NOTE: we must force the first term per field to be // indexed, in case policy doesn't: if (_vgtiw._policy.IsIndexTerm(text, stats) || _first) { _first = false; return(true); } _lastTerm.CopyBytes(text); return(false); }
private void CheckTermsOrder(IndexReader r, ISet <string> allTerms, bool isTop) { TermsEnum terms = MultiFields.GetFields(r).GetTerms("f").GetEnumerator(); BytesRef last = new BytesRef(); ISet <string> seenTerms = new JCG.HashSet <string>(); while (terms.MoveNext()) { BytesRef term = terms.Term; Assert.IsTrue(last.CompareTo(term) < 0); last.CopyBytes(term); string s = term.Utf8ToString(); Assert.IsTrue(allTerms.Contains(s), "term " + TermDesc(s) + " was not added to index (count=" + allTerms.Count + ")"); seenTerms.Add(s); } if (isTop) { Assert.IsTrue(allTerms.SetEquals(seenTerms)); } // Test seeking: IEnumerator <string> it = seenTerms.GetEnumerator(); while (it.MoveNext()) { BytesRef tr = new BytesRef(it.Current); Assert.AreEqual(TermsEnum.SeekStatus.FOUND, terms.SeekCeil(tr), "seek failed for term=" + TermDesc(tr.Utf8ToString())); } }
public override void Copy(MutableValue source) { MutableValueStr s = (MutableValueStr)source; Exists = s.Exists; Value.CopyBytes(s.Value); }
protected override BytesRef NextSeekTerm(BytesRef term) { //System.out.println("ATE.nextSeekTerm term=" + term); if (term == null) { Debug.Assert(seekBytesRef.Length == 0); // return the empty term, as its valid if (runAutomaton.IsAccept(runAutomaton.InitialState)) { return(seekBytesRef); } } else { seekBytesRef.CopyBytes(term); } // seek to the next possible string; if (NextString()) { return(seekBytesRef); // reposition } else { return(null); // no more possible strings can match } }
private void WriteTerm(int fieldNumber, BytesRef term) { //System.out.println(" tiw.write field=" + fieldNumber + " term=" + term.utf8ToString()); // TODO: UTF16toUTF8 could tell us this prefix // Compute prefix in common with last term: int start = 0; int limit = term.Length < LastTerm.Length ? term.Length : LastTerm.Length; while (start < limit) { if (term.Bytes[start + term.Offset] != LastTerm.Bytes[start + LastTerm.Offset]) { break; } start++; } int length = term.Length - start; Output.WriteVInt32(start); // write shared prefix length Output.WriteVInt32(length); // write delta length Output.WriteBytes(term.Bytes, start + term.Offset, length); // write delta bytes Output.WriteVInt32(fieldNumber); // write field num LastTerm.CopyBytes(term); }
/// <summary> /// Builds the final automaton from a list of entries. /// </summary> private FST <object> BuildAutomaton(IBytesRefSorter sorter) { // Build the automaton. Outputs <object> outputs = NoOutputs.Singleton; object empty = outputs.NoOutput; Builder <object> builder = new Builder <object>(FST.INPUT_TYPE.BYTE1, 0, 0, true, true, shareMaxTailLength, outputs, null, false, PackedInt32s.DEFAULT, true, 15); BytesRef scratch = new BytesRef(); BytesRef entry; Int32sRef scratchIntsRef = new Int32sRef(); int count = 0; IBytesRefIterator iter = sorter.GetIterator(); while ((entry = iter.Next()) != null) { count++; if (scratch.CompareTo(entry) != 0) { builder.Add(Util.Fst.Util.ToInt32sRef(entry, scratchIntsRef), empty); scratch.CopyBytes(entry); } } return(count == 0 ? null : builder.Finish()); }
public virtual void TestCopyBytes() { sbyte[] bytes = new sbyte[] { (sbyte)'a', (sbyte)'b', (sbyte)'c', (sbyte)'d' }; BytesRef b = new BytesRef(bytes, 1, 3); // bcd b.CopyBytes(new BytesRef("bcde")); Assert.AreEqual("bcde", b.Utf8ToString()); }
private void SetTerm() { // TODO: is there a cleaner way term.Bytes = new byte[termBuffer.Length]; term.Offset = 0; term.CopyBytes(termBuffer); }
public override bool CheckIndexTerm(BytesRef text, TermStats stats) { //System.out.println("VGW: index term=" + text.utf8ToString()); // NOTE: we must force the first term per field to be // indexed, in case policy doesn't: if (outerInstance.policy.IsIndexTerm(text, stats) || first) { first = false; //System.out.println(" YES"); return(true); } else { lastTerm.CopyBytes(text); return(false); } }
public override bool CheckIndexTerm(BytesRef text, TermStats stats) { // First term is first indexed term: //System.output.println("FGW: checkIndexTerm text=" + text.utf8ToString()); if (0 == (_numTerms++ % _fgtiw._termIndexInterval)) { return(true); } // save last term just before next index term so we // can compute wasted suffix if (0 == _numTerms % _fgtiw._termIndexInterval) { _lastTerm.CopyBytes(text); } return(false); }
public virtual void TestUpdateDelteSlices() { DocumentsWriterDeleteQueue queue = new DocumentsWriterDeleteQueue(); int size = 200 + Random.Next(500) * RANDOM_MULTIPLIER; int?[] ids = new int?[size]; for (int i = 0; i < ids.Length; i++) { ids[i] = Random.Next(); } DeleteSlice slice1 = queue.NewSlice(); DeleteSlice slice2 = queue.NewSlice(); BufferedUpdates bd1 = new BufferedUpdates(); BufferedUpdates bd2 = new BufferedUpdates(); int last1 = 0; int last2 = 0; ISet <Term> uniqueValues = new JCG.HashSet <Term>(); for (int j = 0; j < ids.Length; j++) { int?i = ids[j]; // create an array here since we compare identity below against tailItem Term[] term = new Term[] { new Term("id", i.ToString()) }; uniqueValues.Add(term[0]); queue.AddDelete(term); if (Random.Next(20) == 0 || j == ids.Length - 1) { queue.UpdateSlice(slice1); Assert.IsTrue(slice1.IsTailItem(term)); slice1.Apply(bd1, j); AssertAllBetween(last1, j, bd1, ids); last1 = j + 1; } if (Random.Next(10) == 5 || j == ids.Length - 1) { queue.UpdateSlice(slice2); Assert.IsTrue(slice2.IsTailItem(term)); slice2.Apply(bd2, j); AssertAllBetween(last2, j, bd2, ids); last2 = j + 1; } Assert.AreEqual(j + 1, queue.NumGlobalTermDeletes); } assertEquals(uniqueValues, new JCG.HashSet <Term>(bd1.terms.Keys)); assertEquals(uniqueValues, new JCG.HashSet <Term>(bd2.terms.Keys)); var frozenSet = new JCG.HashSet <Term>(); foreach (Term t in queue.FreezeGlobalBuffer(null).GetTermsEnumerable()) { BytesRef bytesRef = new BytesRef(); bytesRef.CopyBytes(t.Bytes); frozenSet.Add(new Term(t.Field, bytesRef)); } assertEquals(uniqueValues, frozenSet); Assert.AreEqual(0, queue.NumGlobalTermDeletes, "num deletes must be 0 after freeze"); }
// Currently used only by assert statement private int CompareToLastTerm(int fieldNumber, BytesRef term) { if (lastFieldNumber != fieldNumber) { int cmp = FieldName(fieldInfos, lastFieldNumber).CompareToOrdinal(FieldName(fieldInfos, fieldNumber)); // If there is a field named "" (empty string) then we // will get 0 on this comparison, yet, it's "OK". But // it's not OK if two different field numbers map to // the same name. if (cmp != 0 || lastFieldNumber != -1) { return(cmp); } } scratchBytes.CopyBytes(term); if (Debugging.AssertsEnabled) { Debugging.Assert(lastTerm.Offset == 0); } UnicodeUtil.UTF8toUTF16(lastTerm.Bytes, 0, lastTerm.Length, utf16Result1); if (Debugging.AssertsEnabled) { Debugging.Assert(scratchBytes.Offset == 0); } UnicodeUtil.UTF8toUTF16(scratchBytes.Bytes, 0, scratchBytes.Length, utf16Result2); int len; if (utf16Result1.Length < utf16Result2.Length) { len = utf16Result1.Length; } else { len = utf16Result2.Length; } for (int i = 0; i < len; i++) { char ch1 = utf16Result1.Chars[i]; char ch2 = utf16Result2.Chars[i]; if (ch1 != ch2) { return(ch1 - ch2); } } if (utf16Result1.Length == 0 && lastFieldNumber == -1) { // If there is a field named "" (empty string) with a term text of "" (empty string) then we // will get 0 on this comparison, yet, it's "OK". return(-1); } return(utf16Result1.Length - utf16Result2.Length); }
internal virtual void ReadVectors() { TermAndPostings = new TermAndPostings[NumTerms]; BytesRef lastTerm = new BytesRef(); for (int i = 0; i < NumTerms; i++) { TermAndPostings t = new TermAndPostings(); BytesRef term = new BytesRef(); term.CopyBytes(lastTerm); int start = Tvf.ReadVInt(); int deltaLen = Tvf.ReadVInt(); term.Length = start + deltaLen; term.Grow(term.Length); Tvf.ReadBytes(term.Bytes, start, deltaLen); t.Term = term; int freq = Tvf.ReadVInt(); t.Freq = freq; if (StorePositions) { int[] positions = new int[freq]; int pos = 0; for (int posUpto = 0; posUpto < freq; posUpto++) { int delta = Tvf.ReadVInt(); if (delta == -1) { delta = 0; // LUCENE-1542 correction } pos += delta; positions[posUpto] = pos; } t.Positions = positions; } if (StoreOffsets) { int[] startOffsets = new int[freq]; int[] endOffsets = new int[freq]; int offset = 0; for (int posUpto = 0; posUpto < freq; posUpto++) { startOffsets[posUpto] = offset + Tvf.ReadVInt(); offset = endOffsets[posUpto] = startOffsets[posUpto] + Tvf.ReadVInt(); } t.StartOffsets = startOffsets; t.EndOffsets = endOffsets; } lastTerm.CopyBytes(term); TermAndPostings[i] = t; } }
public virtual void TestStressDeleteQueue() { DocumentsWriterDeleteQueue queue = new DocumentsWriterDeleteQueue(); ISet <Term> uniqueValues = new JCG.HashSet <Term>(); int size = 10000 + Random.Next(500) * RANDOM_MULTIPLIER; int?[] ids = new int?[size]; for (int i = 0; i < ids.Length; i++) { ids[i] = Random.Next(); uniqueValues.Add(new Term("id", ids[i].ToString())); } CountdownEvent latch = new CountdownEvent(1); AtomicInt32 index = new AtomicInt32(0); int numThreads = 2 + Random.Next(5); UpdateThread[] threads = new UpdateThread[numThreads]; for (int i = 0; i < threads.Length; i++) { threads[i] = new UpdateThread(queue, index, ids, latch); threads[i].Start(); } latch.Signal(); for (int i = 0; i < threads.Length; i++) { threads[i].Join(); } foreach (UpdateThread updateThread in threads) { DeleteSlice slice = updateThread.Slice; queue.UpdateSlice(slice); BufferedUpdates deletes = updateThread.Deletes; slice.Apply(deletes, BufferedUpdates.MAX_INT32); assertEquals(uniqueValues, new JCG.HashSet <Term>(deletes.terms.Keys)); } queue.TryApplyGlobalSlice(); ISet <Term> frozenSet = new JCG.HashSet <Term>(); foreach (Term t in queue.FreezeGlobalBuffer(null).GetTermsEnumerable()) { BytesRef bytesRef = new BytesRef(); bytesRef.CopyBytes(t.Bytes); frozenSet.Add(new Term(t.Field, bytesRef)); } Assert.AreEqual(0, queue.NumGlobalTermDeletes, "num deletes must be 0 after freeze"); Assert.AreEqual(uniqueValues.Count, frozenSet.Count); assertEquals(uniqueValues, frozenSet); }
// single straight enum private void DoTestStraightEnum(IList <Term> fieldTerms, IndexReader reader, int uniqueTermCount) { if (Verbose) { Console.WriteLine("\nTEST: top now enum reader=" + reader); } Fields fields = MultiFields.GetFields(reader); { // Test straight enum: int termCount = 0; foreach (string field in fields) { Terms terms = fields.GetTerms(field); Assert.IsNotNull(terms); TermsEnum termsEnum = terms.GetEnumerator(); BytesRef text; BytesRef lastText = null; while (termsEnum.MoveNext()) { text = termsEnum.Term; Term exp = fieldTerms[termCount]; if (Verbose) { Console.WriteLine(" got term=" + field + ":" + UnicodeUtil.ToHexString(text.Utf8ToString())); Console.WriteLine(" exp=" + exp.Field + ":" + UnicodeUtil.ToHexString(exp.Text)); Console.WriteLine(); } if (lastText == null) { lastText = BytesRef.DeepCopyOf(text); } else { Assert.IsTrue(lastText.CompareTo(text) < 0); lastText.CopyBytes(text); } Assert.AreEqual(exp.Field, field); Assert.AreEqual(exp.Bytes, text); termCount++; } if (Verbose) { Console.WriteLine(" no more terms for field=" + field); } } Assert.AreEqual(uniqueTermCount, termCount); } }
public BytesRef Next() { if (termsEnum != null) { BytesRef next; while ((next = termsEnum.Next()) != null) { if (IsFrequent(termsEnum.DocFreq())) { freq = termsEnum.DocFreq(); spare.CopyBytes(next); return(spare); } } } return(null); }
private bool CompareToLastTerm(BytesRef t) { if (lastTerm == null && t != null) { lastTerm = BytesRef.DeepCopyOf(t); } else if (t == null) { lastTerm = null; } else { Debug.Assert(termsEnum.Comparer.Compare(lastTerm, t) < 0, "lastTerm=" + lastTerm + " t=" + t); lastTerm.CopyBytes(t); } return(true); }
public bool MoveNext() { if (!(termsEnum is null)) { while (termsEnum.MoveNext()) { if (IsFrequent(termsEnum.DocFreq)) { freq = termsEnum.DocFreq; spare.CopyBytes(termsEnum.Term); current = spare; return(true); } } } current = null; return(false); }
private bool CompareToLastTerm(BytesRef t) { if (lastTerm == null && t != null) { lastTerm = BytesRef.DeepCopyOf(t); } else if (t == null) { lastTerm = null; } else { if (Debugging.AssertsEnabled) { Debugging.Assert(termsEnum.Comparer.Compare(lastTerm, t) < 0, "lastTerm={0} t={1}", lastTerm, t); } lastTerm.CopyBytes(t); } return(true); }
public BytesRef Next() { // LUCENENET NOTE: We moved the cursor when // the instance was created. Make sure we don't // move it again until the second call to Next(). if (first && current != null) { first = false; } else if (i.MoveNext()) { current = i.Current; } else { return(null); } spare.CopyBytes(current.term); return(spare); }
public override void StartTerm(BytesRef term, int freq) { int prefix = StringHelper.BytesDifference(lastTerm, term); int suffix = term.Length - prefix; tvf.WriteVInt32(prefix); tvf.WriteVInt32(suffix); tvf.WriteBytes(term.Bytes, term.Offset + prefix, suffix); tvf.WriteVInt32(freq); lastTerm.CopyBytes(term); lastPosition = lastOffset = 0; if (offsets && positions) { // we might need to buffer if its a non-bulk merge offsetStartBuffer = ArrayUtil.Grow(offsetStartBuffer, freq); offsetEndBuffer = ArrayUtil.Grow(offsetEndBuffer, freq); offsetIndex = 0; offsetFreq = freq; } }
public override void StartTerm(BytesRef term, int freq) { int prefix = StringHelper.BytesDifference(LastTerm, term); int suffix = term.Length - prefix; Tvf.WriteVInt(prefix); Tvf.WriteVInt(suffix); Tvf.WriteBytes(term.Bytes, term.Offset + prefix, suffix); Tvf.WriteVInt(freq); LastTerm.CopyBytes(term); LastPosition = LastOffset = 0; if (Offsets && Positions) { // we might need to buffer if its a non-bulk merge OffsetStartBuffer = ArrayUtil.Grow(OffsetStartBuffer, freq); OffsetEndBuffer = ArrayUtil.Grow(OffsetEndBuffer, freq); } BufferedIndex = 0; BufferedFreq = freq; PayloadData.Length = 0; }
public override bool BytesVal(int doc, BytesRef target) { target.CopyBytes(outerInstance.bytesRef); return true; }
public override BytesRef Next() { if (nextTerm >= numTerms) { return(null); } term.CopyBytes(lastTerm); int start = tvf.ReadVInt32(); int deltaLen = tvf.ReadVInt32(); term.Length = start + deltaLen; term.Grow(term.Length); tvf.ReadBytes(term.Bytes, start, deltaLen); freq = tvf.ReadVInt32(); if (storePayloads) { positions = new int[freq]; payloadOffsets = new int[freq]; int totalPayloadLength = 0; int pos = 0; for (int posUpto = 0; posUpto < freq; posUpto++) { int code = tvf.ReadVInt32(); pos += (int)((uint)code >> 1); positions[posUpto] = pos; if ((code & 1) != 0) { // length change lastPayloadLength = tvf.ReadVInt32(); } payloadOffsets[posUpto] = totalPayloadLength; totalPayloadLength += lastPayloadLength; Debug.Assert(totalPayloadLength >= 0); } payloadData = new byte[totalPayloadLength]; tvf.ReadBytes(payloadData, 0, payloadData.Length); } // no payloads else if (storePositions) { // TODO: we could maybe reuse last array, if we can // somehow be careful about consumer never using two // D&PEnums at once... positions = new int[freq]; int pos = 0; for (int posUpto = 0; posUpto < freq; posUpto++) { pos += tvf.ReadVInt32(); positions[posUpto] = pos; } } if (storeOffsets) { startOffsets = new int[freq]; endOffsets = new int[freq]; int offset = 0; for (int posUpto = 0; posUpto < freq; posUpto++) { startOffsets[posUpto] = offset + tvf.ReadVInt32(); offset = endOffsets[posUpto] = startOffsets[posUpto] + tvf.ReadVInt32(); } } lastTerm.CopyBytes(term); nextTerm++; return(term); }
/// <remarks> /// TODO: we may want an alternate mode here which is /// "if you are about to return NOT_FOUND I won't use /// the terms data from that"; eg FuzzyTermsEnum will /// (usually) just immediately call seek again if we /// return NOT_FOUND so it's a waste for us to fill in /// the term that was actually NOT_FOUND /// </remarks> public override SeekStatus SeekCeil(BytesRef target) { if (indexEnum == null) { throw new InvalidOperationException("terms index was not loaded"); } //System.out.println("BTR.seek seg=" + segment + " target=" + fieldInfo.name + ":" + target.utf8ToString() + " " + target + " current=" + term().utf8ToString() + " " + term() + " indexIsCurrent=" + indexIsCurrent + " didIndexNext=" + didIndexNext + " seekPending=" + seekPending + " divisor=" + indexReader.getDivisor() + " this=" + this); if (didIndexNext) { if (nextIndexTerm == null) { //System.out.println(" nextIndexTerm=null"); } else { //System.out.println(" nextIndexTerm=" + nextIndexTerm.utf8ToString()); } } bool doSeek = true; // See if we can avoid seeking, because target term // is after current term but before next index term: if (indexIsCurrent) { int cmp = BytesRef.UTF8SortedAsUnicodeComparer.Compare(term, target); if (cmp == 0) { // Already at the requested term return(SeekStatus.FOUND); } else if (cmp < 0) { // Target term is after current term if (!didIndexNext) { if (indexEnum.Next() == -1) { nextIndexTerm = null; } else { nextIndexTerm = indexEnum.Term; } //System.out.println(" now do index next() nextIndexTerm=" + (nextIndexTerm == null ? "null" : nextIndexTerm.utf8ToString())); didIndexNext = true; } if (nextIndexTerm == null || BytesRef.UTF8SortedAsUnicodeComparer.Compare(target, nextIndexTerm) < 0) { // Optimization: requested term is within the // same term block we are now in; skip seeking // (but do scanning): doSeek = false; //System.out.println(" skip seek: nextIndexTerm=" + (nextIndexTerm == null ? "null" : nextIndexTerm.utf8ToString())); } } } if (doSeek) { //System.out.println(" seek"); // Ask terms index to find biggest indexed term (= // first term in a block) that's <= our text: input.Seek(indexEnum.Seek(target)); bool result = NextBlock(); // Block must exist since, at least, the indexed term // is in the block: Debug.Assert(result); indexIsCurrent = true; didIndexNext = false; blocksSinceSeek = 0; if (doOrd) { state.Ord = indexEnum.Ord - 1; } term.CopyBytes(indexEnum.Term); //System.out.println(" seek: term=" + term.utf8ToString()); } else { //System.out.println(" skip seek"); if (state.TermBlockOrd == blockTermCount && !NextBlock()) { indexIsCurrent = false; return(SeekStatus.END); } } seekPending = false; int common = 0; // Scan within block. We could do this by calling // _next() and testing the resulting term, but this // is wasteful. Instead, we first confirm the // target matches the common prefix of this block, // and then we scan the term bytes directly from the // termSuffixesreader's byte[], saving a copy into // the BytesRef term per term. Only when we return // do we then copy the bytes into the term. while (true) { // First, see if target term matches common prefix // in this block: if (common < termBlockPrefix) { int cmp = (term.Bytes[common] & 0xFF) - (target.Bytes[target.Offset + common] & 0xFF); if (cmp < 0) { // TODO: maybe we should store common prefix // in block header? (instead of relying on // last term of previous block) // Target's prefix is after the common block // prefix, so term cannot be in this block // but it could be in next block. We // must scan to end-of-block to set common // prefix for next block: if (state.TermBlockOrd < blockTermCount) { while (state.TermBlockOrd < blockTermCount - 1) { state.TermBlockOrd++; state.Ord++; termSuffixesReader.SkipBytes(termSuffixesReader.ReadVInt32()); } int suffix = termSuffixesReader.ReadVInt32(); term.Length = termBlockPrefix + suffix; if (term.Bytes.Length < term.Length) { term.Grow(term.Length); } termSuffixesReader.ReadBytes(term.Bytes, termBlockPrefix, suffix); } state.Ord++; if (!NextBlock()) { indexIsCurrent = false; return(SeekStatus.END); } common = 0; } else if (cmp > 0) { // Target's prefix is before the common prefix // of this block, so we position to start of // block and return NOT_FOUND: Debug.Assert(state.TermBlockOrd == 0); int suffix = termSuffixesReader.ReadVInt32(); term.Length = termBlockPrefix + suffix; if (term.Bytes.Length < term.Length) { term.Grow(term.Length); } termSuffixesReader.ReadBytes(term.Bytes, termBlockPrefix, suffix); return(SeekStatus.NOT_FOUND); } else { common++; } continue; } // Test every term in this block while (true) { state.TermBlockOrd++; state.Ord++; int suffix = termSuffixesReader.ReadVInt32(); // We know the prefix matches, so just compare the new suffix: int termLen = termBlockPrefix + suffix; int bytePos = termSuffixesReader.Position; bool next = false; int limit = target.Offset + (termLen < target.Length ? termLen : target.Length); int targetPos = target.Offset + termBlockPrefix; while (targetPos < limit) { int cmp = (termSuffixes[bytePos++] & 0xFF) - (target.Bytes[targetPos++] & 0xFF); if (cmp < 0) { // Current term is still before the target; // keep scanning next = true; break; } else if (cmp > 0) { // Done! Current term is after target. Stop // here, fill in real term, return NOT_FOUND. term.Length = termBlockPrefix + suffix; if (term.Bytes.Length < term.Length) { term.Grow(term.Length); } termSuffixesReader.ReadBytes(term.Bytes, termBlockPrefix, suffix); //System.out.println(" NOT_FOUND"); return(SeekStatus.NOT_FOUND); } } if (!next && target.Length <= termLen) { term.Length = termBlockPrefix + suffix; if (term.Bytes.Length < term.Length) { term.Grow(term.Length); } termSuffixesReader.ReadBytes(term.Bytes, termBlockPrefix, suffix); if (target.Length == termLen) { // Done! Exact match. Stop here, fill in // real term, return FOUND. //System.out.println(" FOUND"); return(SeekStatus.FOUND); } else { //System.out.println(" NOT_FOUND"); return(SeekStatus.NOT_FOUND); } } if (state.TermBlockOrd == blockTermCount) { // Must pre-fill term for next block's common prefix term.Length = termBlockPrefix + suffix; if (term.Bytes.Length < term.Length) { term.Grow(term.Length); } termSuffixesReader.ReadBytes(term.Bytes, termBlockPrefix, suffix); break; } else { termSuffixesReader.SkipBytes(suffix); } } // The purpose of the terms dict index is to seek // the enum to the closest index term before the // term we are looking for. So, we should never // cross another index term (besides the first // one) while we are scanning: Debug.Assert(indexIsCurrent); if (!NextBlock()) { //System.out.println(" END"); indexIsCurrent = false; return(SeekStatus.END); } common = 0; } }
public override void Build(IInputEnumerator enumerator) { if (enumerator.HasContexts) { throw new ArgumentException("this suggester doesn't support contexts"); } string prefix = this.GetType().Name; var directory = OfflineSorter.DefaultTempDir(); var tempInput = FileSupport.CreateTempFile(prefix, ".input", directory); var tempSorted = FileSupport.CreateTempFile(prefix, ".sorted", directory); hasPayloads = enumerator.HasPayloads; var writer = new OfflineSorter.ByteSequencesWriter(tempInput); OfflineSorter.ByteSequencesReader reader = null; var scratch = new BytesRef(); TokenStreamToAutomaton ts2a = GetTokenStreamToAutomaton(); bool success = false; count = 0; byte[] buffer = new byte[8]; try { var output = new ByteArrayDataOutput(buffer); BytesRef surfaceForm; while (enumerator.MoveNext()) { surfaceForm = enumerator.Current; ISet <Int32sRef> paths = ToFiniteStrings(surfaceForm, ts2a); maxAnalyzedPathsForOneInput = Math.Max(maxAnalyzedPathsForOneInput, paths.Count); foreach (Int32sRef path in paths) { Util.Fst.Util.ToBytesRef(path, scratch); // length of the analyzed text (FST input) if (scratch.Length > ushort.MaxValue - 2) { throw new ArgumentException("cannot handle analyzed forms > " + (ushort.MaxValue - 2) + " in length (got " + scratch.Length + ")"); } ushort analyzedLength = (ushort)scratch.Length; // compute the required length: // analyzed sequence + weight (4) + surface + analyzedLength (short) int requiredLength = analyzedLength + 4 + surfaceForm.Length + 2; BytesRef payload; if (hasPayloads) { if (surfaceForm.Length > (ushort.MaxValue - 2)) { throw new ArgumentException("cannot handle surface form > " + (ushort.MaxValue - 2) + " in length (got " + surfaceForm.Length + ")"); } payload = enumerator.Payload; // payload + surfaceLength (short) requiredLength += payload.Length + 2; } else { payload = null; } buffer = ArrayUtil.Grow(buffer, requiredLength); output.Reset(buffer); output.WriteInt16((short)analyzedLength); output.WriteBytes(scratch.Bytes, scratch.Offset, scratch.Length); output.WriteInt32(EncodeWeight(enumerator.Weight)); if (hasPayloads) { for (int i = 0; i < surfaceForm.Length; i++) { if (surfaceForm.Bytes[i] == PAYLOAD_SEP) { throw new ArgumentException( "surface form cannot contain unit separator character U+001F; this character is reserved"); } } output.WriteInt16((short)surfaceForm.Length); output.WriteBytes(surfaceForm.Bytes, surfaceForm.Offset, surfaceForm.Length); output.WriteBytes(payload.Bytes, payload.Offset, payload.Length); } else { output.WriteBytes(surfaceForm.Bytes, surfaceForm.Offset, surfaceForm.Length); } if (Debugging.AssertsEnabled) { Debugging.Assert(output.Position == requiredLength, () => output.Position + " vs " + requiredLength); } writer.Write(buffer, 0, output.Position); } count++; } writer.Dispose(); // Sort all input/output pairs (required by FST.Builder): (new OfflineSorter(new AnalyzingComparer(hasPayloads))).Sort(tempInput, tempSorted); // Free disk space: tempInput.Delete(); reader = new OfflineSorter.ByteSequencesReader(tempSorted); var outputs = new PairOutputs <long?, BytesRef>(PositiveInt32Outputs.Singleton, ByteSequenceOutputs.Singleton); var builder = new Builder <PairOutputs <long?, BytesRef> .Pair>(FST.INPUT_TYPE.BYTE1, outputs); // Build FST: BytesRef previousAnalyzed = null; BytesRef analyzed = new BytesRef(); BytesRef surface = new BytesRef(); Int32sRef scratchInts = new Int32sRef(); var input = new ByteArrayDataInput(); // Used to remove duplicate surface forms (but we // still index the hightest-weight one). We clear // this when we see a new analyzed form, so it cannot // grow unbounded (at most 256 entries): var seenSurfaceForms = new JCG.HashSet <BytesRef>(); var dedup = 0; while (reader.Read(scratch)) { input.Reset(scratch.Bytes, scratch.Offset, scratch.Length); ushort analyzedLength = (ushort)input.ReadInt16(); analyzed.Grow(analyzedLength + 2); input.ReadBytes(analyzed.Bytes, 0, analyzedLength); analyzed.Length = analyzedLength; long cost = input.ReadInt32(); surface.Bytes = scratch.Bytes; if (hasPayloads) { surface.Length = (ushort)input.ReadInt16(); surface.Offset = input.Position; } else { surface.Offset = input.Position; surface.Length = scratch.Length - surface.Offset; } if (previousAnalyzed == null) { previousAnalyzed = new BytesRef(); previousAnalyzed.CopyBytes(analyzed); seenSurfaceForms.Add(BytesRef.DeepCopyOf(surface)); } else if (analyzed.Equals(previousAnalyzed)) { dedup++; if (dedup >= maxSurfaceFormsPerAnalyzedForm) { // More than maxSurfaceFormsPerAnalyzedForm // dups: skip the rest: continue; } if (seenSurfaceForms.Contains(surface)) { continue; } seenSurfaceForms.Add(BytesRef.DeepCopyOf(surface)); } else { dedup = 0; previousAnalyzed.CopyBytes(analyzed); seenSurfaceForms.Clear(); seenSurfaceForms.Add(BytesRef.DeepCopyOf(surface)); } // TODO: I think we can avoid the extra 2 bytes when // there is no dup (dedup==0), but we'd have to fix // the exactFirst logic ... which would be sort of // hairy because we'd need to special case the two // (dup/not dup)... // NOTE: must be byte 0 so we sort before whatever // is next analyzed.Bytes[analyzed.Offset + analyzed.Length] = 0; analyzed.Bytes[analyzed.Offset + analyzed.Length + 1] = (byte)dedup; analyzed.Length += 2; Util.Fst.Util.ToInt32sRef(analyzed, scratchInts); //System.out.println("ADD: " + scratchInts + " -> " + cost + ": " + surface.utf8ToString()); if (!hasPayloads) { builder.Add(scratchInts, outputs.NewPair(cost, BytesRef.DeepCopyOf(surface))); } else { int payloadOffset = input.Position + surface.Length; int payloadLength = scratch.Length - payloadOffset; BytesRef br = new BytesRef(surface.Length + 1 + payloadLength); Array.Copy(surface.Bytes, surface.Offset, br.Bytes, 0, surface.Length); br.Bytes[surface.Length] = PAYLOAD_SEP; Array.Copy(scratch.Bytes, payloadOffset, br.Bytes, surface.Length + 1, payloadLength); br.Length = br.Bytes.Length; builder.Add(scratchInts, outputs.NewPair(cost, br)); } } fst = builder.Finish(); //Util.dotToFile(fst, "/tmp/suggest.dot"); success = true; } finally { if (success) { IOUtils.Dispose(reader, writer); } else { IOUtils.DisposeWhileHandlingException(reader, writer); } tempInput.Delete(); tempSorted.Delete(); } }
/// <summary> /// expert: writes a value dictionary for a sorted/sortedset field </summary> protected internal virtual void AddTermsDict(FieldInfo field, IEnumerable<BytesRef> values) { // first check if its a "fixed-length" terms dict int minLength = int.MaxValue; int maxLength = int.MinValue; foreach (BytesRef v in values) { minLength = Math.Min(minLength, v.Length); maxLength = Math.Max(maxLength, v.Length); } if (minLength == maxLength) { // no index needed: direct addressing by mult AddBinaryField(field, values); } else { // header Meta.WriteVInt(field.Number); Meta.WriteByte((byte)Lucene45DocValuesFormat.BINARY); Meta.WriteVInt(BINARY_PREFIX_COMPRESSED); Meta.WriteLong(-1L); // now write the bytes: sharing prefixes within a block long startFP = Data.FilePointer; // currently, we have to store the delta from expected for every 1/nth term // we could avoid this, but its not much and less overall RAM than the previous approach! RAMOutputStream addressBuffer = new RAMOutputStream(); MonotonicBlockPackedWriter termAddresses = new MonotonicBlockPackedWriter(addressBuffer, BLOCK_SIZE); BytesRef lastTerm = new BytesRef(); long count = 0; foreach (BytesRef v in values) { if (count % ADDRESS_INTERVAL == 0) { termAddresses.Add(Data.FilePointer - startFP); // force the first term in a block to be abs-encoded lastTerm.Length = 0; } // prefix-code int sharedPrefix = StringHelper.BytesDifference(lastTerm, v); Data.WriteVInt(sharedPrefix); Data.WriteVInt(v.Length - sharedPrefix); Data.WriteBytes(v.Bytes, v.Offset + sharedPrefix, v.Length - sharedPrefix); lastTerm.CopyBytes(v); count++; } long indexStartFP = Data.FilePointer; // write addresses of indexed terms termAddresses.Finish(); addressBuffer.WriteTo(Data); addressBuffer = null; termAddresses = null; Meta.WriteVInt(minLength); Meta.WriteVInt(maxLength); Meta.WriteVLong(count); Meta.WriteLong(startFP); Meta.WriteVInt(ADDRESS_INTERVAL); Meta.WriteLong(indexStartFP); Meta.WriteVInt(PackedInts.VERSION_CURRENT); Meta.WriteVInt(BLOCK_SIZE); } }
/// <summary> /// expert: writes a value dictionary for a sorted/sortedset field </summary> protected internal virtual void AddTermsDict(FieldInfo field, IEnumerable <BytesRef> values) { // first check if its a "fixed-length" terms dict int minLength = int.MaxValue; int maxLength = int.MinValue; foreach (BytesRef v in values) { minLength = Math.Min(minLength, v.Length); maxLength = Math.Max(maxLength, v.Length); } if (minLength == maxLength) { // no index needed: direct addressing by mult AddBinaryField(field, values); } else { // header Meta.WriteVInt(field.Number); Meta.WriteByte((byte)Lucene45DocValuesFormat.BINARY); Meta.WriteVInt(BINARY_PREFIX_COMPRESSED); Meta.WriteLong(-1L); // now write the bytes: sharing prefixes within a block long startFP = Data.FilePointer; // currently, we have to store the delta from expected for every 1/nth term // we could avoid this, but its not much and less overall RAM than the previous approach! RAMOutputStream addressBuffer = new RAMOutputStream(); MonotonicBlockPackedWriter termAddresses = new MonotonicBlockPackedWriter(addressBuffer, BLOCK_SIZE); BytesRef lastTerm = new BytesRef(); long count = 0; foreach (BytesRef v in values) { if (count % ADDRESS_INTERVAL == 0) { termAddresses.Add(Data.FilePointer - startFP); // force the first term in a block to be abs-encoded lastTerm.Length = 0; } // prefix-code int sharedPrefix = StringHelper.BytesDifference(lastTerm, v); Data.WriteVInt(sharedPrefix); Data.WriteVInt(v.Length - sharedPrefix); Data.WriteBytes(v.Bytes, v.Offset + sharedPrefix, v.Length - sharedPrefix); lastTerm.CopyBytes(v); count++; } long indexStartFP = Data.FilePointer; // write addresses of indexed terms termAddresses.Finish(); addressBuffer.WriteTo(Data); addressBuffer = null; termAddresses = null; Meta.WriteVInt(minLength); Meta.WriteVInt(maxLength); Meta.WriteVLong(count); Meta.WriteLong(startFP); Meta.WriteVInt(ADDRESS_INTERVAL); Meta.WriteLong(indexStartFP); Meta.WriteVInt(PackedInts.VERSION_CURRENT); Meta.WriteVInt(BLOCK_SIZE); } }
public override SeekStatus SeekCeil(BytesRef term) { queue.Clear(); numTop = 0; lastSeekExact = false; bool seekOpt = false; if (lastSeek != null && termComp.Compare(lastSeek, term) <= 0) { seekOpt = true; } lastSeekScratch.CopyBytes(term); lastSeek = lastSeekScratch; for (int i = 0; i < numSubs; i++) { SeekStatus status; // LUCENE-2130: if we had just seek'd already, prior // to this seek, and the new seek term is after the // previous one, don't try to re-seek this sub if its // current term is already beyond this new seek term. // Doing so is a waste because this sub will simply // seek to the same spot. if (seekOpt) { BytesRef curTerm = currentSubs[i].Current; if (curTerm != null) { int cmp = termComp.Compare(term, curTerm); if (cmp == 0) { status = SeekStatus.FOUND; } else if (cmp < 0) { status = SeekStatus.NOT_FOUND; } else { status = currentSubs[i].Terms.SeekCeil(term); } } else { status = SeekStatus.END; } } else { status = currentSubs[i].Terms.SeekCeil(term); } if (status == SeekStatus.FOUND) { top[numTop++] = currentSubs[i]; current = currentSubs[i].Current = currentSubs[i].Terms.Term; } else { if (status == SeekStatus.NOT_FOUND) { currentSubs[i].Current = currentSubs[i].Terms.Term; if (Debugging.AssertsEnabled) { Debugging.Assert(currentSubs[i].Current != null); } queue.Add(currentSubs[i]); } else { // enum exhausted currentSubs[i].Current = null; } } } if (numTop > 0) { // at least one sub had exact match to the requested term return(SeekStatus.FOUND); } else if (queue.Count > 0) { // no sub had exact match, but at least one sub found // a term after the requested term -- advance to that // next term: PullTop(); return(SeekStatus.NOT_FOUND); } else { return(SeekStatus.END); } }
/// <summary> /// Builds the final automaton from a list of entries. /// </summary> private FST<object> BuildAutomaton(BytesRefSorter sorter) { // Build the automaton. Outputs<object> outputs = NoOutputs.Singleton; object empty = outputs.NoOutput; Builder<object> builder = new Builder<object>(FST.INPUT_TYPE.BYTE1, 0, 0, true, true, shareMaxTailLength, outputs, null, false, PackedInts.DEFAULT, true, 15); BytesRef scratch = new BytesRef(); BytesRef entry; IntsRef scratchIntsRef = new IntsRef(); int count = 0; BytesRefIterator iter = sorter.GetEnumerator(); while ((entry = iter.Next()) != null) { count++; if (scratch.CompareTo(entry) != 0) { builder.Add(Util.Fst.Util.ToIntsRef(entry, scratchIntsRef), empty); scratch.CopyBytes(entry); } } return count == 0 ? null : builder.Finish(); }
public virtual void TestUpdateDelteSlices() { DocumentsWriterDeleteQueue queue = new DocumentsWriterDeleteQueue(); int size = 200 + Random().Next(500) * RANDOM_MULTIPLIER; int?[] ids = new int?[size]; for (int i = 0; i < ids.Length; i++) { ids[i] = Random().Next(); } DeleteSlice slice1 = queue.NewSlice(); DeleteSlice slice2 = queue.NewSlice(); BufferedUpdates bd1 = new BufferedUpdates(); BufferedUpdates bd2 = new BufferedUpdates(); int last1 = 0; int last2 = 0; HashSet<Term> uniqueValues = new HashSet<Term>(); for (int j = 0; j < ids.Length; j++) { int? i = ids[j]; // create an array here since we compare identity below against tailItem Term[] term = new Term[] { new Term("id", i.ToString()) }; uniqueValues.Add(term[0]); queue.AddDelete(term); if (Random().Next(20) == 0 || j == ids.Length - 1) { queue.UpdateSlice(slice1); Assert.IsTrue(slice1.IsTailItem(term)); slice1.Apply(bd1, j); AssertAllBetween(last1, j, bd1, ids); last1 = j + 1; } if (Random().Next(10) == 5 || j == ids.Length - 1) { queue.UpdateSlice(slice2); Assert.IsTrue(slice2.IsTailItem(term)); slice2.Apply(bd2, j); AssertAllBetween(last2, j, bd2, ids); last2 = j + 1; } Assert.AreEqual(j + 1, queue.NumGlobalTermDeletes()); } Assert.AreEqual(uniqueValues, bd1.Terms_Nunit().Keys); Assert.AreEqual(uniqueValues, bd2.Terms_Nunit().Keys); HashSet<Term> frozenSet = new HashSet<Term>(); foreach (Term t in queue.FreezeGlobalBuffer(null).TermsIterable()) { BytesRef bytesRef = new BytesRef(); bytesRef.CopyBytes(t.Bytes()); frozenSet.Add(new Term(t.Field(), bytesRef)); } Assert.AreEqual(uniqueValues, frozenSet); Assert.AreEqual(0, queue.NumGlobalTermDeletes(), "num deletes must be 0 after freeze"); }
private void CheckTermsOrder(IndexReader r, ISet<string> allTerms, bool isTop) { TermsEnum terms = MultiFields.GetFields(r).Terms("f").Iterator(null); BytesRef last = new BytesRef(); HashSet<string> seenTerms = new HashSet<string>(); while (true) { BytesRef term = terms.Next(); if (term == null) { break; } Assert.IsTrue(last.CompareTo(term) < 0); last.CopyBytes(term); string s = term.Utf8ToString(); Assert.IsTrue(allTerms.Contains(s), "term " + TermDesc(s) + " was not added to index (count=" + allTerms.Count + ")"); seenTerms.Add(s); } if (isTop) { Assert.IsTrue(allTerms.SetEquals(seenTerms)); } // Test seeking: IEnumerator<string> it = seenTerms.GetEnumerator(); while (it.MoveNext()) { BytesRef tr = new BytesRef(it.Current); Assert.AreEqual(TermsEnum.SeekStatus.FOUND, terms.SeekCeil(tr), "seek failed for term=" + TermDesc(tr.Utf8ToString())); } }
public override bool BytesVal(int doc, BytesRef target) { target.CopyBytes(outerInstance.m_bytesRef); return(true); }
public virtual void TestStressDeleteQueue() { DocumentsWriterDeleteQueue queue = new DocumentsWriterDeleteQueue(); HashSet<Term> uniqueValues = new HashSet<Term>(); int size = 10000 + Random().Next(500) * RANDOM_MULTIPLIER; int?[] ids = new int?[size]; for (int i = 0; i < ids.Length; i++) { ids[i] = Random().Next(); uniqueValues.Add(new Term("id", ids[i].ToString())); } CountDownLatch latch = new CountDownLatch(1); AtomicInteger index = new AtomicInteger(0); int numThreads = 2 + Random().Next(5); UpdateThread[] threads = new UpdateThread[numThreads]; for (int i = 0; i < threads.Length; i++) { threads[i] = new UpdateThread(queue, index, ids, latch); threads[i].Start(); } latch.countDown(); for (int i = 0; i < threads.Length; i++) { threads[i].Join(); } foreach (UpdateThread updateThread in threads) { DeleteSlice slice = updateThread.Slice; queue.UpdateSlice(slice); BufferedUpdates deletes = updateThread.Deletes; slice.Apply(deletes, BufferedUpdates.MAX_INT); Assert.AreEqual(uniqueValues, deletes.Terms_Nunit().Keys); } queue.TryApplyGlobalSlice(); HashSet<Term> frozenSet = new HashSet<Term>(); foreach (Term t in queue.FreezeGlobalBuffer(null).TermsIterable()) { BytesRef bytesRef = new BytesRef(); bytesRef.CopyBytes(t.Bytes()); frozenSet.Add(new Term(t.Field(), bytesRef)); } Assert.AreEqual(0, queue.NumGlobalTermDeletes(), "num deletes must be 0 after freeze"); Assert.AreEqual(uniqueValues.Count, frozenSet.Count); Assert.AreEqual(uniqueValues, frozenSet); }