public virtual void TestReadAndWrite() { Counter bytesUsed = Util.Counter.NewCounter(); ByteBlockPool pool = new ByteBlockPool(new ByteBlockPool.DirectTrackingAllocator(bytesUsed)); pool.NextBuffer(); bool reuseFirst = Random().NextBoolean(); for (int j = 0; j < 2; j++) { IList<BytesRef> list = new List<BytesRef>(); int maxLength = AtLeast(500); int numValues = AtLeast(100); BytesRef @ref = new BytesRef(); for (int i = 0; i < numValues; i++) { string value = TestUtil.RandomRealisticUnicodeString(Random(), maxLength); list.Add(new BytesRef(value)); @ref.CopyChars(value); pool.Append(@ref); } // verify long position = 0; foreach (BytesRef expected in list) { @ref.Grow(expected.Length); @ref.Length = expected.Length; pool.ReadBytes(position, @ref.Bytes, @ref.Offset, @ref.Length); Assert.AreEqual(expected, @ref); position += @ref.Length; } pool.Reset(Random().NextBoolean(), reuseFirst); if (reuseFirst) { Assert.AreEqual(ByteBlockPool.BYTE_BLOCK_SIZE, bytesUsed.Get()); } else { Assert.AreEqual(0, bytesUsed.Get()); pool.NextBuffer(); // prepare for next iter } } }
private void LoadTerms() { PositiveInt32Outputs posIntOutputs = PositiveInt32Outputs.Singleton; var outputsInner = new PairOutputs <Int64, Int64>(posIntOutputs, posIntOutputs); var outputs = new PairOutputs <Int64, PairOutputs <Int64, Int64> .Pair>(posIntOutputs, outputsInner); var b = new Builder <PairOutputs <Int64, PairOutputs <Int64, Int64> .Pair> .Pair>(FST.INPUT_TYPE.BYTE1, outputs); IndexInput @in = (IndexInput)outerInstance.input.Clone(); @in.Seek(termsStart); BytesRef lastTerm = new BytesRef(10); long lastDocsStart = -1; int docFreq = 0; long totalTermFreq = 0; FixedBitSet visitedDocs = new FixedBitSet(maxDoc); Int32sRef scratchIntsRef = new Int32sRef(); while (true) { SimpleTextUtil.ReadLine(@in, scratch); if (scratch.Equals(SimpleTextFieldsWriter.END) || StringHelper.StartsWith(scratch, SimpleTextFieldsWriter.FIELD)) { if (lastDocsStart != -1) { b.Add(Util.ToInt32sRef(lastTerm, scratchIntsRef), outputs.NewPair(lastDocsStart, outputsInner.NewPair((long)docFreq, totalTermFreq))); sumTotalTermFreq += totalTermFreq; } break; } else if (StringHelper.StartsWith(scratch, SimpleTextFieldsWriter.DOC)) { docFreq++; sumDocFreq++; UnicodeUtil.UTF8toUTF16(scratch.Bytes, scratch.Offset + SimpleTextFieldsWriter.DOC.Length, scratch.Length - SimpleTextFieldsWriter.DOC.Length, scratchUTF16); int docID = ArrayUtil.ParseInt32(scratchUTF16.Chars, 0, scratchUTF16.Length); visitedDocs.Set(docID); } else if (StringHelper.StartsWith(scratch, SimpleTextFieldsWriter.FREQ)) { UnicodeUtil.UTF8toUTF16(scratch.Bytes, scratch.Offset + SimpleTextFieldsWriter.FREQ.Length, scratch.Length - SimpleTextFieldsWriter.FREQ.Length, scratchUTF16); totalTermFreq += ArrayUtil.ParseInt32(scratchUTF16.Chars, 0, scratchUTF16.Length); } else if (StringHelper.StartsWith(scratch, SimpleTextFieldsWriter.TERM)) { if (lastDocsStart != -1) { b.Add(Util.ToInt32sRef(lastTerm, scratchIntsRef), outputs.NewPair(lastDocsStart, outputsInner.NewPair((long)docFreq, totalTermFreq))); } lastDocsStart = @in.Position; // LUCENENET specific: Renamed from getFilePointer() to match FileStream int len = scratch.Length - SimpleTextFieldsWriter.TERM.Length; if (len > lastTerm.Length) { lastTerm.Grow(len); } System.Array.Copy(scratch.Bytes, SimpleTextFieldsWriter.TERM.Length, lastTerm.Bytes, 0, len); lastTerm.Length = len; docFreq = 0; sumTotalTermFreq += totalTermFreq; totalTermFreq = 0; termCount++; } } docCount = visitedDocs.Cardinality; fst = b.Finish(); /* * PrintStream ps = new PrintStream("out.dot"); * fst.toDot(ps); * ps.close(); * System.out.println("SAVED out.dot"); */ //System.out.println("FST " + fst.sizeInBytes()); }
public override BytesRef Next() { if (nextTerm >= numTerms) { return(null); } term.CopyBytes(lastTerm); int start = tvf.ReadVInt32(); int deltaLen = tvf.ReadVInt32(); term.Length = start + deltaLen; term.Grow(term.Length); tvf.ReadBytes(term.Bytes, start, deltaLen); freq = tvf.ReadVInt32(); if (storePayloads) { positions = new int[freq]; payloadOffsets = new int[freq]; int totalPayloadLength = 0; int pos = 0; for (int posUpto = 0; posUpto < freq; posUpto++) { int code = tvf.ReadVInt32(); pos += (int)((uint)code >> 1); positions[posUpto] = pos; if ((code & 1) != 0) { // length change lastPayloadLength = tvf.ReadVInt32(); } payloadOffsets[posUpto] = totalPayloadLength; totalPayloadLength += lastPayloadLength; if (Debugging.AssertsEnabled) { Debugging.Assert(totalPayloadLength >= 0); } } payloadData = new byte[totalPayloadLength]; tvf.ReadBytes(payloadData, 0, payloadData.Length); } // no payloads else if (storePositions) { // TODO: we could maybe reuse last array, if we can // somehow be careful about consumer never using two // D&PEnums at once... positions = new int[freq]; int pos = 0; for (int posUpto = 0; posUpto < freq; posUpto++) { pos += tvf.ReadVInt32(); positions[posUpto] = pos; } } if (storeOffsets) { startOffsets = new int[freq]; endOffsets = new int[freq]; int offset = 0; for (int posUpto = 0; posUpto < freq; posUpto++) { startOffsets[posUpto] = offset + tvf.ReadVInt32(); offset = endOffsets[posUpto] = startOffsets[posUpto] + tvf.ReadVInt32(); } } lastTerm.CopyBytes(term); nextTerm++; return(term); }
// Swap in S, in place of E: private bool SeekToNonBMP(SegmentTermEnum te, BytesRef term, int pos) { int savLength = term.Length; if (Debugging.AssertsEnabled) { Debugging.Assert(term.Offset == 0); } // The 3 bytes starting at downTo make up 1 // unicode character: if (Debugging.AssertsEnabled) { Debugging.Assert(IsHighBMPChar(term.Bytes, pos)); } // NOTE: we cannot make this assert, because // AutomatonQuery legitimately sends us malformed UTF8 // (eg the UTF8 bytes with just 0xee) // assert term.length >= pos + 3: "term.length=" + term.length + " pos+3=" + (pos+3) + " byte=" + Integer.toHexString(term.bytes[pos]) + " term=" + term.toString(); // Save the bytes && length, since we need to // restore this if seek "back" finds no matching // terms if (term.Bytes.Length < 4 + pos) { term.Grow(4 + pos); } scratch[0] = (sbyte)term.Bytes[pos]; scratch[1] = (sbyte)term.Bytes[pos + 1]; scratch[2] = (sbyte)term.Bytes[pos + 2]; term.Bytes[pos] = 0xf0; term.Bytes[pos + 1] = 0x90; term.Bytes[pos + 2] = 0x80; term.Bytes[pos + 3] = 0x80; term.Length = 4 + pos; if (DEBUG_SURROGATES) { Console.WriteLine(" try seek term=" + UnicodeUtil.ToHexString(term.Utf8ToString())); } // Seek "back": outerInstance.TermsDict.SeekEnum(te, new Term(fieldInfo.Name, term), true); // Test if the term we seek'd to in fact found a // surrogate pair at the same position as the E: Term t2 = te.Term(); // Cannot be null (or move to next field) because at // "worst" it'd seek to the same term we are on now, // unless we are being called from seek if (t2 is null || t2.Field != internedFieldName) { return(false); } if (DEBUG_SURROGATES) { Console.WriteLine(" got term=" + UnicodeUtil.ToHexString(t2.Text)); } // Now test if prefix is identical and we found // a non-BMP char at the same position: BytesRef b2 = t2.Bytes; if (Debugging.AssertsEnabled) { Debugging.Assert(b2.Offset == 0); } bool matches; if (b2.Length >= term.Length && IsNonBMPChar(b2.Bytes, pos)) { matches = true; for (int i = 0; i < pos; i++) { if (term.Bytes[i] != b2.Bytes[i]) { matches = false; break; } } } else { matches = false; } // Restore term: term.Length = savLength; term.Bytes[pos] = (byte)scratch[0]; term.Bytes[pos + 1] = (byte)scratch[1]; term.Bytes[pos + 2] = (byte)scratch[2]; return(matches); }
// TODO: this currently requites a determinized machine, // but it need not -- we can speed it up by walking the // NFA instead. it'd still be fail fast. public static BytesRef GetCommonPrefixBytesRef(Automaton a) { if (a.IsSingleton) { return new BytesRef(a.singleton); } BytesRef @ref = new BytesRef(10); HashSet<State> visited = new HashSet<State>(); State s = a.Initial; bool done; do { done = true; visited.Add(s); if (!s.accept && s.NumTransitions() == 1) { var iter = s.Transitions.GetEnumerator(); iter.MoveNext(); Transition t = iter.Current; if (t.Min_Renamed == t.Max_Renamed && !visited.Contains(t.To)) { @ref.Grow([email protected]); @ref.Bytes[@ref.Length - 1] = (byte)t.Min_Renamed; s = t.To; done = false; } } } while (!done); return @ref; }
public override Fields Get(int doc) { // LUCENENET specific: Use StringComparer.Ordinal to get the same ordering as Java var fields = new JCG.SortedDictionary <string, SimpleTVTerms>(StringComparer.Ordinal); _input.Seek(_offsets[doc]); ReadLine(); if (Debugging.AssertsEnabled) { Debugging.Assert(StringHelper.StartsWith(_scratch, SimpleTextTermVectorsWriter.NUMFIELDS)); } var numFields = ParseInt32At(SimpleTextTermVectorsWriter.NUMFIELDS.Length); if (numFields == 0) { return(null); // no vectors for this doc } for (var i = 0; i < numFields; i++) { ReadLine(); if (Debugging.AssertsEnabled) { Debugging.Assert(StringHelper.StartsWith(_scratch, SimpleTextTermVectorsWriter.FIELD)); } // skip fieldNumber: ParseInt32At(SimpleTextTermVectorsWriter.FIELD.Length); ReadLine(); if (Debugging.AssertsEnabled) { Debugging.Assert(StringHelper.StartsWith(_scratch, SimpleTextTermVectorsWriter.FIELDNAME)); } var fieldName = ReadString(SimpleTextTermVectorsWriter.FIELDNAME.Length, _scratch); ReadLine(); if (Debugging.AssertsEnabled) { Debugging.Assert(StringHelper.StartsWith(_scratch, SimpleTextTermVectorsWriter.FIELDPOSITIONS)); } var positions = Convert.ToBoolean(ReadString(SimpleTextTermVectorsWriter.FIELDPOSITIONS.Length, _scratch), CultureInfo.InvariantCulture); ReadLine(); if (Debugging.AssertsEnabled) { Debugging.Assert(StringHelper.StartsWith(_scratch, SimpleTextTermVectorsWriter.FIELDOFFSETS)); } var offsets = Convert.ToBoolean(ReadString(SimpleTextTermVectorsWriter.FIELDOFFSETS.Length, _scratch), CultureInfo.InvariantCulture); ReadLine(); if (Debugging.AssertsEnabled) { Debugging.Assert(StringHelper.StartsWith(_scratch, SimpleTextTermVectorsWriter.FIELDPAYLOADS)); } var payloads = Convert.ToBoolean(ReadString(SimpleTextTermVectorsWriter.FIELDPAYLOADS.Length, _scratch), CultureInfo.InvariantCulture); ReadLine(); if (Debugging.AssertsEnabled) { Debugging.Assert(StringHelper.StartsWith(_scratch, SimpleTextTermVectorsWriter.FIELDTERMCOUNT)); } var termCount = ParseInt32At(SimpleTextTermVectorsWriter.FIELDTERMCOUNT.Length); var terms = new SimpleTVTerms(offsets, positions, payloads); fields.Add(fieldName, terms); for (var j = 0; j < termCount; j++) { ReadLine(); if (Debugging.AssertsEnabled) { Debugging.Assert(StringHelper.StartsWith(_scratch, SimpleTextTermVectorsWriter.TERMTEXT)); } var term = new BytesRef(); var termLength = _scratch.Length - SimpleTextTermVectorsWriter.TERMTEXT.Length; term.Grow(termLength); term.Length = termLength; Array.Copy(_scratch.Bytes, _scratch.Offset + SimpleTextTermVectorsWriter.TERMTEXT.Length, term.Bytes, term.Offset, termLength); var postings = new SimpleTVPostings(); terms.terms.Add(term, postings); ReadLine(); if (Debugging.AssertsEnabled) { Debugging.Assert(StringHelper.StartsWith(_scratch, SimpleTextTermVectorsWriter.TERMFREQ)); } postings.freq = ParseInt32At(SimpleTextTermVectorsWriter.TERMFREQ.Length); if (!positions && !offsets) { continue; } if (positions) { postings.positions = new int[postings.freq]; if (payloads) { postings.payloads = new BytesRef[postings.freq]; } } if (offsets) { postings.startOffsets = new int[postings.freq]; postings.endOffsets = new int[postings.freq]; } for (var k = 0; k < postings.freq; k++) { if (positions) { ReadLine(); if (Debugging.AssertsEnabled) { Debugging.Assert(StringHelper.StartsWith(_scratch, SimpleTextTermVectorsWriter.POSITION)); } postings.positions[k] = ParseInt32At(SimpleTextTermVectorsWriter.POSITION.Length); if (payloads) { ReadLine(); if (Debugging.AssertsEnabled) { Debugging.Assert(StringHelper.StartsWith(_scratch, SimpleTextTermVectorsWriter.PAYLOAD)); } if (_scratch.Length - SimpleTextTermVectorsWriter.PAYLOAD.Length == 0) { postings.payloads[k] = null; } else { var payloadBytes = new byte[_scratch.Length - SimpleTextTermVectorsWriter.PAYLOAD.Length]; Array.Copy(_scratch.Bytes, _scratch.Offset + SimpleTextTermVectorsWriter.PAYLOAD.Length, payloadBytes, 0, payloadBytes.Length); postings.payloads[k] = new BytesRef(payloadBytes); } } } if (!offsets) { continue; } ReadLine(); if (Debugging.AssertsEnabled) { Debugging.Assert(StringHelper.StartsWith(_scratch, SimpleTextTermVectorsWriter.STARTOFFSET)); } postings.startOffsets[k] = ParseInt32At(SimpleTextTermVectorsWriter.STARTOFFSET.Length); ReadLine(); if (Debugging.AssertsEnabled) { Debugging.Assert(StringHelper.StartsWith(_scratch, SimpleTextTermVectorsWriter.ENDOFFSET)); } postings.endOffsets[k] = ParseInt32At(SimpleTextTermVectorsWriter.ENDOFFSET.Length); } } } return(new SimpleTVFields(this, fields)); }
/// <remarks> /// TODO: we may want an alternate mode here which is /// "if you are about to return NOT_FOUND I won't use /// the terms data from that"; eg FuzzyTermsEnum will /// (usually) just immediately call seek again if we /// return NOT_FOUND so it's a waste for us to fill in /// the term that was actually NOT_FOUND /// </remarks> public override SeekStatus SeekCeil(BytesRef target) { if (_indexEnum == null) { throw new InvalidOperationException("terms index was not loaded"); } var doSeek = true; // See if we can avoid seeking, because target term // is after current term but before next index term: if (_indexIsCurrent) { var cmp = BytesRef.UTF8SortedAsUnicodeComparer.Compare(_term, target); if (cmp == 0) { return(SeekStatus.FOUND); // Already at the requested term } if (cmp < 0) { // Target term is after current term if (!_didIndexNext) { _nextIndexTerm = _indexEnum.Next == -1 ? null : _indexEnum.Term; _didIndexNext = true; } if (_nextIndexTerm == null || BytesRef.UTF8SortedAsUnicodeComparer.Compare(target, _nextIndexTerm) < 0) { // Optimization: requested term is within the // same term block we are now in; skip seeking // (but do scanning): doSeek = false; } } } if (doSeek) { //System.out.println(" seek"); // Ask terms index to find biggest indexed term (= // first term in a block) that's <= our text: _input.Seek(_indexEnum.Seek(target)); var result = NextBlock(); // Block must exist since, at least, the indexed term // is in the block: Debug.Assert(result); _indexIsCurrent = true; _didIndexNext = false; _blocksSinceSeek = 0; if (_doOrd) { _state.Ord = _indexEnum.Ord - 1; } _term.CopyBytes(_indexEnum.Term); } else { if (_state.TermBlockOrd == _blockTermCount && !NextBlock()) { _indexIsCurrent = false; return(SeekStatus.END); } } _seekPending = false; var common = 0; // Scan within block. We could do this by calling // _next() and testing the resulting term, but this // is wasteful. Instead, we first confirm the // target matches the common prefix of this block, // and then we scan the term bytes directly from the // termSuffixesreader's byte[], saving a copy into // the BytesRef term per term. Only when we return // do we then copy the bytes into the term. while (true) { // First, see if target term matches common prefix // in this block: if (common < _termBlockPrefix) { var cmp = (_term.Bytes[common] & 0xFF) - (target.Bytes[target.Offset + common] & 0xFF); if (cmp < 0) { // TODO: maybe we should store common prefix // in block header? (instead of relying on // last term of previous block) // Target's prefix is after the common block // prefix, so term cannot be in this block // but it could be in next block. We // must scan to end-of-block to set common // prefix for next block: if (_state.TermBlockOrd < _blockTermCount) { while (_state.TermBlockOrd < _blockTermCount - 1) { _state.TermBlockOrd++; _state.Ord++; _termSuffixesReader.SkipBytes(_termSuffixesReader.ReadVInt()); } var suffix = _termSuffixesReader.ReadVInt(); _term.Length = _termBlockPrefix + suffix; if (_term.Bytes.Length < _term.Length) { _term.Grow(_term.Length); } _termSuffixesReader.ReadBytes(_term.Bytes, _termBlockPrefix, suffix); } _state.Ord++; if (!NextBlock()) { _indexIsCurrent = false; return(SeekStatus.END); } common = 0; } else if (cmp > 0) { // Target's prefix is before the common prefix // of this block, so we position to start of // block and return NOT_FOUND: Debug.Assert(_state.TermBlockOrd == 0); var suffix = _termSuffixesReader.ReadVInt(); _term.Length = _termBlockPrefix + suffix; if (_term.Bytes.Length < _term.Length) { _term.Grow(_term.Length); } _termSuffixesReader.ReadBytes(_term.Bytes, _termBlockPrefix, suffix); return(SeekStatus.NOT_FOUND); } else { common++; } continue; } // Test every term in this block while (true) { _state.TermBlockOrd++; _state.Ord++; var suffix = _termSuffixesReader.ReadVInt(); // We know the prefix matches, so just compare the new suffix: var termLen = _termBlockPrefix + suffix; var bytePos = _termSuffixesReader.Position; var next = false; var limit = target.Offset + (termLen < target.Length ? termLen : target.Length); var targetPos = target.Offset + _termBlockPrefix; while (targetPos < limit) { var cmp = (_termSuffixes[bytePos++] & 0xFF) - (target.Bytes[targetPos++] & 0xFF); if (cmp < 0) { // Current term is still before the target; // keep scanning next = true; break; } if (cmp <= 0) { continue; } // Done! Current term is after target. Stop // here, fill in real term, return NOT_FOUND. _term.Length = _termBlockPrefix + suffix; if (_term.Bytes.Length < _term.Length) { _term.Grow(_term.Length); } _termSuffixesReader.ReadBytes(_term.Bytes, _termBlockPrefix, suffix); return(SeekStatus.NOT_FOUND); } if (!next && target.Length <= termLen) { _term.Length = _termBlockPrefix + suffix; if (_term.Bytes.Length < _term.Length) { _term.Grow(_term.Length); } _termSuffixesReader.ReadBytes(_term.Bytes, _termBlockPrefix, suffix); return(target.Length == termLen ? SeekStatus.FOUND : SeekStatus.NOT_FOUND); } if (_state.TermBlockOrd == _blockTermCount) { // Must pre-fill term for next block's common prefix _term.Length = _termBlockPrefix + suffix; if (_term.Bytes.Length < _term.Length) { _term.Grow(_term.Length); } _termSuffixesReader.ReadBytes(_term.Bytes, _termBlockPrefix, suffix); break; } _termSuffixesReader.SkipBytes(suffix); } // The purpose of the terms dict index is to seek // the enum to the closest index term before the // term we are looking for. So, we should never // cross another index term (besides the first // one) while we are scanning: Debug.Assert(_indexIsCurrent); if (!NextBlock()) { _indexIsCurrent = false; return(SeekStatus.END); } common = 0; } }
/// <summary> /// Returns the next string in lexicographic order that will not put /// the machine into a reject state. /// <para/> /// This method traverses the DFA from the given position in the string, /// starting at the given state. /// <para/> /// If this cannot satisfy the machine, returns <c>false</c>. This method will /// walk the minimal path, in lexicographic order, as long as possible. /// <para/> /// If this method returns <c>false</c>, then there might still be more solutions, /// it is necessary to backtrack to find out. /// </summary> /// <param name="state"> current non-reject state </param> /// <param name="position"> useful portion of the string </param> /// <returns> <c>true</c> if more possible solutions exist for the DFA from this /// position </returns> private bool NextString(int state, int position) { /* * the next lexicographic character must be greater than the existing * character, if it exists. */ int c = 0; if (position < seekBytesRef.Length) { c = seekBytesRef.Bytes[position] & 0xff; // if the next byte is 0xff and is not part of the useful portion, // then by definition it puts us in a reject state, and therefore this // path is dead. there cannot be any higher transitions. backtrack. if (c++ == 0xff) { return(false); } } seekBytesRef.Length = position; visited[state] = curGen; Transition[] transitions = allTransitions[state]; // find the minimal path (lexicographic order) that is >= c for (int i = 0; i < transitions.Length; i++) { Transition transition = transitions[i]; if (transition.Max >= c) { int nextChar = Math.Max(c, transition.Min); // append either the next sequential char, or the minimum transition seekBytesRef.Grow(seekBytesRef.Length + 1); seekBytesRef.Length++; seekBytesRef.Bytes[seekBytesRef.Length - 1] = (byte)nextChar; state = transition.Dest.Number; /* * as long as is possible, continue down the minimal path in * lexicographic order. if a loop or accept state is encountered, stop. */ while (visited[state] != curGen && !runAutomaton.IsAccept(state)) { visited[state] = curGen; /* * Note: we work with a DFA with no transitions to dead states. * so the below is ok, if it is not an accept state, * then there MUST be at least one transition. */ transition = allTransitions[state][0]; state = transition.Dest.Number; // append the minimum transition seekBytesRef.Grow(seekBytesRef.Length + 1); seekBytesRef.Length++; seekBytesRef.Bytes[seekBytesRef.Length - 1] = (byte)transition.Min; // we found a loop, record it for faster enumeration if ((finite == false) && !linear && visited[state] == curGen) { SetLinear(seekBytesRef.Length - 1); } } return(true); } } return(false); }
public override int NextPosition() { int pos; if (_readPositions) { SimpleTextUtil.ReadLine(_in, _scratch); // LUCENENET specific - use wrapper BytesRefFormatter struct to defer building the string unless string.Format() is called if (Debugging.AssertsEnabled) { Debugging.Assert(StringHelper.StartsWith(_scratch, SimpleTextFieldsWriter.POS), "got line={0}", new BytesRefFormatter(_scratch, BytesRefFormat.UTF8)); } UnicodeUtil.UTF8toUTF16(_scratch.Bytes, _scratch.Offset + SimpleTextFieldsWriter.POS.Length, _scratch.Length - SimpleTextFieldsWriter.POS.Length, _scratchUtf162); pos = ArrayUtil.ParseInt32(_scratchUtf162.Chars, 0, _scratchUtf162.Length); } else { pos = -1; } if (_readOffsets) { SimpleTextUtil.ReadLine(_in, _scratch); // LUCENENET specific - use wrapper BytesRefFormatter struct to defer building the string unless string.Format() is called if (Debugging.AssertsEnabled) { Debugging.Assert(StringHelper.StartsWith(_scratch, SimpleTextFieldsWriter.START_OFFSET), "got line={0}", new BytesRefFormatter(_scratch, BytesRefFormat.UTF8)); } UnicodeUtil.UTF8toUTF16(_scratch.Bytes, _scratch.Offset + SimpleTextFieldsWriter.START_OFFSET.Length, _scratch.Length - SimpleTextFieldsWriter.START_OFFSET.Length, _scratchUtf162); _startOffset = ArrayUtil.ParseInt32(_scratchUtf162.Chars, 0, _scratchUtf162.Length); SimpleTextUtil.ReadLine(_in, _scratch); // LUCENENET specific - use wrapper BytesRefFormatter struct to defer building the string unless string.Format() is called if (Debugging.AssertsEnabled) { Debugging.Assert(StringHelper.StartsWith(_scratch, SimpleTextFieldsWriter.END_OFFSET), "got line={0}", new BytesRefFormatter(_scratch, BytesRefFormat.UTF8)); } UnicodeUtil.UTF8toUTF16(_scratch.Bytes, _scratch.Offset + SimpleTextFieldsWriter.END_OFFSET.Length, _scratch.Length - SimpleTextFieldsWriter.END_OFFSET.Length, _scratchUtf162); _endOffset = ArrayUtil.ParseInt32(_scratchUtf162.Chars, 0, _scratchUtf162.Length); } long fp = _in.Position; // LUCENENET specific: Renamed from getFilePointer() to match FileStream SimpleTextUtil.ReadLine(_in, _scratch); if (StringHelper.StartsWith(_scratch, SimpleTextFieldsWriter.PAYLOAD)) { int len = _scratch.Length - SimpleTextFieldsWriter.PAYLOAD.Length; if (_scratch2.Bytes.Length < len) { _scratch2.Grow(len); } Array.Copy(_scratch.Bytes, SimpleTextFieldsWriter.PAYLOAD.Length, _scratch2.Bytes, 0, len); _scratch2.Length = len; _payload = _scratch2; } else { _payload = null; _in.Seek(fp); } return(pos); }
/// <summary> /// Finds largest term accepted by this Automaton, that's /// <= the provided input term. The result is placed in /// output; it's fine for output and input to point to /// the same BytesRef. The returned result is either the /// provided output, or null if there is no floor term /// (ie, the provided input term is before the first term /// accepted by this Automaton). /// </summary> public virtual BytesRef Floor(BytesRef input, BytesRef output) { output.Offset = 0; //if (DEBUG) System.out.println("CA.floor input=" + input.utf8ToString()); int state = RunAutomaton.InitialState; // Special case empty string: if (input.Length == 0) { if (RunAutomaton.IsAccept(state)) { output.Length = 0; return output; } else { return null; } } IList<int> stack = new List<int>(); int idx = 0; while (true) { int label = ((sbyte)input.Bytes[input.Offset + idx]) & 0xff; int nextState = RunAutomaton.Step(state, label); //if (DEBUG) System.out.println(" cycle label=" + (char) label + " nextState=" + nextState); if (idx == input.Length - 1) { if (nextState != -1 && RunAutomaton.IsAccept(nextState)) { // Input string is accepted if (idx >= output.Bytes.Length) { output.Grow(1 + idx); } output.Bytes[idx] = (byte)label; output.Length = input.Length; //if (DEBUG) System.out.println(" input is accepted; return term=" + output.utf8ToString()); return output; } else { nextState = -1; } } if (nextState == -1) { // Pop back to a state that has a transition // <= our label: while (true) { Transition[] transitions = SortedTransitions[state]; if (transitions.Length == 0) { Debug.Assert(RunAutomaton.IsAccept(state)); output.Length = idx; //if (DEBUG) System.out.println(" return " + output.utf8ToString()); return output; } else if (label - 1 < transitions[0].Min_Renamed) { if (RunAutomaton.IsAccept(state)) { output.Length = idx; //if (DEBUG) System.out.println(" return " + output.utf8ToString()); return output; } // pop if (stack.Count == 0) { //if (DEBUG) System.out.println(" pop ord=" + idx + " return null"); return null; } else { state = stack[stack.Count - 1]; stack.RemoveAt(stack.Count - 1); idx--; //if (DEBUG) System.out.println(" pop ord=" + (idx+1) + " label=" + (char) label + " first trans.min=" + (char) transitions[0].min); label = input.Bytes[input.Offset + idx] & 0xff; } } else { //if (DEBUG) System.out.println(" stop pop ord=" + idx + " first trans.min=" + (char) transitions[0].min); break; } } //if (DEBUG) System.out.println(" label=" + (char) label + " idx=" + idx); return AddTail(state, output, idx, label); } else { if (idx >= output.Bytes.Length) { output.Grow(1 + idx); } output.Bytes[idx] = (byte)label; stack.Add(state); state = nextState; idx++; } } }
//private static final boolean DEBUG = BlockTreeTermsWriter.DEBUG; private BytesRef AddTail(int state, BytesRef term, int idx, int leadLabel) { // Find biggest transition that's < label // TODO: use binary search here Transition maxTransition = null; foreach (Transition transition in SortedTransitions[state]) { if (transition.Min_Renamed < leadLabel) { maxTransition = transition; } } Debug.Assert(maxTransition != null); // Append floorLabel int floorLabel; if (maxTransition.Max_Renamed > leadLabel - 1) { floorLabel = leadLabel - 1; } else { floorLabel = maxTransition.Max_Renamed; } if (idx >= term.Bytes.Length) { term.Grow(1 + idx); } //if (DEBUG) System.out.println(" add floorLabel=" + (char) floorLabel + " idx=" + idx); term.Bytes[idx] = (byte)floorLabel; state = maxTransition.To.Number; idx++; // Push down to last accept state while (true) { Transition[] transitions = SortedTransitions[state]; if (transitions.Length == 0) { Debug.Assert(RunAutomaton.IsAccept(state)); term.Length = idx; //if (DEBUG) System.out.println(" return " + term.utf8ToString()); return term; } else { // We are pushing "top" -- so get last label of // last transition: Debug.Assert(transitions.Length != 0); Transition lastTransition = transitions[transitions.Length - 1]; if (idx >= term.Bytes.Length) { term.Grow(1 + idx); } //if (DEBUG) System.out.println(" push maxLabel=" + (char) lastTransition.max + " idx=" + idx); term.Bytes[idx] = (byte)lastTransition.Max_Renamed; state = lastTransition.To.Number; idx++; } } }
private IEnumerable<BytesRef> GetBytesIterator(int maxDocParam) { // Use yield return instead of ucsom IEnumerable AppendingDeltaPackedLongBuffer.Iterator lengthsIterator = Lengths.GetIterator(); int size = (int)Lengths.Size(); DataInput bytesIterator = Bytes.DataInput; int maxDoc = maxDocParam; int upto = 0; while (upto < maxDoc) { BytesRef v = null; if (upto < size) { int length = (int)lengthsIterator.Next(); var value = new BytesRef(); value.Grow(length); value.Length = length; bytesIterator.ReadBytes(value.Bytes, value.Offset, value.Length); if (DocsWithField.Get(upto)) { v = value; } } upto++; yield return v; } }
private IEnumerable<BytesRef> GetBytesIterator(int maxDocParam) { // Use yield return instead of ucsom IEnumerable BytesRef value = new BytesRef(); AppendingDeltaPackedLongBuffer.Iterator lengthsIterator = (AppendingDeltaPackedLongBuffer.Iterator)Lengths.GetIterator(); int size = (int)Lengths.Size(); DataInput bytesIterator = Bytes.DataInput; int maxDoc = maxDocParam; int upto = 0; long byteOffset = 0L; while (upto < maxDoc) { if (upto < size) { int length = (int)lengthsIterator.Next(); value.Grow(length); value.Length = length; //LUCENE TODO: This modification is slightly fishy, 4x port uses ByteBlockPool bytesIterator.ReadBytes(/*byteOffset,*/ value.Bytes, value.Offset, value.Length); byteOffset += length; } else { // This is to handle last N documents not having // this DV field in the end of the segment: value.Length = 0; } upto++; yield return value; } }
internal virtual void LoadTerms() { PositiveIntOutputs posIntOutputs = PositiveIntOutputs.Singleton; Builder <PairOutputs.Pair <long?, PairOutputs.Pair <long?, long?> > > b; PairOutputs <long?, long?> outputsInner = new PairOutputs <long?, long?>(posIntOutputs, posIntOutputs); PairOutputs <long?, PairOutputs.Pair <long?, long?> > outputs = new PairOutputs <long?, PairOutputs.Pair <long?, long?> >(posIntOutputs, outputsInner); b = new Builder <>(FST.INPUT_TYPE.BYTE1, outputs); IndexInput @in = (IndexInput)outerInstance._input.Clone(); @in.Seek(termsStart); BytesRef lastTerm = new BytesRef(10); long lastDocsStart = -1; int docFreq = 0; long totalTermFreq = 0; FixedBitSet visitedDocs = new FixedBitSet(maxDoc); IntsRef scratchIntsRef = new IntsRef(); while (true) { SimpleTextUtil.ReadLine(@in, scratch); if (scratch.Equals(END) || StringHelper.StartsWith(scratch, FIELD)) { if (lastDocsStart != -1) { b.Add(Util.ToIntsRef(lastTerm, scratchIntsRef), outputs.NewPair(lastDocsStart, outputsInner.NewPair((long)docFreq, totalTermFreq))); sumTotalTermFreq += totalTermFreq; } break; } else if (StringHelper.StartsWith(scratch, DOC)) { docFreq++; sumDocFreq++; UnicodeUtil.UTF8toUTF16(scratch.Bytes, scratch.Offset + DOC.length, scratch.Length - DOC.length, scratchUTF16); int docID = ArrayUtil.ParseInt(scratchUTF16.Chars, 0, scratchUTF16.length); visitedDocs.Set(docID); } else if (StringHelper.StartsWith(scratch, FREQ)) { UnicodeUtil.UTF8toUTF16(scratch.Bytes, scratch.Offset + FREQ.length, scratch.Length - FREQ.length, scratchUTF16); totalTermFreq += ArrayUtil.ParseInt(scratchUTF16.Chars, 0, scratchUTF16.length); } else if (StringHelper.StartsWith(scratch, TERM)) { if (lastDocsStart != -1) { b.Add(Util.ToIntsRef(lastTerm, scratchIntsRef), outputs.NewPair(lastDocsStart, outputsInner.NewPair((long)docFreq, totalTermFreq))); } lastDocsStart = @in.FilePointer; int len = scratch.Length - TERM.length; if (len > lastTerm.Length) { lastTerm.Grow(len); } Array.Copy(scratch.Bytes, TERM.length, lastTerm.Bytes, 0, len); lastTerm.Length = len; docFreq = 0; sumTotalTermFreq += totalTermFreq; totalTermFreq = 0; termCount++; } } docCount = visitedDocs.Cardinality(); fst = b.Finish(); }
public override Fields Get(int doc) { var fields = new SortedDictionary <string, SimpleTVTerms>(); _input.Seek(_offsets[doc]); ReadLine(); Debug.Assert(StringHelper.StartsWith(_scratch, SimpleTextTermVectorsWriter.NUMFIELDS)); var numFields = ParseIntAt(SimpleTextTermVectorsWriter.NUMFIELDS.Length); if (numFields == 0) { return(null); // no vectors for this doc } for (var i = 0; i < numFields; i++) { ReadLine(); Debug.Assert(StringHelper.StartsWith(_scratch, SimpleTextTermVectorsWriter.FIELD)); // skip fieldNumber: ParseIntAt(SimpleTextTermVectorsWriter.FIELD.Length); ReadLine(); Debug.Assert(StringHelper.StartsWith(_scratch, SimpleTextTermVectorsWriter.FIELDNAME)); var fieldName = ReadString(SimpleTextTermVectorsWriter.FIELDNAME.Length, _scratch); ReadLine(); Debug.Assert(StringHelper.StartsWith(_scratch, SimpleTextTermVectorsWriter.FIELDPOSITIONS)); var positions = Convert.ToBoolean(ReadString(SimpleTextTermVectorsWriter.FIELDPOSITIONS.Length, _scratch)); ReadLine(); Debug.Assert(StringHelper.StartsWith(_scratch, SimpleTextTermVectorsWriter.FIELDOFFSETS)); var offsets = Convert.ToBoolean(ReadString(SimpleTextTermVectorsWriter.FIELDOFFSETS.Length, _scratch)); ReadLine(); Debug.Assert(StringHelper.StartsWith(_scratch, SimpleTextTermVectorsWriter.FIELDPAYLOADS)); var payloads = Convert.ToBoolean(ReadString(SimpleTextTermVectorsWriter.FIELDPAYLOADS.Length, _scratch)); ReadLine(); Debug.Assert(StringHelper.StartsWith(_scratch, SimpleTextTermVectorsWriter.FIELDTERMCOUNT)); var termCount = ParseIntAt(SimpleTextTermVectorsWriter.FIELDTERMCOUNT.Length); var terms = new SimpleTVTerms(offsets, positions, payloads); fields.Add(fieldName, terms); for (var j = 0; j < termCount; j++) { ReadLine(); Debug.Assert(StringHelper.StartsWith(_scratch, SimpleTextTermVectorsWriter.TERMTEXT)); var term = new BytesRef(); var termLength = _scratch.Length - SimpleTextTermVectorsWriter.TERMTEXT.Length; term.Grow(termLength); term.Length = termLength; Array.Copy(_scratch.Bytes, _scratch.Offset + SimpleTextTermVectorsWriter.TERMTEXT.Length, term.Bytes, term.Offset, termLength); var postings = new SimpleTVPostings(); terms.TERMS.Add(term, postings); ReadLine(); Debug.Assert(StringHelper.StartsWith(_scratch, SimpleTextTermVectorsWriter.TERMFREQ)); postings.FREQ = ParseIntAt(SimpleTextTermVectorsWriter.TERMFREQ.Length); if (!positions && !offsets) { continue; } if (positions) { postings.POSITIONS = new int[postings.FREQ]; if (payloads) { postings.PAYLOADS = new BytesRef[postings.FREQ]; } } if (offsets) { postings.START_OFFSETS = new int[postings.FREQ]; postings.END_OFFSETS = new int[postings.FREQ]; } for (var k = 0; k < postings.FREQ; k++) { if (positions) { ReadLine(); Debug.Assert(StringHelper.StartsWith(_scratch, SimpleTextTermVectorsWriter.POSITION)); postings.POSITIONS[k] = ParseIntAt(SimpleTextTermVectorsWriter.POSITION.Length); if (payloads) { ReadLine(); Debug.Assert(StringHelper.StartsWith(_scratch, SimpleTextTermVectorsWriter.PAYLOAD)); if (_scratch.Length - SimpleTextTermVectorsWriter.PAYLOAD.Length == 0) { postings.PAYLOADS[k] = null; } else { var payloadBytes = new byte[_scratch.Length - SimpleTextTermVectorsWriter.PAYLOAD.Length]; Array.Copy(_scratch.Bytes, _scratch.Offset + SimpleTextTermVectorsWriter.PAYLOAD.Length, payloadBytes, 0, payloadBytes.Length); postings.PAYLOADS[k] = new BytesRef(payloadBytes); } } } if (!offsets) { continue; } ReadLine(); Debug.Assert(StringHelper.StartsWith(_scratch, SimpleTextTermVectorsWriter.STARTOFFSET)); postings.START_OFFSETS[k] = ParseIntAt(SimpleTextTermVectorsWriter.STARTOFFSET.Length); ReadLine(); Debug.Assert(StringHelper.StartsWith(_scratch, SimpleTextTermVectorsWriter.ENDOFFSET)); postings.END_OFFSETS[k] = ParseIntAt(SimpleTextTermVectorsWriter.ENDOFFSET.Length); } } } return(new SimpleTVFields(this, fields)); }
/// <summary> /// Builds an <seealso cref="SynonymMap"/> and returns it. /// </summary> public virtual SynonymMap Build() { ByteSequenceOutputs outputs = ByteSequenceOutputs.Singleton; // TODO: are we using the best sharing options? var builder = new Builder<BytesRef>(FST.INPUT_TYPE.BYTE4, outputs); BytesRef scratch = new BytesRef(64); ByteArrayDataOutput scratchOutput = new ByteArrayDataOutput(); HashSet<int?> dedupSet; if (dedup) { dedupSet = new HashSet<int?>(); } else { dedupSet = null; } var spare = new sbyte[5]; Dictionary<CharsRef, MapEntry>.KeyCollection keys = workingSet.Keys; CharsRef[] sortedKeys = keys.ToArray(); Arrays.Sort(sortedKeys, CharsRef.UTF16SortedAsUTF8Comparator); IntsRef scratchIntsRef = new IntsRef(); //System.out.println("fmap.build"); for (int keyIdx = 0; keyIdx < sortedKeys.Length; keyIdx++) { CharsRef input = sortedKeys[keyIdx]; MapEntry output = workingSet[input]; int numEntries = output.ords.Count; // output size, assume the worst case int estimatedSize = 5 + numEntries * 5; // numEntries + one ord for each entry scratch.Grow(estimatedSize); scratchOutput.Reset(scratch.Bytes, scratch.Offset, scratch.Bytes.Length); Debug.Assert(scratch.Offset == 0); // now write our output data: int count = 0; for (int i = 0; i < numEntries; i++) { if (dedupSet != null) { // box once int? ent = output.ords[i]; if (dedupSet.Contains(ent)) { continue; } dedupSet.Add(ent); } scratchOutput.WriteVInt(output.ords[i]); count++; } int pos = scratchOutput.Position; scratchOutput.WriteVInt(count << 1 | (output.includeOrig ? 0 : 1)); int pos2 = scratchOutput.Position; int vIntLen = pos2 - pos; // Move the count + includeOrig to the front of the byte[]: Array.Copy(scratch.Bytes, pos, spare, 0, vIntLen); Array.Copy(scratch.Bytes, 0, scratch.Bytes, vIntLen, pos); Array.Copy(spare, 0, scratch.Bytes, 0, vIntLen); if (dedupSet != null) { dedupSet.Clear(); } scratch.Length = scratchOutput.Position - scratch.Offset; //System.out.println(" add input=" + input + " output=" + scratch + " offset=" + scratch.offset + " length=" + scratch.length + " count=" + count); builder.Add(Util.ToUTF32(input, scratchIntsRef), BytesRef.DeepCopyOf(scratch)); } FST<BytesRef> fst = builder.Finish(); return new SynonymMap(fst, words, maxHorizontalContext); }
/// <summary> /// Finds largest term accepted by this Automaton, that's /// <= the provided input term. The result is placed in /// output; it's fine for output and input to point to /// the same BytesRef. The returned result is either the /// provided output, or null if there is no floor term /// (ie, the provided input term is before the first term /// accepted by this Automaton). /// </summary> public virtual BytesRef Floor(BytesRef input, BytesRef output) { output.Offset = 0; //if (DEBUG) System.out.println("CA.floor input=" + input.utf8ToString()); int state = RunAutomaton.InitialState; // Special case empty string: if (input.Length == 0) { if (RunAutomaton.IsAccept(state)) { output.Length = 0; return(output); } else { return(null); } } IList <int> stack = new List <int>(); int idx = 0; while (true) { int label = input.Bytes[input.Offset + idx] & 0xff; int nextState = RunAutomaton.Step(state, label); //if (DEBUG) System.out.println(" cycle label=" + (char) label + " nextState=" + nextState); if (idx == input.Length - 1) { if (nextState != -1 && RunAutomaton.IsAccept(nextState)) { // Input string is accepted if (idx >= output.Bytes.Length) { output.Grow(1 + idx); } output.Bytes[idx] = (sbyte)label; output.Length = input.Length; //if (DEBUG) System.out.println(" input is accepted; return term=" + output.utf8ToString()); return(output); } else { nextState = -1; } } if (nextState == -1) { // Pop back to a state that has a transition // <= our label: while (true) { Transition[] transitions = SortedTransitions[state]; if (transitions.Length == 0) { Debug.Assert(RunAutomaton.IsAccept(state)); output.Length = idx; //if (DEBUG) System.out.println(" return " + output.utf8ToString()); return(output); } else if (label - 1 < transitions[0].Min_Renamed) { if (RunAutomaton.IsAccept(state)) { output.Length = idx; //if (DEBUG) System.out.println(" return " + output.utf8ToString()); return(output); } // pop if (stack.Count == 0) { //if (DEBUG) System.out.println(" pop ord=" + idx + " return null"); return(null); } else { state = stack[stack.Count - 1]; stack.RemoveAt(stack.Count - 1); idx--; //if (DEBUG) System.out.println(" pop ord=" + (idx+1) + " label=" + (char) label + " first trans.min=" + (char) transitions[0].min); label = input.Bytes[input.Offset + idx] & 0xff; } } else { //if (DEBUG) System.out.println(" stop pop ord=" + idx + " first trans.min=" + (char) transitions[0].min); break; } } //if (DEBUG) System.out.println(" label=" + (char) label + " idx=" + idx); return(AddTail(state, output, idx, label)); } else { if (idx >= output.Bytes.Length) { output.Grow(1 + idx); } output.Bytes[idx] = (sbyte)label; stack.Add(state); state = nextState; idx++; } } }
public override Fields Get(int doc) { var fields = new SortedDictionary<string, SimpleTVTerms>(); _input.Seek(_offsets[doc]); ReadLine(); Debug.Assert(StringHelper.StartsWith(_scratch, SimpleTextTermVectorsWriter.NUMFIELDS)); var numFields = ParseIntAt(SimpleTextTermVectorsWriter.NUMFIELDS.Length); if (numFields == 0) { return null; // no vectors for this doc } for (var i = 0; i < numFields; i++) { ReadLine(); Debug.Assert(StringHelper.StartsWith(_scratch, SimpleTextTermVectorsWriter.FIELD)); // skip fieldNumber: ParseIntAt(SimpleTextTermVectorsWriter.FIELD.Length); ReadLine(); Debug.Assert(StringHelper.StartsWith(_scratch, SimpleTextTermVectorsWriter.FIELDNAME)); var fieldName = ReadString(SimpleTextTermVectorsWriter.FIELDNAME.Length, _scratch); ReadLine(); Debug.Assert(StringHelper.StartsWith(_scratch, SimpleTextTermVectorsWriter.FIELDPOSITIONS)); var positions = Convert.ToBoolean(ReadString(SimpleTextTermVectorsWriter.FIELDPOSITIONS.Length, _scratch), CultureInfo.InvariantCulture); ReadLine(); Debug.Assert(StringHelper.StartsWith(_scratch, SimpleTextTermVectorsWriter.FIELDOFFSETS)); var offsets = Convert.ToBoolean(ReadString(SimpleTextTermVectorsWriter.FIELDOFFSETS.Length, _scratch), CultureInfo.InvariantCulture); ReadLine(); Debug.Assert(StringHelper.StartsWith(_scratch, SimpleTextTermVectorsWriter.FIELDPAYLOADS)); var payloads = Convert.ToBoolean(ReadString(SimpleTextTermVectorsWriter.FIELDPAYLOADS.Length, _scratch), CultureInfo.InvariantCulture); ReadLine(); Debug.Assert(StringHelper.StartsWith(_scratch, SimpleTextTermVectorsWriter.FIELDTERMCOUNT)); var termCount = ParseIntAt(SimpleTextTermVectorsWriter.FIELDTERMCOUNT.Length); var terms = new SimpleTVTerms(offsets, positions, payloads); fields.Add(fieldName, terms); for (var j = 0; j < termCount; j++) { ReadLine(); Debug.Assert(StringHelper.StartsWith(_scratch, SimpleTextTermVectorsWriter.TERMTEXT)); var term = new BytesRef(); var termLength = _scratch.Length - SimpleTextTermVectorsWriter.TERMTEXT.Length; term.Grow(termLength); term.Length = termLength; Array.Copy(_scratch.Bytes, _scratch.Offset + SimpleTextTermVectorsWriter.TERMTEXT.Length, term.Bytes, term.Offset, termLength); var postings = new SimpleTVPostings(); terms.TERMS.Add(term, postings); ReadLine(); Debug.Assert(StringHelper.StartsWith(_scratch, SimpleTextTermVectorsWriter.TERMFREQ)); postings.FREQ = ParseIntAt(SimpleTextTermVectorsWriter.TERMFREQ.Length); if (!positions && !offsets) continue; if (positions) { postings.POSITIONS = new int[postings.FREQ]; if (payloads) { postings.PAYLOADS = new BytesRef[postings.FREQ]; } } if (offsets) { postings.START_OFFSETS = new int[postings.FREQ]; postings.END_OFFSETS = new int[postings.FREQ]; } for (var k = 0; k < postings.FREQ; k++) { if (positions) { ReadLine(); Debug.Assert(StringHelper.StartsWith(_scratch, SimpleTextTermVectorsWriter.POSITION)); postings.POSITIONS[k] = ParseIntAt(SimpleTextTermVectorsWriter.POSITION.Length); if (payloads) { ReadLine(); Debug.Assert(StringHelper.StartsWith(_scratch, SimpleTextTermVectorsWriter.PAYLOAD)); if (_scratch.Length - SimpleTextTermVectorsWriter.PAYLOAD.Length == 0) { postings.PAYLOADS[k] = null; } else { var payloadBytes = new byte[_scratch.Length - SimpleTextTermVectorsWriter.PAYLOAD.Length]; Array.Copy(_scratch.Bytes, _scratch.Offset + SimpleTextTermVectorsWriter.PAYLOAD.Length, payloadBytes, 0, payloadBytes.Length); postings.PAYLOADS[k] = new BytesRef(payloadBytes); } } } if (!offsets) continue; ReadLine(); Debug.Assert(StringHelper.StartsWith(_scratch, SimpleTextTermVectorsWriter.STARTOFFSET)); postings.START_OFFSETS[k] = ParseIntAt(SimpleTextTermVectorsWriter.STARTOFFSET.Length); ReadLine(); Debug.Assert(StringHelper.StartsWith(_scratch, SimpleTextTermVectorsWriter.ENDOFFSET)); postings.END_OFFSETS[k] = ParseIntAt(SimpleTextTermVectorsWriter.ENDOFFSET.Length); } } } return new SimpleTVFields(this, fields); }
/// <summary> /// Builds an <seealso cref="SynonymMap"/> and returns it. /// </summary> public virtual SynonymMap Build() { ByteSequenceOutputs outputs = ByteSequenceOutputs.Singleton; // TODO: are we using the best sharing options? var builder = new Builder <BytesRef>(FST.INPUT_TYPE.BYTE4, outputs); BytesRef scratch = new BytesRef(64); ByteArrayDataOutput scratchOutput = new ByteArrayDataOutput(); HashSet <int?> dedupSet; if (dedup) { dedupSet = new HashSet <int?>(); } else { dedupSet = null; } var spare = new byte[5]; IEnumerable <CharsRef> keys = workingSet.Keys; CharsRef[] sortedKeys = keys.ToArray(); #pragma warning disable 612, 618 System.Array.Sort(sortedKeys, CharsRef.UTF16SortedAsUTF8Comparer); #pragma warning restore 612, 618 IntsRef scratchIntsRef = new IntsRef(); //System.out.println("fmap.build"); for (int keyIdx = 0; keyIdx < sortedKeys.Length; keyIdx++) { CharsRef input = sortedKeys[keyIdx]; MapEntry output = workingSet[input]; int numEntries = output.ords.Count; // output size, assume the worst case int estimatedSize = 5 + numEntries * 5; // numEntries + one ord for each entry scratch.Grow(estimatedSize); scratchOutput.Reset(scratch.Bytes, scratch.Offset, scratch.Bytes.Length); Debug.Assert(scratch.Offset == 0); // now write our output data: int count = 0; for (int i = 0; i < numEntries; i++) { if (dedupSet != null) { // box once int?ent = output.ords[i]; if (dedupSet.Contains(ent)) { continue; } dedupSet.Add(ent); } scratchOutput.WriteVInt(output.ords[i]); count++; } int pos = scratchOutput.Position; scratchOutput.WriteVInt(count << 1 | (output.includeOrig ? 0 : 1)); int pos2 = scratchOutput.Position; int vIntLen = pos2 - pos; // Move the count + includeOrig to the front of the byte[]: Array.Copy(scratch.Bytes, pos, spare, 0, vIntLen); Array.Copy(scratch.Bytes, 0, scratch.Bytes, vIntLen, pos); Array.Copy(spare, 0, scratch.Bytes, 0, vIntLen); if (dedupSet != null) { dedupSet.Clear(); } scratch.Length = scratchOutput.Position - scratch.Offset; //System.out.println(" add input=" + input + " output=" + scratch + " offset=" + scratch.offset + " length=" + scratch.length + " count=" + count); builder.Add(Lucene.Net.Util.Fst.Util.ToUTF32(input.ToString(), scratchIntsRef), BytesRef.DeepCopyOf(scratch)); } FST <BytesRef> fst = builder.Finish(); return(new SynonymMap(fst, words, maxHorizontalContext)); }
/// <summary> /// Compare the fields of the terms first, and if not equals return from /// compare. If equal compare terms. /// </summary> /// <param name="term"> /// the term to compare. </param> /// <param name="termIndex"> /// the position of the term in the input to compare </param> /// <param name="input"> /// the input buffer. </param> /// <returns> int. </returns> /// <exception cref="IOException"> If there is a low-level I/O error. </exception> private int CompareTo(Term term, int termIndex, PagedBytesDataInput input, BytesRef reuse) { // if term field does not equal mid's field index, then compare fields // else if they are equal, compare term's string values... int c = CompareField(term, termIndex, input); if (c == 0) { reuse.Length = input.ReadVInt(); reuse.Grow(reuse.Length); input.ReadBytes(reuse.Bytes, 0, reuse.Length); return Comparator.Compare(term.Bytes(), reuse); } return c; }
/// <summary> /// Retrieve suggestions. /// </summary> public virtual IList <LookupResult> DoLookup(string key, IEnumerable <BytesRef> contexts, int num) { if (contexts != null) { throw new System.ArgumentException("this suggester doesn't support contexts"); } TokenStream ts = queryAnalyzer.GetTokenStream("", key.ToString()); try { ITermToBytesRefAttribute termBytesAtt = ts.AddAttribute <ITermToBytesRefAttribute>(); IOffsetAttribute offsetAtt = ts.AddAttribute <IOffsetAttribute>(); IPositionLengthAttribute posLenAtt = ts.AddAttribute <IPositionLengthAttribute>(); IPositionIncrementAttribute posIncAtt = ts.AddAttribute <IPositionIncrementAttribute>(); ts.Reset(); var lastTokens = new BytesRef[grams]; //System.out.println("lookup: key='" + key + "'"); // Run full analysis, but save only the // last 1gram, last 2gram, etc.: BytesRef tokenBytes = termBytesAtt.BytesRef; int maxEndOffset = -1; bool sawRealToken = false; while (ts.IncrementToken()) { termBytesAtt.FillBytesRef(); sawRealToken |= tokenBytes.Length > 0; // TODO: this is somewhat iffy; today, ShingleFilter // sets posLen to the gram count; maybe we should make // a separate dedicated att for this? int gramCount = posLenAtt.PositionLength; Debug.Assert(gramCount <= grams); // Safety: make sure the recalculated count "agrees": if (CountGrams(tokenBytes) != gramCount) { throw new System.ArgumentException("tokens must not contain separator byte; got token=" + tokenBytes + " but gramCount=" + gramCount + " does not match recalculated count=" + CountGrams(tokenBytes)); } maxEndOffset = Math.Max(maxEndOffset, offsetAtt.EndOffset); lastTokens[gramCount - 1] = BytesRef.DeepCopyOf(tokenBytes); } ts.End(); if (!sawRealToken) { throw new System.ArgumentException("no tokens produced by analyzer, or the only tokens were empty strings"); } // Carefully fill last tokens with _ tokens; // ShingleFilter appraently won't emit "only hole" // tokens: int endPosInc = posIncAtt.PositionIncrement; // Note this will also be true if input is the empty // string (in which case we saw no tokens and // maxEndOffset is still -1), which in fact works out OK // because we fill the unigram with an empty BytesRef // below: bool lastTokenEnded = offsetAtt.EndOffset > maxEndOffset || endPosInc > 0; //System.out.println("maxEndOffset=" + maxEndOffset + " vs " + offsetAtt.EndOffset); if (lastTokenEnded) { //System.out.println(" lastTokenEnded"); // If user hit space after the last token, then // "upgrade" all tokens. This way "foo " will suggest // all bigrams starting w/ foo, and not any unigrams // starting with "foo": for (int i = grams - 1; i > 0; i--) { BytesRef token = lastTokens[i - 1]; if (token == null) { continue; } token.Grow(token.Length + 1); token.Bytes[token.Length] = separator; token.Length++; lastTokens[i] = token; } lastTokens[0] = new BytesRef(); } var arc = new FST.Arc <long?>(); var bytesReader = fst.GetBytesReader(); // Try highest order models first, and if they return // results, return that; else, fallback: double backoff = 1.0; List <LookupResult> results = new List <LookupResult>(num); // We only add a given suffix once, from the highest // order model that saw it; for subsequent lower order // models we skip it: var seen = new HashSet <BytesRef>(); for (int gram = grams - 1; gram >= 0; gram--) { BytesRef token = lastTokens[gram]; // Don't make unigram predictions from empty string: if (token == null || (token.Length == 0 && key.Length > 0)) { // Input didn't have enough tokens: //System.out.println(" gram=" + gram + ": skip: not enough input"); continue; } if (endPosInc > 0 && gram <= endPosInc) { // Skip hole-only predictions; in theory we // shouldn't have to do this, but we'd need to fix // ShingleFilter to produce only-hole tokens: //System.out.println(" break: only holes now"); break; } //System.out.println("try " + (gram+1) + " gram token=" + token.utf8ToString()); // TODO: we could add fuzziness here // match the prefix portion exactly //Pair<Long,BytesRef> prefixOutput = null; long?prefixOutput = null; try { prefixOutput = LookupPrefix(fst, bytesReader, token, arc); } catch (IOException bogus) { throw new Exception(bogus.ToString(), bogus); } //System.out.println(" prefixOutput=" + prefixOutput); if (prefixOutput == null) { // This model never saw this prefix, e.g. the // trigram model never saw context "purple mushroom" backoff *= ALPHA; continue; } // TODO: we could do this division at build time, and // bake it into the FST? // Denominator for computing scores from current // model's predictions: long contextCount = totTokens; BytesRef lastTokenFragment = null; for (int i = token.Length - 1; i >= 0; i--) { if (token.Bytes[token.Offset + i] == separator) { BytesRef context = new BytesRef(token.Bytes, token.Offset, i); long? output = Lucene.Net.Util.Fst.Util.Get(fst, Lucene.Net.Util.Fst.Util.ToInt32sRef(context, new Int32sRef())); Debug.Assert(output != null); contextCount = DecodeWeight(output); lastTokenFragment = new BytesRef(token.Bytes, token.Offset + i + 1, token.Length - i - 1); break; } } BytesRef finalLastToken; if (lastTokenFragment == null) { finalLastToken = BytesRef.DeepCopyOf(token); } else { finalLastToken = BytesRef.DeepCopyOf(lastTokenFragment); } Debug.Assert(finalLastToken.Offset == 0); CharsRef spare = new CharsRef(); // complete top-N Util.Fst.Util.TopResults <long?> completions = null; try { // Because we store multiple models in one FST // (1gram, 2gram, 3gram), we must restrict the // search so that it only considers the current // model. For highest order model, this is not // necessary since all completions in the FST // must be from this model, but for lower order // models we have to filter out the higher order // ones: // Must do num+seen.size() for queue depth because we may // reject up to seen.size() paths in acceptResult(): Util.Fst.Util.TopNSearcher <long?> searcher = new TopNSearcherAnonymousInnerClassHelper(this, fst, num, num + seen.Count, weightComparer, seen, finalLastToken); // since this search is initialized with a single start node // it is okay to start with an empty input path here searcher.AddStartPaths(arc, prefixOutput, true, new Int32sRef()); completions = searcher.Search(); Debug.Assert(completions.IsComplete); } catch (IOException bogus) { throw new Exception(bogus.ToString(), bogus); } int prefixLength = token.Length; BytesRef suffix = new BytesRef(8); //System.out.println(" " + completions.length + " completions"); foreach (Util.Fst.Util.Result <long?> completion in completions) { token.Length = prefixLength; // append suffix Util.Fst.Util.ToBytesRef(completion.Input, suffix); token.Append(suffix); //System.out.println(" completion " + token.utf8ToString()); // Skip this path if a higher-order model already // saw/predicted its last token: BytesRef lastToken = token; for (int i = token.Length - 1; i >= 0; i--) { if (token.Bytes[token.Offset + i] == separator) { Debug.Assert(token.Length - i - 1 > 0); lastToken = new BytesRef(token.Bytes, token.Offset + i + 1, token.Length - i - 1); break; } } if (seen.Contains(lastToken)) { //System.out.println(" skip dup " + lastToken.utf8ToString()); goto nextCompletionContinue; } seen.Add(BytesRef.DeepCopyOf(lastToken)); spare.Grow(token.Length); UnicodeUtil.UTF8toUTF16(token, spare); LookupResult result = new LookupResult(spare.ToString(), // LUCENENET NOTE: We need to calculate this as decimal because when using double it can sometimes // return numbers that are greater than long.MaxValue, which results in a negative long number. (long)(long.MaxValue * (decimal)backoff * ((decimal)DecodeWeight(completion.Output)) / contextCount)); results.Add(result); Debug.Assert(results.Count == seen.Count); //System.out.println(" add result=" + result); nextCompletionContinue :; } backoff *= ALPHA; } results.Sort(new ComparerAnonymousInnerClassHelper(this)); if (results.Count > num) { results.SubList(num, results.Count).Clear(); } return(results); } finally { IOUtils.DisposeWhileHandlingException(ts); } }
// Swap in S, in place of E: internal virtual bool SeekToNonBMP(SegmentTermEnum te, BytesRef term, int pos) { int savLength = term.Length; Debug.Assert(term.Offset == 0); // The 3 bytes starting at downTo make up 1 // unicode character: Debug.Assert(IsHighBMPChar(term.Bytes, pos)); // NOTE: we cannot make this assert, because // AutomatonQuery legitimately sends us malformed UTF8 // (eg the UTF8 bytes with just 0xee) // assert term.length >= pos + 3: "term.length=" + term.length + " pos+3=" + (pos+3) + " byte=" + Integer.toHexString(term.bytes[pos]) + " term=" + term.toString(); // Save the bytes && length, since we need to // restore this if seek "back" finds no matching // terms if (term.Bytes.Length < 4 + pos) { term.Grow(4 + pos); } Scratch[0] = (sbyte)term.Bytes[pos]; Scratch[1] = (sbyte)term.Bytes[pos + 1]; Scratch[2] = (sbyte)term.Bytes[pos + 2]; term.Bytes[pos] = unchecked((byte)0xf0); term.Bytes[pos + 1] = unchecked((byte)0x90); term.Bytes[pos + 2] = unchecked((byte)0x80); term.Bytes[pos + 3] = unchecked((byte)0x80); term.Length = 4 + pos; if (DEBUG_SURROGATES) { Console.WriteLine(" try seek term=" + UnicodeUtil.ToHexString(term.Utf8ToString())); } // Seek "back": OuterInstance.TermsDict.SeekEnum(te, new Term(fieldInfo.Name, term), true); // Test if the term we seek'd to in fact found a // surrogate pair at the same position as the E: Term t2 = te.Term(); // Cannot be null (or move to next field) because at // "worst" it'd seek to the same term we are on now, // unless we are being called from seek if (t2 == null || t2.Field() != InternedFieldName) { return false; } if (DEBUG_SURROGATES) { Console.WriteLine(" got term=" + UnicodeUtil.ToHexString(t2.Text())); } // Now test if prefix is identical and we found // a non-BMP char at the same position: BytesRef b2 = t2.Bytes(); Debug.Assert(b2.Offset == 0); bool matches; if (b2.Length >= term.Length && IsNonBMPChar(b2.Bytes, pos)) { matches = true; for (int i = 0; i < pos; i++) { if (term.Bytes[i] != b2.Bytes[i]) { matches = false; break; } } } else { matches = false; } // Restore term: term.Length = savLength; term.Bytes[pos] = (byte)Scratch[0]; term.Bytes[pos + 1] = (byte)Scratch[1]; term.Bytes[pos + 2] = (byte)Scratch[2]; return matches; }
private void LoadTerms() { var posIntOutputs = PositiveInt32Outputs.Singleton; var outputsInner = new PairOutputs <long?, long?>(posIntOutputs, posIntOutputs); var outputs = new PairOutputs <long?, PairOutputs <long?, long?> .Pair>(posIntOutputs, outputsInner); // honestly, wtf kind of generic mess is this. var b = new Builder <PairOutputs <long?, PairOutputs <long?, long?> .Pair> .Pair>(FST.INPUT_TYPE.BYTE1, outputs); var input = (IndexInput)_outerInstance._input.Clone(); input.Seek(_termsStart); var lastTerm = new BytesRef(10); long lastDocsStart = -1; int docFreq = 0; long totalTermFreq = 0; var visitedDocs = new FixedBitSet(_maxDoc); var scratchIntsRef = new Int32sRef(); while (true) { SimpleTextUtil.ReadLine(input, _scratch); if (_scratch.Equals(SimpleTextFieldsWriter.END) || StringHelper.StartsWith(_scratch, SimpleTextFieldsWriter.FIELD)) { if (lastDocsStart != -1) { b.Add(Util.ToInt32sRef(lastTerm, scratchIntsRef), outputs.NewPair(lastDocsStart, outputsInner.NewPair(docFreq, totalTermFreq))); _sumTotalTermFreq += totalTermFreq; } break; } if (StringHelper.StartsWith(_scratch, SimpleTextFieldsWriter.DOC)) { docFreq++; _sumDocFreq++; UnicodeUtil.UTF8toUTF16(_scratch.Bytes, _scratch.Offset + SimpleTextFieldsWriter.DOC.Length, _scratch.Length - SimpleTextFieldsWriter.DOC.Length, _scratchUtf16); int docId = ArrayUtil.ParseInt32(_scratchUtf16.Chars, 0, _scratchUtf16.Length); visitedDocs.Set(docId); } else if (StringHelper.StartsWith(_scratch, SimpleTextFieldsWriter.FREQ)) { UnicodeUtil.UTF8toUTF16(_scratch.Bytes, _scratch.Offset + SimpleTextFieldsWriter.FREQ.Length, _scratch.Length - SimpleTextFieldsWriter.FREQ.Length, _scratchUtf16); totalTermFreq += ArrayUtil.ParseInt32(_scratchUtf16.Chars, 0, _scratchUtf16.Length); } else if (StringHelper.StartsWith(_scratch, SimpleTextFieldsWriter.TERM)) { if (lastDocsStart != -1) { b.Add(Util.ToInt32sRef(lastTerm, scratchIntsRef), outputs.NewPair(lastDocsStart, outputsInner.NewPair(docFreq, totalTermFreq))); } lastDocsStart = input.GetFilePointer(); int len = _scratch.Length - SimpleTextFieldsWriter.TERM.Length; if (len > lastTerm.Length) { lastTerm.Grow(len); } Array.Copy(_scratch.Bytes, SimpleTextFieldsWriter.TERM.Length, lastTerm.Bytes, 0, len); lastTerm.Length = len; docFreq = 0; _sumTotalTermFreq += totalTermFreq; totalTermFreq = 0; _termCount++; } } _docCount = visitedDocs.Cardinality(); _fst = b.Finish(); }
public override int NextPosition() { int pos; if (_readPositions) { SimpleTextUtil.ReadLine(_in, _scratch); if (Debugging.AssertsEnabled) { Debugging.Assert(StringHelper.StartsWith(_scratch, SimpleTextFieldsWriter.POS), () => "got line=" + _scratch.Utf8ToString()); } UnicodeUtil.UTF8toUTF16(_scratch.Bytes, _scratch.Offset + SimpleTextFieldsWriter.POS.Length, _scratch.Length - SimpleTextFieldsWriter.POS.Length, _scratchUtf162); pos = ArrayUtil.ParseInt32(_scratchUtf162.Chars, 0, _scratchUtf162.Length); } else { pos = -1; } if (_readOffsets) { SimpleTextUtil.ReadLine(_in, _scratch); if (Debugging.AssertsEnabled) { Debugging.Assert(StringHelper.StartsWith(_scratch, SimpleTextFieldsWriter.START_OFFSET), () => "got line=" + _scratch.Utf8ToString()); } UnicodeUtil.UTF8toUTF16(_scratch.Bytes, _scratch.Offset + SimpleTextFieldsWriter.START_OFFSET.Length, _scratch.Length - SimpleTextFieldsWriter.START_OFFSET.Length, _scratchUtf162); _startOffset = ArrayUtil.ParseInt32(_scratchUtf162.Chars, 0, _scratchUtf162.Length); SimpleTextUtil.ReadLine(_in, _scratch); if (Debugging.AssertsEnabled) { Debugging.Assert(StringHelper.StartsWith(_scratch, SimpleTextFieldsWriter.END_OFFSET), () => "got line=" + _scratch.Utf8ToString()); } UnicodeUtil.UTF8toUTF16(_scratch.Bytes, _scratch.Offset + SimpleTextFieldsWriter.END_OFFSET.Length, _scratch.Length - SimpleTextFieldsWriter.END_OFFSET.Length, _scratchUtf162); _endOffset = ArrayUtil.ParseInt32(_scratchUtf162.Chars, 0, _scratchUtf162.Length); } long fp = _in.GetFilePointer(); SimpleTextUtil.ReadLine(_in, _scratch); if (StringHelper.StartsWith(_scratch, SimpleTextFieldsWriter.PAYLOAD)) { int len = _scratch.Length - SimpleTextFieldsWriter.PAYLOAD.Length; if (_scratch2.Bytes.Length < len) { _scratch2.Grow(len); } Array.Copy(_scratch.Bytes, SimpleTextFieldsWriter.PAYLOAD.Length, _scratch2.Bytes, 0, len); _scratch2.Length = len; _payload = _scratch2; } else { _payload = null; _in.Seek(fp); } return(pos); }
//private static final boolean DEBUG = BlockTreeTermsWriter.DEBUG; private BytesRef AddTail(int state, BytesRef term, int idx, int leadLabel) { // Find biggest transition that's < label // TODO: use binary search here Transition maxTransition = null; foreach (Transition transition in sortedTransitions[state]) { if (transition.min < leadLabel) { maxTransition = transition; } } if (Debugging.AssertsEnabled) { Debugging.Assert(maxTransition != null); } // Append floorLabel int floorLabel; if (maxTransition.max > leadLabel - 1) { floorLabel = leadLabel - 1; } else { floorLabel = maxTransition.max; } if (idx >= term.Bytes.Length) { term.Grow(1 + idx); } //if (DEBUG) System.out.println(" add floorLabel=" + (char) floorLabel + " idx=" + idx); term.Bytes[idx] = (byte)floorLabel; state = maxTransition.to.Number; idx++; // Push down to last accept state while (true) { Transition[] transitions = sortedTransitions[state]; if (transitions.Length == 0) { if (Debugging.AssertsEnabled) { Debugging.Assert(RunAutomaton.IsAccept(state)); } term.Length = idx; //if (DEBUG) System.out.println(" return " + term.utf8ToString()); return(term); } else { // We are pushing "top" -- so get last label of // last transition: if (Debugging.AssertsEnabled) { Debugging.Assert(transitions.Length != 0); } Transition lastTransition = transitions[transitions.Length - 1]; if (idx >= term.Bytes.Length) { term.Grow(1 + idx); } //if (DEBUG) System.out.println(" push maxLabel=" + (char) lastTransition.max + " idx=" + idx); term.Bytes[idx] = (byte)lastTransition.max; state = lastTransition.to.Number; idx++; } } }
/// <summary> /// Returns the <i>n'th</i> element of this <seealso cref="BytesRefArray"/> </summary> /// <param name="spare"> a spare <seealso cref="BytesRef"/> instance </param> /// <param name="index"> the elements index to retrieve </param> /// <returns> the <i>n'th</i> element of this <seealso cref="BytesRefArray"/> </returns> public BytesRef Get(BytesRef spare, int index) { if (LastElement > index) { int offset = Offsets[index]; int length = index == LastElement - 1 ? CurrentOffset - offset : Offsets[index + 1] - offset; Debug.Assert(spare.Offset == 0); spare.Grow(length); spare.Length = length; Pool.ReadBytes(offset, spare.Bytes, spare.Offset, spare.Length); return spare; } throw new System.IndexOutOfRangeException("index " + index + " must be less than the size: " + LastElement); }
/* Walk through all unique text tokens (Posting * instances) found in this field and serialize them * into a single RAM segment. */ internal void Flush(string fieldName, FieldsConsumer consumer, SegmentWriteState state) { if (!fieldInfo.Indexed) { return; // nothing to flush, don't bother the codec with the unindexed field } TermsConsumer termsConsumer = consumer.AddField(fieldInfo); IComparer <BytesRef> termComp = termsConsumer.Comparator; // CONFUSING: this.indexOptions holds the index options // that were current when we first saw this field. But // it's possible this has changed, eg when other // documents are indexed that cause a "downgrade" of the // IndexOptions. So we must decode the in-RAM buffer // according to this.indexOptions, but then write the // new segment to the directory according to // currentFieldIndexOptions: FieldInfo.IndexOptions?currentFieldIndexOptions = fieldInfo.FieldIndexOptions; Debug.Assert(currentFieldIndexOptions != null); bool writeTermFreq = currentFieldIndexOptions >= FieldInfo.IndexOptions.DOCS_AND_FREQS; bool writePositions = currentFieldIndexOptions >= FieldInfo.IndexOptions.DOCS_AND_FREQS_AND_POSITIONS; bool writeOffsets = currentFieldIndexOptions >= FieldInfo.IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS; bool readTermFreq = this.HasFreq; bool readPositions = this.HasProx; bool readOffsets = this.HasOffsets; //System.out.println("flush readTF=" + readTermFreq + " readPos=" + readPositions + " readOffs=" + readOffsets); // Make sure FieldInfo.update is working correctly!: Debug.Assert(!writeTermFreq || readTermFreq); Debug.Assert(!writePositions || readPositions); Debug.Assert(!writeOffsets || readOffsets); Debug.Assert(!writeOffsets || writePositions); IDictionary <Term, int?> segDeletes; if (state.SegUpdates != null && state.SegUpdates.Terms.Count > 0) { segDeletes = state.SegUpdates.Terms; } else { segDeletes = null; } int[] termIDs = TermsHashPerField.SortPostings(termComp); int numTerms = TermsHashPerField.BytesHash.Size(); BytesRef text = new BytesRef(); FreqProxPostingsArray postings = (FreqProxPostingsArray)TermsHashPerField.PostingsArray; ByteSliceReader freq = new ByteSliceReader(); ByteSliceReader prox = new ByteSliceReader(); FixedBitSet visitedDocs = new FixedBitSet(state.SegmentInfo.DocCount); long sumTotalTermFreq = 0; long sumDocFreq = 0; Term protoTerm = new Term(fieldName); for (int i = 0; i < numTerms; i++) { int termID = termIDs[i]; // Get BytesRef int textStart = postings.TextStarts[termID]; TermsHashPerField.BytePool.SetBytesRef(text, textStart); TermsHashPerField.InitReader(freq, termID, 0); if (readPositions || readOffsets) { TermsHashPerField.InitReader(prox, termID, 1); } // TODO: really TermsHashPerField should take over most // of this loop, including merge sort of terms from // multiple threads and interacting with the // TermsConsumer, only calling out to us (passing us the // DocsConsumer) to handle delivery of docs/positions PostingsConsumer postingsConsumer = termsConsumer.StartTerm(text); int?delDocLimit; if (segDeletes != null) { protoTerm.Bytes_Renamed = text; int?docIDUpto; segDeletes.TryGetValue(protoTerm, out docIDUpto); if (docIDUpto != null) { delDocLimit = docIDUpto; } else { delDocLimit = 0; } } else { delDocLimit = 0; } // Now termStates has numToMerge FieldMergeStates // which all share the same term. Now we must // interleave the docID streams. int docFreq = 0; long totalTermFreq = 0; int docID = 0; while (true) { //System.out.println(" cycle"); int termFreq; if (freq.Eof()) { if (postings.LastDocCodes[termID] != -1) { // Return last doc docID = postings.LastDocIDs[termID]; if (readTermFreq) { termFreq = postings.TermFreqs[termID]; } else { termFreq = -1; } postings.LastDocCodes[termID] = -1; } else { // EOF break; } } else { int code = freq.ReadVInt(); if (!readTermFreq) { docID += code; termFreq = -1; } else { docID += (int)((uint)code >> 1); if ((code & 1) != 0) { termFreq = 1; } else { termFreq = freq.ReadVInt(); } } Debug.Assert(docID != postings.LastDocIDs[termID]); } docFreq++; Debug.Assert(docID < state.SegmentInfo.DocCount, "doc=" + docID + " maxDoc=" + state.SegmentInfo.DocCount); // NOTE: we could check here if the docID was // deleted, and skip it. However, this is somewhat // dangerous because it can yield non-deterministic // behavior since we may see the docID before we see // the term that caused it to be deleted. this // would mean some (but not all) of its postings may // make it into the index, which'd alter the docFreq // for those terms. We could fix this by doing two // passes, ie first sweep marks all del docs, and // 2nd sweep does the real flush, but I suspect // that'd add too much time to flush. visitedDocs.Set(docID); postingsConsumer.StartDoc(docID, writeTermFreq ? termFreq : -1); if (docID < delDocLimit) { // Mark it deleted. TODO: we could also skip // writing its postings; this would be // deterministic (just for this Term's docs). // TODO: can we do this reach-around in a cleaner way???? if (state.LiveDocs == null) { state.LiveDocs = DocState.DocWriter.Codec.LiveDocsFormat().NewLiveDocs(state.SegmentInfo.DocCount); } if (state.LiveDocs.Get(docID)) { state.DelCountOnFlush++; state.LiveDocs.Clear(docID); } } totalTermFreq += termFreq; // Carefully copy over the prox + payload info, // changing the format to match Lucene's segment // format. if (readPositions || readOffsets) { // we did record positions (& maybe payload) and/or offsets int position = 0; int offset = 0; for (int j = 0; j < termFreq; j++) { BytesRef thisPayload; if (readPositions) { int code = prox.ReadVInt(); position += (int)((uint)code >> 1); if ((code & 1) != 0) { // this position has a payload int payloadLength = prox.ReadVInt(); if (Payload == null) { Payload = new BytesRef(); Payload.Bytes = new sbyte[payloadLength]; } else if (Payload.Bytes.Length < payloadLength) { Payload.Grow(payloadLength); } prox.ReadBytes(Payload.Bytes, 0, payloadLength); Payload.Length = payloadLength; thisPayload = Payload; } else { thisPayload = null; } if (readOffsets) { int startOffset = offset + prox.ReadVInt(); int endOffset = startOffset + prox.ReadVInt(); if (writePositions) { if (writeOffsets) { Debug.Assert(startOffset >= 0 && endOffset >= startOffset, "startOffset=" + startOffset + ",endOffset=" + endOffset + ",offset=" + offset); postingsConsumer.AddPosition(position, thisPayload, startOffset, endOffset); } else { postingsConsumer.AddPosition(position, thisPayload, -1, -1); } } offset = startOffset; } else if (writePositions) { postingsConsumer.AddPosition(position, thisPayload, -1, -1); } } } } postingsConsumer.FinishDoc(); } termsConsumer.FinishTerm(text, new TermStats(docFreq, writeTermFreq ? totalTermFreq : -1)); sumTotalTermFreq += totalTermFreq; sumDocFreq += docFreq; } termsConsumer.Finish(writeTermFreq ? sumTotalTermFreq : -1, sumDocFreq, visitedDocs.Cardinality()); }
/// <remarks> /// TODO: we may want an alternate mode here which is /// "if you are about to return NOT_FOUND I won't use /// the terms data from that"; eg FuzzyTermsEnum will /// (usually) just immediately call seek again if we /// return NOT_FOUND so it's a waste for us to fill in /// the term that was actually NOT_FOUND /// </remarks> public override SeekStatus SeekCeil(BytesRef target) { if (indexEnum == null) { throw new InvalidOperationException("terms index was not loaded"); } //System.out.println("BTR.seek seg=" + segment + " target=" + fieldInfo.name + ":" + target.utf8ToString() + " " + target + " current=" + term().utf8ToString() + " " + term() + " indexIsCurrent=" + indexIsCurrent + " didIndexNext=" + didIndexNext + " seekPending=" + seekPending + " divisor=" + indexReader.getDivisor() + " this=" + this); if (didIndexNext) { if (nextIndexTerm == null) { //System.out.println(" nextIndexTerm=null"); } else { //System.out.println(" nextIndexTerm=" + nextIndexTerm.utf8ToString()); } } bool doSeek = true; // See if we can avoid seeking, because target term // is after current term but before next index term: if (indexIsCurrent) { int cmp = BytesRef.UTF8SortedAsUnicodeComparer.Compare(term, target); if (cmp == 0) { // Already at the requested term return(SeekStatus.FOUND); } else if (cmp < 0) { // Target term is after current term if (!didIndexNext) { if (indexEnum.Next() == -1) { nextIndexTerm = null; } else { nextIndexTerm = indexEnum.Term; } //System.out.println(" now do index next() nextIndexTerm=" + (nextIndexTerm == null ? "null" : nextIndexTerm.utf8ToString())); didIndexNext = true; } if (nextIndexTerm == null || BytesRef.UTF8SortedAsUnicodeComparer.Compare(target, nextIndexTerm) < 0) { // Optimization: requested term is within the // same term block we are now in; skip seeking // (but do scanning): doSeek = false; //System.out.println(" skip seek: nextIndexTerm=" + (nextIndexTerm == null ? "null" : nextIndexTerm.utf8ToString())); } } } if (doSeek) { //System.out.println(" seek"); // Ask terms index to find biggest indexed term (= // first term in a block) that's <= our text: input.Seek(indexEnum.Seek(target)); bool result = NextBlock(); // Block must exist since, at least, the indexed term // is in the block: if (Debugging.AssertsEnabled) { Debugging.Assert(result); } indexIsCurrent = true; didIndexNext = false; blocksSinceSeek = 0; if (doOrd) { state.Ord = indexEnum.Ord - 1; } term.CopyBytes(indexEnum.Term); //System.out.println(" seek: term=" + term.utf8ToString()); } else { //System.out.println(" skip seek"); if (state.TermBlockOrd == blockTermCount && !NextBlock()) { indexIsCurrent = false; return(SeekStatus.END); } } seekPending = false; int common = 0; // Scan within block. We could do this by calling // _next() and testing the resulting term, but this // is wasteful. Instead, we first confirm the // target matches the common prefix of this block, // and then we scan the term bytes directly from the // termSuffixesreader's byte[], saving a copy into // the BytesRef term per term. Only when we return // do we then copy the bytes into the term. while (true) { // First, see if target term matches common prefix // in this block: if (common < termBlockPrefix) { int cmp = (term.Bytes[common] & 0xFF) - (target.Bytes[target.Offset + common] & 0xFF); if (cmp < 0) { // TODO: maybe we should store common prefix // in block header? (instead of relying on // last term of previous block) // Target's prefix is after the common block // prefix, so term cannot be in this block // but it could be in next block. We // must scan to end-of-block to set common // prefix for next block: if (state.TermBlockOrd < blockTermCount) { while (state.TermBlockOrd < blockTermCount - 1) { state.TermBlockOrd++; state.Ord++; termSuffixesReader.SkipBytes(termSuffixesReader.ReadVInt32()); } int suffix = termSuffixesReader.ReadVInt32(); term.Length = termBlockPrefix + suffix; if (term.Bytes.Length < term.Length) { term.Grow(term.Length); } termSuffixesReader.ReadBytes(term.Bytes, termBlockPrefix, suffix); } state.Ord++; if (!NextBlock()) { indexIsCurrent = false; return(SeekStatus.END); } common = 0; } else if (cmp > 0) { // Target's prefix is before the common prefix // of this block, so we position to start of // block and return NOT_FOUND: if (Debugging.AssertsEnabled) { Debugging.Assert(state.TermBlockOrd == 0); } int suffix = termSuffixesReader.ReadVInt32(); term.Length = termBlockPrefix + suffix; if (term.Bytes.Length < term.Length) { term.Grow(term.Length); } termSuffixesReader.ReadBytes(term.Bytes, termBlockPrefix, suffix); return(SeekStatus.NOT_FOUND); } else { common++; } continue; } // Test every term in this block while (true) { state.TermBlockOrd++; state.Ord++; int suffix = termSuffixesReader.ReadVInt32(); // We know the prefix matches, so just compare the new suffix: int termLen = termBlockPrefix + suffix; int bytePos = termSuffixesReader.Position; bool next = false; int limit = target.Offset + (termLen < target.Length ? termLen : target.Length); int targetPos = target.Offset + termBlockPrefix; while (targetPos < limit) { int cmp = (termSuffixes[bytePos++] & 0xFF) - (target.Bytes[targetPos++] & 0xFF); if (cmp < 0) { // Current term is still before the target; // keep scanning next = true; break; } else if (cmp > 0) { // Done! Current term is after target. Stop // here, fill in real term, return NOT_FOUND. term.Length = termBlockPrefix + suffix; if (term.Bytes.Length < term.Length) { term.Grow(term.Length); } termSuffixesReader.ReadBytes(term.Bytes, termBlockPrefix, suffix); //System.out.println(" NOT_FOUND"); return(SeekStatus.NOT_FOUND); } } if (!next && target.Length <= termLen) { term.Length = termBlockPrefix + suffix; if (term.Bytes.Length < term.Length) { term.Grow(term.Length); } termSuffixesReader.ReadBytes(term.Bytes, termBlockPrefix, suffix); if (target.Length == termLen) { // Done! Exact match. Stop here, fill in // real term, return FOUND. //System.out.println(" FOUND"); return(SeekStatus.FOUND); } else { //System.out.println(" NOT_FOUND"); return(SeekStatus.NOT_FOUND); } } if (state.TermBlockOrd == blockTermCount) { // Must pre-fill term for next block's common prefix term.Length = termBlockPrefix + suffix; if (term.Bytes.Length < term.Length) { term.Grow(term.Length); } termSuffixesReader.ReadBytes(term.Bytes, termBlockPrefix, suffix); break; } else { termSuffixesReader.SkipBytes(suffix); } } // The purpose of the terms dict index is to seek // the enum to the closest index term before the // term we are looking for. So, we should never // cross another index term (besides the first // one) while we are scanning: if (Debugging.AssertsEnabled) { Debugging.Assert(indexIsCurrent); } if (!NextBlock()) { //System.out.println(" END"); indexIsCurrent = false; return(SeekStatus.END); } common = 0; } }
public override void Build(IInputEnumerator enumerator) { if (enumerator.HasContexts) { throw new ArgumentException("this suggester doesn't support contexts"); } string prefix = this.GetType().Name; var directory = OfflineSorter.DefaultTempDir(); var tempInput = FileSupport.CreateTempFile(prefix, ".input", directory); var tempSorted = FileSupport.CreateTempFile(prefix, ".sorted", directory); hasPayloads = enumerator.HasPayloads; var writer = new OfflineSorter.ByteSequencesWriter(tempInput); OfflineSorter.ByteSequencesReader reader = null; var scratch = new BytesRef(); TokenStreamToAutomaton ts2a = GetTokenStreamToAutomaton(); bool success = false; count = 0; byte[] buffer = new byte[8]; try { var output = new ByteArrayDataOutput(buffer); BytesRef surfaceForm; while (enumerator.MoveNext()) { surfaceForm = enumerator.Current; ISet <Int32sRef> paths = ToFiniteStrings(surfaceForm, ts2a); maxAnalyzedPathsForOneInput = Math.Max(maxAnalyzedPathsForOneInput, paths.Count); foreach (Int32sRef path in paths) { Util.Fst.Util.ToBytesRef(path, scratch); // length of the analyzed text (FST input) if (scratch.Length > ushort.MaxValue - 2) { throw new ArgumentException("cannot handle analyzed forms > " + (ushort.MaxValue - 2) + " in length (got " + scratch.Length + ")"); } ushort analyzedLength = (ushort)scratch.Length; // compute the required length: // analyzed sequence + weight (4) + surface + analyzedLength (short) int requiredLength = analyzedLength + 4 + surfaceForm.Length + 2; BytesRef payload; if (hasPayloads) { if (surfaceForm.Length > (ushort.MaxValue - 2)) { throw new ArgumentException("cannot handle surface form > " + (ushort.MaxValue - 2) + " in length (got " + surfaceForm.Length + ")"); } payload = enumerator.Payload; // payload + surfaceLength (short) requiredLength += payload.Length + 2; } else { payload = null; } buffer = ArrayUtil.Grow(buffer, requiredLength); output.Reset(buffer); output.WriteInt16((short)analyzedLength); output.WriteBytes(scratch.Bytes, scratch.Offset, scratch.Length); output.WriteInt32(EncodeWeight(enumerator.Weight)); if (hasPayloads) { for (int i = 0; i < surfaceForm.Length; i++) { if (surfaceForm.Bytes[i] == PAYLOAD_SEP) { throw new ArgumentException( "surface form cannot contain unit separator character U+001F; this character is reserved"); } } output.WriteInt16((short)surfaceForm.Length); output.WriteBytes(surfaceForm.Bytes, surfaceForm.Offset, surfaceForm.Length); output.WriteBytes(payload.Bytes, payload.Offset, payload.Length); } else { output.WriteBytes(surfaceForm.Bytes, surfaceForm.Offset, surfaceForm.Length); } if (Debugging.AssertsEnabled) { Debugging.Assert(output.Position == requiredLength, () => output.Position + " vs " + requiredLength); } writer.Write(buffer, 0, output.Position); } count++; } writer.Dispose(); // Sort all input/output pairs (required by FST.Builder): (new OfflineSorter(new AnalyzingComparer(hasPayloads))).Sort(tempInput, tempSorted); // Free disk space: tempInput.Delete(); reader = new OfflineSorter.ByteSequencesReader(tempSorted); var outputs = new PairOutputs <long?, BytesRef>(PositiveInt32Outputs.Singleton, ByteSequenceOutputs.Singleton); var builder = new Builder <PairOutputs <long?, BytesRef> .Pair>(FST.INPUT_TYPE.BYTE1, outputs); // Build FST: BytesRef previousAnalyzed = null; BytesRef analyzed = new BytesRef(); BytesRef surface = new BytesRef(); Int32sRef scratchInts = new Int32sRef(); var input = new ByteArrayDataInput(); // Used to remove duplicate surface forms (but we // still index the hightest-weight one). We clear // this when we see a new analyzed form, so it cannot // grow unbounded (at most 256 entries): var seenSurfaceForms = new JCG.HashSet <BytesRef>(); var dedup = 0; while (reader.Read(scratch)) { input.Reset(scratch.Bytes, scratch.Offset, scratch.Length); ushort analyzedLength = (ushort)input.ReadInt16(); analyzed.Grow(analyzedLength + 2); input.ReadBytes(analyzed.Bytes, 0, analyzedLength); analyzed.Length = analyzedLength; long cost = input.ReadInt32(); surface.Bytes = scratch.Bytes; if (hasPayloads) { surface.Length = (ushort)input.ReadInt16(); surface.Offset = input.Position; } else { surface.Offset = input.Position; surface.Length = scratch.Length - surface.Offset; } if (previousAnalyzed == null) { previousAnalyzed = new BytesRef(); previousAnalyzed.CopyBytes(analyzed); seenSurfaceForms.Add(BytesRef.DeepCopyOf(surface)); } else if (analyzed.Equals(previousAnalyzed)) { dedup++; if (dedup >= maxSurfaceFormsPerAnalyzedForm) { // More than maxSurfaceFormsPerAnalyzedForm // dups: skip the rest: continue; } if (seenSurfaceForms.Contains(surface)) { continue; } seenSurfaceForms.Add(BytesRef.DeepCopyOf(surface)); } else { dedup = 0; previousAnalyzed.CopyBytes(analyzed); seenSurfaceForms.Clear(); seenSurfaceForms.Add(BytesRef.DeepCopyOf(surface)); } // TODO: I think we can avoid the extra 2 bytes when // there is no dup (dedup==0), but we'd have to fix // the exactFirst logic ... which would be sort of // hairy because we'd need to special case the two // (dup/not dup)... // NOTE: must be byte 0 so we sort before whatever // is next analyzed.Bytes[analyzed.Offset + analyzed.Length] = 0; analyzed.Bytes[analyzed.Offset + analyzed.Length + 1] = (byte)dedup; analyzed.Length += 2; Util.Fst.Util.ToInt32sRef(analyzed, scratchInts); //System.out.println("ADD: " + scratchInts + " -> " + cost + ": " + surface.utf8ToString()); if (!hasPayloads) { builder.Add(scratchInts, outputs.NewPair(cost, BytesRef.DeepCopyOf(surface))); } else { int payloadOffset = input.Position + surface.Length; int payloadLength = scratch.Length - payloadOffset; BytesRef br = new BytesRef(surface.Length + 1 + payloadLength); Array.Copy(surface.Bytes, surface.Offset, br.Bytes, 0, surface.Length); br.Bytes[surface.Length] = PAYLOAD_SEP; Array.Copy(scratch.Bytes, payloadOffset, br.Bytes, surface.Length + 1, payloadLength); br.Length = br.Bytes.Length; builder.Add(scratchInts, outputs.NewPair(cost, br)); } } fst = builder.Finish(); //Util.dotToFile(fst, "/tmp/suggest.dot"); success = true; } finally { if (success) { IOUtils.Dispose(reader, writer); } else { IOUtils.DisposeWhileHandlingException(reader, writer); } tempInput.Delete(); tempSorted.Delete(); } }
public override BytesRef Next() { if (NextTerm >= NumTerms) { return(null); } Term_Renamed.CopyBytes(LastTerm); int start = Tvf.ReadVInt(); int deltaLen = Tvf.ReadVInt(); Term_Renamed.Length = start + deltaLen; Term_Renamed.Grow(Term_Renamed.Length); Tvf.ReadBytes(Term_Renamed.Bytes, start, deltaLen); Freq = Tvf.ReadVInt(); if (StorePayloads) { Positions = new int[Freq]; PayloadOffsets = new int[Freq]; int totalPayloadLength = 0; int pos = 0; for (int posUpto = 0; posUpto < Freq; posUpto++) { int code = Tvf.ReadVInt(); pos += (int)((uint)code >> 1); Positions[posUpto] = pos; if ((code & 1) != 0) { // length change LastPayloadLength = Tvf.ReadVInt(); } PayloadOffsets[posUpto] = totalPayloadLength; totalPayloadLength += LastPayloadLength; Debug.Assert(totalPayloadLength >= 0); } PayloadData = new byte[totalPayloadLength]; Tvf.ReadBytes(PayloadData, 0, PayloadData.Length); } // no payloads else if (StorePositions) { // TODO: we could maybe reuse last array, if we can // somehow be careful about consumer never using two // D&PEnums at once... Positions = new int[Freq]; int pos = 0; for (int posUpto = 0; posUpto < Freq; posUpto++) { pos += Tvf.ReadVInt(); Positions[posUpto] = pos; } } if (StoreOffsets) { StartOffsets = new int[Freq]; EndOffsets = new int[Freq]; int offset = 0; for (int posUpto = 0; posUpto < Freq; posUpto++) { StartOffsets[posUpto] = offset + Tvf.ReadVInt(); offset = EndOffsets[posUpto] = StartOffsets[posUpto] + Tvf.ReadVInt(); } } LastTerm.CopyBytes(Term_Renamed); NextTerm++; return(Term_Renamed); }