public virtual void TestReadAndWrite()
 {
     Counter bytesUsed = Util.Counter.NewCounter();
     ByteBlockPool pool = new ByteBlockPool(new ByteBlockPool.DirectTrackingAllocator(bytesUsed));
     pool.NextBuffer();
     bool reuseFirst = Random().NextBoolean();
     for (int j = 0; j < 2; j++)
     {
         IList<BytesRef> list = new List<BytesRef>();
         int maxLength = AtLeast(500);
         int numValues = AtLeast(100);
         BytesRef @ref = new BytesRef();
         for (int i = 0; i < numValues; i++)
         {
             string value = TestUtil.RandomRealisticUnicodeString(Random(), maxLength);
             list.Add(new BytesRef(value));
             @ref.CopyChars(value);
             pool.Append(@ref);
         }
         // verify
         long position = 0;
         foreach (BytesRef expected in list)
         {
             @ref.Grow(expected.Length);
             @ref.Length = expected.Length;
             pool.ReadBytes(position, @ref.Bytes, @ref.Offset, @ref.Length);
             Assert.AreEqual(expected, @ref);
             position += @ref.Length;
         }
         pool.Reset(Random().NextBoolean(), reuseFirst);
         if (reuseFirst)
         {
             Assert.AreEqual(ByteBlockPool.BYTE_BLOCK_SIZE, bytesUsed.Get());
         }
         else
         {
             Assert.AreEqual(0, bytesUsed.Get());
             pool.NextBuffer(); // prepare for next iter
         }
     }
 }
Exemple #2
0
            private void LoadTerms()
            {
                PositiveInt32Outputs posIntOutputs = PositiveInt32Outputs.Singleton;
                var outputsInner = new PairOutputs <Int64, Int64>(posIntOutputs, posIntOutputs);
                var outputs      = new PairOutputs <Int64, PairOutputs <Int64, Int64> .Pair>(posIntOutputs,
                                                                                             outputsInner);
                var        b   = new Builder <PairOutputs <Int64, PairOutputs <Int64, Int64> .Pair> .Pair>(FST.INPUT_TYPE.BYTE1, outputs);
                IndexInput @in = (IndexInput)outerInstance.input.Clone();

                @in.Seek(termsStart);
                BytesRef    lastTerm       = new BytesRef(10);
                long        lastDocsStart  = -1;
                int         docFreq        = 0;
                long        totalTermFreq  = 0;
                FixedBitSet visitedDocs    = new FixedBitSet(maxDoc);
                Int32sRef   scratchIntsRef = new Int32sRef();

                while (true)
                {
                    SimpleTextUtil.ReadLine(@in, scratch);
                    if (scratch.Equals(SimpleTextFieldsWriter.END) || StringHelper.StartsWith(scratch, SimpleTextFieldsWriter.FIELD))
                    {
                        if (lastDocsStart != -1)
                        {
                            b.Add(Util.ToInt32sRef(lastTerm, scratchIntsRef),
                                  outputs.NewPair(lastDocsStart,
                                                  outputsInner.NewPair((long)docFreq, totalTermFreq)));
                            sumTotalTermFreq += totalTermFreq;
                        }
                        break;
                    }
                    else if (StringHelper.StartsWith(scratch, SimpleTextFieldsWriter.DOC))
                    {
                        docFreq++;
                        sumDocFreq++;
                        UnicodeUtil.UTF8toUTF16(scratch.Bytes, scratch.Offset + SimpleTextFieldsWriter.DOC.Length, scratch.Length - SimpleTextFieldsWriter.DOC.Length, scratchUTF16);
                        int docID = ArrayUtil.ParseInt32(scratchUTF16.Chars, 0, scratchUTF16.Length);
                        visitedDocs.Set(docID);
                    }
                    else if (StringHelper.StartsWith(scratch, SimpleTextFieldsWriter.FREQ))
                    {
                        UnicodeUtil.UTF8toUTF16(scratch.Bytes, scratch.Offset + SimpleTextFieldsWriter.FREQ.Length, scratch.Length - SimpleTextFieldsWriter.FREQ.Length, scratchUTF16);
                        totalTermFreq += ArrayUtil.ParseInt32(scratchUTF16.Chars, 0, scratchUTF16.Length);
                    }
                    else if (StringHelper.StartsWith(scratch, SimpleTextFieldsWriter.TERM))
                    {
                        if (lastDocsStart != -1)
                        {
                            b.Add(Util.ToInt32sRef(lastTerm, scratchIntsRef), outputs.NewPair(lastDocsStart,
                                                                                              outputsInner.NewPair((long)docFreq, totalTermFreq)));
                        }
                        lastDocsStart = @in.Position; // LUCENENET specific: Renamed from getFilePointer() to match FileStream
                        int len = scratch.Length - SimpleTextFieldsWriter.TERM.Length;
                        if (len > lastTerm.Length)
                        {
                            lastTerm.Grow(len);
                        }
                        System.Array.Copy(scratch.Bytes, SimpleTextFieldsWriter.TERM.Length, lastTerm.Bytes, 0, len);
                        lastTerm.Length   = len;
                        docFreq           = 0;
                        sumTotalTermFreq += totalTermFreq;
                        totalTermFreq     = 0;
                        termCount++;
                    }
                }
                docCount = visitedDocs.Cardinality;
                fst      = b.Finish();

                /*
                 * PrintStream ps = new PrintStream("out.dot");
                 * fst.toDot(ps);
                 * ps.close();
                 * System.out.println("SAVED out.dot");
                 */
                //System.out.println("FST " + fst.sizeInBytes());
            }
            public override BytesRef Next()
            {
                if (nextTerm >= numTerms)
                {
                    return(null);
                }
                term.CopyBytes(lastTerm);
                int start    = tvf.ReadVInt32();
                int deltaLen = tvf.ReadVInt32();

                term.Length = start + deltaLen;
                term.Grow(term.Length);
                tvf.ReadBytes(term.Bytes, start, deltaLen);
                freq = tvf.ReadVInt32();

                if (storePayloads)
                {
                    positions      = new int[freq];
                    payloadOffsets = new int[freq];
                    int totalPayloadLength = 0;
                    int pos = 0;
                    for (int posUpto = 0; posUpto < freq; posUpto++)
                    {
                        int code = tvf.ReadVInt32();
                        pos += (int)((uint)code >> 1);
                        positions[posUpto] = pos;
                        if ((code & 1) != 0)
                        {
                            // length change
                            lastPayloadLength = tvf.ReadVInt32();
                        }
                        payloadOffsets[posUpto] = totalPayloadLength;
                        totalPayloadLength     += lastPayloadLength;
                        if (Debugging.AssertsEnabled)
                        {
                            Debugging.Assert(totalPayloadLength >= 0);
                        }
                    }
                    payloadData = new byte[totalPayloadLength];
                    tvf.ReadBytes(payloadData, 0, payloadData.Length);
                } // no payloads
                else if (storePositions)
                {
                    // TODO: we could maybe reuse last array, if we can
                    // somehow be careful about consumer never using two
                    // D&PEnums at once...
                    positions = new int[freq];
                    int pos = 0;
                    for (int posUpto = 0; posUpto < freq; posUpto++)
                    {
                        pos += tvf.ReadVInt32();
                        positions[posUpto] = pos;
                    }
                }

                if (storeOffsets)
                {
                    startOffsets = new int[freq];
                    endOffsets   = new int[freq];
                    int offset = 0;
                    for (int posUpto = 0; posUpto < freq; posUpto++)
                    {
                        startOffsets[posUpto] = offset + tvf.ReadVInt32();
                        offset = endOffsets[posUpto] = startOffsets[posUpto] + tvf.ReadVInt32();
                    }
                }

                lastTerm.CopyBytes(term);
                nextTerm++;
                return(term);
            }
Exemple #4
0
            // Swap in S, in place of E:
            private bool SeekToNonBMP(SegmentTermEnum te, BytesRef term, int pos)
            {
                int savLength = term.Length;

                if (Debugging.AssertsEnabled)
                {
                    Debugging.Assert(term.Offset == 0);
                }

                // The 3 bytes starting at downTo make up 1
                // unicode character:
                if (Debugging.AssertsEnabled)
                {
                    Debugging.Assert(IsHighBMPChar(term.Bytes, pos));
                }

                // NOTE: we cannot make this assert, because
                // AutomatonQuery legitimately sends us malformed UTF8
                // (eg the UTF8 bytes with just 0xee)
                // assert term.length >= pos + 3: "term.length=" + term.length + " pos+3=" + (pos+3) + " byte=" + Integer.toHexString(term.bytes[pos]) + " term=" + term.toString();

                // Save the bytes && length, since we need to
                // restore this if seek "back" finds no matching
                // terms
                if (term.Bytes.Length < 4 + pos)
                {
                    term.Grow(4 + pos);
                }

                scratch[0] = (sbyte)term.Bytes[pos];
                scratch[1] = (sbyte)term.Bytes[pos + 1];
                scratch[2] = (sbyte)term.Bytes[pos + 2];

                term.Bytes[pos]     = 0xf0;
                term.Bytes[pos + 1] = 0x90;
                term.Bytes[pos + 2] = 0x80;
                term.Bytes[pos + 3] = 0x80;
                term.Length         = 4 + pos;

                if (DEBUG_SURROGATES)
                {
                    Console.WriteLine("      try seek term=" + UnicodeUtil.ToHexString(term.Utf8ToString()));
                }

                // Seek "back":
                outerInstance.TermsDict.SeekEnum(te, new Term(fieldInfo.Name, term), true);

                // Test if the term we seek'd to in fact found a
                // surrogate pair at the same position as the E:
                Term t2 = te.Term();

                // Cannot be null (or move to next field) because at
                // "worst" it'd seek to the same term we are on now,
                // unless we are being called from seek
                if (t2 is null || t2.Field != internedFieldName)
                {
                    return(false);
                }

                if (DEBUG_SURROGATES)
                {
                    Console.WriteLine("      got term=" + UnicodeUtil.ToHexString(t2.Text));
                }

                // Now test if prefix is identical and we found
                // a non-BMP char at the same position:
                BytesRef b2 = t2.Bytes;

                if (Debugging.AssertsEnabled)
                {
                    Debugging.Assert(b2.Offset == 0);
                }

                bool matches;

                if (b2.Length >= term.Length && IsNonBMPChar(b2.Bytes, pos))
                {
                    matches = true;
                    for (int i = 0; i < pos; i++)
                    {
                        if (term.Bytes[i] != b2.Bytes[i])
                        {
                            matches = false;
                            break;
                        }
                    }
                }
                else
                {
                    matches = false;
                }

                // Restore term:
                term.Length         = savLength;
                term.Bytes[pos]     = (byte)scratch[0];
                term.Bytes[pos + 1] = (byte)scratch[1];
                term.Bytes[pos + 2] = (byte)scratch[2];

                return(matches);
            }
        // TODO: this currently requites a determinized machine,
        // but it need not -- we can speed it up by walking the
        // NFA instead.  it'd still be fail fast.
        public static BytesRef GetCommonPrefixBytesRef(Automaton a)
        {
            if (a.IsSingleton)
            {
                return new BytesRef(a.singleton);
            }
            BytesRef @ref = new BytesRef(10);
            HashSet<State> visited = new HashSet<State>();
            State s = a.Initial;
            bool done;
            do
            {
                done = true;
                visited.Add(s);
                if (!s.accept && s.NumTransitions() == 1)
                {
                    var iter = s.Transitions.GetEnumerator();
                    iter.MoveNext();
                    Transition t = iter.Current;

                    if (t.Min_Renamed == t.Max_Renamed && !visited.Contains(t.To))
                    {
                        @ref.Grow([email protected]);
                        @ref.Bytes[@ref.Length - 1] = (byte)t.Min_Renamed;
                        s = t.To;
                        done = false;
                    }
                }
            } while (!done);
            return @ref;
        }
        public override Fields Get(int doc)
        {
            // LUCENENET specific: Use StringComparer.Ordinal to get the same ordering as Java
            var fields = new JCG.SortedDictionary <string, SimpleTVTerms>(StringComparer.Ordinal);

            _input.Seek(_offsets[doc]);
            ReadLine();
            if (Debugging.AssertsEnabled)
            {
                Debugging.Assert(StringHelper.StartsWith(_scratch, SimpleTextTermVectorsWriter.NUMFIELDS));
            }
            var numFields = ParseInt32At(SimpleTextTermVectorsWriter.NUMFIELDS.Length);

            if (numFields == 0)
            {
                return(null); // no vectors for this doc
            }
            for (var i = 0; i < numFields; i++)
            {
                ReadLine();
                if (Debugging.AssertsEnabled)
                {
                    Debugging.Assert(StringHelper.StartsWith(_scratch, SimpleTextTermVectorsWriter.FIELD));
                }
                // skip fieldNumber:
                ParseInt32At(SimpleTextTermVectorsWriter.FIELD.Length);

                ReadLine();
                if (Debugging.AssertsEnabled)
                {
                    Debugging.Assert(StringHelper.StartsWith(_scratch, SimpleTextTermVectorsWriter.FIELDNAME));
                }
                var fieldName = ReadString(SimpleTextTermVectorsWriter.FIELDNAME.Length, _scratch);

                ReadLine();
                if (Debugging.AssertsEnabled)
                {
                    Debugging.Assert(StringHelper.StartsWith(_scratch, SimpleTextTermVectorsWriter.FIELDPOSITIONS));
                }
                var positions = Convert.ToBoolean(ReadString(SimpleTextTermVectorsWriter.FIELDPOSITIONS.Length, _scratch), CultureInfo.InvariantCulture);

                ReadLine();
                if (Debugging.AssertsEnabled)
                {
                    Debugging.Assert(StringHelper.StartsWith(_scratch, SimpleTextTermVectorsWriter.FIELDOFFSETS));
                }
                var offsets = Convert.ToBoolean(ReadString(SimpleTextTermVectorsWriter.FIELDOFFSETS.Length, _scratch), CultureInfo.InvariantCulture);

                ReadLine();
                if (Debugging.AssertsEnabled)
                {
                    Debugging.Assert(StringHelper.StartsWith(_scratch, SimpleTextTermVectorsWriter.FIELDPAYLOADS));
                }
                var payloads = Convert.ToBoolean(ReadString(SimpleTextTermVectorsWriter.FIELDPAYLOADS.Length, _scratch), CultureInfo.InvariantCulture);

                ReadLine();
                if (Debugging.AssertsEnabled)
                {
                    Debugging.Assert(StringHelper.StartsWith(_scratch, SimpleTextTermVectorsWriter.FIELDTERMCOUNT));
                }
                var termCount = ParseInt32At(SimpleTextTermVectorsWriter.FIELDTERMCOUNT.Length);

                var terms = new SimpleTVTerms(offsets, positions, payloads);
                fields.Add(fieldName, terms);

                for (var j = 0; j < termCount; j++)
                {
                    ReadLine();
                    if (Debugging.AssertsEnabled)
                    {
                        Debugging.Assert(StringHelper.StartsWith(_scratch, SimpleTextTermVectorsWriter.TERMTEXT));
                    }
                    var term       = new BytesRef();
                    var termLength = _scratch.Length - SimpleTextTermVectorsWriter.TERMTEXT.Length;
                    term.Grow(termLength);
                    term.Length = termLength;
                    Array.Copy(_scratch.Bytes, _scratch.Offset + SimpleTextTermVectorsWriter.TERMTEXT.Length, term.Bytes, term.Offset, termLength);

                    var postings = new SimpleTVPostings();
                    terms.terms.Add(term, postings);

                    ReadLine();
                    if (Debugging.AssertsEnabled)
                    {
                        Debugging.Assert(StringHelper.StartsWith(_scratch, SimpleTextTermVectorsWriter.TERMFREQ));
                    }
                    postings.freq = ParseInt32At(SimpleTextTermVectorsWriter.TERMFREQ.Length);

                    if (!positions && !offsets)
                    {
                        continue;
                    }

                    if (positions)
                    {
                        postings.positions = new int[postings.freq];
                        if (payloads)
                        {
                            postings.payloads = new BytesRef[postings.freq];
                        }
                    }

                    if (offsets)
                    {
                        postings.startOffsets = new int[postings.freq];
                        postings.endOffsets   = new int[postings.freq];
                    }

                    for (var k = 0; k < postings.freq; k++)
                    {
                        if (positions)
                        {
                            ReadLine();
                            if (Debugging.AssertsEnabled)
                            {
                                Debugging.Assert(StringHelper.StartsWith(_scratch, SimpleTextTermVectorsWriter.POSITION));
                            }
                            postings.positions[k] = ParseInt32At(SimpleTextTermVectorsWriter.POSITION.Length);
                            if (payloads)
                            {
                                ReadLine();
                                if (Debugging.AssertsEnabled)
                                {
                                    Debugging.Assert(StringHelper.StartsWith(_scratch, SimpleTextTermVectorsWriter.PAYLOAD));
                                }
                                if (_scratch.Length - SimpleTextTermVectorsWriter.PAYLOAD.Length == 0)
                                {
                                    postings.payloads[k] = null;
                                }
                                else
                                {
                                    var payloadBytes = new byte[_scratch.Length - SimpleTextTermVectorsWriter.PAYLOAD.Length];
                                    Array.Copy(_scratch.Bytes, _scratch.Offset + SimpleTextTermVectorsWriter.PAYLOAD.Length, payloadBytes, 0,
                                               payloadBytes.Length);
                                    postings.payloads[k] = new BytesRef(payloadBytes);
                                }
                            }
                        }

                        if (!offsets)
                        {
                            continue;
                        }

                        ReadLine();
                        if (Debugging.AssertsEnabled)
                        {
                            Debugging.Assert(StringHelper.StartsWith(_scratch, SimpleTextTermVectorsWriter.STARTOFFSET));
                        }
                        postings.startOffsets[k] = ParseInt32At(SimpleTextTermVectorsWriter.STARTOFFSET.Length);

                        ReadLine();
                        if (Debugging.AssertsEnabled)
                        {
                            Debugging.Assert(StringHelper.StartsWith(_scratch, SimpleTextTermVectorsWriter.ENDOFFSET));
                        }
                        postings.endOffsets[k] = ParseInt32At(SimpleTextTermVectorsWriter.ENDOFFSET.Length);
                    }
                }
            }
            return(new SimpleTVFields(this, fields));
        }
Exemple #7
0
                /// <remarks>
                /// TODO: we may want an alternate mode here which is
                /// "if you are about to return NOT_FOUND I won't use
                /// the terms data from that"; eg FuzzyTermsEnum will
                /// (usually) just immediately call seek again if we
                /// return NOT_FOUND so it's a waste for us to fill in
                /// the term that was actually NOT_FOUND
                /// </remarks>
                public override SeekStatus SeekCeil(BytesRef target)
                {
                    if (_indexEnum == null)
                    {
                        throw new InvalidOperationException("terms index was not loaded");
                    }

                    var doSeek = true;

                    // See if we can avoid seeking, because target term
                    // is after current term but before next index term:
                    if (_indexIsCurrent)
                    {
                        var cmp = BytesRef.UTF8SortedAsUnicodeComparer.Compare(_term, target);

                        if (cmp == 0)
                        {
                            return(SeekStatus.FOUND);     // Already at the requested term
                        }
                        if (cmp < 0)
                        {
                            // Target term is after current term
                            if (!_didIndexNext)
                            {
                                _nextIndexTerm = _indexEnum.Next == -1 ? null : _indexEnum.Term;
                                _didIndexNext  = true;
                            }

                            if (_nextIndexTerm == null ||
                                BytesRef.UTF8SortedAsUnicodeComparer.Compare(target, _nextIndexTerm) < 0)
                            {
                                // Optimization: requested term is within the
                                // same term block we are now in; skip seeking
                                // (but do scanning):
                                doSeek = false;
                            }
                        }
                    }

                    if (doSeek)
                    {
                        //System.out.println("  seek");

                        // Ask terms index to find biggest indexed term (=
                        // first term in a block) that's <= our text:
                        _input.Seek(_indexEnum.Seek(target));
                        var result = NextBlock();

                        // Block must exist since, at least, the indexed term
                        // is in the block:
                        Debug.Assert(result);

                        _indexIsCurrent  = true;
                        _didIndexNext    = false;
                        _blocksSinceSeek = 0;

                        if (_doOrd)
                        {
                            _state.Ord = _indexEnum.Ord - 1;
                        }

                        _term.CopyBytes(_indexEnum.Term);
                    }
                    else
                    {
                        if (_state.TermBlockOrd == _blockTermCount && !NextBlock())
                        {
                            _indexIsCurrent = false;
                            return(SeekStatus.END);
                        }
                    }

                    _seekPending = false;

                    var common = 0;

                    // Scan within block.  We could do this by calling
                    // _next() and testing the resulting term, but this
                    // is wasteful.  Instead, we first confirm the
                    // target matches the common prefix of this block,
                    // and then we scan the term bytes directly from the
                    // termSuffixesreader's byte[], saving a copy into
                    // the BytesRef term per term.  Only when we return
                    // do we then copy the bytes into the term.

                    while (true)
                    {
                        // First, see if target term matches common prefix
                        // in this block:
                        if (common < _termBlockPrefix)
                        {
                            var cmp = (_term.Bytes[common] & 0xFF) - (target.Bytes[target.Offset + common] & 0xFF);
                            if (cmp < 0)
                            {
                                // TODO: maybe we should store common prefix
                                // in block header?  (instead of relying on
                                // last term of previous block)

                                // Target's prefix is after the common block
                                // prefix, so term cannot be in this block
                                // but it could be in next block.  We
                                // must scan to end-of-block to set common
                                // prefix for next block:
                                if (_state.TermBlockOrd < _blockTermCount)
                                {
                                    while (_state.TermBlockOrd < _blockTermCount - 1)
                                    {
                                        _state.TermBlockOrd++;
                                        _state.Ord++;
                                        _termSuffixesReader.SkipBytes(_termSuffixesReader.ReadVInt());
                                    }
                                    var suffix = _termSuffixesReader.ReadVInt();
                                    _term.Length = _termBlockPrefix + suffix;
                                    if (_term.Bytes.Length < _term.Length)
                                    {
                                        _term.Grow(_term.Length);
                                    }
                                    _termSuffixesReader.ReadBytes(_term.Bytes, _termBlockPrefix, suffix);
                                }
                                _state.Ord++;

                                if (!NextBlock())
                                {
                                    _indexIsCurrent = false;
                                    return(SeekStatus.END);
                                }
                                common = 0;
                            }
                            else if (cmp > 0)
                            {
                                // Target's prefix is before the common prefix
                                // of this block, so we position to start of
                                // block and return NOT_FOUND:
                                Debug.Assert(_state.TermBlockOrd == 0);

                                var suffix = _termSuffixesReader.ReadVInt();
                                _term.Length = _termBlockPrefix + suffix;
                                if (_term.Bytes.Length < _term.Length)
                                {
                                    _term.Grow(_term.Length);
                                }
                                _termSuffixesReader.ReadBytes(_term.Bytes, _termBlockPrefix, suffix);
                                return(SeekStatus.NOT_FOUND);
                            }
                            else
                            {
                                common++;
                            }

                            continue;
                        }

                        // Test every term in this block
                        while (true)
                        {
                            _state.TermBlockOrd++;
                            _state.Ord++;

                            var suffix = _termSuffixesReader.ReadVInt();

                            // We know the prefix matches, so just compare the new suffix:

                            var termLen = _termBlockPrefix + suffix;
                            var bytePos = _termSuffixesReader.Position;

                            var next = false;

                            var limit     = target.Offset + (termLen < target.Length ? termLen : target.Length);
                            var targetPos = target.Offset + _termBlockPrefix;
                            while (targetPos < limit)
                            {
                                var cmp = (_termSuffixes[bytePos++] & 0xFF) - (target.Bytes[targetPos++] & 0xFF);
                                if (cmp < 0)
                                {
                                    // Current term is still before the target;
                                    // keep scanning
                                    next = true;
                                    break;
                                }

                                if (cmp <= 0)
                                {
                                    continue;
                                }

                                // Done!  Current term is after target. Stop
                                // here, fill in real term, return NOT_FOUND.
                                _term.Length = _termBlockPrefix + suffix;
                                if (_term.Bytes.Length < _term.Length)
                                {
                                    _term.Grow(_term.Length);
                                }
                                _termSuffixesReader.ReadBytes(_term.Bytes, _termBlockPrefix, suffix);
                                return(SeekStatus.NOT_FOUND);
                            }

                            if (!next && target.Length <= termLen)
                            {
                                _term.Length = _termBlockPrefix + suffix;
                                if (_term.Bytes.Length < _term.Length)
                                {
                                    _term.Grow(_term.Length);
                                }
                                _termSuffixesReader.ReadBytes(_term.Bytes, _termBlockPrefix, suffix);

                                return(target.Length == termLen ? SeekStatus.FOUND : SeekStatus.NOT_FOUND);
                            }

                            if (_state.TermBlockOrd == _blockTermCount)
                            {
                                // Must pre-fill term for next block's common prefix
                                _term.Length = _termBlockPrefix + suffix;
                                if (_term.Bytes.Length < _term.Length)
                                {
                                    _term.Grow(_term.Length);
                                }
                                _termSuffixesReader.ReadBytes(_term.Bytes, _termBlockPrefix, suffix);
                                break;
                            }

                            _termSuffixesReader.SkipBytes(suffix);
                        }

                        // The purpose of the terms dict index is to seek
                        // the enum to the closest index term before the
                        // term we are looking for.  So, we should never
                        // cross another index term (besides the first
                        // one) while we are scanning:

                        Debug.Assert(_indexIsCurrent);

                        if (!NextBlock())
                        {
                            _indexIsCurrent = false;
                            return(SeekStatus.END);
                        }
                        common = 0;
                    }
                }
        /// <summary>
        /// Returns the next string in lexicographic order that will not put
        /// the machine into a reject state.
        /// <para/>
        /// This method traverses the DFA from the given position in the string,
        /// starting at the given state.
        /// <para/>
        /// If this cannot satisfy the machine, returns <c>false</c>. This method will
        /// walk the minimal path, in lexicographic order, as long as possible.
        /// <para/>
        /// If this method returns <c>false</c>, then there might still be more solutions,
        /// it is necessary to backtrack to find out.
        /// </summary>
        /// <param name="state"> current non-reject state </param>
        /// <param name="position"> useful portion of the string </param>
        /// <returns> <c>true</c> if more possible solutions exist for the DFA from this
        ///         position </returns>
        private bool NextString(int state, int position)
        {
            /*
             * the next lexicographic character must be greater than the existing
             * character, if it exists.
             */
            int c = 0;

            if (position < seekBytesRef.Length)
            {
                c = seekBytesRef.Bytes[position] & 0xff;
                // if the next byte is 0xff and is not part of the useful portion,
                // then by definition it puts us in a reject state, and therefore this
                // path is dead. there cannot be any higher transitions. backtrack.
                if (c++ == 0xff)
                {
                    return(false);
                }
            }

            seekBytesRef.Length = position;
            visited[state]      = curGen;

            Transition[] transitions = allTransitions[state];

            // find the minimal path (lexicographic order) that is >= c

            for (int i = 0; i < transitions.Length; i++)
            {
                Transition transition = transitions[i];
                if (transition.Max >= c)
                {
                    int nextChar = Math.Max(c, transition.Min);
                    // append either the next sequential char, or the minimum transition
                    seekBytesRef.Grow(seekBytesRef.Length + 1);
                    seekBytesRef.Length++;
                    seekBytesRef.Bytes[seekBytesRef.Length - 1] = (byte)nextChar;
                    state = transition.Dest.Number;

                    /*
                     * as long as is possible, continue down the minimal path in
                     * lexicographic order. if a loop or accept state is encountered, stop.
                     */
                    while (visited[state] != curGen && !runAutomaton.IsAccept(state))
                    {
                        visited[state] = curGen;

                        /*
                         * Note: we work with a DFA with no transitions to dead states.
                         * so the below is ok, if it is not an accept state,
                         * then there MUST be at least one transition.
                         */
                        transition = allTransitions[state][0];
                        state      = transition.Dest.Number;

                        // append the minimum transition
                        seekBytesRef.Grow(seekBytesRef.Length + 1);
                        seekBytesRef.Length++;
                        seekBytesRef.Bytes[seekBytesRef.Length - 1] = (byte)transition.Min;

                        // we found a loop, record it for faster enumeration
                        if ((finite == false) && !linear && visited[state] == curGen)
                        {
                            SetLinear(seekBytesRef.Length - 1);
                        }
                    }
                    return(true);
                }
            }
            return(false);
        }
Exemple #9
0
            public override int NextPosition()
            {
                int pos;

                if (_readPositions)
                {
                    SimpleTextUtil.ReadLine(_in, _scratch);
                    // LUCENENET specific - use wrapper BytesRefFormatter struct to defer building the string unless string.Format() is called
                    if (Debugging.AssertsEnabled)
                    {
                        Debugging.Assert(StringHelper.StartsWith(_scratch, SimpleTextFieldsWriter.POS), "got line={0}", new BytesRefFormatter(_scratch, BytesRefFormat.UTF8));
                    }
                    UnicodeUtil.UTF8toUTF16(_scratch.Bytes, _scratch.Offset + SimpleTextFieldsWriter.POS.Length, _scratch.Length - SimpleTextFieldsWriter.POS.Length,
                                            _scratchUtf162);
                    pos = ArrayUtil.ParseInt32(_scratchUtf162.Chars, 0, _scratchUtf162.Length);
                }
                else
                {
                    pos = -1;
                }

                if (_readOffsets)
                {
                    SimpleTextUtil.ReadLine(_in, _scratch);
                    // LUCENENET specific - use wrapper BytesRefFormatter struct to defer building the string unless string.Format() is called
                    if (Debugging.AssertsEnabled)
                    {
                        Debugging.Assert(StringHelper.StartsWith(_scratch, SimpleTextFieldsWriter.START_OFFSET), "got line={0}", new BytesRefFormatter(_scratch, BytesRefFormat.UTF8));
                    }
                    UnicodeUtil.UTF8toUTF16(_scratch.Bytes, _scratch.Offset + SimpleTextFieldsWriter.START_OFFSET.Length,
                                            _scratch.Length - SimpleTextFieldsWriter.START_OFFSET.Length, _scratchUtf162);
                    _startOffset = ArrayUtil.ParseInt32(_scratchUtf162.Chars, 0, _scratchUtf162.Length);
                    SimpleTextUtil.ReadLine(_in, _scratch);
                    // LUCENENET specific - use wrapper BytesRefFormatter struct to defer building the string unless string.Format() is called
                    if (Debugging.AssertsEnabled)
                    {
                        Debugging.Assert(StringHelper.StartsWith(_scratch, SimpleTextFieldsWriter.END_OFFSET), "got line={0}", new BytesRefFormatter(_scratch, BytesRefFormat.UTF8));
                    }
                    UnicodeUtil.UTF8toUTF16(_scratch.Bytes, _scratch.Offset + SimpleTextFieldsWriter.END_OFFSET.Length,
                                            _scratch.Length - SimpleTextFieldsWriter.END_OFFSET.Length, _scratchUtf162);
                    _endOffset = ArrayUtil.ParseInt32(_scratchUtf162.Chars, 0, _scratchUtf162.Length);
                }

                long fp = _in.Position; // LUCENENET specific: Renamed from getFilePointer() to match FileStream

                SimpleTextUtil.ReadLine(_in, _scratch);
                if (StringHelper.StartsWith(_scratch, SimpleTextFieldsWriter.PAYLOAD))
                {
                    int len = _scratch.Length - SimpleTextFieldsWriter.PAYLOAD.Length;
                    if (_scratch2.Bytes.Length < len)
                    {
                        _scratch2.Grow(len);
                    }
                    Array.Copy(_scratch.Bytes, SimpleTextFieldsWriter.PAYLOAD.Length, _scratch2.Bytes, 0, len);
                    _scratch2.Length = len;
                    _payload         = _scratch2;
                }
                else
                {
                    _payload = null;
                    _in.Seek(fp);
                }
                return(pos);
            }
        /// <summary>
        /// Finds largest term accepted by this Automaton, that's
        ///  <= the provided input term.  The result is placed in
        ///  output; it's fine for output and input to point to
        ///  the same BytesRef.  The returned result is either the
        ///  provided output, or null if there is no floor term
        ///  (ie, the provided input term is before the first term
        ///  accepted by this Automaton).
        /// </summary>
        public virtual BytesRef Floor(BytesRef input, BytesRef output)
        {
            output.Offset = 0;
            //if (DEBUG) System.out.println("CA.floor input=" + input.utf8ToString());

            int state = RunAutomaton.InitialState;

            // Special case empty string:
            if (input.Length == 0)
            {
                if (RunAutomaton.IsAccept(state))
                {
                    output.Length = 0;
                    return output;
                }
                else
                {
                    return null;
                }
            }

            IList<int> stack = new List<int>();

            int idx = 0;
            while (true)
            {
                int label = ((sbyte)input.Bytes[input.Offset + idx]) & 0xff;
                int nextState = RunAutomaton.Step(state, label);
                //if (DEBUG) System.out.println("  cycle label=" + (char) label + " nextState=" + nextState);

                if (idx == input.Length - 1)
                {
                    if (nextState != -1 && RunAutomaton.IsAccept(nextState))
                    {
                        // Input string is accepted
                        if (idx >= output.Bytes.Length)
                        {
                            output.Grow(1 + idx);
                        }
                        output.Bytes[idx] = (byte)label;
                        output.Length = input.Length;
                        //if (DEBUG) System.out.println("  input is accepted; return term=" + output.utf8ToString());
                        return output;
                    }
                    else
                    {
                        nextState = -1;
                    }
                }

                if (nextState == -1)
                {
                    // Pop back to a state that has a transition
                    // <= our label:
                    while (true)
                    {
                        Transition[] transitions = SortedTransitions[state];
                        if (transitions.Length == 0)
                        {
                            Debug.Assert(RunAutomaton.IsAccept(state));
                            output.Length = idx;
                            //if (DEBUG) System.out.println("  return " + output.utf8ToString());
                            return output;
                        }
                        else if (label - 1 < transitions[0].Min_Renamed)
                        {
                            if (RunAutomaton.IsAccept(state))
                            {
                                output.Length = idx;
                                //if (DEBUG) System.out.println("  return " + output.utf8ToString());
                                return output;
                            }
                            // pop
                            if (stack.Count == 0)
                            {
                                //if (DEBUG) System.out.println("  pop ord=" + idx + " return null");
                                return null;
                            }
                            else
                            {
                                state = stack[stack.Count - 1];
                                stack.RemoveAt(stack.Count - 1);
                                idx--;
                                //if (DEBUG) System.out.println("  pop ord=" + (idx+1) + " label=" + (char) label + " first trans.min=" + (char) transitions[0].min);
                                label = input.Bytes[input.Offset + idx] & 0xff;
                            }
                        }
                        else
                        {
                            //if (DEBUG) System.out.println("  stop pop ord=" + idx + " first trans.min=" + (char) transitions[0].min);
                            break;
                        }
                    }

                    //if (DEBUG) System.out.println("  label=" + (char) label + " idx=" + idx);

                    return AddTail(state, output, idx, label);
                }
                else
                {
                    if (idx >= output.Bytes.Length)
                    {
                        output.Grow(1 + idx);
                    }
                    output.Bytes[idx] = (byte)label;
                    stack.Add(state);
                    state = nextState;
                    idx++;
                }
            }
        }
        //private static final boolean DEBUG = BlockTreeTermsWriter.DEBUG;

        private BytesRef AddTail(int state, BytesRef term, int idx, int leadLabel)
        {
            // Find biggest transition that's < label
            // TODO: use binary search here
            Transition maxTransition = null;
            foreach (Transition transition in SortedTransitions[state])
            {
                if (transition.Min_Renamed < leadLabel)
                {
                    maxTransition = transition;
                }
            }

            Debug.Assert(maxTransition != null);

            // Append floorLabel
            int floorLabel;
            if (maxTransition.Max_Renamed > leadLabel - 1)
            {
                floorLabel = leadLabel - 1;
            }
            else
            {
                floorLabel = maxTransition.Max_Renamed;
            }
            if (idx >= term.Bytes.Length)
            {
                term.Grow(1 + idx);
            }
            //if (DEBUG) System.out.println("  add floorLabel=" + (char) floorLabel + " idx=" + idx);
            term.Bytes[idx] = (byte)floorLabel;

            state = maxTransition.To.Number;
            idx++;

            // Push down to last accept state
            while (true)
            {
                Transition[] transitions = SortedTransitions[state];
                if (transitions.Length == 0)
                {
                    Debug.Assert(RunAutomaton.IsAccept(state));
                    term.Length = idx;
                    //if (DEBUG) System.out.println("  return " + term.utf8ToString());
                    return term;
                }
                else
                {
                    // We are pushing "top" -- so get last label of
                    // last transition:
                    Debug.Assert(transitions.Length != 0);
                    Transition lastTransition = transitions[transitions.Length - 1];
                    if (idx >= term.Bytes.Length)
                    {
                        term.Grow(1 + idx);
                    }
                    //if (DEBUG) System.out.println("  push maxLabel=" + (char) lastTransition.max + " idx=" + idx);
                    term.Bytes[idx] = (byte)lastTransition.Max_Renamed;
                    state = lastTransition.To.Number;
                    idx++;
                }
            }
        }
        private IEnumerable<BytesRef> GetBytesIterator(int maxDocParam)
        {
            // Use yield return instead of ucsom IEnumerable

            AppendingDeltaPackedLongBuffer.Iterator lengthsIterator = Lengths.GetIterator();
            int size = (int)Lengths.Size();
            DataInput bytesIterator = Bytes.DataInput;
            int maxDoc = maxDocParam;
            int upto = 0;

            while (upto < maxDoc)
            {
                BytesRef v = null;
                if (upto < size)
                {
                    int length = (int)lengthsIterator.Next();
                    var value = new BytesRef();
                    value.Grow(length);
                    value.Length = length;
                    bytesIterator.ReadBytes(value.Bytes, value.Offset, value.Length);

                    if (DocsWithField.Get(upto))
                    {
                        v = value;
                    }
                }

                upto++;
                yield return v;
            }
        }
        private IEnumerable<BytesRef> GetBytesIterator(int maxDocParam)
        {
            // Use yield return instead of ucsom IEnumerable

            BytesRef value = new BytesRef();
            AppendingDeltaPackedLongBuffer.Iterator lengthsIterator = (AppendingDeltaPackedLongBuffer.Iterator)Lengths.GetIterator();
            int size = (int)Lengths.Size();
            DataInput bytesIterator = Bytes.DataInput;
            int maxDoc = maxDocParam;
            int upto = 0;
            long byteOffset = 0L;

            while (upto < maxDoc)
            {
                if (upto < size)
                {
                    int length = (int)lengthsIterator.Next();
                    value.Grow(length);
                    value.Length = length;
                    //LUCENE TODO: This modification is slightly fishy, 4x port uses ByteBlockPool
                    bytesIterator.ReadBytes(/*byteOffset,*/ value.Bytes, value.Offset, value.Length);
                    byteOffset += length;
                }
                else
                {
                    // This is to handle last N documents not having
                    // this DV field in the end of the segment:
                    value.Length = 0;
                }

                upto++;
                yield return value;
            }
        }
Exemple #14
0
            internal virtual void LoadTerms()
            {
                PositiveIntOutputs posIntOutputs = PositiveIntOutputs.Singleton;
                Builder <PairOutputs.Pair <long?, PairOutputs.Pair <long?, long?> > > b;
                PairOutputs <long?, long?> outputsInner = new PairOutputs <long?, long?>(posIntOutputs, posIntOutputs);
                PairOutputs <long?, PairOutputs.Pair <long?, long?> > outputs =
                    new PairOutputs <long?, PairOutputs.Pair <long?, long?> >(posIntOutputs, outputsInner);

                b = new Builder <>(FST.INPUT_TYPE.BYTE1, outputs);
                IndexInput @in = (IndexInput)outerInstance._input.Clone();

                @in.Seek(termsStart);

                BytesRef    lastTerm      = new BytesRef(10);
                long        lastDocsStart = -1;
                int         docFreq       = 0;
                long        totalTermFreq = 0;
                FixedBitSet visitedDocs   = new FixedBitSet(maxDoc);

                IntsRef scratchIntsRef = new IntsRef();

                while (true)
                {
                    SimpleTextUtil.ReadLine(@in, scratch);
                    if (scratch.Equals(END) || StringHelper.StartsWith(scratch, FIELD))
                    {
                        if (lastDocsStart != -1)
                        {
                            b.Add(Util.ToIntsRef(lastTerm, scratchIntsRef),
                                  outputs.NewPair(lastDocsStart, outputsInner.NewPair((long)docFreq, totalTermFreq)));
                            sumTotalTermFreq += totalTermFreq;
                        }
                        break;
                    }
                    else if (StringHelper.StartsWith(scratch, DOC))
                    {
                        docFreq++;
                        sumDocFreq++;
                        UnicodeUtil.UTF8toUTF16(scratch.Bytes, scratch.Offset + DOC.length, scratch.Length - DOC.length,
                                                scratchUTF16);
                        int docID = ArrayUtil.ParseInt(scratchUTF16.Chars, 0, scratchUTF16.length);
                        visitedDocs.Set(docID);
                    }
                    else if (StringHelper.StartsWith(scratch, FREQ))
                    {
                        UnicodeUtil.UTF8toUTF16(scratch.Bytes, scratch.Offset + FREQ.length,
                                                scratch.Length - FREQ.length, scratchUTF16);
                        totalTermFreq += ArrayUtil.ParseInt(scratchUTF16.Chars, 0, scratchUTF16.length);
                    }
                    else if (StringHelper.StartsWith(scratch, TERM))
                    {
                        if (lastDocsStart != -1)
                        {
                            b.Add(Util.ToIntsRef(lastTerm, scratchIntsRef),
                                  outputs.NewPair(lastDocsStart, outputsInner.NewPair((long)docFreq, totalTermFreq)));
                        }
                        lastDocsStart = @in.FilePointer;
                        int len = scratch.Length - TERM.length;
                        if (len > lastTerm.Length)
                        {
                            lastTerm.Grow(len);
                        }
                        Array.Copy(scratch.Bytes, TERM.length, lastTerm.Bytes, 0, len);
                        lastTerm.Length   = len;
                        docFreq           = 0;
                        sumTotalTermFreq += totalTermFreq;
                        totalTermFreq     = 0;
                        termCount++;
                    }
                }
                docCount = visitedDocs.Cardinality();
                fst      = b.Finish();
            }
Exemple #15
0
        public override Fields Get(int doc)
        {
            var fields = new SortedDictionary <string, SimpleTVTerms>();

            _input.Seek(_offsets[doc]);
            ReadLine();
            Debug.Assert(StringHelper.StartsWith(_scratch, SimpleTextTermVectorsWriter.NUMFIELDS));
            var numFields = ParseIntAt(SimpleTextTermVectorsWriter.NUMFIELDS.Length);

            if (numFields == 0)
            {
                return(null); // no vectors for this doc
            }
            for (var i = 0; i < numFields; i++)
            {
                ReadLine();
                Debug.Assert(StringHelper.StartsWith(_scratch, SimpleTextTermVectorsWriter.FIELD));
                // skip fieldNumber:
                ParseIntAt(SimpleTextTermVectorsWriter.FIELD.Length);

                ReadLine();
                Debug.Assert(StringHelper.StartsWith(_scratch, SimpleTextTermVectorsWriter.FIELDNAME));
                var fieldName = ReadString(SimpleTextTermVectorsWriter.FIELDNAME.Length, _scratch);

                ReadLine();
                Debug.Assert(StringHelper.StartsWith(_scratch, SimpleTextTermVectorsWriter.FIELDPOSITIONS));
                var positions = Convert.ToBoolean(ReadString(SimpleTextTermVectorsWriter.FIELDPOSITIONS.Length, _scratch));

                ReadLine();
                Debug.Assert(StringHelper.StartsWith(_scratch, SimpleTextTermVectorsWriter.FIELDOFFSETS));
                var offsets = Convert.ToBoolean(ReadString(SimpleTextTermVectorsWriter.FIELDOFFSETS.Length, _scratch));

                ReadLine();
                Debug.Assert(StringHelper.StartsWith(_scratch, SimpleTextTermVectorsWriter.FIELDPAYLOADS));
                var payloads = Convert.ToBoolean(ReadString(SimpleTextTermVectorsWriter.FIELDPAYLOADS.Length, _scratch));

                ReadLine();
                Debug.Assert(StringHelper.StartsWith(_scratch, SimpleTextTermVectorsWriter.FIELDTERMCOUNT));
                var termCount = ParseIntAt(SimpleTextTermVectorsWriter.FIELDTERMCOUNT.Length);

                var terms = new SimpleTVTerms(offsets, positions, payloads);
                fields.Add(fieldName, terms);

                for (var j = 0; j < termCount; j++)
                {
                    ReadLine();
                    Debug.Assert(StringHelper.StartsWith(_scratch, SimpleTextTermVectorsWriter.TERMTEXT));
                    var term       = new BytesRef();
                    var termLength = _scratch.Length - SimpleTextTermVectorsWriter.TERMTEXT.Length;
                    term.Grow(termLength);
                    term.Length = termLength;
                    Array.Copy(_scratch.Bytes, _scratch.Offset + SimpleTextTermVectorsWriter.TERMTEXT.Length, term.Bytes, term.Offset, termLength);

                    var postings = new SimpleTVPostings();
                    terms.TERMS.Add(term, postings);

                    ReadLine();
                    Debug.Assert(StringHelper.StartsWith(_scratch, SimpleTextTermVectorsWriter.TERMFREQ));
                    postings.FREQ = ParseIntAt(SimpleTextTermVectorsWriter.TERMFREQ.Length);

                    if (!positions && !offsets)
                    {
                        continue;
                    }

                    if (positions)
                    {
                        postings.POSITIONS = new int[postings.FREQ];
                        if (payloads)
                        {
                            postings.PAYLOADS = new BytesRef[postings.FREQ];
                        }
                    }

                    if (offsets)
                    {
                        postings.START_OFFSETS = new int[postings.FREQ];
                        postings.END_OFFSETS   = new int[postings.FREQ];
                    }

                    for (var k = 0; k < postings.FREQ; k++)
                    {
                        if (positions)
                        {
                            ReadLine();
                            Debug.Assert(StringHelper.StartsWith(_scratch, SimpleTextTermVectorsWriter.POSITION));
                            postings.POSITIONS[k] = ParseIntAt(SimpleTextTermVectorsWriter.POSITION.Length);
                            if (payloads)
                            {
                                ReadLine();
                                Debug.Assert(StringHelper.StartsWith(_scratch, SimpleTextTermVectorsWriter.PAYLOAD));
                                if (_scratch.Length - SimpleTextTermVectorsWriter.PAYLOAD.Length == 0)
                                {
                                    postings.PAYLOADS[k] = null;
                                }
                                else
                                {
                                    var payloadBytes = new byte[_scratch.Length - SimpleTextTermVectorsWriter.PAYLOAD.Length];
                                    Array.Copy(_scratch.Bytes, _scratch.Offset + SimpleTextTermVectorsWriter.PAYLOAD.Length, payloadBytes, 0,
                                               payloadBytes.Length);
                                    postings.PAYLOADS[k] = new BytesRef(payloadBytes);
                                }
                            }
                        }

                        if (!offsets)
                        {
                            continue;
                        }

                        ReadLine();
                        Debug.Assert(StringHelper.StartsWith(_scratch, SimpleTextTermVectorsWriter.STARTOFFSET));
                        postings.START_OFFSETS[k] = ParseIntAt(SimpleTextTermVectorsWriter.STARTOFFSET.Length);

                        ReadLine();
                        Debug.Assert(StringHelper.StartsWith(_scratch, SimpleTextTermVectorsWriter.ENDOFFSET));
                        postings.END_OFFSETS[k] = ParseIntAt(SimpleTextTermVectorsWriter.ENDOFFSET.Length);
                    }
                }
            }
            return(new SimpleTVFields(this, fields));
        }
Exemple #16
0
		/// <summary>
		/// Builds an <seealso cref="SynonymMap"/> and returns it.
		/// </summary>
		public virtual SynonymMap Build()
		{
		  ByteSequenceOutputs outputs = ByteSequenceOutputs.Singleton;
		  // TODO: are we using the best sharing options?
		  var builder = new Builder<BytesRef>(FST.INPUT_TYPE.BYTE4, outputs);

		  BytesRef scratch = new BytesRef(64);
		  ByteArrayDataOutput scratchOutput = new ByteArrayDataOutput();

		  HashSet<int?> dedupSet;

		  if (dedup)
		  {
			dedupSet = new HashSet<int?>();
		  }
		  else
		  {
			dedupSet = null;
		  }

		  
            var spare = new sbyte[5];

		  Dictionary<CharsRef, MapEntry>.KeyCollection keys = workingSet.Keys;
		  CharsRef[] sortedKeys = keys.ToArray();
		  Arrays.Sort(sortedKeys, CharsRef.UTF16SortedAsUTF8Comparator);

		  IntsRef scratchIntsRef = new IntsRef();

		  //System.out.println("fmap.build");
		  for (int keyIdx = 0; keyIdx < sortedKeys.Length; keyIdx++)
		  {
			CharsRef input = sortedKeys[keyIdx];
			MapEntry output = workingSet[input];

			int numEntries = output.ords.Count;
			// output size, assume the worst case
			int estimatedSize = 5 + numEntries * 5; // numEntries + one ord for each entry

			scratch.Grow(estimatedSize);
			scratchOutput.Reset(scratch.Bytes, scratch.Offset, scratch.Bytes.Length);
			Debug.Assert(scratch.Offset == 0);

			// now write our output data:
			int count = 0;
			for (int i = 0; i < numEntries; i++)
			{
			  if (dedupSet != null)
			  {
				// box once
				int? ent = output.ords[i];
				if (dedupSet.Contains(ent))
				{
				  continue;
				}
				dedupSet.Add(ent);
			  }
			  scratchOutput.WriteVInt(output.ords[i]);
			  count++;
			}

			int pos = scratchOutput.Position;
			scratchOutput.WriteVInt(count << 1 | (output.includeOrig ? 0 : 1));
			int pos2 = scratchOutput.Position;
			int vIntLen = pos2 - pos;

			// Move the count + includeOrig to the front of the byte[]:
			Array.Copy(scratch.Bytes, pos, spare, 0, vIntLen);
			Array.Copy(scratch.Bytes, 0, scratch.Bytes, vIntLen, pos);
			Array.Copy(spare, 0, scratch.Bytes, 0, vIntLen);

			if (dedupSet != null)
			{
			  dedupSet.Clear();
			}

			scratch.Length = scratchOutput.Position - scratch.Offset;
			//System.out.println("  add input=" + input + " output=" + scratch + " offset=" + scratch.offset + " length=" + scratch.length + " count=" + count);
			builder.Add(Util.ToUTF32(input, scratchIntsRef), BytesRef.DeepCopyOf(scratch));
		  }

		  FST<BytesRef> fst = builder.Finish();
		  return new SynonymMap(fst, words, maxHorizontalContext);
		}
        /// <summary>
        /// Finds largest term accepted by this Automaton, that's
        ///  <= the provided input term.  The result is placed in
        ///  output; it's fine for output and input to point to
        ///  the same BytesRef.  The returned result is either the
        ///  provided output, or null if there is no floor term
        ///  (ie, the provided input term is before the first term
        ///  accepted by this Automaton).
        /// </summary>
        public virtual BytesRef Floor(BytesRef input, BytesRef output)
        {
            output.Offset = 0;
            //if (DEBUG) System.out.println("CA.floor input=" + input.utf8ToString());

            int state = RunAutomaton.InitialState;

            // Special case empty string:
            if (input.Length == 0)
            {
                if (RunAutomaton.IsAccept(state))
                {
                    output.Length = 0;
                    return(output);
                }
                else
                {
                    return(null);
                }
            }

            IList <int> stack = new List <int>();

            int idx = 0;

            while (true)
            {
                int label     = input.Bytes[input.Offset + idx] & 0xff;
                int nextState = RunAutomaton.Step(state, label);
                //if (DEBUG) System.out.println("  cycle label=" + (char) label + " nextState=" + nextState);

                if (idx == input.Length - 1)
                {
                    if (nextState != -1 && RunAutomaton.IsAccept(nextState))
                    {
                        // Input string is accepted
                        if (idx >= output.Bytes.Length)
                        {
                            output.Grow(1 + idx);
                        }
                        output.Bytes[idx] = (sbyte)label;
                        output.Length     = input.Length;
                        //if (DEBUG) System.out.println("  input is accepted; return term=" + output.utf8ToString());
                        return(output);
                    }
                    else
                    {
                        nextState = -1;
                    }
                }

                if (nextState == -1)
                {
                    // Pop back to a state that has a transition
                    // <= our label:
                    while (true)
                    {
                        Transition[] transitions = SortedTransitions[state];
                        if (transitions.Length == 0)
                        {
                            Debug.Assert(RunAutomaton.IsAccept(state));
                            output.Length = idx;
                            //if (DEBUG) System.out.println("  return " + output.utf8ToString());
                            return(output);
                        }
                        else if (label - 1 < transitions[0].Min_Renamed)
                        {
                            if (RunAutomaton.IsAccept(state))
                            {
                                output.Length = idx;
                                //if (DEBUG) System.out.println("  return " + output.utf8ToString());
                                return(output);
                            }
                            // pop
                            if (stack.Count == 0)
                            {
                                //if (DEBUG) System.out.println("  pop ord=" + idx + " return null");
                                return(null);
                            }
                            else
                            {
                                state = stack[stack.Count - 1];
                                stack.RemoveAt(stack.Count - 1);
                                idx--;
                                //if (DEBUG) System.out.println("  pop ord=" + (idx+1) + " label=" + (char) label + " first trans.min=" + (char) transitions[0].min);
                                label = input.Bytes[input.Offset + idx] & 0xff;
                            }
                        }
                        else
                        {
                            //if (DEBUG) System.out.println("  stop pop ord=" + idx + " first trans.min=" + (char) transitions[0].min);
                            break;
                        }
                    }

                    //if (DEBUG) System.out.println("  label=" + (char) label + " idx=" + idx);

                    return(AddTail(state, output, idx, label));
                }
                else
                {
                    if (idx >= output.Bytes.Length)
                    {
                        output.Grow(1 + idx);
                    }
                    output.Bytes[idx] = (sbyte)label;
                    stack.Add(state);
                    state = nextState;
                    idx++;
                }
            }
        }
        public override Fields Get(int doc)
        {
            var fields = new SortedDictionary<string, SimpleTVTerms>();

            _input.Seek(_offsets[doc]);
            ReadLine();
            Debug.Assert(StringHelper.StartsWith(_scratch, SimpleTextTermVectorsWriter.NUMFIELDS));
            var numFields = ParseIntAt(SimpleTextTermVectorsWriter.NUMFIELDS.Length);
            if (numFields == 0)
            {
                return null; // no vectors for this doc
            }
            for (var i = 0; i < numFields; i++)
            {
                ReadLine();
                Debug.Assert(StringHelper.StartsWith(_scratch, SimpleTextTermVectorsWriter.FIELD));
                // skip fieldNumber:
                ParseIntAt(SimpleTextTermVectorsWriter.FIELD.Length);

                ReadLine();
                Debug.Assert(StringHelper.StartsWith(_scratch, SimpleTextTermVectorsWriter.FIELDNAME));
                var fieldName = ReadString(SimpleTextTermVectorsWriter.FIELDNAME.Length, _scratch);

                ReadLine();
                Debug.Assert(StringHelper.StartsWith(_scratch, SimpleTextTermVectorsWriter.FIELDPOSITIONS));
                var positions = Convert.ToBoolean(ReadString(SimpleTextTermVectorsWriter.FIELDPOSITIONS.Length, _scratch), CultureInfo.InvariantCulture);

                ReadLine();
                Debug.Assert(StringHelper.StartsWith(_scratch, SimpleTextTermVectorsWriter.FIELDOFFSETS));
                var offsets = Convert.ToBoolean(ReadString(SimpleTextTermVectorsWriter.FIELDOFFSETS.Length, _scratch), CultureInfo.InvariantCulture);

                ReadLine();
                Debug.Assert(StringHelper.StartsWith(_scratch, SimpleTextTermVectorsWriter.FIELDPAYLOADS));
                var payloads = Convert.ToBoolean(ReadString(SimpleTextTermVectorsWriter.FIELDPAYLOADS.Length, _scratch), CultureInfo.InvariantCulture);

                ReadLine();
                Debug.Assert(StringHelper.StartsWith(_scratch, SimpleTextTermVectorsWriter.FIELDTERMCOUNT));
                var termCount = ParseIntAt(SimpleTextTermVectorsWriter.FIELDTERMCOUNT.Length);

                var terms = new SimpleTVTerms(offsets, positions, payloads);
                fields.Add(fieldName, terms);

                for (var j = 0; j < termCount; j++)
                {
                    ReadLine();
                    Debug.Assert(StringHelper.StartsWith(_scratch, SimpleTextTermVectorsWriter.TERMTEXT));
                    var term = new BytesRef();
                    var termLength = _scratch.Length - SimpleTextTermVectorsWriter.TERMTEXT.Length;
                    term.Grow(termLength);
                    term.Length = termLength;
                    Array.Copy(_scratch.Bytes, _scratch.Offset + SimpleTextTermVectorsWriter.TERMTEXT.Length, term.Bytes, term.Offset, termLength);

                    var postings = new SimpleTVPostings();
                    terms.TERMS.Add(term, postings);

                    ReadLine();
                    Debug.Assert(StringHelper.StartsWith(_scratch, SimpleTextTermVectorsWriter.TERMFREQ));
                    postings.FREQ = ParseIntAt(SimpleTextTermVectorsWriter.TERMFREQ.Length);

                    if (!positions && !offsets) continue;

                    if (positions)
                    {
                        postings.POSITIONS = new int[postings.FREQ];
                        if (payloads)
                        {
                            postings.PAYLOADS = new BytesRef[postings.FREQ];
                        }
                    }

                    if (offsets)
                    {
                        postings.START_OFFSETS = new int[postings.FREQ];
                        postings.END_OFFSETS = new int[postings.FREQ];
                    }

                    for (var k = 0; k < postings.FREQ; k++)
                    {
                        if (positions)
                        {
                            ReadLine();
                            Debug.Assert(StringHelper.StartsWith(_scratch, SimpleTextTermVectorsWriter.POSITION));
                            postings.POSITIONS[k] = ParseIntAt(SimpleTextTermVectorsWriter.POSITION.Length);
                            if (payloads)
                            {
                                ReadLine();
                                Debug.Assert(StringHelper.StartsWith(_scratch, SimpleTextTermVectorsWriter.PAYLOAD));
                                if (_scratch.Length - SimpleTextTermVectorsWriter.PAYLOAD.Length == 0)
                                {
                                    postings.PAYLOADS[k] = null;
                                }
                                else
                                {
                                    var payloadBytes = new byte[_scratch.Length - SimpleTextTermVectorsWriter.PAYLOAD.Length];
                                    Array.Copy(_scratch.Bytes, _scratch.Offset + SimpleTextTermVectorsWriter.PAYLOAD.Length, payloadBytes, 0,
                                        payloadBytes.Length);
                                    postings.PAYLOADS[k] = new BytesRef(payloadBytes);
                                }
                            }
                        }

                        if (!offsets) continue;

                        ReadLine();
                        Debug.Assert(StringHelper.StartsWith(_scratch, SimpleTextTermVectorsWriter.STARTOFFSET));
                        postings.START_OFFSETS[k] = ParseIntAt(SimpleTextTermVectorsWriter.STARTOFFSET.Length);

                        ReadLine();
                        Debug.Assert(StringHelper.StartsWith(_scratch, SimpleTextTermVectorsWriter.ENDOFFSET));
                        postings.END_OFFSETS[k] = ParseIntAt(SimpleTextTermVectorsWriter.ENDOFFSET.Length);
                    }
                }
            }
            return new SimpleTVFields(this, fields);
        }
Exemple #19
0
            /// <summary>
            /// Builds an <seealso cref="SynonymMap"/> and returns it.
            /// </summary>
            public virtual SynonymMap Build()
            {
                ByteSequenceOutputs outputs = ByteSequenceOutputs.Singleton;
                // TODO: are we using the best sharing options?
                var builder = new Builder <BytesRef>(FST.INPUT_TYPE.BYTE4, outputs);

                BytesRef            scratch       = new BytesRef(64);
                ByteArrayDataOutput scratchOutput = new ByteArrayDataOutput();

                HashSet <int?> dedupSet;

                if (dedup)
                {
                    dedupSet = new HashSet <int?>();
                }
                else
                {
                    dedupSet = null;
                }


                var spare = new byte[5];

                IEnumerable <CharsRef> keys = workingSet.Keys;

                CharsRef[] sortedKeys = keys.ToArray();
#pragma warning disable 612, 618
                System.Array.Sort(sortedKeys, CharsRef.UTF16SortedAsUTF8Comparer);
#pragma warning restore 612, 618


                IntsRef scratchIntsRef = new IntsRef();

                //System.out.println("fmap.build");
                for (int keyIdx = 0; keyIdx < sortedKeys.Length; keyIdx++)
                {
                    CharsRef input  = sortedKeys[keyIdx];
                    MapEntry output = workingSet[input];

                    int numEntries = output.ords.Count;
                    // output size, assume the worst case
                    int estimatedSize = 5 + numEntries * 5; // numEntries + one ord for each entry

                    scratch.Grow(estimatedSize);
                    scratchOutput.Reset(scratch.Bytes, scratch.Offset, scratch.Bytes.Length);
                    Debug.Assert(scratch.Offset == 0);

                    // now write our output data:
                    int count = 0;
                    for (int i = 0; i < numEntries; i++)
                    {
                        if (dedupSet != null)
                        {
                            // box once
                            int?ent = output.ords[i];
                            if (dedupSet.Contains(ent))
                            {
                                continue;
                            }
                            dedupSet.Add(ent);
                        }
                        scratchOutput.WriteVInt(output.ords[i]);
                        count++;
                    }

                    int pos = scratchOutput.Position;
                    scratchOutput.WriteVInt(count << 1 | (output.includeOrig ? 0 : 1));
                    int pos2    = scratchOutput.Position;
                    int vIntLen = pos2 - pos;

                    // Move the count + includeOrig to the front of the byte[]:
                    Array.Copy(scratch.Bytes, pos, spare, 0, vIntLen);
                    Array.Copy(scratch.Bytes, 0, scratch.Bytes, vIntLen, pos);
                    Array.Copy(spare, 0, scratch.Bytes, 0, vIntLen);

                    if (dedupSet != null)
                    {
                        dedupSet.Clear();
                    }

                    scratch.Length = scratchOutput.Position - scratch.Offset;
                    //System.out.println("  add input=" + input + " output=" + scratch + " offset=" + scratch.offset + " length=" + scratch.length + " count=" + count);
                    builder.Add(Lucene.Net.Util.Fst.Util.ToUTF32(input.ToString(), scratchIntsRef), BytesRef.DeepCopyOf(scratch));
                }

                FST <BytesRef> fst = builder.Finish();
                return(new SynonymMap(fst, words, maxHorizontalContext));
            }
 /// <summary>
 /// Compare the fields of the terms first, and if not equals return from
 /// compare. If equal compare terms.
 /// </summary>
 /// <param name="term">
 ///          the term to compare. </param>
 /// <param name="termIndex">
 ///          the position of the term in the input to compare </param>
 /// <param name="input">
 ///          the input buffer. </param>
 /// <returns> int. </returns>
 /// <exception cref="IOException"> If there is a low-level I/O error. </exception>
 private int CompareTo(Term term, int termIndex, PagedBytesDataInput input, BytesRef reuse)
 {
     // if term field does not equal mid's field index, then compare fields
     // else if they are equal, compare term's string values...
     int c = CompareField(term, termIndex, input);
     if (c == 0)
     {
         reuse.Length = input.ReadVInt();
         reuse.Grow(reuse.Length);
         input.ReadBytes(reuse.Bytes, 0, reuse.Length);
         return Comparator.Compare(term.Bytes(), reuse);
     }
     return c;
 }
        /// <summary>
        /// Retrieve suggestions.
        /// </summary>
        public virtual IList <LookupResult> DoLookup(string key, IEnumerable <BytesRef> contexts, int num)
        {
            if (contexts != null)
            {
                throw new System.ArgumentException("this suggester doesn't support contexts");
            }

            TokenStream ts = queryAnalyzer.GetTokenStream("", key.ToString());

            try
            {
                ITermToBytesRefAttribute    termBytesAtt = ts.AddAttribute <ITermToBytesRefAttribute>();
                IOffsetAttribute            offsetAtt    = ts.AddAttribute <IOffsetAttribute>();
                IPositionLengthAttribute    posLenAtt    = ts.AddAttribute <IPositionLengthAttribute>();
                IPositionIncrementAttribute posIncAtt    = ts.AddAttribute <IPositionIncrementAttribute>();
                ts.Reset();

                var lastTokens = new BytesRef[grams];
                //System.out.println("lookup: key='" + key + "'");

                // Run full analysis, but save only the
                // last 1gram, last 2gram, etc.:
                BytesRef tokenBytes   = termBytesAtt.BytesRef;
                int      maxEndOffset = -1;
                bool     sawRealToken = false;
                while (ts.IncrementToken())
                {
                    termBytesAtt.FillBytesRef();
                    sawRealToken |= tokenBytes.Length > 0;
                    // TODO: this is somewhat iffy; today, ShingleFilter
                    // sets posLen to the gram count; maybe we should make
                    // a separate dedicated att for this?
                    int gramCount = posLenAtt.PositionLength;

                    Debug.Assert(gramCount <= grams);

                    // Safety: make sure the recalculated count "agrees":
                    if (CountGrams(tokenBytes) != gramCount)
                    {
                        throw new System.ArgumentException("tokens must not contain separator byte; got token=" + tokenBytes + " but gramCount=" + gramCount + " does not match recalculated count=" + CountGrams(tokenBytes));
                    }
                    maxEndOffset = Math.Max(maxEndOffset, offsetAtt.EndOffset);
                    lastTokens[gramCount - 1] = BytesRef.DeepCopyOf(tokenBytes);
                }
                ts.End();

                if (!sawRealToken)
                {
                    throw new System.ArgumentException("no tokens produced by analyzer, or the only tokens were empty strings");
                }

                // Carefully fill last tokens with _ tokens;
                // ShingleFilter appraently won't emit "only hole"
                // tokens:
                int endPosInc = posIncAtt.PositionIncrement;

                // Note this will also be true if input is the empty
                // string (in which case we saw no tokens and
                // maxEndOffset is still -1), which in fact works out OK
                // because we fill the unigram with an empty BytesRef
                // below:
                bool lastTokenEnded = offsetAtt.EndOffset > maxEndOffset || endPosInc > 0;
                //System.out.println("maxEndOffset=" + maxEndOffset + " vs " + offsetAtt.EndOffset);

                if (lastTokenEnded)
                {
                    //System.out.println("  lastTokenEnded");
                    // If user hit space after the last token, then
                    // "upgrade" all tokens.  This way "foo " will suggest
                    // all bigrams starting w/ foo, and not any unigrams
                    // starting with "foo":
                    for (int i = grams - 1; i > 0; i--)
                    {
                        BytesRef token = lastTokens[i - 1];
                        if (token == null)
                        {
                            continue;
                        }
                        token.Grow(token.Length + 1);
                        token.Bytes[token.Length] = separator;
                        token.Length++;
                        lastTokens[i] = token;
                    }
                    lastTokens[0] = new BytesRef();
                }

                var arc = new FST.Arc <long?>();

                var bytesReader = fst.GetBytesReader();

                // Try highest order models first, and if they return
                // results, return that; else, fallback:
                double backoff = 1.0;

                List <LookupResult> results = new List <LookupResult>(num);

                // We only add a given suffix once, from the highest
                // order model that saw it; for subsequent lower order
                // models we skip it:
                var seen = new HashSet <BytesRef>();

                for (int gram = grams - 1; gram >= 0; gram--)
                {
                    BytesRef token = lastTokens[gram];
                    // Don't make unigram predictions from empty string:
                    if (token == null || (token.Length == 0 && key.Length > 0))
                    {
                        // Input didn't have enough tokens:
                        //System.out.println("  gram=" + gram + ": skip: not enough input");
                        continue;
                    }

                    if (endPosInc > 0 && gram <= endPosInc)
                    {
                        // Skip hole-only predictions; in theory we
                        // shouldn't have to do this, but we'd need to fix
                        // ShingleFilter to produce only-hole tokens:
                        //System.out.println("  break: only holes now");
                        break;
                    }

                    //System.out.println("try " + (gram+1) + " gram token=" + token.utf8ToString());

                    // TODO: we could add fuzziness here
                    // match the prefix portion exactly
                    //Pair<Long,BytesRef> prefixOutput = null;
                    long?prefixOutput = null;
                    try
                    {
                        prefixOutput = LookupPrefix(fst, bytesReader, token, arc);
                    }
                    catch (IOException bogus)
                    {
                        throw new Exception(bogus.ToString(), bogus);
                    }
                    //System.out.println("  prefixOutput=" + prefixOutput);

                    if (prefixOutput == null)
                    {
                        // This model never saw this prefix, e.g. the
                        // trigram model never saw context "purple mushroom"
                        backoff *= ALPHA;
                        continue;
                    }

                    // TODO: we could do this division at build time, and
                    // bake it into the FST?

                    // Denominator for computing scores from current
                    // model's predictions:
                    long contextCount = totTokens;

                    BytesRef lastTokenFragment = null;

                    for (int i = token.Length - 1; i >= 0; i--)
                    {
                        if (token.Bytes[token.Offset + i] == separator)
                        {
                            BytesRef context = new BytesRef(token.Bytes, token.Offset, i);
                            long?    output  = Lucene.Net.Util.Fst.Util.Get(fst, Lucene.Net.Util.Fst.Util.ToInt32sRef(context, new Int32sRef()));
                            Debug.Assert(output != null);
                            contextCount      = DecodeWeight(output);
                            lastTokenFragment = new BytesRef(token.Bytes, token.Offset + i + 1, token.Length - i - 1);
                            break;
                        }
                    }

                    BytesRef finalLastToken;

                    if (lastTokenFragment == null)
                    {
                        finalLastToken = BytesRef.DeepCopyOf(token);
                    }
                    else
                    {
                        finalLastToken = BytesRef.DeepCopyOf(lastTokenFragment);
                    }
                    Debug.Assert(finalLastToken.Offset == 0);

                    CharsRef spare = new CharsRef();

                    // complete top-N
                    Util.Fst.Util.TopResults <long?> completions = null;
                    try
                    {
                        // Because we store multiple models in one FST
                        // (1gram, 2gram, 3gram), we must restrict the
                        // search so that it only considers the current
                        // model.  For highest order model, this is not
                        // necessary since all completions in the FST
                        // must be from this model, but for lower order
                        // models we have to filter out the higher order
                        // ones:

                        // Must do num+seen.size() for queue depth because we may
                        // reject up to seen.size() paths in acceptResult():
                        Util.Fst.Util.TopNSearcher <long?> searcher = new TopNSearcherAnonymousInnerClassHelper(this, fst, num, num + seen.Count, weightComparer, seen, finalLastToken);

                        // since this search is initialized with a single start node
                        // it is okay to start with an empty input path here
                        searcher.AddStartPaths(arc, prefixOutput, true, new Int32sRef());

                        completions = searcher.Search();
                        Debug.Assert(completions.IsComplete);
                    }
                    catch (IOException bogus)
                    {
                        throw new Exception(bogus.ToString(), bogus);
                    }

                    int prefixLength = token.Length;

                    BytesRef suffix = new BytesRef(8);
                    //System.out.println("    " + completions.length + " completions");

                    foreach (Util.Fst.Util.Result <long?> completion in completions)
                    {
                        token.Length = prefixLength;
                        // append suffix
                        Util.Fst.Util.ToBytesRef(completion.Input, suffix);
                        token.Append(suffix);

                        //System.out.println("    completion " + token.utf8ToString());

                        // Skip this path if a higher-order model already
                        // saw/predicted its last token:
                        BytesRef lastToken = token;
                        for (int i = token.Length - 1; i >= 0; i--)
                        {
                            if (token.Bytes[token.Offset + i] == separator)
                            {
                                Debug.Assert(token.Length - i - 1 > 0);
                                lastToken = new BytesRef(token.Bytes, token.Offset + i + 1, token.Length - i - 1);
                                break;
                            }
                        }
                        if (seen.Contains(lastToken))
                        {
                            //System.out.println("      skip dup " + lastToken.utf8ToString());
                            goto nextCompletionContinue;
                        }
                        seen.Add(BytesRef.DeepCopyOf(lastToken));
                        spare.Grow(token.Length);
                        UnicodeUtil.UTF8toUTF16(token, spare);
                        LookupResult result = new LookupResult(spare.ToString(),
                                                               // LUCENENET NOTE: We need to calculate this as decimal because when using double it can sometimes
                                                               // return numbers that are greater than long.MaxValue, which results in a negative long number.
                                                               (long)(long.MaxValue * (decimal)backoff * ((decimal)DecodeWeight(completion.Output)) / contextCount));
                        results.Add(result);
                        Debug.Assert(results.Count == seen.Count);
                        //System.out.println("  add result=" + result);
                        nextCompletionContinue :;
                    }
                    backoff *= ALPHA;
                }

                results.Sort(new ComparerAnonymousInnerClassHelper(this));

                if (results.Count > num)
                {
                    results.SubList(num, results.Count).Clear();
                }

                return(results);
            }
            finally
            {
                IOUtils.DisposeWhileHandlingException(ts);
            }
        }
            // Swap in S, in place of E:
            internal virtual bool SeekToNonBMP(SegmentTermEnum te, BytesRef term, int pos)
            {
                int savLength = term.Length;

                Debug.Assert(term.Offset == 0);

                // The 3 bytes starting at downTo make up 1
                // unicode character:
                Debug.Assert(IsHighBMPChar(term.Bytes, pos));

                // NOTE: we cannot make this assert, because
                // AutomatonQuery legitimately sends us malformed UTF8
                // (eg the UTF8 bytes with just 0xee)
                // assert term.length >= pos + 3: "term.length=" + term.length + " pos+3=" + (pos+3) + " byte=" + Integer.toHexString(term.bytes[pos]) + " term=" + term.toString();

                // Save the bytes && length, since we need to
                // restore this if seek "back" finds no matching
                // terms
                if (term.Bytes.Length < 4 + pos)
                {
                    term.Grow(4 + pos);
                }

                Scratch[0] = (sbyte)term.Bytes[pos];
                Scratch[1] = (sbyte)term.Bytes[pos + 1];
                Scratch[2] = (sbyte)term.Bytes[pos + 2];

                term.Bytes[pos] = unchecked((byte)0xf0);
                term.Bytes[pos + 1] = unchecked((byte)0x90);
                term.Bytes[pos + 2] = unchecked((byte)0x80);
                term.Bytes[pos + 3] = unchecked((byte)0x80);
                term.Length = 4 + pos;

                if (DEBUG_SURROGATES)
                {
                    Console.WriteLine("      try seek term=" + UnicodeUtil.ToHexString(term.Utf8ToString()));
                }

                // Seek "back":
                OuterInstance.TermsDict.SeekEnum(te, new Term(fieldInfo.Name, term), true);

                // Test if the term we seek'd to in fact found a
                // surrogate pair at the same position as the E:
                Term t2 = te.Term();

                // Cannot be null (or move to next field) because at
                // "worst" it'd seek to the same term we are on now,
                // unless we are being called from seek
                if (t2 == null || t2.Field() != InternedFieldName)
                {
                    return false;
                }

                if (DEBUG_SURROGATES)
                {
                    Console.WriteLine("      got term=" + UnicodeUtil.ToHexString(t2.Text()));
                }

                // Now test if prefix is identical and we found
                // a non-BMP char at the same position:
                BytesRef b2 = t2.Bytes();
                Debug.Assert(b2.Offset == 0);

                bool matches;
                if (b2.Length >= term.Length && IsNonBMPChar(b2.Bytes, pos))
                {
                    matches = true;
                    for (int i = 0; i < pos; i++)
                    {
                        if (term.Bytes[i] != b2.Bytes[i])
                        {
                            matches = false;
                            break;
                        }
                    }
                }
                else
                {
                    matches = false;
                }

                // Restore term:
                term.Length = savLength;
                term.Bytes[pos] = (byte)Scratch[0];
                term.Bytes[pos + 1] = (byte)Scratch[1];
                term.Bytes[pos + 2] = (byte)Scratch[2];

                return matches;
            }
            private void LoadTerms()
            {
                var posIntOutputs = PositiveInt32Outputs.Singleton;
                var outputsInner  = new PairOutputs <long?, long?>(posIntOutputs, posIntOutputs);
                var outputs       = new PairOutputs <long?, PairOutputs <long?, long?> .Pair>(posIntOutputs, outputsInner);

                // honestly, wtf kind of generic mess is this.
                var b     = new Builder <PairOutputs <long?, PairOutputs <long?, long?> .Pair> .Pair>(FST.INPUT_TYPE.BYTE1, outputs);
                var input = (IndexInput)_outerInstance._input.Clone();

                input.Seek(_termsStart);

                var  lastTerm      = new BytesRef(10);
                long lastDocsStart = -1;
                int  docFreq       = 0;
                long totalTermFreq = 0;
                var  visitedDocs   = new FixedBitSet(_maxDoc);

                var scratchIntsRef = new Int32sRef();

                while (true)
                {
                    SimpleTextUtil.ReadLine(input, _scratch);
                    if (_scratch.Equals(SimpleTextFieldsWriter.END) || StringHelper.StartsWith(_scratch, SimpleTextFieldsWriter.FIELD))
                    {
                        if (lastDocsStart != -1)
                        {
                            b.Add(Util.ToInt32sRef(lastTerm, scratchIntsRef),
                                  outputs.NewPair(lastDocsStart, outputsInner.NewPair(docFreq, totalTermFreq)));
                            _sumTotalTermFreq += totalTermFreq;
                        }
                        break;
                    }

                    if (StringHelper.StartsWith(_scratch, SimpleTextFieldsWriter.DOC))
                    {
                        docFreq++;
                        _sumDocFreq++;
                        UnicodeUtil.UTF8toUTF16(_scratch.Bytes, _scratch.Offset + SimpleTextFieldsWriter.DOC.Length, _scratch.Length - SimpleTextFieldsWriter.DOC.Length,
                                                _scratchUtf16);
                        int docId = ArrayUtil.ParseInt32(_scratchUtf16.Chars, 0, _scratchUtf16.Length);
                        visitedDocs.Set(docId);
                    }
                    else if (StringHelper.StartsWith(_scratch, SimpleTextFieldsWriter.FREQ))
                    {
                        UnicodeUtil.UTF8toUTF16(_scratch.Bytes, _scratch.Offset + SimpleTextFieldsWriter.FREQ.Length,
                                                _scratch.Length - SimpleTextFieldsWriter.FREQ.Length, _scratchUtf16);
                        totalTermFreq += ArrayUtil.ParseInt32(_scratchUtf16.Chars, 0, _scratchUtf16.Length);
                    }
                    else if (StringHelper.StartsWith(_scratch, SimpleTextFieldsWriter.TERM))
                    {
                        if (lastDocsStart != -1)
                        {
                            b.Add(Util.ToInt32sRef(lastTerm, scratchIntsRef),
                                  outputs.NewPair(lastDocsStart, outputsInner.NewPair(docFreq, totalTermFreq)));
                        }
                        lastDocsStart = input.GetFilePointer();
                        int len = _scratch.Length - SimpleTextFieldsWriter.TERM.Length;
                        if (len > lastTerm.Length)
                        {
                            lastTerm.Grow(len);
                        }
                        Array.Copy(_scratch.Bytes, SimpleTextFieldsWriter.TERM.Length, lastTerm.Bytes, 0, len);
                        lastTerm.Length    = len;
                        docFreq            = 0;
                        _sumTotalTermFreq += totalTermFreq;
                        totalTermFreq      = 0;
                        _termCount++;
                    }
                }
                _docCount = visitedDocs.Cardinality();
                _fst      = b.Finish();
            }
Exemple #24
0
            public override int NextPosition()
            {
                int pos;

                if (_readPositions)
                {
                    SimpleTextUtil.ReadLine(_in, _scratch);
                    if (Debugging.AssertsEnabled)
                    {
                        Debugging.Assert(StringHelper.StartsWith(_scratch, SimpleTextFieldsWriter.POS), () => "got line=" + _scratch.Utf8ToString());
                    }
                    UnicodeUtil.UTF8toUTF16(_scratch.Bytes, _scratch.Offset + SimpleTextFieldsWriter.POS.Length, _scratch.Length - SimpleTextFieldsWriter.POS.Length,
                                            _scratchUtf162);
                    pos = ArrayUtil.ParseInt32(_scratchUtf162.Chars, 0, _scratchUtf162.Length);
                }
                else
                {
                    pos = -1;
                }

                if (_readOffsets)
                {
                    SimpleTextUtil.ReadLine(_in, _scratch);
                    if (Debugging.AssertsEnabled)
                    {
                        Debugging.Assert(StringHelper.StartsWith(_scratch, SimpleTextFieldsWriter.START_OFFSET), () => "got line=" + _scratch.Utf8ToString());
                    }
                    UnicodeUtil.UTF8toUTF16(_scratch.Bytes, _scratch.Offset + SimpleTextFieldsWriter.START_OFFSET.Length,
                                            _scratch.Length - SimpleTextFieldsWriter.START_OFFSET.Length, _scratchUtf162);
                    _startOffset = ArrayUtil.ParseInt32(_scratchUtf162.Chars, 0, _scratchUtf162.Length);
                    SimpleTextUtil.ReadLine(_in, _scratch);
                    if (Debugging.AssertsEnabled)
                    {
                        Debugging.Assert(StringHelper.StartsWith(_scratch, SimpleTextFieldsWriter.END_OFFSET), () => "got line=" + _scratch.Utf8ToString());
                    }
                    UnicodeUtil.UTF8toUTF16(_scratch.Bytes, _scratch.Offset + SimpleTextFieldsWriter.END_OFFSET.Length,
                                            _scratch.Length - SimpleTextFieldsWriter.END_OFFSET.Length, _scratchUtf162);
                    _endOffset = ArrayUtil.ParseInt32(_scratchUtf162.Chars, 0, _scratchUtf162.Length);
                }

                long fp = _in.GetFilePointer();

                SimpleTextUtil.ReadLine(_in, _scratch);
                if (StringHelper.StartsWith(_scratch, SimpleTextFieldsWriter.PAYLOAD))
                {
                    int len = _scratch.Length - SimpleTextFieldsWriter.PAYLOAD.Length;
                    if (_scratch2.Bytes.Length < len)
                    {
                        _scratch2.Grow(len);
                    }
                    Array.Copy(_scratch.Bytes, SimpleTextFieldsWriter.PAYLOAD.Length, _scratch2.Bytes, 0, len);
                    _scratch2.Length = len;
                    _payload         = _scratch2;
                }
                else
                {
                    _payload = null;
                    _in.Seek(fp);
                }
                return(pos);
            }
Exemple #25
0
        //private static final boolean DEBUG = BlockTreeTermsWriter.DEBUG;

        private BytesRef AddTail(int state, BytesRef term, int idx, int leadLabel)
        {
            // Find biggest transition that's < label
            // TODO: use binary search here
            Transition maxTransition = null;

            foreach (Transition transition in sortedTransitions[state])
            {
                if (transition.min < leadLabel)
                {
                    maxTransition = transition;
                }
            }

            if (Debugging.AssertsEnabled)
            {
                Debugging.Assert(maxTransition != null);
            }

            // Append floorLabel
            int floorLabel;

            if (maxTransition.max > leadLabel - 1)
            {
                floorLabel = leadLabel - 1;
            }
            else
            {
                floorLabel = maxTransition.max;
            }
            if (idx >= term.Bytes.Length)
            {
                term.Grow(1 + idx);
            }
            //if (DEBUG) System.out.println("  add floorLabel=" + (char) floorLabel + " idx=" + idx);
            term.Bytes[idx] = (byte)floorLabel;

            state = maxTransition.to.Number;
            idx++;

            // Push down to last accept state
            while (true)
            {
                Transition[] transitions = sortedTransitions[state];
                if (transitions.Length == 0)
                {
                    if (Debugging.AssertsEnabled)
                    {
                        Debugging.Assert(RunAutomaton.IsAccept(state));
                    }
                    term.Length = idx;
                    //if (DEBUG) System.out.println("  return " + term.utf8ToString());
                    return(term);
                }
                else
                {
                    // We are pushing "top" -- so get last label of
                    // last transition:
                    if (Debugging.AssertsEnabled)
                    {
                        Debugging.Assert(transitions.Length != 0);
                    }
                    Transition lastTransition = transitions[transitions.Length - 1];
                    if (idx >= term.Bytes.Length)
                    {
                        term.Grow(1 + idx);
                    }
                    //if (DEBUG) System.out.println("  push maxLabel=" + (char) lastTransition.max + " idx=" + idx);
                    term.Bytes[idx] = (byte)lastTransition.max;
                    state           = lastTransition.to.Number;
                    idx++;
                }
            }
        }
Exemple #26
0
 /// <summary>
 /// Returns the <i>n'th</i> element of this <seealso cref="BytesRefArray"/> </summary>
 /// <param name="spare"> a spare <seealso cref="BytesRef"/> instance </param>
 /// <param name="index"> the elements index to retrieve </param>
 /// <returns> the <i>n'th</i> element of this <seealso cref="BytesRefArray"/> </returns>
 public BytesRef Get(BytesRef spare, int index)
 {
     if (LastElement > index)
     {
         int offset = Offsets[index];
         int length = index == LastElement - 1 ? CurrentOffset - offset : Offsets[index + 1] - offset;
         Debug.Assert(spare.Offset == 0);
         spare.Grow(length);
         spare.Length = length;
         Pool.ReadBytes(offset, spare.Bytes, spare.Offset, spare.Length);
         return spare;
     }
     throw new System.IndexOutOfRangeException("index " + index + " must be less than the size: " + LastElement);
 }
Exemple #27
0
        /* Walk through all unique text tokens (Posting
         * instances) found in this field and serialize them
         * into a single RAM segment. */

        internal void Flush(string fieldName, FieldsConsumer consumer, SegmentWriteState state)
        {
            if (!fieldInfo.Indexed)
            {
                return; // nothing to flush, don't bother the codec with the unindexed field
            }

            TermsConsumer        termsConsumer = consumer.AddField(fieldInfo);
            IComparer <BytesRef> termComp      = termsConsumer.Comparator;

            // CONFUSING: this.indexOptions holds the index options
            // that were current when we first saw this field.  But
            // it's possible this has changed, eg when other
            // documents are indexed that cause a "downgrade" of the
            // IndexOptions.  So we must decode the in-RAM buffer
            // according to this.indexOptions, but then write the
            // new segment to the directory according to
            // currentFieldIndexOptions:
            FieldInfo.IndexOptions?currentFieldIndexOptions = fieldInfo.FieldIndexOptions;
            Debug.Assert(currentFieldIndexOptions != null);

            bool writeTermFreq  = currentFieldIndexOptions >= FieldInfo.IndexOptions.DOCS_AND_FREQS;
            bool writePositions = currentFieldIndexOptions >= FieldInfo.IndexOptions.DOCS_AND_FREQS_AND_POSITIONS;
            bool writeOffsets   = currentFieldIndexOptions >= FieldInfo.IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS;

            bool readTermFreq  = this.HasFreq;
            bool readPositions = this.HasProx;
            bool readOffsets   = this.HasOffsets;

            //System.out.println("flush readTF=" + readTermFreq + " readPos=" + readPositions + " readOffs=" + readOffsets);

            // Make sure FieldInfo.update is working correctly!:
            Debug.Assert(!writeTermFreq || readTermFreq);
            Debug.Assert(!writePositions || readPositions);
            Debug.Assert(!writeOffsets || readOffsets);

            Debug.Assert(!writeOffsets || writePositions);

            IDictionary <Term, int?> segDeletes;

            if (state.SegUpdates != null && state.SegUpdates.Terms.Count > 0)
            {
                segDeletes = state.SegUpdates.Terms;
            }
            else
            {
                segDeletes = null;
            }

            int[]    termIDs  = TermsHashPerField.SortPostings(termComp);
            int      numTerms = TermsHashPerField.BytesHash.Size();
            BytesRef text     = new BytesRef();
            FreqProxPostingsArray postings = (FreqProxPostingsArray)TermsHashPerField.PostingsArray;
            ByteSliceReader       freq     = new ByteSliceReader();
            ByteSliceReader       prox     = new ByteSliceReader();

            FixedBitSet visitedDocs      = new FixedBitSet(state.SegmentInfo.DocCount);
            long        sumTotalTermFreq = 0;
            long        sumDocFreq       = 0;

            Term protoTerm = new Term(fieldName);

            for (int i = 0; i < numTerms; i++)
            {
                int termID = termIDs[i];
                // Get BytesRef
                int textStart = postings.TextStarts[termID];
                TermsHashPerField.BytePool.SetBytesRef(text, textStart);

                TermsHashPerField.InitReader(freq, termID, 0);
                if (readPositions || readOffsets)
                {
                    TermsHashPerField.InitReader(prox, termID, 1);
                }

                // TODO: really TermsHashPerField should take over most
                // of this loop, including merge sort of terms from
                // multiple threads and interacting with the
                // TermsConsumer, only calling out to us (passing us the
                // DocsConsumer) to handle delivery of docs/positions

                PostingsConsumer postingsConsumer = termsConsumer.StartTerm(text);

                int?delDocLimit;
                if (segDeletes != null)
                {
                    protoTerm.Bytes_Renamed = text;
                    int?docIDUpto;
                    segDeletes.TryGetValue(protoTerm, out docIDUpto);
                    if (docIDUpto != null)
                    {
                        delDocLimit = docIDUpto;
                    }
                    else
                    {
                        delDocLimit = 0;
                    }
                }
                else
                {
                    delDocLimit = 0;
                }

                // Now termStates has numToMerge FieldMergeStates
                // which all share the same term.  Now we must
                // interleave the docID streams.
                int  docFreq       = 0;
                long totalTermFreq = 0;
                int  docID         = 0;

                while (true)
                {
                    //System.out.println("  cycle");
                    int termFreq;
                    if (freq.Eof())
                    {
                        if (postings.LastDocCodes[termID] != -1)
                        {
                            // Return last doc
                            docID = postings.LastDocIDs[termID];
                            if (readTermFreq)
                            {
                                termFreq = postings.TermFreqs[termID];
                            }
                            else
                            {
                                termFreq = -1;
                            }
                            postings.LastDocCodes[termID] = -1;
                        }
                        else
                        {
                            // EOF
                            break;
                        }
                    }
                    else
                    {
                        int code = freq.ReadVInt();
                        if (!readTermFreq)
                        {
                            docID   += code;
                            termFreq = -1;
                        }
                        else
                        {
                            docID += (int)((uint)code >> 1);
                            if ((code & 1) != 0)
                            {
                                termFreq = 1;
                            }
                            else
                            {
                                termFreq = freq.ReadVInt();
                            }
                        }

                        Debug.Assert(docID != postings.LastDocIDs[termID]);
                    }

                    docFreq++;
                    Debug.Assert(docID < state.SegmentInfo.DocCount, "doc=" + docID + " maxDoc=" + state.SegmentInfo.DocCount);

                    // NOTE: we could check here if the docID was
                    // deleted, and skip it.  However, this is somewhat
                    // dangerous because it can yield non-deterministic
                    // behavior since we may see the docID before we see
                    // the term that caused it to be deleted.  this
                    // would mean some (but not all) of its postings may
                    // make it into the index, which'd alter the docFreq
                    // for those terms.  We could fix this by doing two
                    // passes, ie first sweep marks all del docs, and
                    // 2nd sweep does the real flush, but I suspect
                    // that'd add too much time to flush.
                    visitedDocs.Set(docID);
                    postingsConsumer.StartDoc(docID, writeTermFreq ? termFreq : -1);
                    if (docID < delDocLimit)
                    {
                        // Mark it deleted.  TODO: we could also skip
                        // writing its postings; this would be
                        // deterministic (just for this Term's docs).

                        // TODO: can we do this reach-around in a cleaner way????
                        if (state.LiveDocs == null)
                        {
                            state.LiveDocs = DocState.DocWriter.Codec.LiveDocsFormat().NewLiveDocs(state.SegmentInfo.DocCount);
                        }
                        if (state.LiveDocs.Get(docID))
                        {
                            state.DelCountOnFlush++;
                            state.LiveDocs.Clear(docID);
                        }
                    }

                    totalTermFreq += termFreq;

                    // Carefully copy over the prox + payload info,
                    // changing the format to match Lucene's segment
                    // format.

                    if (readPositions || readOffsets)
                    {
                        // we did record positions (& maybe payload) and/or offsets
                        int position = 0;
                        int offset   = 0;
                        for (int j = 0; j < termFreq; j++)
                        {
                            BytesRef thisPayload;

                            if (readPositions)
                            {
                                int code = prox.ReadVInt();
                                position += (int)((uint)code >> 1);

                                if ((code & 1) != 0)
                                {
                                    // this position has a payload
                                    int payloadLength = prox.ReadVInt();

                                    if (Payload == null)
                                    {
                                        Payload       = new BytesRef();
                                        Payload.Bytes = new sbyte[payloadLength];
                                    }
                                    else if (Payload.Bytes.Length < payloadLength)
                                    {
                                        Payload.Grow(payloadLength);
                                    }

                                    prox.ReadBytes(Payload.Bytes, 0, payloadLength);
                                    Payload.Length = payloadLength;
                                    thisPayload    = Payload;
                                }
                                else
                                {
                                    thisPayload = null;
                                }

                                if (readOffsets)
                                {
                                    int startOffset = offset + prox.ReadVInt();
                                    int endOffset   = startOffset + prox.ReadVInt();
                                    if (writePositions)
                                    {
                                        if (writeOffsets)
                                        {
                                            Debug.Assert(startOffset >= 0 && endOffset >= startOffset, "startOffset=" + startOffset + ",endOffset=" + endOffset + ",offset=" + offset);
                                            postingsConsumer.AddPosition(position, thisPayload, startOffset, endOffset);
                                        }
                                        else
                                        {
                                            postingsConsumer.AddPosition(position, thisPayload, -1, -1);
                                        }
                                    }
                                    offset = startOffset;
                                }
                                else if (writePositions)
                                {
                                    postingsConsumer.AddPosition(position, thisPayload, -1, -1);
                                }
                            }
                        }
                    }
                    postingsConsumer.FinishDoc();
                }
                termsConsumer.FinishTerm(text, new TermStats(docFreq, writeTermFreq ? totalTermFreq : -1));
                sumTotalTermFreq += totalTermFreq;
                sumDocFreq       += docFreq;
            }

            termsConsumer.Finish(writeTermFreq ? sumTotalTermFreq : -1, sumDocFreq, visitedDocs.Cardinality());
        }
Exemple #28
0
                /// <remarks>
                /// TODO: we may want an alternate mode here which is
                /// "if you are about to return NOT_FOUND I won't use
                /// the terms data from that"; eg FuzzyTermsEnum will
                /// (usually) just immediately call seek again if we
                /// return NOT_FOUND so it's a waste for us to fill in
                /// the term that was actually NOT_FOUND
                /// </remarks>
                public override SeekStatus SeekCeil(BytesRef target)
                {
                    if (indexEnum == null)
                    {
                        throw new InvalidOperationException("terms index was not loaded");
                    }

                    //System.out.println("BTR.seek seg=" + segment + " target=" + fieldInfo.name + ":" + target.utf8ToString() + " " + target + " current=" + term().utf8ToString() + " " + term() + " indexIsCurrent=" + indexIsCurrent + " didIndexNext=" + didIndexNext + " seekPending=" + seekPending + " divisor=" + indexReader.getDivisor() + " this="  + this);
                    if (didIndexNext)
                    {
                        if (nextIndexTerm == null)
                        {
                            //System.out.println("  nextIndexTerm=null");
                        }
                        else
                        {
                            //System.out.println("  nextIndexTerm=" + nextIndexTerm.utf8ToString());
                        }
                    }

                    bool doSeek = true;

                    // See if we can avoid seeking, because target term
                    // is after current term but before next index term:
                    if (indexIsCurrent)
                    {
                        int cmp = BytesRef.UTF8SortedAsUnicodeComparer.Compare(term, target);

                        if (cmp == 0)
                        {
                            // Already at the requested term
                            return(SeekStatus.FOUND);
                        }
                        else if (cmp < 0)
                        {
                            // Target term is after current term
                            if (!didIndexNext)
                            {
                                if (indexEnum.Next() == -1)
                                {
                                    nextIndexTerm = null;
                                }
                                else
                                {
                                    nextIndexTerm = indexEnum.Term;
                                }
                                //System.out.println("  now do index next() nextIndexTerm=" + (nextIndexTerm == null ? "null" : nextIndexTerm.utf8ToString()));
                                didIndexNext = true;
                            }

                            if (nextIndexTerm == null || BytesRef.UTF8SortedAsUnicodeComparer.Compare(target, nextIndexTerm) < 0)
                            {
                                // Optimization: requested term is within the
                                // same term block we are now in; skip seeking
                                // (but do scanning):
                                doSeek = false;
                                //System.out.println("  skip seek: nextIndexTerm=" + (nextIndexTerm == null ? "null" : nextIndexTerm.utf8ToString()));
                            }
                        }
                    }

                    if (doSeek)
                    {
                        //System.out.println("  seek");

                        // Ask terms index to find biggest indexed term (=
                        // first term in a block) that's <= our text:
                        input.Seek(indexEnum.Seek(target));
                        bool result = NextBlock();

                        // Block must exist since, at least, the indexed term
                        // is in the block:
                        if (Debugging.AssertsEnabled)
                        {
                            Debugging.Assert(result);
                        }

                        indexIsCurrent  = true;
                        didIndexNext    = false;
                        blocksSinceSeek = 0;

                        if (doOrd)
                        {
                            state.Ord = indexEnum.Ord - 1;
                        }

                        term.CopyBytes(indexEnum.Term);
                        //System.out.println("  seek: term=" + term.utf8ToString());
                    }
                    else
                    {
                        //System.out.println("  skip seek");
                        if (state.TermBlockOrd == blockTermCount && !NextBlock())
                        {
                            indexIsCurrent = false;
                            return(SeekStatus.END);
                        }
                    }

                    seekPending = false;

                    int common = 0;

                    // Scan within block.  We could do this by calling
                    // _next() and testing the resulting term, but this
                    // is wasteful.  Instead, we first confirm the
                    // target matches the common prefix of this block,
                    // and then we scan the term bytes directly from the
                    // termSuffixesreader's byte[], saving a copy into
                    // the BytesRef term per term.  Only when we return
                    // do we then copy the bytes into the term.

                    while (true)
                    {
                        // First, see if target term matches common prefix
                        // in this block:
                        if (common < termBlockPrefix)
                        {
                            int cmp = (term.Bytes[common] & 0xFF) - (target.Bytes[target.Offset + common] & 0xFF);
                            if (cmp < 0)
                            {
                                // TODO: maybe we should store common prefix
                                // in block header?  (instead of relying on
                                // last term of previous block)

                                // Target's prefix is after the common block
                                // prefix, so term cannot be in this block
                                // but it could be in next block.  We
                                // must scan to end-of-block to set common
                                // prefix for next block:
                                if (state.TermBlockOrd < blockTermCount)
                                {
                                    while (state.TermBlockOrd < blockTermCount - 1)
                                    {
                                        state.TermBlockOrd++;
                                        state.Ord++;
                                        termSuffixesReader.SkipBytes(termSuffixesReader.ReadVInt32());
                                    }
                                    int suffix = termSuffixesReader.ReadVInt32();
                                    term.Length = termBlockPrefix + suffix;
                                    if (term.Bytes.Length < term.Length)
                                    {
                                        term.Grow(term.Length);
                                    }
                                    termSuffixesReader.ReadBytes(term.Bytes, termBlockPrefix, suffix);
                                }
                                state.Ord++;

                                if (!NextBlock())
                                {
                                    indexIsCurrent = false;
                                    return(SeekStatus.END);
                                }
                                common = 0;
                            }
                            else if (cmp > 0)
                            {
                                // Target's prefix is before the common prefix
                                // of this block, so we position to start of
                                // block and return NOT_FOUND:
                                if (Debugging.AssertsEnabled)
                                {
                                    Debugging.Assert(state.TermBlockOrd == 0);
                                }

                                int suffix = termSuffixesReader.ReadVInt32();
                                term.Length = termBlockPrefix + suffix;
                                if (term.Bytes.Length < term.Length)
                                {
                                    term.Grow(term.Length);
                                }
                                termSuffixesReader.ReadBytes(term.Bytes, termBlockPrefix, suffix);
                                return(SeekStatus.NOT_FOUND);
                            }
                            else
                            {
                                common++;
                            }

                            continue;
                        }

                        // Test every term in this block
                        while (true)
                        {
                            state.TermBlockOrd++;
                            state.Ord++;

                            int suffix = termSuffixesReader.ReadVInt32();

                            // We know the prefix matches, so just compare the new suffix:
                            int termLen = termBlockPrefix + suffix;
                            int bytePos = termSuffixesReader.Position;

                            bool next      = false;
                            int  limit     = target.Offset + (termLen < target.Length ? termLen : target.Length);
                            int  targetPos = target.Offset + termBlockPrefix;
                            while (targetPos < limit)
                            {
                                int cmp = (termSuffixes[bytePos++] & 0xFF) - (target.Bytes[targetPos++] & 0xFF);
                                if (cmp < 0)
                                {
                                    // Current term is still before the target;
                                    // keep scanning
                                    next = true;
                                    break;
                                }
                                else if (cmp > 0)
                                {
                                    // Done!  Current term is after target. Stop
                                    // here, fill in real term, return NOT_FOUND.
                                    term.Length = termBlockPrefix + suffix;
                                    if (term.Bytes.Length < term.Length)
                                    {
                                        term.Grow(term.Length);
                                    }
                                    termSuffixesReader.ReadBytes(term.Bytes, termBlockPrefix, suffix);
                                    //System.out.println("  NOT_FOUND");
                                    return(SeekStatus.NOT_FOUND);
                                }
                            }

                            if (!next && target.Length <= termLen)
                            {
                                term.Length = termBlockPrefix + suffix;
                                if (term.Bytes.Length < term.Length)
                                {
                                    term.Grow(term.Length);
                                }
                                termSuffixesReader.ReadBytes(term.Bytes, termBlockPrefix, suffix);

                                if (target.Length == termLen)
                                {
                                    // Done!  Exact match.  Stop here, fill in
                                    // real term, return FOUND.
                                    //System.out.println("  FOUND");
                                    return(SeekStatus.FOUND);
                                }
                                else
                                {
                                    //System.out.println("  NOT_FOUND");
                                    return(SeekStatus.NOT_FOUND);
                                }
                            }

                            if (state.TermBlockOrd == blockTermCount)
                            {
                                // Must pre-fill term for next block's common prefix
                                term.Length = termBlockPrefix + suffix;
                                if (term.Bytes.Length < term.Length)
                                {
                                    term.Grow(term.Length);
                                }
                                termSuffixesReader.ReadBytes(term.Bytes, termBlockPrefix, suffix);
                                break;
                            }
                            else
                            {
                                termSuffixesReader.SkipBytes(suffix);
                            }
                        }

                        // The purpose of the terms dict index is to seek
                        // the enum to the closest index term before the
                        // term we are looking for.  So, we should never
                        // cross another index term (besides the first
                        // one) while we are scanning:

                        if (Debugging.AssertsEnabled)
                        {
                            Debugging.Assert(indexIsCurrent);
                        }

                        if (!NextBlock())
                        {
                            //System.out.println("  END");
                            indexIsCurrent = false;
                            return(SeekStatus.END);
                        }
                        common = 0;
                    }
                }
Exemple #29
0
        public override void Build(IInputEnumerator enumerator)
        {
            if (enumerator.HasContexts)
            {
                throw new ArgumentException("this suggester doesn't support contexts");
            }
            string prefix     = this.GetType().Name;
            var    directory  = OfflineSorter.DefaultTempDir();
            var    tempInput  = FileSupport.CreateTempFile(prefix, ".input", directory);
            var    tempSorted = FileSupport.CreateTempFile(prefix, ".sorted", directory);

            hasPayloads = enumerator.HasPayloads;

            var writer = new OfflineSorter.ByteSequencesWriter(tempInput);

            OfflineSorter.ByteSequencesReader reader = null;
            var scratch = new BytesRef();

            TokenStreamToAutomaton ts2a = GetTokenStreamToAutomaton();

            bool success = false;

            count = 0;
            byte[] buffer = new byte[8];
            try
            {
                var      output = new ByteArrayDataOutput(buffer);
                BytesRef surfaceForm;

                while (enumerator.MoveNext())
                {
                    surfaceForm = enumerator.Current;
                    ISet <Int32sRef> paths = ToFiniteStrings(surfaceForm, ts2a);

                    maxAnalyzedPathsForOneInput = Math.Max(maxAnalyzedPathsForOneInput, paths.Count);

                    foreach (Int32sRef path in paths)
                    {
                        Util.Fst.Util.ToBytesRef(path, scratch);

                        // length of the analyzed text (FST input)
                        if (scratch.Length > ushort.MaxValue - 2)
                        {
                            throw new ArgumentException("cannot handle analyzed forms > " + (ushort.MaxValue - 2) +
                                                        " in length (got " + scratch.Length + ")");
                        }
                        ushort analyzedLength = (ushort)scratch.Length;

                        // compute the required length:
                        // analyzed sequence + weight (4) + surface + analyzedLength (short)
                        int requiredLength = analyzedLength + 4 + surfaceForm.Length + 2;

                        BytesRef payload;

                        if (hasPayloads)
                        {
                            if (surfaceForm.Length > (ushort.MaxValue - 2))
                            {
                                throw new ArgumentException("cannot handle surface form > " + (ushort.MaxValue - 2) +
                                                            " in length (got " + surfaceForm.Length + ")");
                            }
                            payload = enumerator.Payload;
                            // payload + surfaceLength (short)
                            requiredLength += payload.Length + 2;
                        }
                        else
                        {
                            payload = null;
                        }

                        buffer = ArrayUtil.Grow(buffer, requiredLength);

                        output.Reset(buffer);

                        output.WriteInt16((short)analyzedLength);

                        output.WriteBytes(scratch.Bytes, scratch.Offset, scratch.Length);

                        output.WriteInt32(EncodeWeight(enumerator.Weight));

                        if (hasPayloads)
                        {
                            for (int i = 0; i < surfaceForm.Length; i++)
                            {
                                if (surfaceForm.Bytes[i] == PAYLOAD_SEP)
                                {
                                    throw new ArgumentException(
                                              "surface form cannot contain unit separator character U+001F; this character is reserved");
                                }
                            }
                            output.WriteInt16((short)surfaceForm.Length);
                            output.WriteBytes(surfaceForm.Bytes, surfaceForm.Offset, surfaceForm.Length);
                            output.WriteBytes(payload.Bytes, payload.Offset, payload.Length);
                        }
                        else
                        {
                            output.WriteBytes(surfaceForm.Bytes, surfaceForm.Offset, surfaceForm.Length);
                        }

                        if (Debugging.AssertsEnabled)
                        {
                            Debugging.Assert(output.Position == requiredLength, () => output.Position + " vs " + requiredLength);
                        }

                        writer.Write(buffer, 0, output.Position);
                    }
                    count++;
                }
                writer.Dispose();

                // Sort all input/output pairs (required by FST.Builder):
                (new OfflineSorter(new AnalyzingComparer(hasPayloads))).Sort(tempInput, tempSorted);

                // Free disk space:
                tempInput.Delete();

                reader = new OfflineSorter.ByteSequencesReader(tempSorted);

                var outputs = new PairOutputs <long?, BytesRef>(PositiveInt32Outputs.Singleton,
                                                                ByteSequenceOutputs.Singleton);
                var builder = new Builder <PairOutputs <long?, BytesRef> .Pair>(FST.INPUT_TYPE.BYTE1, outputs);

                // Build FST:
                BytesRef  previousAnalyzed = null;
                BytesRef  analyzed         = new BytesRef();
                BytesRef  surface          = new BytesRef();
                Int32sRef scratchInts      = new Int32sRef();
                var       input            = new ByteArrayDataInput();

                // Used to remove duplicate surface forms (but we
                // still index the hightest-weight one).  We clear
                // this when we see a new analyzed form, so it cannot
                // grow unbounded (at most 256 entries):
                var seenSurfaceForms = new JCG.HashSet <BytesRef>();

                var dedup = 0;
                while (reader.Read(scratch))
                {
                    input.Reset(scratch.Bytes, scratch.Offset, scratch.Length);
                    ushort analyzedLength = (ushort)input.ReadInt16();
                    analyzed.Grow(analyzedLength + 2);
                    input.ReadBytes(analyzed.Bytes, 0, analyzedLength);
                    analyzed.Length = analyzedLength;

                    long cost = input.ReadInt32();

                    surface.Bytes = scratch.Bytes;
                    if (hasPayloads)
                    {
                        surface.Length = (ushort)input.ReadInt16();
                        surface.Offset = input.Position;
                    }
                    else
                    {
                        surface.Offset = input.Position;
                        surface.Length = scratch.Length - surface.Offset;
                    }

                    if (previousAnalyzed == null)
                    {
                        previousAnalyzed = new BytesRef();
                        previousAnalyzed.CopyBytes(analyzed);
                        seenSurfaceForms.Add(BytesRef.DeepCopyOf(surface));
                    }
                    else if (analyzed.Equals(previousAnalyzed))
                    {
                        dedup++;
                        if (dedup >= maxSurfaceFormsPerAnalyzedForm)
                        {
                            // More than maxSurfaceFormsPerAnalyzedForm
                            // dups: skip the rest:
                            continue;
                        }
                        if (seenSurfaceForms.Contains(surface))
                        {
                            continue;
                        }
                        seenSurfaceForms.Add(BytesRef.DeepCopyOf(surface));
                    }
                    else
                    {
                        dedup = 0;
                        previousAnalyzed.CopyBytes(analyzed);
                        seenSurfaceForms.Clear();
                        seenSurfaceForms.Add(BytesRef.DeepCopyOf(surface));
                    }

                    // TODO: I think we can avoid the extra 2 bytes when
                    // there is no dup (dedup==0), but we'd have to fix
                    // the exactFirst logic ... which would be sort of
                    // hairy because we'd need to special case the two
                    // (dup/not dup)...

                    // NOTE: must be byte 0 so we sort before whatever
                    // is next
                    analyzed.Bytes[analyzed.Offset + analyzed.Length]     = 0;
                    analyzed.Bytes[analyzed.Offset + analyzed.Length + 1] = (byte)dedup;
                    analyzed.Length += 2;

                    Util.Fst.Util.ToInt32sRef(analyzed, scratchInts);
                    //System.out.println("ADD: " + scratchInts + " -> " + cost + ": " + surface.utf8ToString());
                    if (!hasPayloads)
                    {
                        builder.Add(scratchInts, outputs.NewPair(cost, BytesRef.DeepCopyOf(surface)));
                    }
                    else
                    {
                        int      payloadOffset = input.Position + surface.Length;
                        int      payloadLength = scratch.Length - payloadOffset;
                        BytesRef br            = new BytesRef(surface.Length + 1 + payloadLength);
                        Array.Copy(surface.Bytes, surface.Offset, br.Bytes, 0, surface.Length);
                        br.Bytes[surface.Length] = PAYLOAD_SEP;
                        Array.Copy(scratch.Bytes, payloadOffset, br.Bytes, surface.Length + 1, payloadLength);
                        br.Length = br.Bytes.Length;
                        builder.Add(scratchInts, outputs.NewPair(cost, br));
                    }
                }
                fst = builder.Finish();

                //Util.dotToFile(fst, "/tmp/suggest.dot");

                success = true;
            }
            finally
            {
                if (success)
                {
                    IOUtils.Dispose(reader, writer);
                }
                else
                {
                    IOUtils.DisposeWhileHandlingException(reader, writer);
                }

                tempInput.Delete();
                tempSorted.Delete();
            }
        }
            public override BytesRef Next()
            {
                if (NextTerm >= NumTerms)
                {
                    return(null);
                }
                Term_Renamed.CopyBytes(LastTerm);
                int start    = Tvf.ReadVInt();
                int deltaLen = Tvf.ReadVInt();

                Term_Renamed.Length = start + deltaLen;
                Term_Renamed.Grow(Term_Renamed.Length);
                Tvf.ReadBytes(Term_Renamed.Bytes, start, deltaLen);
                Freq = Tvf.ReadVInt();

                if (StorePayloads)
                {
                    Positions      = new int[Freq];
                    PayloadOffsets = new int[Freq];
                    int totalPayloadLength = 0;
                    int pos = 0;
                    for (int posUpto = 0; posUpto < Freq; posUpto++)
                    {
                        int code = Tvf.ReadVInt();
                        pos += (int)((uint)code >> 1);
                        Positions[posUpto] = pos;
                        if ((code & 1) != 0)
                        {
                            // length change
                            LastPayloadLength = Tvf.ReadVInt();
                        }
                        PayloadOffsets[posUpto] = totalPayloadLength;
                        totalPayloadLength     += LastPayloadLength;
                        Debug.Assert(totalPayloadLength >= 0);
                    }
                    PayloadData = new byte[totalPayloadLength];
                    Tvf.ReadBytes(PayloadData, 0, PayloadData.Length);
                } // no payloads
                else if (StorePositions)
                {
                    // TODO: we could maybe reuse last array, if we can
                    // somehow be careful about consumer never using two
                    // D&PEnums at once...
                    Positions = new int[Freq];
                    int pos = 0;
                    for (int posUpto = 0; posUpto < Freq; posUpto++)
                    {
                        pos += Tvf.ReadVInt();
                        Positions[posUpto] = pos;
                    }
                }

                if (StoreOffsets)
                {
                    StartOffsets = new int[Freq];
                    EndOffsets   = new int[Freq];
                    int offset = 0;
                    for (int posUpto = 0; posUpto < Freq; posUpto++)
                    {
                        StartOffsets[posUpto] = offset + Tvf.ReadVInt();
                        offset = EndOffsets[posUpto] = StartOffsets[posUpto] + Tvf.ReadVInt();
                    }
                }

                LastTerm.CopyBytes(Term_Renamed);
                NextTerm++;
                return(Term_Renamed);
            }