Пример #1
0
 /// <summary>
 /// decodes the weight at the current position </summary>
 protected internal virtual long Decode(BytesRef scratch, ByteArrayDataInput tmpInput)
 {
     tmpInput.Reset(scratch.Bytes);
     tmpInput.SkipBytes(scratch.Length - 8); // suggestion
     scratch.Length -= 8;                    // long
     return(tmpInput.ReadInt64());
 }
Пример #2
0
        /// <summary>
        /// decodes the contexts at the current position </summary>
        protected internal virtual ISet <BytesRef> DecodeContexts(BytesRef scratch, ByteArrayDataInput tmpInput)
        {
            tmpInput.Reset(scratch.Bytes);
            tmpInput.SkipBytes(scratch.Length - 2); //skip to context set size
            ushort ctxSetSize = (ushort)tmpInput.ReadInt16();

            scratch.Length -= 2;
            var contextSet = new JCG.HashSet <BytesRef>();

            for (ushort i = 0; i < ctxSetSize; i++)
            {
                tmpInput.Position = scratch.Length - 2;
                ushort curContextLength = (ushort)tmpInput.ReadInt16();
                scratch.Length   -= 2;
                tmpInput.Position = scratch.Length - curContextLength;
                BytesRef contextSpare = new BytesRef(curContextLength);
                tmpInput.ReadBytes(contextSpare.Bytes, 0, curContextLength);
                contextSpare.Length = curContextLength;
                contextSet.Add(contextSpare);
                scratch.Length -= curContextLength;
            }
            // LUCENENET NOTE: The result was at one point reversed because of test failures, but since we are
            // using JCG.HashSet<T> now (whose Equals() implementation respects set equality),
            // we have reverted back to the original implementation.
            return(contextSet);
        }
Пример #3
0
        /// <summary>
        /// decodes the contexts at the current position </summary>
        protected internal virtual ISet <BytesRef> DecodeContexts(BytesRef scratch, ByteArrayDataInput tmpInput)
        {
            tmpInput.Reset(scratch.Bytes);
            tmpInput.SkipBytes(scratch.Length - 2); //skip to context set size
            ushort ctxSetSize = (ushort)tmpInput.ReadInt16();

            scratch.Length -= 2;

            var contextSet = new JCG.HashSet <BytesRef>();

            for (ushort i = 0; i < ctxSetSize; i++)
            {
                tmpInput.Position = scratch.Length - 2;
                ushort curContextLength = (ushort)tmpInput.ReadInt16();
                scratch.Length   -= 2;
                tmpInput.Position = scratch.Length - curContextLength;
                BytesRef contextSpare = new BytesRef(curContextLength);
                tmpInput.ReadBytes(contextSpare.Bytes, 0, curContextLength);
                contextSpare.Length = curContextLength;
                contextSet.Add(contextSpare);
                scratch.Length -= curContextLength;
            }

            // LUCENENET TODO: We are writing the data forward.
            // Not sure exactly why, but when we read it back it
            // is reversed. So, we need to fix that before returning the result.
            // If the underlying problem is found and fixed, then this line can just be
            // return contextSet;
            return(new JCG.HashSet <BytesRef>(contextSet.Reverse()));
        }
Пример #4
0
 internal virtual void SkipDirtyBytes(int count)
 {
     Debug.Assert(count >= 0);
     Debug.Assert(count <= AllOnesLength + DirtyLength);
     WordNum += count;
     if (count <= AllOnesLength)
     {
         AllOnesLength -= count;
     }
     else
     {
         count        -= AllOnesLength;
         AllOnesLength = 0;
         @in.SkipBytes(count);
         DirtyLength -= count;
     }
 }
Пример #5
0
            public int Compare(BytesRef a, BytesRef b)
            {
                // First by analyzed form:
                readerA.Reset(a.Bytes, a.Offset, a.Length);
                scratchA.Length = (ushort)readerA.ReadInt16();
                scratchA.Bytes  = a.Bytes;
                scratchA.Offset = readerA.Position;

                readerB.Reset(b.Bytes, b.Offset, b.Length);
                scratchB.Bytes  = b.Bytes;
                scratchB.Length = (ushort)readerB.ReadInt16();
                scratchB.Offset = readerB.Position;

                int cmp = scratchA.CompareTo(scratchB);

                if (cmp != 0)
                {
                    return(cmp);
                }
                readerA.SkipBytes(scratchA.Length);
                readerB.SkipBytes(scratchB.Length);

                // Next by cost:
                long aCost = readerA.ReadInt32();
                long bCost = readerB.ReadInt32();

                if (Debugging.AssertsEnabled)
                {
                    Debugging.Assert(DecodeWeight(aCost) >= 0);
                    Debugging.Assert(DecodeWeight(bCost) >= 0);
                }
                if (aCost < bCost)
                {
                    return(-1);
                }
                else if (aCost > bCost)
                {
                    return(1);
                }

                // Finally by surface form:
                if (hasPayloads)
                {
                    scratchA.Length = (ushort)readerA.ReadInt16();
                    scratchB.Length = (ushort)readerB.ReadInt16();
                    scratchA.Offset = readerA.Position;
                    scratchB.Offset = readerB.Position;
                }
                else
                {
                    scratchA.Offset = readerA.Position;
                    scratchB.Offset = readerB.Position;
                    scratchA.Length = a.Length - scratchA.Offset;
                    scratchB.Length = b.Length - scratchB.Offset;
                }

                return(scratchA.CompareTo(scratchB));
            }
Пример #6
0
 internal virtual void SkipDirtyBytes(int count)
 {
     if (Debugging.AssertsEnabled)
     {
         Debugging.Assert(count >= 0);
         Debugging.Assert(count <= allOnesLength + dirtyLength);
     }
     wordNum += count;
     if (count <= allOnesLength)
     {
         allOnesLength -= count;
     }
     else
     {
         count        -= allOnesLength;
         allOnesLength = 0;
         @in.SkipBytes(count);
         dirtyLength -= count;
     }
 }
Пример #7
0
            public override int NextPosition()
            {
                if (Debugging.AssertsEnabled)
                {
                    Debugging.Assert(_posPending > 0);
                }

                _posPending--;

                if (_storePayloads)
                {
                    if (!_payloadRetrieved)
                    {
                        _postings.SkipBytes(_payloadLength);
                    }
                    int code = _postings.ReadVInt32();
                    if ((code & 1) != 0)
                    {
                        _payloadLength = _postings.ReadVInt32();
                    }
                    _position        += code.TripleShift(1);
                    _payloadRetrieved = false;
                }
                else
                {
                    _position += _postings.ReadVInt32();
                }

                if (_storeOffsets)
                {
                    int offsetCode = _postings.ReadVInt32();
                    if ((offsetCode & 1) != 0)
                    {
                        // new offset length
                        _offsetLength = _postings.ReadVInt32();
                    }
                    _startOffset += offsetCode.TripleShift(1);
                }

                return(_position);
            }
Пример #8
0
        /// <summary>
        /// decodes the payload at the current position
        /// </summary>
        protected internal virtual BytesRef DecodePayload(BytesRef scratch, ByteArrayDataInput tmpInput)
        {
            tmpInput.Reset(scratch.Bytes);
            tmpInput.SkipBytes(scratch.Length - 2);                 // skip to payload size
            ushort payloadLength = (ushort)tmpInput.ReadInt16();    // read payload size

            tmpInput.Position = scratch.Length - 2 - payloadLength; // setPosition to start of payload
            BytesRef payloadScratch = new BytesRef(payloadLength);

            tmpInput.ReadBytes(payloadScratch.Bytes, 0, payloadLength); // read payload
            payloadScratch.Length = payloadLength;
            scratch.Length       -= 2;                                  // payload length info (short)
            scratch.Length       -= payloadLength;                      // payload
            return(payloadScratch);
        }
Пример #9
0
            public virtual int Compare(BytesRef a, BytesRef b)
            {
                readerA.Reset(a.Bytes, a.Offset, a.Length);
                readerB.Reset(b.Bytes, b.Offset, b.Length);

                // By token:
                scratchA.Length = (ushort)readerA.ReadInt16();
                scratchA.Bytes  = a.Bytes;
                scratchA.Offset = readerA.Position;

                scratchB.Bytes  = b.Bytes;
                scratchB.Length = (ushort)readerB.ReadInt16();
                scratchB.Offset = readerB.Position;

                int cmp = scratchA.CompareTo(scratchB);

                if (cmp != 0)
                {
                    return(cmp);
                }
                readerA.SkipBytes(scratchA.Length);
                readerB.SkipBytes(scratchB.Length);

                // By length (smaller surface forms sorted first):
                cmp = a.Length - b.Length;
                if (cmp != 0)
                {
                    return(cmp);
                }

                // By surface form:
                scratchA.Offset = readerA.Position;
                scratchA.Length = a.Length - scratchA.Offset;
                scratchB.Offset = readerB.Position;
                scratchB.Length = b.Length - scratchB.Offset;

                return(scratchA.CompareTo(scratchB));
            }
Пример #10
0
        /// <summary>
        /// decodes the contexts at the current position </summary>
        protected internal virtual HashSet <BytesRef> DecodeContexts(BytesRef scratch, ByteArrayDataInput tmpInput)
        {
            tmpInput.Reset(scratch.Bytes);
            tmpInput.SkipBytes(scratch.Length - 2); //skip to context set size
            short ctxSetSize = tmpInput.ReadShort();

            scratch.Length -= 2;

            var contextSet = new HashSet <BytesRef>();

            for (short i = 0; i < ctxSetSize; i++)
            {
                tmpInput.Position = scratch.Length - 2;
                short curContextLength = tmpInput.ReadShort();
                scratch.Length   -= 2;
                tmpInput.Position = scratch.Length - curContextLength;
                BytesRef contextSpare = new BytesRef(curContextLength);
                tmpInput.ReadBytes(contextSpare.Bytes, 0, curContextLength);
                contextSpare.Length = curContextLength;
                contextSet.Add(contextSpare);
                scratch.Length -= curContextLength;
            }
            return(contextSet);
        }
Пример #11
0
                /// <remarks>
                /// TODO: we may want an alternate mode here which is
                /// "if you are about to return NOT_FOUND I won't use
                /// the terms data from that"; eg FuzzyTermsEnum will
                /// (usually) just immediately call seek again if we
                /// return NOT_FOUND so it's a waste for us to fill in
                /// the term that was actually NOT_FOUND
                /// </remarks>
                public override SeekStatus SeekCeil(BytesRef target)
                {
                    if (indexEnum == null)
                    {
                        throw new InvalidOperationException("terms index was not loaded");
                    }

                    //System.out.println("BTR.seek seg=" + segment + " target=" + fieldInfo.name + ":" + target.utf8ToString() + " " + target + " current=" + term().utf8ToString() + " " + term() + " indexIsCurrent=" + indexIsCurrent + " didIndexNext=" + didIndexNext + " seekPending=" + seekPending + " divisor=" + indexReader.getDivisor() + " this="  + this);
                    if (didIndexNext)
                    {
                        if (nextIndexTerm == null)
                        {
                            //System.out.println("  nextIndexTerm=null");
                        }
                        else
                        {
                            //System.out.println("  nextIndexTerm=" + nextIndexTerm.utf8ToString());
                        }
                    }

                    bool doSeek = true;

                    // See if we can avoid seeking, because target term
                    // is after current term but before next index term:
                    if (indexIsCurrent)
                    {
                        int cmp = BytesRef.UTF8SortedAsUnicodeComparer.Compare(term, target);

                        if (cmp == 0)
                        {
                            // Already at the requested term
                            return(SeekStatus.FOUND);
                        }
                        else if (cmp < 0)
                        {
                            // Target term is after current term
                            if (!didIndexNext)
                            {
                                if (indexEnum.Next() == -1)
                                {
                                    nextIndexTerm = null;
                                }
                                else
                                {
                                    nextIndexTerm = indexEnum.Term;
                                }
                                //System.out.println("  now do index next() nextIndexTerm=" + (nextIndexTerm == null ? "null" : nextIndexTerm.utf8ToString()));
                                didIndexNext = true;
                            }

                            if (nextIndexTerm == null || BytesRef.UTF8SortedAsUnicodeComparer.Compare(target, nextIndexTerm) < 0)
                            {
                                // Optimization: requested term is within the
                                // same term block we are now in; skip seeking
                                // (but do scanning):
                                doSeek = false;
                                //System.out.println("  skip seek: nextIndexTerm=" + (nextIndexTerm == null ? "null" : nextIndexTerm.utf8ToString()));
                            }
                        }
                    }

                    if (doSeek)
                    {
                        //System.out.println("  seek");

                        // Ask terms index to find biggest indexed term (=
                        // first term in a block) that's <= our text:
                        input.Seek(indexEnum.Seek(target));
                        bool result = NextBlock();

                        // Block must exist since, at least, the indexed term
                        // is in the block:
                        Debug.Assert(result);

                        indexIsCurrent  = true;
                        didIndexNext    = false;
                        blocksSinceSeek = 0;

                        if (doOrd)
                        {
                            state.Ord = indexEnum.Ord - 1;
                        }

                        term.CopyBytes(indexEnum.Term);
                        //System.out.println("  seek: term=" + term.utf8ToString());
                    }
                    else
                    {
                        //System.out.println("  skip seek");
                        if (state.TermBlockOrd == blockTermCount && !NextBlock())
                        {
                            indexIsCurrent = false;
                            return(SeekStatus.END);
                        }
                    }

                    seekPending = false;

                    int common = 0;

                    // Scan within block.  We could do this by calling
                    // _next() and testing the resulting term, but this
                    // is wasteful.  Instead, we first confirm the
                    // target matches the common prefix of this block,
                    // and then we scan the term bytes directly from the
                    // termSuffixesreader's byte[], saving a copy into
                    // the BytesRef term per term.  Only when we return
                    // do we then copy the bytes into the term.

                    while (true)
                    {
                        // First, see if target term matches common prefix
                        // in this block:
                        if (common < termBlockPrefix)
                        {
                            int cmp = (term.Bytes[common] & 0xFF) - (target.Bytes[target.Offset + common] & 0xFF);
                            if (cmp < 0)
                            {
                                // TODO: maybe we should store common prefix
                                // in block header?  (instead of relying on
                                // last term of previous block)

                                // Target's prefix is after the common block
                                // prefix, so term cannot be in this block
                                // but it could be in next block.  We
                                // must scan to end-of-block to set common
                                // prefix for next block:
                                if (state.TermBlockOrd < blockTermCount)
                                {
                                    while (state.TermBlockOrd < blockTermCount - 1)
                                    {
                                        state.TermBlockOrd++;
                                        state.Ord++;
                                        termSuffixesReader.SkipBytes(termSuffixesReader.ReadVInt32());
                                    }
                                    int suffix = termSuffixesReader.ReadVInt32();
                                    term.Length = termBlockPrefix + suffix;
                                    if (term.Bytes.Length < term.Length)
                                    {
                                        term.Grow(term.Length);
                                    }
                                    termSuffixesReader.ReadBytes(term.Bytes, termBlockPrefix, suffix);
                                }
                                state.Ord++;

                                if (!NextBlock())
                                {
                                    indexIsCurrent = false;
                                    return(SeekStatus.END);
                                }
                                common = 0;
                            }
                            else if (cmp > 0)
                            {
                                // Target's prefix is before the common prefix
                                // of this block, so we position to start of
                                // block and return NOT_FOUND:
                                Debug.Assert(state.TermBlockOrd == 0);

                                int suffix = termSuffixesReader.ReadVInt32();
                                term.Length = termBlockPrefix + suffix;
                                if (term.Bytes.Length < term.Length)
                                {
                                    term.Grow(term.Length);
                                }
                                termSuffixesReader.ReadBytes(term.Bytes, termBlockPrefix, suffix);
                                return(SeekStatus.NOT_FOUND);
                            }
                            else
                            {
                                common++;
                            }

                            continue;
                        }

                        // Test every term in this block
                        while (true)
                        {
                            state.TermBlockOrd++;
                            state.Ord++;

                            int suffix = termSuffixesReader.ReadVInt32();

                            // We know the prefix matches, so just compare the new suffix:
                            int termLen = termBlockPrefix + suffix;
                            int bytePos = termSuffixesReader.Position;

                            bool next      = false;
                            int  limit     = target.Offset + (termLen < target.Length ? termLen : target.Length);
                            int  targetPos = target.Offset + termBlockPrefix;
                            while (targetPos < limit)
                            {
                                int cmp = (termSuffixes[bytePos++] & 0xFF) - (target.Bytes[targetPos++] & 0xFF);
                                if (cmp < 0)
                                {
                                    // Current term is still before the target;
                                    // keep scanning
                                    next = true;
                                    break;
                                }
                                else if (cmp > 0)
                                {
                                    // Done!  Current term is after target. Stop
                                    // here, fill in real term, return NOT_FOUND.
                                    term.Length = termBlockPrefix + suffix;
                                    if (term.Bytes.Length < term.Length)
                                    {
                                        term.Grow(term.Length);
                                    }
                                    termSuffixesReader.ReadBytes(term.Bytes, termBlockPrefix, suffix);
                                    //System.out.println("  NOT_FOUND");
                                    return(SeekStatus.NOT_FOUND);
                                }
                            }

                            if (!next && target.Length <= termLen)
                            {
                                term.Length = termBlockPrefix + suffix;
                                if (term.Bytes.Length < term.Length)
                                {
                                    term.Grow(term.Length);
                                }
                                termSuffixesReader.ReadBytes(term.Bytes, termBlockPrefix, suffix);

                                if (target.Length == termLen)
                                {
                                    // Done!  Exact match.  Stop here, fill in
                                    // real term, return FOUND.
                                    //System.out.println("  FOUND");
                                    return(SeekStatus.FOUND);
                                }
                                else
                                {
                                    //System.out.println("  NOT_FOUND");
                                    return(SeekStatus.NOT_FOUND);
                                }
                            }

                            if (state.TermBlockOrd == blockTermCount)
                            {
                                // Must pre-fill term for next block's common prefix
                                term.Length = termBlockPrefix + suffix;
                                if (term.Bytes.Length < term.Length)
                                {
                                    term.Grow(term.Length);
                                }
                                termSuffixesReader.ReadBytes(term.Bytes, termBlockPrefix, suffix);
                                break;
                            }
                            else
                            {
                                termSuffixesReader.SkipBytes(suffix);
                            }
                        }

                        // The purpose of the terms dict index is to seek
                        // the enum to the closest index term before the
                        // term we are looking for.  So, we should never
                        // cross another index term (besides the first
                        // one) while we are scanning:

                        Debug.Assert(indexIsCurrent);

                        if (!NextBlock())
                        {
                            //System.out.println("  END");
                            indexIsCurrent = false;
                            return(SeekStatus.END);
                        }
                        common = 0;
                    }
                }
Пример #12
0
            public override int NextDoc()
            {
                while (posPending > 0)
                {
                    NextPosition();
                }
                while (true)
                {
                    //System.out.println("  nextDoc cycle docUpto=" + docUpto + " numDocs=" + numDocs + " fp=" + in.getPosition() + " this=" + this);
                    if (docUpto == numDocs)
                    {
                        //System.out.println("    END");
                        return(docID = NO_MORE_DOCS);
                    }
                    docUpto++;

                    int code = @in.ReadVInt32();
                    accum += (int)((uint)code >> 1);
                    if ((code & 1) != 0)
                    {
                        freq = 1;
                    }
                    else
                    {
                        freq = @in.ReadVInt32();
                        if (Debugging.AssertsEnabled)
                        {
                            Debugging.Assert(freq > 0);
                        }
                    }

                    if (liveDocs == null || liveDocs.Get(accum))
                    {
                        pos         = 0;
                        startOffset = storeOffsets ? 0 : -1;
                        posPending  = freq;
                        //System.out.println("    return docID=" + accum + " freq=" + freq);
                        return(docID = accum);
                    }

                    // Skip positions
                    for (int posUpto = 0; posUpto < freq; posUpto++)
                    {
                        if (!storePayloads)
                        {
                            @in.ReadVInt32();
                        }
                        else
                        {
                            int skipCode = @in.ReadVInt32();
                            if ((skipCode & 1) != 0)
                            {
                                payloadLength = @in.ReadVInt32();
                                //System.out.println("    new payloadLen=" + payloadLength);
                            }
                        }

                        if (storeOffsets)
                        {
                            if ((@in.ReadVInt32() & 1) != 0)
                            {
                                // new offset length
                                offsetLength = @in.ReadVInt32();
                            }
                        }

                        if (storePayloads)
                        {
                            @in.SkipBytes(payloadLength);
                        }
                    }
                }
            }
Пример #13
0
            public override int NextDoc()
            {
                while (true)
                {
                    //System.out.println("  nextDoc cycle docUpto=" + docUpto + " numDocs=" + numDocs + " fp=" + in.getPosition() + " this=" + this);
                    if (docUpto == numDocs)
                    {
                        // System.out.println("    END");
                        return(docID = NO_MORE_DOCS);
                    }
                    docUpto++;
                    if (indexOptions == IndexOptions.DOCS_ONLY)
                    {
                        accum += @in.ReadVInt32();
                    }
                    else
                    {
                        int code = @in.ReadVInt32();
                        accum += (int)((uint)code >> 1);
                        //System.out.println("  docID=" + accum + " code=" + code);
                        if ((code & 1) != 0)
                        {
                            freq = 1;
                        }
                        else
                        {
                            freq = @in.ReadVInt32();
                            if (Debugging.AssertsEnabled)
                            {
                                Debugging.Assert(freq > 0);
                            }
                        }

                        if (indexOptions == IndexOptions.DOCS_AND_FREQS_AND_POSITIONS)
                        {
                            // Skip positions/payloads
                            for (int posUpto = 0; posUpto < freq; posUpto++)
                            {
                                if (!storePayloads)
                                {
                                    @in.ReadVInt32();
                                }
                                else
                                {
                                    int posCode = @in.ReadVInt32();
                                    if ((posCode & 1) != 0)
                                    {
                                        payloadLen = @in.ReadVInt32();
                                    }
                                    @in.SkipBytes(payloadLen);
                                }
                            }
                        }
                        else if (indexOptions == IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS)
                        {
                            // Skip positions/offsets/payloads
                            for (int posUpto = 0; posUpto < freq; posUpto++)
                            {
                                int posCode = @in.ReadVInt32();
                                if (storePayloads && ((posCode & 1) != 0))
                                {
                                    payloadLen = @in.ReadVInt32();
                                }
                                if ((@in.ReadVInt32() & 1) != 0)
                                {
                                    // new offset length
                                    @in.ReadVInt32();
                                }
                                if (storePayloads)
                                {
                                    @in.SkipBytes(payloadLen);
                                }
                            }
                        }
                    }

                    if (liveDocs == null || liveDocs.Get(accum))
                    {
                        //System.out.println("    return docID=" + accum + " freq=" + freq);
                        return(docID = accum);
                    }
                }
            }
Пример #14
0
        /// <summary>
        /// Applies the affix rule to the given word, producing a list of stems if any are found
        /// </summary>
        /// <param name="strippedWord"> Word the affix has been removed and the strip added </param>
        /// <param name="length"> valid length of stripped word </param>
        /// <param name="affix"> HunspellAffix representing the affix rule itself </param>
        /// <param name="prefixFlag"> when we already stripped a prefix, we cant simply recurse and check the suffix, unless both are compatible
        ///                   so we must check dictionary form against both to add it as a stem! </param>
        /// <param name="recursionDepth"> current recursion depth </param>
        /// <param name="prefix"> true if we are removing a prefix (false if its a suffix) </param>
        /// <returns> List of stems for the word, or an empty list if none are found </returns>
        internal IList <CharsRef> ApplyAffix(char[] strippedWord, int length, int affix, int prefixFlag, int recursionDepth, bool prefix, bool circumfix)
        {
            // TODO: just pass this in from before, no need to decode it twice
            affixReader.Position = 8 * affix;
            char flag = (char)(affixReader.ReadShort() & 0xffff);

            affixReader.SkipBytes(2); // strip
            int  condition    = (char)(affixReader.ReadShort() & 0xffff);
            bool crossProduct = (condition & 1) == 1;

            condition = (int)((uint)condition >> 1);
            char append = (char)(affixReader.ReadShort() & 0xffff);

            List <CharsRef> stems = new List <CharsRef>();

            IntsRef forms = dictionary.LookupWord(strippedWord, 0, length);

            if (forms != null)
            {
                for (int i = 0; i < forms.Length; i++)
                {
                    dictionary.flagLookup.Get(forms.Ints[forms.Offset + i], scratch);
                    char[] wordFlags = Dictionary.DecodeFlags(scratch);
                    if (Dictionary.HasFlag(wordFlags, flag))
                    {
                        // confusing: in this one exception, we already chained the first prefix against the second,
                        // so it doesnt need to be checked against the word
                        bool chainedPrefix = dictionary.complexPrefixes && recursionDepth == 1 && prefix;
                        if (chainedPrefix == false && prefixFlag >= 0 && !Dictionary.HasFlag(wordFlags, (char)prefixFlag))
                        {
                            // see if we can chain prefix thru the suffix continuation class (only if it has any!)
                            dictionary.flagLookup.Get(append, scratch);
                            char[] appendFlags = Dictionary.DecodeFlags(scratch);
                            if (!HasCrossCheckedFlag((char)prefixFlag, appendFlags, false))
                            {
                                continue;
                            }
                        }

                        // if circumfix was previously set by a prefix, we must check this suffix,
                        // to ensure it has it, and vice versa
                        if (dictionary.circumfix != -1)
                        {
                            dictionary.flagLookup.Get(append, scratch);
                            char[] appendFlags     = Dictionary.DecodeFlags(scratch);
                            bool   suffixCircumfix = Dictionary.HasFlag(appendFlags, (char)dictionary.circumfix);
                            if (circumfix != suffixCircumfix)
                            {
                                continue;
                            }
                        }
                        stems.Add(NewStem(strippedWord, length));
                    }
                }
            }

            // if a circumfix flag is defined in the dictionary, and we are a prefix, we need to check if we have that flag
            if (dictionary.circumfix != -1 && !circumfix && prefix)
            {
                dictionary.flagLookup.Get(append, scratch);
                char[] appendFlags = Dictionary.DecodeFlags(scratch);
                circumfix = Dictionary.HasFlag(appendFlags, (char)dictionary.circumfix);
            }

            if (crossProduct)
            {
                if (recursionDepth == 0)
                {
                    if (prefix)
                    {
                        // we took away the first prefix.
                        // COMPLEXPREFIXES = true:  combine with a second prefix and another suffix
                        // COMPLEXPREFIXES = false: combine with a suffix
                        stems.AddRange(Stem(strippedWord, length, affix, flag, flag, ++recursionDepth, dictionary.complexPrefixes && dictionary.twoStageAffix, true, true, circumfix));
                    }
                    else if (dictionary.complexPrefixes == false && dictionary.twoStageAffix)
                    {
                        // we took away a suffix.
                        // COMPLEXPREFIXES = true: we don't recurse! only one suffix allowed
                        // COMPLEXPREFIXES = false: combine with another suffix
                        stems.AddRange(Stem(strippedWord, length, affix, flag, prefixFlag, ++recursionDepth, false, true, false, circumfix));
                    }
                }
                else if (recursionDepth == 1)
                {
                    if (prefix && dictionary.complexPrefixes)
                    {
                        // we took away the second prefix: go look for another suffix
                        stems.AddRange(Stem(strippedWord, length, affix, flag, flag, ++recursionDepth, false, true, true, circumfix));
                    }
                    else if (prefix == false && dictionary.complexPrefixes == false && dictionary.twoStageAffix)
                    {
                        // we took away a prefix, then a suffix: go look for another suffix
                        stems.AddRange(Stem(strippedWord, length, affix, flag, prefixFlag, ++recursionDepth, false, true, false, circumfix));
                    }
                }
            }

            return(stems);
        }
Пример #15
0
            public override int NextDoc()
            {
                while (true)
                {
                    if (_postings.Eof)
                    {
                        return(_docId = NO_MORE_DOCS);
                    }

                    var code = _postings.ReadVInt32();
                    if (_indexOptions == IndexOptions.DOCS_ONLY)
                    {
                        _accum += code;
                    }
                    else
                    {
                        _accum += code.TripleShift(1);;  // shift off low bit
                        _freq   = (code & 1) != 0 ? 1 : _postings.ReadVInt32();

                        // LUCENENET specific - to avoid boxing, changed from CompareTo() to IndexOptionsComparer.Compare()
                        if (IndexOptionsComparer.Default.Compare(_indexOptions, IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) >= 0)
                        {
                            // Skip positions
                            if (_storePayloads)
                            {
                                for (var pos = 0; pos < _freq; pos++)
                                {
                                    var posCode = _postings.ReadVInt32();
                                    if ((posCode & 1) != 0)
                                    {
                                        _payloadLength = _postings.ReadVInt32();
                                    }
                                    if (_storeOffsets && (_postings.ReadVInt32() & 1) != 0)
                                    {
                                        // new offset length
                                        _postings.ReadVInt32();
                                    }
                                    if (_payloadLength != 0)
                                    {
                                        _postings.SkipBytes(_payloadLength);
                                    }
                                }
                            }
                            else
                            {
                                for (var pos = 0; pos < _freq; pos++)
                                {
                                    // TODO: skipVInt
                                    _postings.ReadVInt32();
                                    if (_storeOffsets && (_postings.ReadVInt32() & 1) != 0)
                                    {
                                        // new offset length
                                        _postings.ReadVInt32();
                                    }
                                }
                            }
                        }
                    }

                    if (_liveDocs is null || _liveDocs.Get(_accum))
                    {
                        return(_docId = _accum);
                    }
                }
            }
Пример #16
0
                /// <remarks>
                /// TODO: we may want an alternate mode here which is
                /// "if you are about to return NOT_FOUND I won't use
                /// the terms data from that"; eg FuzzyTermsEnum will
                /// (usually) just immediately call seek again if we
                /// return NOT_FOUND so it's a waste for us to fill in
                /// the term that was actually NOT_FOUND
                /// </remarks>
                public override SeekStatus SeekCeil(BytesRef target)
                {
                    if (_indexEnum == null)
                    {
                        throw new InvalidOperationException("terms index was not loaded");
                    }

                    var doSeek = true;

                    // See if we can avoid seeking, because target term
                    // is after current term but before next index term:
                    if (_indexIsCurrent)
                    {
                        var cmp = BytesRef.UTF8SortedAsUnicodeComparer.Compare(_term, target);

                        if (cmp == 0)
                        {
                            return(SeekStatus.FOUND);     // Already at the requested term
                        }
                        if (cmp < 0)
                        {
                            // Target term is after current term
                            if (!_didIndexNext)
                            {
                                _nextIndexTerm = _indexEnum.Next == -1 ? null : _indexEnum.Term;
                                _didIndexNext  = true;
                            }

                            if (_nextIndexTerm == null ||
                                BytesRef.UTF8SortedAsUnicodeComparer.Compare(target, _nextIndexTerm) < 0)
                            {
                                // Optimization: requested term is within the
                                // same term block we are now in; skip seeking
                                // (but do scanning):
                                doSeek = false;
                            }
                        }
                    }

                    if (doSeek)
                    {
                        //System.out.println("  seek");

                        // Ask terms index to find biggest indexed term (=
                        // first term in a block) that's <= our text:
                        _input.Seek(_indexEnum.Seek(target));
                        var result = NextBlock();

                        // Block must exist since, at least, the indexed term
                        // is in the block:
                        Debug.Assert(result);

                        _indexIsCurrent  = true;
                        _didIndexNext    = false;
                        _blocksSinceSeek = 0;

                        if (_doOrd)
                        {
                            _state.Ord = _indexEnum.Ord - 1;
                        }

                        _term.CopyBytes(_indexEnum.Term);
                    }
                    else
                    {
                        if (_state.TermBlockOrd == _blockTermCount && !NextBlock())
                        {
                            _indexIsCurrent = false;
                            return(SeekStatus.END);
                        }
                    }

                    _seekPending = false;

                    var common = 0;

                    // Scan within block.  We could do this by calling
                    // _next() and testing the resulting term, but this
                    // is wasteful.  Instead, we first confirm the
                    // target matches the common prefix of this block,
                    // and then we scan the term bytes directly from the
                    // termSuffixesreader's byte[], saving a copy into
                    // the BytesRef term per term.  Only when we return
                    // do we then copy the bytes into the term.

                    while (true)
                    {
                        // First, see if target term matches common prefix
                        // in this block:
                        if (common < _termBlockPrefix)
                        {
                            var cmp = (_term.Bytes[common] & 0xFF) - (target.Bytes[target.Offset + common] & 0xFF);
                            if (cmp < 0)
                            {
                                // TODO: maybe we should store common prefix
                                // in block header?  (instead of relying on
                                // last term of previous block)

                                // Target's prefix is after the common block
                                // prefix, so term cannot be in this block
                                // but it could be in next block.  We
                                // must scan to end-of-block to set common
                                // prefix for next block:
                                if (_state.TermBlockOrd < _blockTermCount)
                                {
                                    while (_state.TermBlockOrd < _blockTermCount - 1)
                                    {
                                        _state.TermBlockOrd++;
                                        _state.Ord++;
                                        _termSuffixesReader.SkipBytes(_termSuffixesReader.ReadVInt());
                                    }
                                    var suffix = _termSuffixesReader.ReadVInt();
                                    _term.Length = _termBlockPrefix + suffix;
                                    if (_term.Bytes.Length < _term.Length)
                                    {
                                        _term.Grow(_term.Length);
                                    }
                                    _termSuffixesReader.ReadBytes(_term.Bytes, _termBlockPrefix, suffix);
                                }
                                _state.Ord++;

                                if (!NextBlock())
                                {
                                    _indexIsCurrent = false;
                                    return(SeekStatus.END);
                                }
                                common = 0;
                            }
                            else if (cmp > 0)
                            {
                                // Target's prefix is before the common prefix
                                // of this block, so we position to start of
                                // block and return NOT_FOUND:
                                Debug.Assert(_state.TermBlockOrd == 0);

                                var suffix = _termSuffixesReader.ReadVInt();
                                _term.Length = _termBlockPrefix + suffix;
                                if (_term.Bytes.Length < _term.Length)
                                {
                                    _term.Grow(_term.Length);
                                }
                                _termSuffixesReader.ReadBytes(_term.Bytes, _termBlockPrefix, suffix);
                                return(SeekStatus.NOT_FOUND);
                            }
                            else
                            {
                                common++;
                            }

                            continue;
                        }

                        // Test every term in this block
                        while (true)
                        {
                            _state.TermBlockOrd++;
                            _state.Ord++;

                            var suffix = _termSuffixesReader.ReadVInt();

                            // We know the prefix matches, so just compare the new suffix:

                            var termLen = _termBlockPrefix + suffix;
                            var bytePos = _termSuffixesReader.Position;

                            var next = false;

                            var limit     = target.Offset + (termLen < target.Length ? termLen : target.Length);
                            var targetPos = target.Offset + _termBlockPrefix;
                            while (targetPos < limit)
                            {
                                var cmp = (_termSuffixes[bytePos++] & 0xFF) - (target.Bytes[targetPos++] & 0xFF);
                                if (cmp < 0)
                                {
                                    // Current term is still before the target;
                                    // keep scanning
                                    next = true;
                                    break;
                                }

                                if (cmp <= 0)
                                {
                                    continue;
                                }

                                // Done!  Current term is after target. Stop
                                // here, fill in real term, return NOT_FOUND.
                                _term.Length = _termBlockPrefix + suffix;
                                if (_term.Bytes.Length < _term.Length)
                                {
                                    _term.Grow(_term.Length);
                                }
                                _termSuffixesReader.ReadBytes(_term.Bytes, _termBlockPrefix, suffix);
                                return(SeekStatus.NOT_FOUND);
                            }

                            if (!next && target.Length <= termLen)
                            {
                                _term.Length = _termBlockPrefix + suffix;
                                if (_term.Bytes.Length < _term.Length)
                                {
                                    _term.Grow(_term.Length);
                                }
                                _termSuffixesReader.ReadBytes(_term.Bytes, _termBlockPrefix, suffix);

                                return(target.Length == termLen ? SeekStatus.FOUND : SeekStatus.NOT_FOUND);
                            }

                            if (_state.TermBlockOrd == _blockTermCount)
                            {
                                // Must pre-fill term for next block's common prefix
                                _term.Length = _termBlockPrefix + suffix;
                                if (_term.Bytes.Length < _term.Length)
                                {
                                    _term.Grow(_term.Length);
                                }
                                _termSuffixesReader.ReadBytes(_term.Bytes, _termBlockPrefix, suffix);
                                break;
                            }

                            _termSuffixesReader.SkipBytes(suffix);
                        }

                        // The purpose of the terms dict index is to seek
                        // the enum to the closest index term before the
                        // term we are looking for.  So, we should never
                        // cross another index term (besides the first
                        // one) while we are scanning:

                        Debug.Assert(_indexIsCurrent);

                        if (!NextBlock())
                        {
                            _indexIsCurrent = false;
                            return(SeekStatus.END);
                        }
                        common = 0;
                    }
                }
Пример #17
0
            public override int NextDoc()
            {
                while (true)
                {
                    if (_postings.Eof())
                    {
                        return(_docId = NO_MORE_DOCS);
                    }

                    var code = _postings.ReadVInt();
                    if (_indexOptions == FieldInfo.IndexOptions.DOCS_ONLY)
                    {
                        _accum += code;
                    }
                    else
                    {
                        _accum += (int)((uint)code >> 1);;  // shift off low bit
                        _freq   = (code & 1) != 0 ? 1 : _postings.ReadVInt();

                        if (_indexOptions.Value.CompareTo(FieldInfo.IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) >= 0)
                        {
                            // Skip positions
                            if (_storePayloads)
                            {
                                for (var pos = 0; pos < _freq; pos++)
                                {
                                    var posCode = _postings.ReadVInt();
                                    if ((posCode & 1) != 0)
                                    {
                                        _payloadLength = _postings.ReadVInt();
                                    }
                                    if (_storeOffsets && (_postings.ReadVInt() & 1) != 0)
                                    {
                                        // new offset length
                                        _postings.ReadVInt();
                                    }
                                    if (_payloadLength != 0)
                                    {
                                        _postings.SkipBytes(_payloadLength);
                                    }
                                }
                            }
                            else
                            {
                                for (var pos = 0; pos < _freq; pos++)
                                {
                                    // TODO: skipVInt
                                    _postings.ReadVInt();
                                    if (_storeOffsets && (_postings.ReadVInt() & 1) != 0)
                                    {
                                        // new offset length
                                        _postings.ReadVInt();
                                    }
                                }
                            }
                        }
                    }

                    if (_liveDocs == null || _liveDocs.Get(_accum))
                    {
                        return(_docId = _accum);
                    }
                }
            }