예제 #1
0
        public override void VisitDocument(int docID, StoredFieldVisitor visitor)
        {
            fieldsStream.Seek(indexReader.GetStartPointer(docID));

            int docBase   = fieldsStream.ReadVInt32();
            int chunkDocs = fieldsStream.ReadVInt32();

            if (docID < docBase || docID >= docBase + chunkDocs || docBase + chunkDocs > numDocs)
            {
                throw new CorruptIndexException("Corrupted: docID=" + docID + ", docBase=" + docBase + ", chunkDocs=" + chunkDocs + ", numDocs=" + numDocs + " (resource=" + fieldsStream + ")");
            }

            int numStoredFields, offset, length, totalLength;

            if (chunkDocs == 1)
            {
                numStoredFields = fieldsStream.ReadVInt32();
                offset          = 0;
                length          = fieldsStream.ReadVInt32();
                totalLength     = length;
            }
            else
            {
                int bitsPerStoredFields = fieldsStream.ReadVInt32();
                if (bitsPerStoredFields == 0)
                {
                    numStoredFields = fieldsStream.ReadVInt32();
                }
                else if (bitsPerStoredFields > 31)
                {
                    throw new CorruptIndexException("bitsPerStoredFields=" + bitsPerStoredFields + " (resource=" + fieldsStream + ")");
                }
                else
                {
                    long filePointer           = fieldsStream.GetFilePointer();
                    PackedInt32s.Reader reader = PackedInt32s.GetDirectReaderNoHeader(fieldsStream, PackedInt32s.Format.PACKED, packedIntsVersion, chunkDocs, bitsPerStoredFields);
                    numStoredFields = (int)(reader.Get(docID - docBase));
                    fieldsStream.Seek(filePointer + PackedInt32s.Format.PACKED.ByteCount(packedIntsVersion, chunkDocs, bitsPerStoredFields));
                }

                int bitsPerLength = fieldsStream.ReadVInt32();
                if (bitsPerLength == 0)
                {
                    length      = fieldsStream.ReadVInt32();
                    offset      = (docID - docBase) * length;
                    totalLength = chunkDocs * length;
                }
                else if (bitsPerStoredFields > 31)
                {
                    throw new CorruptIndexException("bitsPerLength=" + bitsPerLength + " (resource=" + fieldsStream + ")");
                }
                else
                {
                    PackedInt32s.IReaderIterator it = PackedInt32s.GetReaderIteratorNoHeader(fieldsStream, PackedInt32s.Format.PACKED, packedIntsVersion, chunkDocs, bitsPerLength, 1);
                    int off = 0;
                    for (int i = 0; i < docID - docBase; ++i)
                    {
                        off += (int)it.Next();
                    }
                    offset = off;
                    length = (int)it.Next();
                    off   += length;
                    for (int i = docID - docBase + 1; i < chunkDocs; ++i)
                    {
                        off += (int)it.Next();
                    }
                    totalLength = off;
                }
            }

            if ((length == 0) != (numStoredFields == 0))
            {
                throw new CorruptIndexException("length=" + length + ", numStoredFields=" + numStoredFields + " (resource=" + fieldsStream + ")");
            }
            if (numStoredFields == 0)
            {
                // nothing to do
                return;
            }

            DataInput documentInput;

            if (version >= CompressingStoredFieldsWriter.VERSION_BIG_CHUNKS && totalLength >= 2 * chunkSize)
            {
                if (Debugging.AssertsEnabled)
                {
                    Debugging.Assert(chunkSize > 0);
                    Debugging.Assert(offset < chunkSize);
                }

                decompressor.Decompress(fieldsStream, chunkSize, offset, Math.Min(length, chunkSize - offset), bytes);
                documentInput = new DataInputAnonymousInnerClassHelper(this, length);
            }
            else
            {
                BytesRef bytes = totalLength <= BUFFER_REUSE_THRESHOLD ? this.bytes : new BytesRef();
                decompressor.Decompress(fieldsStream, totalLength, offset, length, bytes);
                if (Debugging.AssertsEnabled)
                {
                    Debugging.Assert(bytes.Length == length);
                }
                documentInput = new ByteArrayDataInput(bytes.Bytes, bytes.Offset, bytes.Length);
            }

            for (int fieldIDX = 0; fieldIDX < numStoredFields; fieldIDX++)
            {
                long      infoAndBits = documentInput.ReadVInt64();
                int       fieldNumber = (int)((long)((ulong)infoAndBits >> CompressingStoredFieldsWriter.TYPE_BITS));
                FieldInfo fieldInfo   = fieldInfos.FieldInfo(fieldNumber);

                int bits = (int)(infoAndBits & CompressingStoredFieldsWriter.TYPE_MASK);
                if (Debugging.AssertsEnabled)
                {
                    Debugging.Assert(bits <= CompressingStoredFieldsWriter.NUMERIC_DOUBLE, "bits={0:x}", bits);
                }

                switch (visitor.NeedsField(fieldInfo))
                {
                case StoredFieldVisitor.Status.YES:
                    ReadField(documentInput, visitor, fieldInfo, bits);
                    break;

                case StoredFieldVisitor.Status.NO:
                    SkipField(documentInput, bits);
                    break;

                case StoredFieldVisitor.Status.STOP:
                    return;
                }
            }
        }
예제 #2
0
            public override void AddSortedField(FieldInfo field, IEnumerable <BytesRef> values, IEnumerable <long?> docToOrd)
            {
                int      valueCount = 0;
                BytesRef lastValue  = null;

                foreach (BytesRef b in values)
                {
                    if (Debugging.AssertsEnabled)
                    {
                        Debugging.Assert(b != null);
                    }
                    if (Debugging.AssertsEnabled)
                    {
                        Debugging.Assert(b.IsValid());
                    }
                    if (valueCount > 0)
                    {
                        if (Debugging.AssertsEnabled)
                        {
                            Debugging.Assert(b.CompareTo(lastValue) > 0);
                        }
                    }
                    lastValue = BytesRef.DeepCopyOf(b);
                    valueCount++;
                }
                if (Debugging.AssertsEnabled)
                {
                    Debugging.Assert(valueCount <= maxDoc);
                }

                FixedBitSet seenOrds = new FixedBitSet(valueCount);

                int count = 0;

                foreach (long?v in docToOrd)
                {
                    if (Debugging.AssertsEnabled)
                    {
                        Debugging.Assert(v != null);
                    }
                    int ord = (int)v.Value;
                    if (Debugging.AssertsEnabled)
                    {
                        Debugging.Assert(ord >= -1 && ord < valueCount);
                    }
                    if (ord >= 0)
                    {
                        seenOrds.Set(ord);
                    }
                    count++;
                }

                if (Debugging.AssertsEnabled)
                {
                    Debugging.Assert(count == maxDoc);
                }
                if (Debugging.AssertsEnabled)
                {
                    Debugging.Assert(seenOrds.Cardinality() == valueCount);
                }
                CheckIterator(values.GetEnumerator(), valueCount, false);
                CheckIterator(docToOrd.GetEnumerator(), maxDoc, false);
                @in.AddSortedField(field, values, docToOrd);
            }
예제 #3
0
        public virtual ApplyDeletesResult ApplyDeletesAndUpdates(IndexWriter.ReaderPool readerPool, IList <SegmentCommitInfo> infos)
        {
            UninterruptableMonitor.Enter(this);
            try
            {
                long t0 = J2N.Time.NanoTime() / J2N.Time.MillisecondsPerNanosecond; // LUCENENET: Use NanoTime() rather than CurrentTimeMilliseconds() for more accurate/reliable results

                if (infos.Count == 0)
                {
                    return(new ApplyDeletesResult(false, nextGen++, null));
                }

                if (Debugging.AssertsEnabled)
                {
                    Debugging.Assert(CheckDeleteStats());
                }

                if (!Any())
                {
                    if (infoStream.IsEnabled("BD"))
                    {
                        infoStream.Message("BD", "applyDeletes: no deletes; skipping");
                    }
                    return(new ApplyDeletesResult(false, nextGen++, null));
                }

                if (infoStream.IsEnabled("BD"))
                {
                    infoStream.Message("BD", "applyDeletes: infos=" + string.Format(J2N.Text.StringFormatter.InvariantCulture, "{0}", infos) + " packetCount=" + updates.Count);
                }

                long gen = nextGen++;

                JCG.List <SegmentCommitInfo> infos2 = new JCG.List <SegmentCommitInfo>();
                infos2.AddRange(infos);
                infos2.Sort(sortSegInfoByDelGen);

                CoalescedUpdates coalescedUpdates = null;
                bool             anyNewDeletes    = false;

                int infosIDX = infos2.Count - 1;
                int delIDX   = updates.Count - 1;

                IList <SegmentCommitInfo> allDeleted = null;

                while (infosIDX >= 0)
                {
                    //System.out.println("BD: cycle delIDX=" + delIDX + " infoIDX=" + infosIDX);

                    FrozenBufferedUpdates packet = delIDX >= 0 ? updates[delIDX] : null;
                    SegmentCommitInfo     info   = infos2[infosIDX];
                    long segGen = info.BufferedDeletesGen;

                    if (packet != null && segGen < packet.DelGen)
                    {
                        //        System.out.println("  coalesce");
                        if (coalescedUpdates == null)
                        {
                            coalescedUpdates = new CoalescedUpdates();
                        }
                        if (!packet.isSegmentPrivate)
                        {
                            /*
                             * Only coalesce if we are NOT on a segment private del packet: the segment private del packet
                             * must only applied to segments with the same delGen.  Yet, if a segment is already deleted
                             * from the SI since it had no more documents remaining after some del packets younger than
                             * its segPrivate packet (higher delGen) have been applied, the segPrivate packet has not been
                             * removed.
                             */
                            coalescedUpdates.Update(packet);
                        }

                        delIDX--;
                    }
                    else if (packet != null && segGen == packet.DelGen)
                    {
                        if (Debugging.AssertsEnabled)
                        {
                            Debugging.Assert(packet.isSegmentPrivate, "Packet and Segments deletegen can only match on a segment private del packet gen={0}", segGen);
                        }
                        //System.out.println("  eq");

                        // Lock order: IW -> BD -> RP
                        if (Debugging.AssertsEnabled)
                        {
                            Debugging.Assert(readerPool.InfoIsLive(info));
                        }
                        ReadersAndUpdates rld    = readerPool.Get(info, true);
                        SegmentReader     reader = rld.GetReader(IOContext.READ);
                        int  delCount            = 0;
                        bool segAllDeletes;
                        try
                        {
                            DocValuesFieldUpdates.Container dvUpdates = new DocValuesFieldUpdates.Container();
                            if (coalescedUpdates != null)
                            {
                                //System.out.println("    del coalesced");
                                delCount += (int)ApplyTermDeletes(coalescedUpdates.TermsIterable(), rld, reader);
                                delCount += (int)ApplyQueryDeletes(coalescedUpdates.QueriesIterable(), rld, reader);
                                ApplyDocValuesUpdates(coalescedUpdates.numericDVUpdates, rld, reader, dvUpdates);
                                ApplyDocValuesUpdates(coalescedUpdates.binaryDVUpdates, rld, reader, dvUpdates);
                            }
                            //System.out.println("    del exact");
                            // Don't delete by Term here; DocumentsWriterPerThread
                            // already did that on flush:
                            delCount += (int)ApplyQueryDeletes(packet.GetQueriesEnumerable(), rld, reader);
                            ApplyDocValuesUpdates(packet.numericDVUpdates, rld, reader, dvUpdates);
                            ApplyDocValuesUpdates(packet.binaryDVUpdates, rld, reader, dvUpdates);
                            if (dvUpdates.Any())
                            {
                                rld.WriteFieldUpdates(info.Info.Dir, dvUpdates);
                            }
                            int fullDelCount = rld.Info.DelCount + rld.PendingDeleteCount;
                            if (Debugging.AssertsEnabled)
                            {
                                Debugging.Assert(fullDelCount <= rld.Info.Info.DocCount);
                            }
                            segAllDeletes = fullDelCount == rld.Info.Info.DocCount;
                        }
                        finally
                        {
                            rld.Release(reader);
                            readerPool.Release(rld);
                        }
                        anyNewDeletes |= delCount > 0;

                        if (segAllDeletes)
                        {
                            if (allDeleted == null)
                            {
                                allDeleted = new JCG.List <SegmentCommitInfo>();
                            }
                            allDeleted.Add(info);
                        }

                        if (infoStream.IsEnabled("BD"))
                        {
                            infoStream.Message("BD", "seg=" + info + " segGen=" + segGen + " segDeletes=[" + packet + "]; coalesced deletes=[" + (coalescedUpdates == null ? "null" : coalescedUpdates.ToString()) + "] newDelCount=" + delCount + (segAllDeletes ? " 100% deleted" : ""));
                        }

                        if (coalescedUpdates == null)
                        {
                            coalescedUpdates = new CoalescedUpdates();
                        }

                        /*
                         * Since we are on a segment private del packet we must not
                         * update the coalescedDeletes here! We can simply advance to the
                         * next packet and seginfo.
                         */
                        delIDX--;
                        infosIDX--;
                        info.SetBufferedDeletesGen(gen);
                    }
                    else
                    {
                        //System.out.println("  gt");

                        if (coalescedUpdates != null)
                        {
                            // Lock order: IW -> BD -> RP
                            if (Debugging.AssertsEnabled)
                            {
                                Debugging.Assert(readerPool.InfoIsLive(info));
                            }
                            ReadersAndUpdates rld    = readerPool.Get(info, true);
                            SegmentReader     reader = rld.GetReader(IOContext.READ);
                            int  delCount            = 0;
                            bool segAllDeletes;
                            try
                            {
                                delCount += (int)ApplyTermDeletes(coalescedUpdates.TermsIterable(), rld, reader);
                                delCount += (int)ApplyQueryDeletes(coalescedUpdates.QueriesIterable(), rld, reader);
                                DocValuesFieldUpdates.Container dvUpdates = new DocValuesFieldUpdates.Container();
                                ApplyDocValuesUpdates(coalescedUpdates.numericDVUpdates, rld, reader, dvUpdates);
                                ApplyDocValuesUpdates(coalescedUpdates.binaryDVUpdates, rld, reader, dvUpdates);
                                if (dvUpdates.Any())
                                {
                                    rld.WriteFieldUpdates(info.Info.Dir, dvUpdates);
                                }
                                int fullDelCount = rld.Info.DelCount + rld.PendingDeleteCount;
                                if (Debugging.AssertsEnabled)
                                {
                                    Debugging.Assert(fullDelCount <= rld.Info.Info.DocCount);
                                }
                                segAllDeletes = fullDelCount == rld.Info.Info.DocCount;
                            }
                            finally
                            {
                                rld.Release(reader);
                                readerPool.Release(rld);
                            }
                            anyNewDeletes |= delCount > 0;

                            if (segAllDeletes)
                            {
                                if (allDeleted == null)
                                {
                                    allDeleted = new JCG.List <SegmentCommitInfo>();
                                }
                                allDeleted.Add(info);
                            }

                            if (infoStream.IsEnabled("BD"))
                            {
                                infoStream.Message("BD", "seg=" + info + " segGen=" + segGen + " coalesced deletes=[" + coalescedUpdates + "] newDelCount=" + delCount + (segAllDeletes ? " 100% deleted" : ""));
                            }
                        }
                        info.SetBufferedDeletesGen(gen);

                        infosIDX--;
                    }
                }

                if (Debugging.AssertsEnabled)
                {
                    Debugging.Assert(CheckDeleteStats());
                }
                if (infoStream.IsEnabled("BD"))
                {
                    infoStream.Message("BD", "applyDeletes took " + ((J2N.Time.NanoTime() / J2N.Time.MillisecondsPerNanosecond) - t0) + " msec"); // LUCENENET: Use NanoTime() rather than CurrentTimeMilliseconds() for more accurate/reliable results
                }
                // assert infos != segmentInfos || !any() : "infos=" + infos + " segmentInfos=" + segmentInfos + " any=" + any;

                return(new ApplyDeletesResult(anyNewDeletes, gen, allDeleted));
            }
            finally
            {
                UninterruptableMonitor.Exit(this);
            }
        }
예제 #4
0
        // algorithm: treat sentence snippets as miniature documents
        // we can intersect these with the postings lists via BreakIterator.preceding(offset),s
        // score each sentence as norm(sentenceStartOffset) * sum(weight * tf(freq))
        private Passage[] HighlightDoc(string field, BytesRef[] terms, int contentLength, BreakIterator bi, int doc,
                                       TermsEnum termsEnum, DocsAndPositionsEnum[] postings, int n)
        {
            PassageScorer scorer = GetScorer(field);

            if (scorer == null)
            {
                throw new NullReferenceException("PassageScorer cannot be null");
            }
            JCG.PriorityQueue <OffsetsEnum> pq = new JCG.PriorityQueue <OffsetsEnum>();
            float[] weights = new float[terms.Length];
            // initialize postings
            for (int i = 0; i < terms.Length; i++)
            {
                DocsAndPositionsEnum de = postings[i];
                int pDoc;
                if (de == EMPTY)
                {
                    continue;
                }
                else if (de == null)
                {
                    postings[i] = EMPTY; // initially
                    if (!termsEnum.SeekExact(terms[i]))
                    {
                        continue; // term not found
                    }
                    de = postings[i] = termsEnum.DocsAndPositions(null, null, DocsAndPositionsFlags.OFFSETS);
                    if (de == null)
                    {
                        // no positions available
                        throw new ArgumentException("field '" + field + "' was indexed without offsets, cannot highlight");
                    }
                    pDoc = de.Advance(doc);
                }
                else
                {
                    pDoc = de.DocID;
                    if (pDoc < doc)
                    {
                        pDoc = de.Advance(doc);
                    }
                }

                if (doc == pDoc)
                {
                    weights[i] = scorer.Weight(contentLength, de.Freq);
                    de.NextPosition();
                    pq.Add(new OffsetsEnum(de, i));
                }
            }

            pq.Add(new OffsetsEnum(EMPTY, int.MaxValue)); // a sentinel for termination

            JCG.PriorityQueue <Passage> passageQueue = new JCG.PriorityQueue <Passage>(n, Comparer <Passage> .Create((left, right) =>
            {
                if (left.score < right.score)
                {
                    return(-1);
                }
                else if (left.score > right.score)
                {
                    return(1);
                }
                else
                {
                    return(left.startOffset - right.startOffset);
                }
            }));
            Passage current = new Passage();

            while (pq.TryDequeue(out OffsetsEnum off))
            {
                DocsAndPositionsEnum dp = off.dp;
                int start = dp.StartOffset;
                if (start == -1)
                {
                    throw new ArgumentException("field '" + field + "' was indexed without offsets, cannot highlight");
                }
                int end = dp.EndOffset;
                // LUCENE-5166: this hit would span the content limit... however more valid
                // hits may exist (they are sorted by start). so we pretend like we never
                // saw this term, it won't cause a passage to be added to passageQueue or anything.
                if (Debugging.AssertsEnabled)
                {
                    Debugging.Assert(EMPTY.StartOffset == int.MaxValue);
                }
                if (start < contentLength && end > contentLength)
                {
                    continue;
                }
                if (start >= current.endOffset)
                {
                    if (current.startOffset >= 0)
                    {
                        // finalize current
                        current.score *= scorer.Norm(current.startOffset);
                        // new sentence: first add 'current' to queue
                        if (passageQueue.Count == n && current.score < passageQueue.Peek().score)
                        {
                            current.Reset(); // can't compete, just reset it
                        }
                        else
                        {
                            passageQueue.Enqueue(current);
                            if (passageQueue.Count > n)
                            {
                                current = passageQueue.Dequeue();
                                current.Reset();
                            }
                            else
                            {
                                current = new Passage();
                            }
                        }
                    }
                    // if we exceed limit, we are done
                    if (start >= contentLength)
                    {
                        Passage[] passages = passageQueue.ToArray();
                        foreach (Passage p in passages)
                        {
                            p.Sort();
                        }
                        // sort in ascending order
                        ArrayUtil.TimSort(passages, Comparer <Passage> .Create((left, right) => left.startOffset - right.startOffset));
                        return(passages);
                    }
                    // advance breakiterator
                    if (Debugging.AssertsEnabled)
                    {
                        Debugging.Assert(BreakIterator.Done < 0);
                    }
                    current.startOffset = Math.Max(bi.Preceding(start + 1), 0);
                    current.endOffset   = Math.Min(bi.Next(), contentLength);
                }
                int tf = 0;
                while (true)
                {
                    tf++;
                    BytesRef term = terms[off.id];
                    if (term == null)
                    {
                        // multitermquery match, pull from payload
                        term = off.dp.GetPayload();
                        if (Debugging.AssertsEnabled)
                        {
                            Debugging.Assert(term != null);
                        }
                    }
                    current.AddMatch(start, end, term);
                    if (off.pos == dp.Freq)
                    {
                        break; // removed from pq
                    }
                    else
                    {
                        off.pos++;
                        dp.NextPosition();
                        start = dp.StartOffset;
                        end   = dp.EndOffset;
                    }
                    if (start >= current.endOffset || end > contentLength)
                    {
                        pq.Enqueue(off);
                        break;
                    }
                }
                current.score += weights[off.id] * scorer.Tf(tf, current.endOffset - current.startOffset);
            }

            // Dead code but compiler disagrees:
            if (Debugging.AssertsEnabled)
            {
                Debugging.Assert(false);
            }
            return(null);
        }
예제 #5
0
            public override Scorer GetScorer(AtomicReaderContext context, IBits acceptDocs)
            {
                if (Debugging.AssertsEnabled)
                {
                    Debugging.Assert(outerInstance.termArrays.Count > 0);
                }
                AtomicReader reader   = (context.AtomicReader);
                IBits        liveDocs = acceptDocs;

                PhraseQuery.PostingsAndFreq[] postingsFreqs = new PhraseQuery.PostingsAndFreq[outerInstance.termArrays.Count];

                Terms fieldTerms = reader.GetTerms(outerInstance.field);

                if (fieldTerms is null)
                {
                    return(null);
                }

                // Reuse single TermsEnum below:
                TermsEnum termsEnum = fieldTerms.GetEnumerator();

                for (int pos = 0; pos < postingsFreqs.Length; pos++)
                {
                    Term[] terms = outerInstance.termArrays[pos];

                    DocsAndPositionsEnum postingsEnum;
                    int docFreq;

                    if (terms.Length > 1)
                    {
                        postingsEnum = new UnionDocsAndPositionsEnum(liveDocs, context, terms, termContexts, termsEnum);

                        // coarse -- this overcounts since a given doc can
                        // have more than one term:
                        docFreq = 0;
                        for (int termIdx = 0; termIdx < terms.Length; termIdx++)
                        {
                            Term      term      = terms[termIdx];
                            TermState termState = termContexts[term].Get(context.Ord);
                            if (termState is null)
                            {
                                // Term not in reader
                                continue;
                            }
                            termsEnum.SeekExact(term.Bytes, termState);
                            docFreq += termsEnum.DocFreq;
                        }

                        if (docFreq == 0)
                        {
                            // None of the terms are in this reader
                            return(null);
                        }
                    }
                    else
                    {
                        Term      term      = terms[0];
                        TermState termState = termContexts[term].Get(context.Ord);
                        if (termState is null)
                        {
                            // Term not in reader
                            return(null);
                        }
                        termsEnum.SeekExact(term.Bytes, termState);
                        postingsEnum = termsEnum.DocsAndPositions(liveDocs, null, DocsAndPositionsFlags.NONE);

                        if (postingsEnum is null)
                        {
                            // term does exist, but has no positions
                            if (Debugging.AssertsEnabled)
                            {
                                Debugging.Assert(termsEnum.Docs(liveDocs, null, DocsFlags.NONE) != null, "termstate found but no term exists in reader");
                            }
                            throw IllegalStateException.Create("field \"" + term.Field + "\" was indexed without position data; cannot run PhraseQuery (term=" + term.Text + ")");
                        }

                        docFreq = termsEnum.DocFreq;
                    }

                    postingsFreqs[pos] = new PhraseQuery.PostingsAndFreq(postingsEnum, docFreq, (int)outerInstance.positions[pos], terms);
                }

                // sort by increasing docFreq order
                if (outerInstance.slop == 0)
                {
                    ArrayUtil.TimSort(postingsFreqs);
                }

                if (outerInstance.slop == 0)
                {
                    ExactPhraseScorer s = new ExactPhraseScorer(this, postingsFreqs, similarity.GetSimScorer(stats, context));
                    if (s.noDocs)
                    {
                        return(null);
                    }
                    else
                    {
                        return(s);
                    }
                }
                else
                {
                    return(new SloppyPhraseScorer(this, postingsFreqs, outerInstance.slop, similarity.GetSimScorer(stats, context)));
                }
            }
            internal virtual DocValuesConsumer GetInstance(FieldInfo field)
            {
                DocValuesFormat format = null;

                if (field.DocValuesGen != -1)
                {
                    string formatName = field.GetAttribute(PER_FIELD_FORMAT_KEY);
                    // this means the field never existed in that segment, yet is applied updates
                    if (formatName != null)
                    {
                        format = DocValuesFormat.ForName(formatName);
                    }
                }
                if (format == null)
                {
                    format = outerInstance.GetDocValuesFormatForField(field.Name);
                }
                if (format == null)
                {
                    throw new InvalidOperationException("invalid null DocValuesFormat for field=\"" + field.Name + "\"");
                }
                string formatName_ = format.Name;

                string previousValue = field.PutAttribute(PER_FIELD_FORMAT_KEY, formatName_);

                if (Debugging.AssertsEnabled)
                {
                    Debugging.Assert(field.DocValuesGen != -1 || previousValue == null, () => "formatName=" + formatName_ + " prevValue=" + previousValue);
                }

                int?suffix = null;

                ConsumerAndSuffix consumer;

                if (!formats.TryGetValue(format, out consumer) || consumer == null)
                {
                    // First time we are seeing this format; create a new instance

                    if (field.DocValuesGen != -1)
                    {
                        string suffixAtt = field.GetAttribute(PER_FIELD_SUFFIX_KEY);
                        // even when dvGen is != -1, it can still be a new field, that never
                        // existed in the segment, and therefore doesn't have the recorded
                        // attributes yet.
                        if (suffixAtt != null)
                        {
                            suffix = Convert.ToInt32(suffixAtt, CultureInfo.InvariantCulture);
                        }
                    }

                    if (suffix == null)
                    {
                        // bump the suffix
                        if (!suffixes.TryGetValue(formatName_, out suffix) || suffix == null)
                        {
                            suffix = 0;
                        }
                        else
                        {
                            suffix = suffix + 1;
                        }
                    }
                    suffixes[formatName_] = suffix;

                    string segmentSuffix = GetFullSegmentSuffix(segmentWriteState.SegmentSuffix, GetSuffix(formatName_, Convert.ToString(suffix, CultureInfo.InvariantCulture)));
                    consumer          = new ConsumerAndSuffix();
                    consumer.Consumer = format.FieldsConsumer(new SegmentWriteState(segmentWriteState, segmentSuffix));
                    consumer.Suffix   = suffix.Value; // LUCENENET NOTE: At this point suffix cannot be null
                    formats[format]   = consumer;
                }
                else
                {
                    // we've already seen this format, so just grab its suffix
                    if (Debugging.AssertsEnabled)
                    {
                        Debugging.Assert(suffixes.ContainsKey(formatName_));
                    }
                    suffix = consumer.Suffix;
                }

                previousValue = field.PutAttribute(PER_FIELD_SUFFIX_KEY, Convert.ToString(suffix, CultureInfo.InvariantCulture));
                if (Debugging.AssertsEnabled)
                {
                    Debugging.Assert(field.DocValuesGen != -1 || previousValue == null, () => "suffix=" + Convert.ToString(suffix, CultureInfo.InvariantCulture) + " prevValue=" + previousValue);
                }

                // TODO: we should only provide the "slice" of FIS
                // that this DVF actually sees ...
                return(consumer.Consumer);
            }
        public override void Write(Directory directory, string segmentName, string segmentSuffix, FieldInfos infos, IOContext context)
        {
            string      fileName = IndexFileNames.SegmentFileName(segmentName, "", Lucene40FieldInfosFormat.FIELD_INFOS_EXTENSION);
            IndexOutput output   = directory.CreateOutput(fileName, context);
            bool        success  = false;

            try
            {
                CodecUtil.WriteHeader(output, Lucene40FieldInfosFormat.CODEC_NAME, Lucene40FieldInfosFormat.FORMAT_CURRENT);
                output.WriteVInt32(infos.Count);
                foreach (FieldInfo fi in infos)
                {
                    IndexOptions indexOptions = fi.IndexOptions;
                    sbyte        bits         = 0x0;
                    if (fi.HasVectors)
                    {
                        bits |= Lucene40FieldInfosFormat.STORE_TERMVECTOR;
                    }
                    if (fi.OmitsNorms)
                    {
                        bits |= Lucene40FieldInfosFormat.OMIT_NORMS;
                    }
                    if (fi.HasPayloads)
                    {
                        bits |= Lucene40FieldInfosFormat.STORE_PAYLOADS;
                    }
                    if (fi.IsIndexed)
                    {
                        bits |= Lucene40FieldInfosFormat.IS_INDEXED;
                        if (Debugging.AssertsEnabled)
                        {
                            Debugging.Assert(indexOptions.CompareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) >= 0 || !fi.HasPayloads);
                        }
                        if (indexOptions == IndexOptions.DOCS_ONLY)
                        {
                            bits |= Lucene40FieldInfosFormat.OMIT_TERM_FREQ_AND_POSITIONS;
                        }
                        else if (indexOptions == IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS)
                        {
                            bits |= Lucene40FieldInfosFormat.STORE_OFFSETS_IN_POSTINGS;
                        }
                        else if (indexOptions == IndexOptions.DOCS_AND_FREQS)
                        {
                            bits |= Lucene40FieldInfosFormat.OMIT_POSITIONS;
                        }
                    }
                    output.WriteString(fi.Name);
                    output.WriteVInt32(fi.Number);
                    output.WriteByte((byte)bits);

                    // pack the DV types in one byte
                    byte dv  = DocValuesByte(fi.DocValuesType, fi.GetAttribute(Lucene40FieldInfosReader.LEGACY_DV_TYPE_KEY));
                    byte nrm = DocValuesByte(fi.NormType, fi.GetAttribute(Lucene40FieldInfosReader.LEGACY_NORM_TYPE_KEY));
                    if (Debugging.AssertsEnabled)
                    {
                        Debugging.Assert((dv & (~0xF)) == 0 && (nrm & (~0x0F)) == 0);
                    }
                    var val = (byte)(0xff & ((nrm << 4) | (byte)dv));
                    output.WriteByte(val);
                    output.WriteStringStringMap(fi.Attributes);
                }
                success = true;
            }
            finally
            {
                if (success)
                {
                    output.Dispose();
                }
                else
                {
                    IOUtils.DisposeWhileHandlingException(output);
                }
            }
        }
예제 #8
0
            // Look for seek type 3 ("pop"): if the delta from
            // prev -> current was replacing an S with an E,
            // we must now seek to beyond that E.  this seek
            // "finishes" the dance at this character
            // position.
            private bool DoPop()
            {
                if (DEBUG_SURROGATES)
                {
                    Console.WriteLine("  try pop");
                }

                if (Debugging.AssertsEnabled)
                {
                    Debugging.Assert(newSuffixStart <= prevTerm.Length);
                    Debugging.Assert(newSuffixStart < scratchTerm.Length || newSuffixStart == 0);
                }

                if (prevTerm.Length > newSuffixStart && IsNonBMPChar(prevTerm.Bytes, newSuffixStart) && IsHighBMPChar(scratchTerm.Bytes, newSuffixStart))
                {
                    // Seek type 2 -- put 0xFF at this position:
                    scratchTerm.Bytes[newSuffixStart] = 0xff;
                    scratchTerm.Length = newSuffixStart + 1;

                    if (DEBUG_SURROGATES)
                    {
                        Console.WriteLine("    seek to term=" + UnicodeUtil.ToHexString(scratchTerm.Utf8ToString()) + " " + scratchTerm.ToString());
                    }

                    // TODO: more efficient seek?  can we simply swap
                    // the enums?
                    outerInstance.TermsDict.SeekEnum(termEnum, new Term(fieldInfo.Name, scratchTerm), true);

                    Term t2 = termEnum.Term();

                    // We could hit EOF or different field since this
                    // was a seek "forward":
                    if (t2 != null && t2.Field == internedFieldName)
                    {
                        if (DEBUG_SURROGATES)
                        {
                            Console.WriteLine("      got term=" + UnicodeUtil.ToHexString(t2.Text()) + " " + t2.Bytes);
                        }

                        BytesRef b2 = t2.Bytes;
                        if (Debugging.AssertsEnabled)
                        {
                            Debugging.Assert(b2.Offset == 0);
                        }

                        // Set newSuffixStart -- we can't use
                        // termEnum's since the above seek may have
                        // done no scanning (eg, term was precisely
                        // and index term, or, was in the term seek
                        // cache):
                        scratchTerm.CopyBytes(b2);
                        SetNewSuffixStart(prevTerm, scratchTerm);

                        return(true);
                    }
                    else if (newSuffixStart != 0 || scratchTerm.Length != 0)
                    {
                        if (DEBUG_SURROGATES)
                        {
                            Console.WriteLine("      got term=null (or next field)");
                        }
                        newSuffixStart     = 0;
                        scratchTerm.Length = 0;
                        return(true);
                    }
                }

                return(false);
            }
예제 #9
0
            // Pre-flex indices store terms in UTF16 sort order, but
            // certain queries require Unicode codepoint order; this
            // method carefully seeks around surrogates to handle
            // this impedance mismatch

            private void SurrogateDance()
            {
                if (!unicodeSortOrder)
                {
                    return;
                }

                // We are invoked after TIS.next() (by UTF16 order) to
                // possibly seek to a different "next" (by unicode
                // order) term.

                // We scan only the "delta" from the last term to the
                // current term, in UTF8 bytes.  We look at 1) the bytes
                // stripped from the prior term, and then 2) the bytes
                // appended to that prior term's prefix.

                // We don't care about specific UTF8 sequences, just
                // the "category" of the UTF16 character.  Category S
                // is a high/low surrogate pair (it non-BMP).
                // Category E is any BMP char > UNI_SUR_LOW_END (and <
                // U+FFFF). Category A is the rest (any unicode char
                // <= UNI_SUR_HIGH_START).

                // The core issue is that pre-flex indices sort the
                // characters as ASE, while flex must sort as AES.  So
                // when scanning, when we hit S, we must 1) seek
                // forward to E and enum the terms there, then 2) seek
                // back to S and enum all terms there, then 3) seek to
                // after E.  Three different seek points (1, 2, 3).

                // We can easily detect S in UTF8: if a byte has
                // prefix 11110 (0xf0), then that byte and the
                // following 3 bytes encode a single unicode codepoint
                // in S.  Similarly, we can detect E: if a byte has
                // prefix 1110111 (0xee), then that byte and the
                // following 2 bytes encode a single unicode codepoint
                // in E.

                // Note that this is really a recursive process --
                // maybe the char at pos 2 needs to dance, but any
                // point in its dance, suddenly pos 4 needs to dance
                // so you must finish pos 4 before returning to pos
                // 2.  But then during pos 4's dance maybe pos 7 needs
                // to dance, etc.  However, despite being recursive,
                // we don't need to hold any state because the state
                // can always be derived by looking at prior term &
                // current term.

                // TODO: can we avoid this copy?
                if (termEnum.Term() == null || termEnum.Term().Field != internedFieldName)
                {
                    scratchTerm.Length = 0;
                }
                else
                {
                    scratchTerm.CopyBytes(termEnum.Term().Bytes);
                }

                if (DEBUG_SURROGATES)
                {
                    Console.WriteLine("  dance");
                    Console.WriteLine("    prev=" + UnicodeUtil.ToHexString(prevTerm.Utf8ToString()));
                    Console.WriteLine("         " + prevTerm.ToString());
                    Console.WriteLine("    term=" + UnicodeUtil.ToHexString(scratchTerm.Utf8ToString()));
                    Console.WriteLine("         " + scratchTerm.ToString());
                }

                // this code assumes TermInfosReader/SegmentTermEnum
                // always use BytesRef.offset == 0
                if (Debugging.AssertsEnabled)
                {
                    Debugging.Assert(prevTerm.Offset == 0);
                    Debugging.Assert(scratchTerm.Offset == 0);
                }

                // Need to loop here because we may need to do multiple
                // pops, and possibly a continue in the end, ie:
                //
                //  cont
                //  pop, cont
                //  pop, pop, cont
                //  <nothing>
                //

                while (true)
                {
                    if (DoContinue())
                    {
                        break;
                    }
                    else
                    {
                        if (!DoPop())
                        {
                            break;
                        }
                    }
                }

                if (DEBUG_SURROGATES)
                {
                    Console.WriteLine("  finish bmp ends");
                }

                DoPushes();
            }
예제 #10
0
            public override BytesRef Next()
            {
                if (DEBUG_SURROGATES)
                {
                    Console.WriteLine("TE.next()");
                }
                if (skipNext)
                {
                    if (DEBUG_SURROGATES)
                    {
                        Console.WriteLine("  skipNext=true");
                    }
                    skipNext = false;
                    if (termEnum.Term() == null)
                    {
                        return(null);
                        // PreFlex codec interns field names:
                    }
                    else if (termEnum.Term().Field != internedFieldName)
                    {
                        return(null);
                    }
                    else
                    {
                        return(current = termEnum.Term().Bytes);
                    }
                }

                // TODO: can we use STE's prevBuffer here?
                prevTerm.CopyBytes(termEnum.Term().Bytes);

                if (termEnum.Next() && termEnum.Term().Field == internedFieldName)
                {
                    newSuffixStart = termEnum.newSuffixStart;
                    if (DEBUG_SURROGATES)
                    {
                        Console.WriteLine("  newSuffixStart=" + newSuffixStart);
                    }
                    SurrogateDance();
                    Term t = termEnum.Term();
                    if (t == null || t.Field != internedFieldName)
                    {
                        // PreFlex codec interns field names; verify:
                        if (Debugging.AssertsEnabled)
                        {
                            Debugging.Assert(t == null || !t.Field.Equals(internedFieldName, StringComparison.Ordinal));
                        }
                        current = null;
                    }
                    else
                    {
                        current = t.Bytes;
                    }
                    return(current);
                }
                else
                {
                    // this field is exhausted, but we have to give
                    // surrogateDance a chance to seek back:
                    if (DEBUG_SURROGATES)
                    {
                        Console.WriteLine("  force cont");
                    }
                    //newSuffixStart = prevTerm.length;
                    newSuffixStart = 0;
                    SurrogateDance();

                    Term t = termEnum.Term();
                    if (t == null || t.Field != internedFieldName)
                    {
                        // PreFlex codec interns field names; verify:
                        if (Debugging.AssertsEnabled)
                        {
                            Debugging.Assert(t == null || !t.Field.Equals(internedFieldName, StringComparison.Ordinal));
                        }
                        return(null);
                    }
                    else
                    {
                        current = t.Bytes;
                        return(current);
                    }
                }
            }
예제 #11
0
            // Swap in S, in place of E:
            private bool SeekToNonBMP(SegmentTermEnum te, BytesRef term, int pos)
            {
                int savLength = term.Length;

                if (Debugging.AssertsEnabled)
                {
                    Debugging.Assert(term.Offset == 0);
                }

                // The 3 bytes starting at downTo make up 1
                // unicode character:
                if (Debugging.AssertsEnabled)
                {
                    Debugging.Assert(IsHighBMPChar(term.Bytes, pos));
                }

                // NOTE: we cannot make this assert, because
                // AutomatonQuery legitimately sends us malformed UTF8
                // (eg the UTF8 bytes with just 0xee)
                // assert term.length >= pos + 3: "term.length=" + term.length + " pos+3=" + (pos+3) + " byte=" + Integer.toHexString(term.bytes[pos]) + " term=" + term.toString();

                // Save the bytes && length, since we need to
                // restore this if seek "back" finds no matching
                // terms
                if (term.Bytes.Length < 4 + pos)
                {
                    term.Grow(4 + pos);
                }

                scratch[0] = (sbyte)term.Bytes[pos];
                scratch[1] = (sbyte)term.Bytes[pos + 1];
                scratch[2] = (sbyte)term.Bytes[pos + 2];

                term.Bytes[pos]     = 0xf0;
                term.Bytes[pos + 1] = 0x90;
                term.Bytes[pos + 2] = 0x80;
                term.Bytes[pos + 3] = 0x80;
                term.Length         = 4 + pos;

                if (DEBUG_SURROGATES)
                {
                    Console.WriteLine("      try seek term=" + UnicodeUtil.ToHexString(term.Utf8ToString()));
                }

                // Seek "back":
                outerInstance.TermsDict.SeekEnum(te, new Term(fieldInfo.Name, term), true);

                // Test if the term we seek'd to in fact found a
                // surrogate pair at the same position as the E:
                Term t2 = te.Term();

                // Cannot be null (or move to next field) because at
                // "worst" it'd seek to the same term we are on now,
                // unless we are being called from seek
                if (t2 == null || t2.Field != internedFieldName)
                {
                    return(false);
                }

                if (DEBUG_SURROGATES)
                {
                    Console.WriteLine("      got term=" + UnicodeUtil.ToHexString(t2.Text()));
                }

                // Now test if prefix is identical and we found
                // a non-BMP char at the same position:
                BytesRef b2 = t2.Bytes;

                if (Debugging.AssertsEnabled)
                {
                    Debugging.Assert(b2.Offset == 0);
                }

                bool matches;

                if (b2.Length >= term.Length && IsNonBMPChar(b2.Bytes, pos))
                {
                    matches = true;
                    for (int i = 0; i < pos; i++)
                    {
                        if (term.Bytes[i] != b2.Bytes[i])
                        {
                            matches = false;
                            break;
                        }
                    }
                }
                else
                {
                    matches = false;
                }

                // Restore term:
                term.Length         = savLength;
                term.Bytes[pos]     = (byte)scratch[0];
                term.Bytes[pos + 1] = (byte)scratch[1];
                term.Bytes[pos + 2] = (byte)scratch[2];

                return(matches);
            }
예제 #12
0
            public override Scorer GetScorer(AtomicReaderContext context, IBits acceptDocs)
            {
                if (Debugging.AssertsEnabled)
                {
                    Debugging.Assert(outerInstance.terms.Count > 0);
                }
                AtomicReader reader   = context.AtomicReader;
                IBits        liveDocs = acceptDocs;

                PostingsAndFreq[] postingsFreqs = new PostingsAndFreq[outerInstance.terms.Count];

                Terms fieldTerms = reader.GetTerms(outerInstance.field);

                if (fieldTerms is null)
                {
                    return(null);
                }

                // Reuse single TermsEnum below:
                TermsEnum te = fieldTerms.GetEnumerator();

                for (int i = 0; i < outerInstance.terms.Count; i++)
                {
                    Term      t     = outerInstance.terms[i];
                    TermState state = states[i].Get(context.Ord);
                    if (state is null) // term doesnt exist in this segment
                    {
                        if (Debugging.AssertsEnabled)
                        {
                            Debugging.Assert(TermNotInReader(reader, t), "no termstate found but term exists in reader");
                        }
                        return(null);
                    }
                    te.SeekExact(t.Bytes, state);
                    DocsAndPositionsEnum postingsEnum = te.DocsAndPositions(liveDocs, null, DocsAndPositionsFlags.NONE);

                    // PhraseQuery on a field that did not index
                    // positions.
                    if (postingsEnum is null)
                    {
                        if (Debugging.AssertsEnabled)
                        {
                            Debugging.Assert(te.SeekExact(t.Bytes), "termstate found but no term exists in reader");
                        }
                        // term does exist, but has no positions
                        throw IllegalStateException.Create("field \"" + t.Field + "\" was indexed without position data; cannot run PhraseQuery (term=" + t.Text + ")");
                    }
                    postingsFreqs[i] = new PostingsAndFreq(postingsEnum, te.DocFreq, outerInstance.positions[i], t);
                }

                // sort by increasing docFreq order
                if (outerInstance.slop == 0)
                {
                    ArrayUtil.TimSort(postingsFreqs);
                }

                if (outerInstance.slop == 0) // optimize exact case
                {
                    ExactPhraseScorer s = new ExactPhraseScorer(this, postingsFreqs, similarity.GetSimScorer(stats, context));
                    if (s.noDocs)
                    {
                        return(null);
                    }
                    else
                    {
                        return(s);
                    }
                }
                else
                {
                    return(new SloppyPhraseScorer(this, postingsFreqs, outerInstance.slop, similarity.GetSimScorer(stats, context)));
                }
            }
예제 #13
0
        /// <summary>
        /// Sole constructor. </summary>
        public CompressingStoredFieldsReader(Directory d, SegmentInfo si, string segmentSuffix, FieldInfos fn, IOContext context, string formatName, CompressionMode compressionMode)
        {
            this.compressionMode = compressionMode;
            string segment = si.Name;
            bool   success = false;

            fieldInfos = fn;
            numDocs    = si.DocCount;
            ChecksumIndexInput indexStream = null;

            try
            {
                string indexStreamFN  = IndexFileNames.SegmentFileName(segment, segmentSuffix, Lucene40StoredFieldsWriter.FIELDS_INDEX_EXTENSION);
                string fieldsStreamFN = IndexFileNames.SegmentFileName(segment, segmentSuffix, Lucene40StoredFieldsWriter.FIELDS_EXTENSION);
                // Load the index into memory
                indexStream = d.OpenChecksumInput(indexStreamFN, context);
                string codecNameIdx = formatName + CompressingStoredFieldsWriter.CODEC_SFX_IDX;
                version = CodecUtil.CheckHeader(indexStream, codecNameIdx, CompressingStoredFieldsWriter.VERSION_START, CompressingStoredFieldsWriter.VERSION_CURRENT);
                if (Debugging.AssertsEnabled)
                {
                    Debugging.Assert(CodecUtil.HeaderLength(codecNameIdx) == indexStream.GetFilePointer());
                }
                indexReader = new CompressingStoredFieldsIndexReader(indexStream, si);

                long maxPointer = -1;

                if (version >= CompressingStoredFieldsWriter.VERSION_CHECKSUM)
                {
                    maxPointer = indexStream.ReadVInt64();
                    CodecUtil.CheckFooter(indexStream);
                }
                else
                {
#pragma warning disable 612, 618
                    CodecUtil.CheckEOF(indexStream);
#pragma warning restore 612, 618
                }
                indexStream.Dispose();
                indexStream = null;

                // Open the data file and read metadata
                fieldsStream = d.OpenInput(fieldsStreamFN, context);
                if (version >= CompressingStoredFieldsWriter.VERSION_CHECKSUM)
                {
                    if (maxPointer + CodecUtil.FooterLength() != fieldsStream.Length)
                    {
                        throw new CorruptIndexException("Invalid fieldsStream maxPointer (file truncated?): maxPointer=" + maxPointer + ", length=" + fieldsStream.Length);
                    }
                }
                else
                {
                    maxPointer = fieldsStream.Length;
                }
                this.maxPointer = maxPointer;
                string codecNameDat  = formatName + CompressingStoredFieldsWriter.CODEC_SFX_DAT;
                int    fieldsVersion = CodecUtil.CheckHeader(fieldsStream, codecNameDat, CompressingStoredFieldsWriter.VERSION_START, CompressingStoredFieldsWriter.VERSION_CURRENT);
                if (version != fieldsVersion)
                {
                    throw new CorruptIndexException("Version mismatch between stored fields index and data: " + version + " != " + fieldsVersion);
                }
                if (Debugging.AssertsEnabled)
                {
                    Debugging.Assert(CodecUtil.HeaderLength(codecNameDat) == fieldsStream.GetFilePointer());
                }

                if (version >= CompressingStoredFieldsWriter.VERSION_BIG_CHUNKS)
                {
                    chunkSize = fieldsStream.ReadVInt32();
                }
                else
                {
                    chunkSize = -1;
                }
                packedIntsVersion = fieldsStream.ReadVInt32();
                decompressor      = compressionMode.NewDecompressor();
                this.bytes        = new BytesRef();

                success = true;
            }
            finally
            {
                if (!success)
                {
                    IOUtils.DisposeWhileHandlingException(this, indexStream);
                }
            }
        }
예제 #14
0
            /// <summary>
            /// Go to the chunk containing the provided <paramref name="doc"/> ID.
            /// </summary>
            internal void Next(int doc)
            {
                if (Debugging.AssertsEnabled)
                {
                    Debugging.Assert(doc >= this.docBase + this.chunkDocs, "{0} {1} {2}", doc, this.docBase, this.chunkDocs);
                }
                fieldsStream.Seek(outerInstance.indexReader.GetStartPointer(doc));

                int docBase   = fieldsStream.ReadVInt32();
                int chunkDocs = fieldsStream.ReadVInt32();

                if (docBase < this.docBase + this.chunkDocs || docBase + chunkDocs > outerInstance.numDocs)
                {
                    throw new CorruptIndexException($"Corrupted: current docBase={this.docBase}, current numDocs={this.chunkDocs}, new docBase={docBase}, new numDocs={chunkDocs} (resource={fieldsStream})");
                }
                this.docBase   = docBase;
                this.chunkDocs = chunkDocs;

                if (chunkDocs > numStoredFields.Length)
                {
                    int newLength = ArrayUtil.Oversize(chunkDocs, 4);
                    numStoredFields = new int[newLength];
                    lengths         = new int[newLength];
                }

                if (chunkDocs == 1)
                {
                    numStoredFields[0] = fieldsStream.ReadVInt32();
                    lengths[0]         = fieldsStream.ReadVInt32();
                }
                else
                {
                    int bitsPerStoredFields = fieldsStream.ReadVInt32();
                    if (bitsPerStoredFields == 0)
                    {
                        Arrays.Fill(numStoredFields, 0, chunkDocs, fieldsStream.ReadVInt32());
                    }
                    else if (bitsPerStoredFields > 31)
                    {
                        throw new CorruptIndexException("bitsPerStoredFields=" + bitsPerStoredFields + " (resource=" + fieldsStream + ")");
                    }
                    else
                    {
                        PackedInt32s.IReaderIterator it = PackedInt32s.GetReaderIteratorNoHeader(fieldsStream, PackedInt32s.Format.PACKED, outerInstance.packedIntsVersion, chunkDocs, bitsPerStoredFields, 1);
                        for (int i = 0; i < chunkDocs; ++i)
                        {
                            numStoredFields[i] = (int)it.Next();
                        }
                    }

                    int bitsPerLength = fieldsStream.ReadVInt32();
                    if (bitsPerLength == 0)
                    {
                        Arrays.Fill(lengths, 0, chunkDocs, fieldsStream.ReadVInt32());
                    }
                    else if (bitsPerLength > 31)
                    {
                        throw new CorruptIndexException($"bitsPerLength={bitsPerLength}");
                    }
                    else
                    {
                        PackedInt32s.IReaderIterator it = PackedInt32s.GetReaderIteratorNoHeader(fieldsStream, PackedInt32s.Format.PACKED, outerInstance.packedIntsVersion, chunkDocs, bitsPerLength, 1);
                        for (int i = 0; i < chunkDocs; ++i)
                        {
                            lengths[i] = (int)it.Next();
                        }
                    }
                }
            }
예제 #15
0
        private void TestOne(Random r, ByteRunAutomaton a, int startCode, int endCode, int iters)
        {
            // Verify correct ints are accepted
            int  nonSurrogateCount;
            bool ovSurStart;

            if (endCode < UnicodeUtil.UNI_SUR_HIGH_START || startCode > UnicodeUtil.UNI_SUR_LOW_END)
            {
                // no overlap w/ surrogates
                nonSurrogateCount = endCode - startCode + 1;
                ovSurStart        = false;
            }
            else if (IsSurrogate(startCode))
            {
                // start of range overlaps surrogates
                nonSurrogateCount = endCode - startCode + 1 - (UnicodeUtil.UNI_SUR_LOW_END - startCode + 1);
                ovSurStart        = false;
            }
            else if (IsSurrogate(endCode))
            {
                // end of range overlaps surrogates
                ovSurStart        = true;
                nonSurrogateCount = endCode - startCode + 1 - (endCode - UnicodeUtil.UNI_SUR_HIGH_START + 1);
            }
            else
            {
                // range completely subsumes surrogates
                ovSurStart        = true;
                nonSurrogateCount = endCode - startCode + 1 - (UnicodeUtil.UNI_SUR_LOW_END - UnicodeUtil.UNI_SUR_HIGH_START + 1);
            }

            if (Debugging.AssertsEnabled)
            {
                Debugging.Assert(nonSurrogateCount > 0);
            }

            for (int iter = 0; iter < iters; iter++)
            {
                // pick random code point in-range

                int code = startCode + r.Next(nonSurrogateCount);
                if (IsSurrogate(code))
                {
                    if (ovSurStart)
                    {
                        code = UnicodeUtil.UNI_SUR_LOW_END + 1 + (code - UnicodeUtil.UNI_SUR_HIGH_START);
                    }
                    else
                    {
                        code = UnicodeUtil.UNI_SUR_LOW_END + 1 + (code - startCode);
                    }
                }

                if (Debugging.AssertsEnabled)
                {
                    Debugging.Assert(code >= startCode && code <= endCode, () => "code=" + code + " start=" + startCode + " end=" + endCode);
                }
                if (Debugging.AssertsEnabled)
                {
                    Debugging.Assert(!IsSurrogate(code));
                }

                Assert.IsTrue(Matches(a, code), "DFA for range " + startCode + "-" + endCode + " failed to match code=" + code);
            }

            // Verify invalid ints are not accepted
            int invalidRange = MAX_UNICODE - (endCode - startCode + 1);

            if (invalidRange > 0)
            {
                for (int iter = 0; iter < iters; iter++)
                {
                    int x = TestUtil.NextInt32(r, 0, invalidRange - 1);
                    int code;
                    if (x >= startCode)
                    {
                        code = endCode + 1 + x - startCode;
                    }
                    else
                    {
                        code = x;
                    }
                    if ((code >= UnicodeUtil.UNI_SUR_HIGH_START && code <= UnicodeUtil.UNI_SUR_HIGH_END) | (code >= UnicodeUtil.UNI_SUR_LOW_START && code <= UnicodeUtil.UNI_SUR_LOW_END))
                    {
                        iter--;
                        continue;
                    }
                    Assert.IsFalse(Matches(a, code), "DFA for range " + startCode + "-" + endCode + " matched invalid code=" + code);
                }
            }
        }
예제 #16
0
            // Look for seek type 1 ("push"): if the newly added
            // suffix contains any S, we must try to seek to the
            // corresponding E.  If we find a match, we go there;
            // else we keep looking for additional S's in the new
            // suffix.  this "starts" the dance, at this character
            // position:
            private void DoPushes()
            {
                int upTo = newSuffixStart;

                if (DEBUG_SURROGATES)
                {
                    Console.WriteLine("  try push newSuffixStart=" + newSuffixStart + " scratchLen=" + scratchTerm.Length);
                }

                while (upTo < scratchTerm.Length)
                {
                    if (IsNonBMPChar(scratchTerm.Bytes, upTo) && (upTo > newSuffixStart || (upTo >= prevTerm.Length || (!IsNonBMPChar(prevTerm.Bytes, upTo) && !IsHighBMPChar(prevTerm.Bytes, upTo)))))
                    {
                        // A non-BMP char (4 bytes UTF8) starts here:
                        if (Debugging.AssertsEnabled)
                        {
                            Debugging.Assert(scratchTerm.Length >= upTo + 4);
                        }

                        int savLength = scratchTerm.Length;
                        scratch[0] = (sbyte)scratchTerm.Bytes[upTo];
                        scratch[1] = (sbyte)scratchTerm.Bytes[upTo + 1];
                        scratch[2] = (sbyte)scratchTerm.Bytes[upTo + 2];

                        scratchTerm.Bytes[upTo]     = (byte)UTF8_HIGH_BMP_LEAD;
                        scratchTerm.Bytes[upTo + 1] = 0x80;
                        scratchTerm.Bytes[upTo + 2] = 0x80;
                        scratchTerm.Length          = upTo + 3;

                        if (DEBUG_SURROGATES)
                        {
                            Console.WriteLine("    try seek 1 pos=" + upTo + " term=" + UnicodeUtil.ToHexString(scratchTerm.Utf8ToString()) + " " + scratchTerm.ToString() + " len=" + scratchTerm.Length);
                        }

                        // Seek "forward":
                        // TODO: more efficient seek?
                        outerInstance.TermsDict.SeekEnum(seekTermEnum, new Term(fieldInfo.Name, scratchTerm), true);

                        scratchTerm.Bytes[upTo]     = (byte)scratch[0];
                        scratchTerm.Bytes[upTo + 1] = (byte)scratch[1];
                        scratchTerm.Bytes[upTo + 2] = (byte)scratch[2];
                        scratchTerm.Length          = savLength;

                        // Did we find a match?
                        Term t2 = seekTermEnum.Term();

                        if (DEBUG_SURROGATES)
                        {
                            if (t2 == null)
                            {
                                Console.WriteLine("      hit term=null");
                            }
                            else
                            {
                                Console.WriteLine("      hit term=" + UnicodeUtil.ToHexString(t2.Text()) + " " + (t2 == null ? null : t2.Bytes));
                            }
                        }

                        // Since this was a seek "forward", we could hit
                        // EOF or a different field:
                        bool matches;

                        if (t2 != null && t2.Field == internedFieldName)
                        {
                            BytesRef b2 = t2.Bytes;
                            if (Debugging.AssertsEnabled)
                            {
                                Debugging.Assert(b2.Offset == 0);
                            }
                            if (b2.Length >= upTo + 3 && IsHighBMPChar(b2.Bytes, upTo))
                            {
                                matches = true;
                                for (int i = 0; i < upTo; i++)
                                {
                                    if (scratchTerm.Bytes[i] != b2.Bytes[i])
                                    {
                                        matches = false;
                                        break;
                                    }
                                }
                            }
                            else
                            {
                                matches = false;
                            }
                        }
                        else
                        {
                            matches = false;
                        }

                        if (matches)
                        {
                            if (DEBUG_SURROGATES)
                            {
                                Console.WriteLine("      matches!");
                            }

                            // OK seek "back"
                            // TODO: more efficient seek?
                            outerInstance.TermsDict.SeekEnum(termEnum, seekTermEnum.Term(), true);

                            scratchTerm.CopyBytes(seekTermEnum.Term().Bytes);

                            // +3 because we don't need to check the char
                            // at upTo: we know it's > BMP
                            upTo += 3;

                            // NOTE: we keep iterating, now, since this
                            // can easily "recurse".  Ie, after seeking
                            // forward at a certain char position, we may
                            // find another surrogate in our [new] suffix
                            // and must then do another seek (recurse)
                        }
                        else
                        {
                            upTo++;
                        }
                    }
                    else
                    {
                        upTo++;
                    }
                }
            }
예제 #17
0
        internal override int Transition(int absState, int position, int vector)
        {
            // null absState should never be passed in
            if (Debugging.AssertsEnabled)
            {
                Debugging.Assert(absState != -1);
            }

            // decode absState -> state, offset
            int state  = absState / (m_w + 1);
            int offset = absState % (m_w + 1);

            if (Debugging.AssertsEnabled)
            {
                Debugging.Assert(offset >= 0);
            }

            if (position == m_w)
            {
                if (state < 3)
                {
                    int loc = vector * 3 + state;
                    offset += Unpack(offsetIncrs0, loc, 1);
                    state   = Unpack(toStates0, loc, 2) - 1;
                }
            }
            else if (position == m_w - 1)
            {
                if (state < 5)
                {
                    int loc = vector * 5 + state;
                    offset += Unpack(offsetIncrs1, loc, 1);
                    state   = Unpack(toStates1, loc, 3) - 1;
                }
            }
            else if (position == m_w - 2)
            {
                if (state < 11)
                {
                    int loc = vector * 11 + state;
                    offset += Unpack(offsetIncrs2, loc, 2);
                    state   = Unpack(toStates2, loc, 4) - 1;
                }
            }
            else if (position == m_w - 3)
            {
                if (state < 21)
                {
                    int loc = vector * 21 + state;
                    offset += Unpack(offsetIncrs3, loc, 2);
                    state   = Unpack(toStates3, loc, 5) - 1;
                }
            }
            else if (position == m_w - 4)
            {
                if (state < 30)
                {
                    int loc = vector * 30 + state;
                    offset += Unpack(offsetIncrs4, loc, 3);
                    state   = Unpack(toStates4, loc, 5) - 1;
                }
            }
            else
            {
                if (state < 30)
                {
                    int loc = vector * 30 + state;
                    offset += Unpack(offsetIncrs5, loc, 3);
                    state   = Unpack(toStates5, loc, 5) - 1;
                }
            }

            if (state == -1)
            {
                // null state
                return(-1);
            }
            else
            {
                // translate back to abs
                return(state * (m_w + 1) + offset);
            }
        }
예제 #18
0
            public override SeekStatus SeekCeil(BytesRef term)
            {
                if (DEBUG_SURROGATES)
                {
                    Console.WriteLine("TE.seek target=" + UnicodeUtil.ToHexString(term.Utf8ToString()));
                }
                skipNext = false;
                TermInfosReader tis = outerInstance.TermsDict;
                Term            t0  = new Term(fieldInfo.Name, term);

                if (Debugging.AssertsEnabled)
                {
                    Debugging.Assert(termEnum != null);
                }

                tis.SeekEnum(termEnum, t0, false);

                Term t = termEnum.Term();

                if (t != null && t.Field == internedFieldName && term.BytesEquals(t.Bytes))
                {
                    // If we found an exact match, no need to do the
                    // surrogate dance
                    if (DEBUG_SURROGATES)
                    {
                        Console.WriteLine("  seek exact match");
                    }
                    current = t.Bytes;
                    return(SeekStatus.FOUND);
                }
                else if (t == null || t.Field != internedFieldName)
                {
                    // TODO: maybe we can handle this like the next()
                    // into null?  set term as prevTerm then dance?

                    if (DEBUG_SURROGATES)
                    {
                        Console.WriteLine("  seek hit EOF");
                    }

                    // We hit EOF; try end-case surrogate dance: if we
                    // find an E, try swapping in S, backwards:
                    scratchTerm.CopyBytes(term);

                    if (Debugging.AssertsEnabled)
                    {
                        Debugging.Assert(scratchTerm.Offset == 0);
                    }

                    for (int i = scratchTerm.Length - 1; i >= 0; i--)
                    {
                        if (IsHighBMPChar(scratchTerm.Bytes, i))
                        {
                            if (DEBUG_SURROGATES)
                            {
                                Console.WriteLine("    found E pos=" + i + "; try seek");
                            }

                            if (SeekToNonBMP(seekTermEnum, scratchTerm, i))
                            {
                                scratchTerm.CopyBytes(seekTermEnum.Term().Bytes);
                                outerInstance.TermsDict.SeekEnum(termEnum, seekTermEnum.Term(), false);

                                newSuffixStart = 1 + i;

                                DoPushes();

                                // Found a match
                                // TODO: faster seek?
                                current = termEnum.Term().Bytes;
                                return(SeekStatus.NOT_FOUND);
                            }
                        }
                    }

                    if (DEBUG_SURROGATES)
                    {
                        Console.WriteLine("  seek END");
                    }

                    current = null;
                    return(SeekStatus.END);
                }
                else
                {
                    // We found a non-exact but non-null term; this one
                    // is fun -- just treat it like next, by pretending
                    // requested term was prev:
                    prevTerm.CopyBytes(term);

                    if (DEBUG_SURROGATES)
                    {
                        Console.WriteLine("  seek hit non-exact term=" + UnicodeUtil.ToHexString(t.Text()));
                    }

                    BytesRef br = t.Bytes;
                    if (Debugging.AssertsEnabled)
                    {
                        Debugging.Assert(br.Offset == 0);
                    }

                    SetNewSuffixStart(term, br);

                    SurrogateDance();

                    Term t2 = termEnum.Term();
                    if (t2 == null || t2.Field != internedFieldName)
                    {
                        // PreFlex codec interns field names; verify:
                        if (Debugging.AssertsEnabled)
                        {
                            Debugging.Assert(t2 == null || !t2.Field.Equals(internedFieldName, StringComparison.Ordinal));
                        }
                        current = null;
                        return(SeekStatus.END);
                    }
                    else
                    {
                        current = t2.Bytes;
                        if (Debugging.AssertsEnabled)
                        {
                            Debugging.Assert(!unicodeSortOrder || term.CompareTo(current) < 0, () => "term=" + UnicodeUtil.ToHexString(term.Utf8ToString()) + " vs current=" + UnicodeUtil.ToHexString(current.Utf8ToString()));
                        }
                        return(SeekStatus.NOT_FOUND);
                    }
                }
            }
예제 #19
0
        private void Build(State start, State end, UTF8Sequence startUTF8, UTF8Sequence endUTF8, int upto)
        {
            // Break into start, middle, end:
            if (startUTF8.ByteAt(upto) == endUTF8.ByteAt(upto))
            {
                // Degen case: lead with the same byte:
                if (upto == startUTF8.len - 1 && upto == endUTF8.len - 1)
                {
                    // Super degen: just single edge, one UTF8 byte:
                    start.AddTransition(new Transition(startUTF8.ByteAt(upto), endUTF8.ByteAt(upto), end));
                    return;
                }
                else
                {
                    if (Debugging.AssertsEnabled)
                    {
                        Debugging.Assert(startUTF8.len > upto + 1);
                        Debugging.Assert(endUTF8.len > upto + 1);
                    }
                    State n = NewUTF8State();

                    // Single value leading edge
                    start.AddTransition(new Transition(startUTF8.ByteAt(upto), n)); // type=single

                    // Recurse for the rest
                    Build(n, end, startUTF8, endUTF8, 1 + upto);
                }
            }
            else if (startUTF8.len == endUTF8.len)
            {
                if (upto == startUTF8.len - 1)
                {
                    start.AddTransition(new Transition(startUTF8.ByteAt(upto), endUTF8.ByteAt(upto), end)); // type=startend
                }
                else
                {
                    Start(start, end, startUTF8, upto, false);
                    if (endUTF8.ByteAt(upto) - startUTF8.ByteAt(upto) > 1)
                    {
                        // There is a middle
                        All(start, end, startUTF8.ByteAt(upto) + 1, endUTF8.ByteAt(upto) - 1, startUTF8.len - upto - 1);
                    }
                    End(start, end, endUTF8, upto, false);
                }
            }
            else
            {
                // start
                Start(start, end, startUTF8, upto, true);

                // possibly middle, spanning multiple num bytes
                int byteCount = 1 + startUTF8.len - upto;
                int limit     = endUTF8.len - upto;
                while (byteCount < limit)
                {
                    // wasteful: we only need first byte, and, we should
                    // statically encode this first byte:
                    tmpUTF8a.Set(startCodes[byteCount - 1]);
                    tmpUTF8b.Set(endCodes[byteCount - 1]);
                    All(start, end, tmpUTF8a.ByteAt(0), tmpUTF8b.ByteAt(0), tmpUTF8a.len - 1);
                    byteCount++;
                }

                // end
                End(start, end, endUTF8, upto, true);
            }
        }
예제 #20
0
        internal void MarkForFullFlush()
        {
            DocumentsWriterDeleteQueue flushingQueue;

            lock (this)
            {
                if (Debugging.AssertsEnabled)
                {
                    Debugging.Assert(!fullFlush, "called DWFC#markForFullFlush() while full flush is still running");
                    Debugging.Assert(fullFlushBuffer.Count == 0, () => "full flush buffer should be empty: " + fullFlushBuffer);
                }
                fullFlush     = true;
                flushingQueue = documentsWriter.deleteQueue;
                // Set a new delete queue - all subsequent DWPT will use this queue until
                // we do another full flush
                DocumentsWriterDeleteQueue newQueue = new DocumentsWriterDeleteQueue(flushingQueue.generation + 1);
                documentsWriter.deleteQueue = newQueue;
            }
            int limit = perThreadPool.NumThreadStatesActive;

            for (int i = 0; i < limit; i++)
            {
                ThreadState next = perThreadPool.GetThreadState(i);
                next.@Lock();
                try
                {
                    if (!next.IsInitialized)
                    {
                        if (closed && next.IsActive)
                        {
                            perThreadPool.DeactivateThreadState(next);
                        }
                        continue;
                    }
                    if (Debugging.AssertsEnabled)
                    {
                        Debugging.Assert(next.dwpt.deleteQueue == flushingQueue || next.dwpt.deleteQueue == documentsWriter.deleteQueue, () => " flushingQueue: " + flushingQueue + " currentqueue: " + documentsWriter.deleteQueue + " perThread queue: " + next.dwpt.deleteQueue + " numDocsInRam: " + next.dwpt.NumDocsInRAM);
                    }
                    if (next.dwpt.deleteQueue != flushingQueue)
                    {
                        // this one is already a new DWPT
                        continue;
                    }
                    AddFlushableState(next);
                }
                finally
                {
                    next.Unlock();
                }
            }
            lock (this)
            {
                /* make sure we move all DWPT that are where concurrently marked as
                 * pending and moved to blocked are moved over to the flushQueue. There is
                 * a chance that this happens since we marking DWPT for full flush without
                 * blocking indexing.*/
                PruneBlockedQueue(flushingQueue);
                if (Debugging.AssertsEnabled)
                {
                    Debugging.Assert(AssertBlockedFlushes(documentsWriter.deleteQueue));
                }
                //FlushQueue.AddAll(FullFlushBuffer);
                foreach (var dwpt in fullFlushBuffer)
                {
                    flushQueue.Enqueue(dwpt);
                }
                fullFlushBuffer.Clear();
                UpdateStallState();
            }
            if (Debugging.AssertsEnabled)
            {
                Debugging.Assert(AssertActiveDeleteQueue(documentsWriter.deleteQueue));
            }
        }
예제 #21
0
        private IDictionary <int, object> HighlightField(string field, string[] contents, BreakIterator bi, BytesRef[] terms, int[] docids, IList <AtomicReaderContext> leaves, int maxPassages, Query query)
        {
            IDictionary <int, object> highlights = new Dictionary <int, object>();

            PassageFormatter fieldFormatter = GetFormatter(field);

            if (fieldFormatter == null)
            {
                throw new NullReferenceException("PassageFormatter cannot be null");
            }

            // check if we should do any multiterm processing
            Analyzer analyzer = GetIndexAnalyzer(field);

            CharacterRunAutomaton[] automata = Arrays.Empty <CharacterRunAutomaton>();
            if (analyzer != null)
            {
                automata = MultiTermHighlighting.ExtractAutomata(query, field);
            }

            // resize 'terms', where the last term is the multiterm matcher
            if (automata.Length > 0)
            {
                BytesRef[] newTerms = new BytesRef[terms.Length + 1];
                System.Array.Copy(terms, 0, newTerms, 0, terms.Length);
                terms = newTerms;
            }

            // we are processing in increasing docid order, so we only need to reinitialize stuff on segment changes
            // otherwise, we will just advance() existing enums to the new document in the same segment.
            DocsAndPositionsEnum[] postings = null;
            TermsEnum termsEnum             = null;
            int       lastLeaf = -1;

            for (int i = 0; i < docids.Length; i++)
            {
                string content = contents[i];
                if (content.Length == 0)
                {
                    continue; // nothing to do
                }
                bi.SetText(content);
                int doc  = docids[i];
                int leaf = ReaderUtil.SubIndex(doc, leaves);
                AtomicReaderContext subContext = leaves[leaf];
                AtomicReader        r          = subContext.AtomicReader;

                if (Debugging.AssertsEnabled)
                {
                    Debugging.Assert(leaf >= lastLeaf);                           // increasing order
                }
                // if the segment has changed, we must initialize new enums.
                if (leaf != lastLeaf)
                {
                    Terms t = r.GetTerms(field);
                    if (t != null)
                    {
                        termsEnum = t.GetEnumerator();
                        postings  = new DocsAndPositionsEnum[terms.Length];
                    }
                }
                if (termsEnum == null)
                {
                    continue; // no terms for this field, nothing to do
                }

                // if there are multi-term matches, we have to initialize the "fake" enum for each document
                if (automata.Length > 0)
                {
                    DocsAndPositionsEnum dp = MultiTermHighlighting.GetDocsEnum(analyzer.GetTokenStream(field, content), automata);
                    dp.Advance(doc - subContext.DocBase);
                    postings[terms.Length - 1] = dp; // last term is the multiterm matcher
                }

                Passage[] passages = HighlightDoc(field, terms, content.Length, bi, doc - subContext.DocBase, termsEnum, postings, maxPassages);

                if (passages.Length == 0)
                {
                    // no passages were returned, so ask for a default summary
                    passages = GetEmptyHighlight(field, bi, maxPassages);
                }

                if (passages.Length > 0)
                {
                    highlights[doc] = fieldFormatter.Format(passages, content);
                }

                lastLeaf = leaf;
            }

            return(highlights);
        }
예제 #22
0
            public override int NextPosition()
            {
                if (lazyProxPointer != -1)
                {
                    proxIn.Seek(lazyProxPointer);
                    lazyProxPointer = -1;
                }

                if (payloadPending && payloadLength > 0)
                {
                    // payload of last position was never retrieved -- skip it
                    proxIn.Seek(proxIn.Position + payloadLength); // LUCENENET specific: Renamed from getFilePointer() to match FileStream
                    payloadPending = false;
                }

                // scan over any docs that were iterated without their positions
                while (posPendingCount > freq)
                {
                    int code = proxIn.ReadVInt32();

                    if (storePayloads)
                    {
                        if ((code & 1) != 0)
                        {
                            // new payload length
                            payloadLength = proxIn.ReadVInt32();
                            if (Debugging.AssertsEnabled)
                            {
                                Debugging.Assert(payloadLength >= 0);
                            }
                        }
                        if (Debugging.AssertsEnabled)
                        {
                            Debugging.Assert(payloadLength != -1);
                        }
                    }

                    if (storeOffsets)
                    {
                        if ((proxIn.ReadVInt32() & 1) != 0)
                        {
                            // new offset length
                            offsetLength = proxIn.ReadVInt32();
                        }
                    }

                    if (storePayloads)
                    {
                        proxIn.Seek(proxIn.Position + payloadLength); // LUCENENET specific: Renamed from getFilePointer() to match FileStream
                    }

                    posPendingCount--;
                    position       = 0;
                    startOffset    = 0;
                    payloadPending = false;
                    //System.out.println("StandardR.D&PE skipPos");
                }

                // read next position
                if (payloadPending && payloadLength > 0)
                {
                    // payload wasn't retrieved for last position
                    proxIn.Seek(proxIn.Position + payloadLength); // LUCENENET specific: Renamed from getFilePointer() to match FileStream
                }

                int code_ = proxIn.ReadVInt32();

                if (storePayloads)
                {
                    if ((code_ & 1) != 0)
                    {
                        // new payload length
                        payloadLength = proxIn.ReadVInt32();
                        if (Debugging.AssertsEnabled)
                        {
                            Debugging.Assert(payloadLength >= 0);
                        }
                    }
                    if (Debugging.AssertsEnabled)
                    {
                        Debugging.Assert(payloadLength != -1);
                    }

                    payloadPending = true;
                    code_          = code_.TripleShift(1);
                }
                position += code_;

                if (storeOffsets)
                {
                    int offsetCode = proxIn.ReadVInt32();
                    if ((offsetCode & 1) != 0)
                    {
                        // new offset length
                        offsetLength = proxIn.ReadVInt32();
                    }
                    startOffset += offsetCode.TripleShift(1);
                }

                posPendingCount--;

                if (Debugging.AssertsEnabled)
                {
                    Debugging.Assert(posPendingCount >= 0, "NextPosition() was called too many times (more than Freq times) posPendingCount={0}", posPendingCount);
                }

                //System.out.println("StandardR.D&PE nextPos   return pos=" + position);
                return(position);
            }
예제 #23
0
        /// <summary>
        /// Safe (but, slowish) default method to write every
        /// vector field in the document.
        /// </summary>
        protected void AddAllDocVectors(Fields vectors, MergeState mergeState)
        {
            if (vectors == null)
            {
                StartDocument(0);
                FinishDocument();
                return;
            }

            int numFields = vectors.Count;

            if (numFields == -1)
            {
                // count manually! TODO: Maybe enforce that Fields.size() returns something valid?
                numFields = 0;
                //for (IEnumerator<string> it = vectors.Iterator(); it.hasNext();)
                foreach (string it in vectors)
                {
                    numFields++;
                }
            }
            StartDocument(numFields);

            string lastFieldName = null;

            TermsEnum            termsEnum            = null;
            DocsAndPositionsEnum docsAndPositionsEnum = null;

            int fieldCount = 0;

            foreach (string fieldName in vectors)
            {
                fieldCount++;
                FieldInfo fieldInfo = mergeState.FieldInfos.FieldInfo(fieldName);

                if (Debugging.AssertsEnabled)
                {
                    Debugging.Assert(lastFieldName == null || fieldName.CompareToOrdinal(lastFieldName) > 0, "lastFieldName={0} fieldName={1}", lastFieldName, fieldName);
                }
                lastFieldName = fieldName;

                Terms terms = vectors.GetTerms(fieldName);
                if (terms == null)
                {
                    // FieldsEnum shouldn't lie...
                    continue;
                }

                bool hasPositions = terms.HasPositions;
                bool hasOffsets   = terms.HasOffsets;
                bool hasPayloads  = terms.HasPayloads;
                if (Debugging.AssertsEnabled)
                {
                    Debugging.Assert(!hasPayloads || hasPositions);
                }

                int numTerms = (int)terms.Count;
                if (numTerms == -1)
                {
                    // count manually. It is stupid, but needed, as Terms.size() is not a mandatory statistics function
                    numTerms  = 0;
                    termsEnum = terms.GetEnumerator(termsEnum);
                    while (termsEnum.MoveNext())
                    {
                        numTerms++;
                    }
                }

                StartField(fieldInfo, numTerms, hasPositions, hasOffsets, hasPayloads);
                termsEnum = terms.GetEnumerator(termsEnum);

                int termCount = 0;
                while (termsEnum.MoveNext())
                {
                    termCount++;

                    int freq = (int)termsEnum.TotalTermFreq;

                    StartTerm(termsEnum.Term, freq);

                    if (hasPositions || hasOffsets)
                    {
                        docsAndPositionsEnum = termsEnum.DocsAndPositions(null, docsAndPositionsEnum);
                        if (Debugging.AssertsEnabled)
                        {
                            Debugging.Assert(docsAndPositionsEnum != null);
                        }

                        int docID = docsAndPositionsEnum.NextDoc();
                        if (Debugging.AssertsEnabled)
                        {
                            Debugging.Assert(docID != DocIdSetIterator.NO_MORE_DOCS);
                            Debugging.Assert(docsAndPositionsEnum.Freq == freq);
                        }

                        for (int posUpto = 0; posUpto < freq; posUpto++)
                        {
                            int pos         = docsAndPositionsEnum.NextPosition();
                            int startOffset = docsAndPositionsEnum.StartOffset;
                            int endOffset   = docsAndPositionsEnum.EndOffset;

                            BytesRef payload = docsAndPositionsEnum.GetPayload();

                            if (Debugging.AssertsEnabled)
                            {
                                Debugging.Assert(!hasPositions || pos >= 0);
                            }
                            AddPosition(pos, startOffset, endOffset, payload);
                        }
                    }
                    FinishTerm();
                }
                if (Debugging.AssertsEnabled)
                {
                    Debugging.Assert(termCount == numTerms);
                }
                FinishField();
            }
            if (Debugging.AssertsEnabled)
            {
                Debugging.Assert(fieldCount == numFields);
            }
            FinishDocument();
        }
예제 #24
0
        public override void Warm(AtomicReader reader)
        {
            long startTime      = J2N.Time.NanoTime() / J2N.Time.MillisecondsPerNanosecond; // LUCENENET: Use NanoTime() rather than CurrentTimeMilliseconds() for more accurate/reliable results
            int  indexedCount   = 0;
            int  docValuesCount = 0;
            int  normsCount     = 0;

            foreach (FieldInfo info in reader.FieldInfos)
            {
                if (info.IsIndexed)
                {
                    reader.GetTerms(info.Name);
                    indexedCount++;

                    if (info.HasNorms)
                    {
                        reader.GetNormValues(info.Name);
                        normsCount++;
                    }
                }

                if (info.HasDocValues)
                {
                    switch (info.DocValuesType)
                    {
                    case DocValuesType.NUMERIC:
                        reader.GetNumericDocValues(info.Name);
                        break;

                    case DocValuesType.BINARY:
                        reader.GetBinaryDocValues(info.Name);
                        break;

                    case DocValuesType.SORTED:
                        reader.GetSortedDocValues(info.Name);
                        break;

                    case DocValuesType.SORTED_SET:
                        reader.GetSortedSetDocValues(info.Name);
                        break;

                    default:
                        if (Debugging.AssertsEnabled)
                        {
                            Debugging.Assert(false);                               // unknown dv type
                        }
                        break;
                    }
                    docValuesCount++;
                }
            }

            reader.Document(0);
            reader.GetTermVectors(0);

            if (infoStream.IsEnabled("SMSW"))
            {
                infoStream.Message("SMSW",
                                   "Finished warming segment: " + reader +
                                   ", indexed=" + indexedCount +
                                   ", docValues=" + docValuesCount +
                                   ", norms=" + normsCount +
                                   ", time=" + ((J2N.Time.NanoTime() / J2N.Time.MillisecondsPerNanosecond) - startTime)); // LUCENENET: Use NanoTime() rather than CurrentTimeMilliseconds() for more accurate/reliable results
            }
        }
예제 #25
0
        // for debugging

        /*
         * private String toString(BytesRef b) {
         * try {
         *  return b.utf8ToString() + " " + b;
         * } catch (Throwable t) {
         *  return b.toString();
         * }
         * }
         */

        /// <summary>
        /// It's OK to add the same input twice in a row with
        /// different outputs, as long as outputs impls the merge
        /// method. Note that input is fully consumed after this
        /// method is returned (so caller is free to reuse), but
        /// output is not.  So if your outputs are changeable (eg
        /// <see cref="ByteSequenceOutputs"/> or
        /// <see cref="Int32SequenceOutputs"/>) then you cannot reuse across
        /// calls.
        /// </summary>
        public virtual void Add(Int32sRef input, T output)
        {
            /*
             * if (DEBUG) {
             * BytesRef b = new BytesRef(input.length);
             * for(int x=0;x<input.length;x++) {
             *  b.bytes[x] = (byte) input.ints[x];
             * }
             * b.length = input.length;
             * if (output == NO_OUTPUT) {
             *  System.out.println("\nFST ADD: input=" + toString(b) + " " + b);
             * } else {
             *  System.out.println("\nFST ADD: input=" + toString(b) + " " + b + " output=" + fst.outputs.outputToString(output));
             * }
             * }
             */

            // De-dup NO_OUTPUT since it must be a singleton:
            if (output.Equals(NO_OUTPUT))
            {
                output = NO_OUTPUT;
            }

            if (Debugging.AssertsEnabled)
            {
                Debugging.Assert(lastInput.Length == 0 || input.CompareTo(lastInput) >= 0, () => "inputs are added out of order lastInput=" + lastInput + " vs input=" + input);
                Debugging.Assert(ValidOutput(output));
            }

            //System.out.println("\nadd: " + input);
            if (input.Length == 0)
            {
                // empty input: only allowed as first input.  we have
                // to special case this because the packed FST
                // format cannot represent the empty input since
                // 'finalness' is stored on the incoming arc, not on
                // the node
                frontier[0].InputCount++;
                frontier[0].IsFinal = true;
                fst.EmptyOutput     = output;
                return;
            }

            // compare shared prefix length
            int pos1     = 0;
            int pos2     = input.Offset;
            int pos1Stop = Math.Min(lastInput.Length, input.Length);

            while (true)
            {
                frontier[pos1].InputCount++;
                //System.out.println("  incr " + pos1 + " ct=" + frontier[pos1].inputCount + " n=" + frontier[pos1]);
                if (pos1 >= pos1Stop || lastInput.Int32s[pos1] != input.Int32s[pos2])
                {
                    break;
                }
                pos1++;
                pos2++;
            }
            int prefixLenPlus1 = pos1 + 1;

            if (frontier.Length < input.Length + 1)
            {
                UnCompiledNode <T>[] next = new UnCompiledNode <T> [ArrayUtil.Oversize(input.Length + 1, RamUsageEstimator.NUM_BYTES_OBJECT_REF)];
                Array.Copy(frontier, 0, next, 0, frontier.Length);
                for (int idx = frontier.Length; idx < next.Length; idx++)
                {
                    next[idx] = new UnCompiledNode <T>(this, idx);
                }
                frontier = next;
            }

            // minimize/compile states from previous input's
            // orphan'd suffix
            DoFreezeTail(prefixLenPlus1);

            // init tail states for current input
            for (int idx = prefixLenPlus1; idx <= input.Length; idx++)
            {
                frontier[idx - 1].AddArc(input.Int32s[input.Offset + idx - 1], frontier[idx]);
                frontier[idx].InputCount++;
            }

            UnCompiledNode <T> lastNode = frontier[input.Length];

            if (lastInput.Length != input.Length || prefixLenPlus1 != input.Length + 1)
            {
                lastNode.IsFinal = true;
                lastNode.Output  = NO_OUTPUT;
            }

            // push conflicting outputs forward, only as far as
            // needed
            for (int idx = 1; idx < prefixLenPlus1; idx++)
            {
                UnCompiledNode <T> node       = frontier[idx];
                UnCompiledNode <T> parentNode = frontier[idx - 1];

                T lastOutput = parentNode.GetLastOutput(input.Int32s[input.Offset + idx - 1]);
                if (Debugging.AssertsEnabled)
                {
                    Debugging.Assert(ValidOutput(lastOutput));
                }

                T commonOutputPrefix;
                T wordSuffix;

                if (!lastOutput.Equals(NO_OUTPUT))
                {
                    commonOutputPrefix = fst.Outputs.Common(output, lastOutput);
                    if (Debugging.AssertsEnabled)
                    {
                        Debugging.Assert(ValidOutput(commonOutputPrefix));
                    }
                    wordSuffix = fst.Outputs.Subtract(lastOutput, commonOutputPrefix);
                    if (Debugging.AssertsEnabled)
                    {
                        Debugging.Assert(ValidOutput(wordSuffix));
                    }
                    parentNode.SetLastOutput(input.Int32s[input.Offset + idx - 1], commonOutputPrefix);
                    node.PrependOutput(wordSuffix);
                }
                else
                {
                    commonOutputPrefix = /*wordSuffix =*/ NO_OUTPUT; // LUCENENET: Removed unnecessary assignment
                }

                output = fst.Outputs.Subtract(output, commonOutputPrefix);
                if (Debugging.AssertsEnabled)
                {
                    Debugging.Assert(ValidOutput(output));
                }
            }

            if (lastInput.Length == input.Length && prefixLenPlus1 == 1 + input.Length)
            {
                // same input more than 1 time in a row, mapping to
                // multiple outputs
                lastNode.Output = fst.Outputs.Merge(lastNode.Output, output);
            }
            else
            {
                // this new arc is private to this new input; set its
                // arc output to the leftover output:
                frontier[prefixLenPlus1 - 1].SetLastOutput(input.Int32s[input.Offset + prefixLenPlus1 - 1], output);
            }

            // save last input
            lastInput.CopyInt32s(input);

            //System.out.println("  count[0]=" + frontier[0].inputCount);
        }
예제 #26
0
            public virtual void _run()
            {
                for (int iter = 0; iter < NUM_TEST_ITER; iter++)
                {
                    FieldData field     = fields[Random.Next(fields.Length)];
                    TermsEnum termsEnum = termsDict.GetTerms(field.fieldInfo.Name).GetEnumerator();
#pragma warning disable 612, 618
                    if (si.Codec is Lucene3xCodec)
#pragma warning restore 612, 618
                    {
                        // code below expects unicode sort order
                        continue;
                    }

                    int upto = 0;
                    // Test straight enum of the terms:
                    while (termsEnum.MoveNext())
                    {
                        BytesRef term     = termsEnum.Term;
                        BytesRef expected = new BytesRef(field.terms[upto++].text2);
                        Assert.IsTrue(expected.BytesEquals(term), "expected=" + expected + " vs actual " + term);
                    }
                    Assert.AreEqual(upto, field.terms.Length);

                    // Test random seek:
                    TermData             term2  = field.terms[Random.Next(field.terms.Length)];
                    TermsEnum.SeekStatus status = termsEnum.SeekCeil(new BytesRef(term2.text2));
                    Assert.AreEqual(status, TermsEnum.SeekStatus.FOUND);
                    Assert.AreEqual(term2.docs.Length, termsEnum.DocFreq);
                    if (field.omitTF)
                    {
                        this.VerifyDocs(term2.docs, term2.positions, TestUtil.Docs(Random, termsEnum, null, null, DocsFlags.NONE), false);
                    }
                    else
                    {
                        this.VerifyDocs(term2.docs, term2.positions, termsEnum.DocsAndPositions(null, null), true);
                    }

                    // Test random seek by ord:
                    int idx = Random.Next(field.terms.Length);
                    term2 = field.terms[idx];
                    bool success = false;
                    try
                    {
                        termsEnum.SeekExact(idx);
                        success = true;
                    }
                    catch (Exception uoe) when(uoe.IsUnsupportedOperationException())
                    {
                        // ok -- skip it
                    }
                    if (success)
                    {
                        Assert.AreEqual(status, TermsEnum.SeekStatus.FOUND);
                        Assert.IsTrue(termsEnum.Term.BytesEquals(new BytesRef(term2.text2)));
                        Assert.AreEqual(term2.docs.Length, termsEnum.DocFreq);
                        if (field.omitTF)
                        {
                            this.VerifyDocs(term2.docs, term2.positions, TestUtil.Docs(Random, termsEnum, null, null, DocsFlags.NONE), false);
                        }
                        else
                        {
                            this.VerifyDocs(term2.docs, term2.positions, termsEnum.DocsAndPositions(null, null), true);
                        }
                    }

                    // Test seek to non-existent terms:
                    if (Verbose)
                    {
                        Console.WriteLine("TEST: seek non-exist terms");
                    }
                    for (int i = 0; i < 100; i++)
                    {
                        string text2 = TestUtil.RandomUnicodeString(Random) + ".";
                        status = termsEnum.SeekCeil(new BytesRef(text2));
                        Assert.IsTrue(status == TermsEnum.SeekStatus.NOT_FOUND || status == TermsEnum.SeekStatus.END);
                    }

                    // Seek to each term, backwards:
                    if (Verbose)
                    {
                        Console.WriteLine("TEST: seek terms backwards");
                    }
                    for (int i = field.terms.Length - 1; i >= 0; i--)
                    {
                        Assert.AreEqual(TermsEnum.SeekStatus.FOUND, termsEnum.SeekCeil(new BytesRef(field.terms[i].text2)), Thread.CurrentThread.Name + ": field=" + field.fieldInfo.Name + " term=" + field.terms[i].text2);
                        Assert.AreEqual(field.terms[i].docs.Length, termsEnum.DocFreq);
                    }

                    // Seek to each term by ord, backwards
                    for (int i = field.terms.Length - 1; i >= 0; i--)
                    {
                        try
                        {
                            termsEnum.SeekExact(i);
                            Assert.AreEqual(field.terms[i].docs.Length, termsEnum.DocFreq);
                            Assert.IsTrue(termsEnum.Term.BytesEquals(new BytesRef(field.terms[i].text2)));
                        }
                        catch (Exception uoe) when(uoe.IsUnsupportedOperationException())
                        {
                        }
                    }

                    // Seek to non-existent empty-string term
                    status = termsEnum.SeekCeil(new BytesRef(""));
                    Assert.IsNotNull(status);
                    //Assert.AreEqual(TermsEnum.SeekStatus.NOT_FOUND, status);

                    // Make sure we're now pointing to first term
                    Assert.IsTrue(termsEnum.Term.BytesEquals(new BytesRef(field.terms[0].text2)));

                    // Test docs enum
                    termsEnum.SeekCeil(new BytesRef(""));
                    upto = 0;
                    do
                    {
                        term2 = field.terms[upto];
                        if (Random.Next(3) == 1)
                        {
                            DocsEnum             docs;
                            DocsEnum             docsAndFreqs;
                            DocsAndPositionsEnum postings;
                            if (!field.omitTF)
                            {
                                postings = termsEnum.DocsAndPositions(null, null);
                                if (postings != null)
                                {
                                    docs = docsAndFreqs = postings;
                                }
                                else
                                {
                                    docs = docsAndFreqs = TestUtil.Docs(Random, termsEnum, null, null, DocsFlags.FREQS);
                                }
                            }
                            else
                            {
                                postings     = null;
                                docsAndFreqs = null;
                                docs         = TestUtil.Docs(Random, termsEnum, null, null, DocsFlags.NONE);
                            }
                            Assert.IsNotNull(docs);
                            int  upto2 = -1;
                            bool ended = false;
                            while (upto2 < term2.docs.Length - 1)
                            {
                                // Maybe skip:
                                int left = term2.docs.Length - upto2;
                                int doc;
                                if (Random.Next(3) == 1 && left >= 1)
                                {
                                    int inc = 1 + Random.Next(left - 1);
                                    upto2 += inc;
                                    if (Random.Next(2) == 1)
                                    {
                                        doc = docs.Advance(term2.docs[upto2]);
                                        Assert.AreEqual(term2.docs[upto2], doc);
                                    }
                                    else
                                    {
                                        doc = docs.Advance(1 + term2.docs[upto2]);
                                        if (doc == DocIdSetIterator.NO_MORE_DOCS)
                                        {
                                            // skipped past last doc
                                            if (Debugging.AssertsEnabled)
                                            {
                                                Debugging.Assert(upto2 == term2.docs.Length - 1);
                                            }
                                            ended = true;
                                            break;
                                        }
                                        else
                                        {
                                            // skipped to next doc
                                            if (Debugging.AssertsEnabled)
                                            {
                                                Debugging.Assert(upto2 < term2.docs.Length - 1);
                                            }
                                            if (doc >= term2.docs[1 + upto2])
                                            {
                                                upto2++;
                                            }
                                        }
                                    }
                                }
                                else
                                {
                                    doc = docs.NextDoc();
                                    Assert.IsTrue(doc != -1);
                                    upto2++;
                                }
                                Assert.AreEqual(term2.docs[upto2], doc);
                                if (!field.omitTF)
                                {
                                    Assert.AreEqual(term2.positions[upto2].Length, postings.Freq);
                                    if (Random.Next(2) == 1)
                                    {
                                        this.VerifyPositions(term2.positions[upto2], postings);
                                    }
                                }
                            }

                            if (!ended)
                            {
                                Assert.AreEqual(DocIdSetIterator.NO_MORE_DOCS, docs.NextDoc());
                            }
                        }
                        upto++;
                    } while (termsEnum.MoveNext());

                    Assert.AreEqual(upto, field.terms.Length);
                }
            }
예제 #27
0
            public override void AddSortedSetField(FieldInfo field, IEnumerable <BytesRef> values, IEnumerable <long?> docToOrdCount, IEnumerable <long?> ords)
            {
                long     valueCount = 0;
                BytesRef lastValue  = null;

                foreach (BytesRef b in values)
                {
                    if (Debugging.AssertsEnabled)
                    {
                        Debugging.Assert(b != null);
                    }
                    if (Debugging.AssertsEnabled)
                    {
                        Debugging.Assert(b.IsValid());
                    }
                    if (valueCount > 0)
                    {
                        if (Debugging.AssertsEnabled)
                        {
                            Debugging.Assert(b.CompareTo(lastValue) > 0);
                        }
                    }
                    lastValue = BytesRef.DeepCopyOf(b);
                    valueCount++;
                }

                int         docCount = 0;
                long        ordCount = 0;
                Int64BitSet seenOrds = new Int64BitSet(valueCount);

                using IEnumerator <long?> ordIterator = ords.GetEnumerator();
                foreach (long?v in docToOrdCount)
                {
                    if (Debugging.AssertsEnabled)
                    {
                        Debugging.Assert(v != null);
                    }
                    int count = (int)v.Value;
                    if (Debugging.AssertsEnabled)
                    {
                        Debugging.Assert(count >= 0);
                    }
                    docCount++;
                    ordCount += count;

                    long lastOrd = -1;
                    for (int i = 0; i < count; i++)
                    {
                        ordIterator.MoveNext();
                        long?o = ordIterator.Current;
                        if (Debugging.AssertsEnabled)
                        {
                            Debugging.Assert(o != null);
                        }
                        long ord = o.Value;
                        if (Debugging.AssertsEnabled)
                        {
                            Debugging.Assert(ord >= 0 && ord < valueCount);
                        }
                        if (Debugging.AssertsEnabled)
                        {
                            Debugging.Assert(ord > lastOrd, "ord={0},lastOrd={1}", ord, lastOrd);
                        }
                        seenOrds.Set(ord);
                        lastOrd = ord;
                    }
                }
                if (Debugging.AssertsEnabled)
                {
                    Debugging.Assert(ordIterator.MoveNext() == false);
                }

                if (Debugging.AssertsEnabled)
                {
                    Debugging.Assert(docCount == maxDoc);
                }
                if (Debugging.AssertsEnabled)
                {
                    Debugging.Assert(seenOrds.Cardinality() == valueCount);
                }
                CheckIterator(values.GetEnumerator(), valueCount, false);
                CheckIterator(docToOrdCount.GetEnumerator(), maxDoc, false);
                CheckIterator(ords.GetEnumerator(), ordCount, false);
                @in.AddSortedSetField(field, values, docToOrdCount, ords);
            }
예제 #28
0
        public override TopDocs Rescore(IndexSearcher searcher, TopDocs firstPassTopDocs, int topN)
        {
            ScoreDoc[] hits = (ScoreDoc[])firstPassTopDocs.ScoreDocs.Clone();
            Array.Sort(hits, Comparer <ScoreDoc> .Create((a, b) => a.Doc - b.Doc));

            IList <AtomicReaderContext> leaves = searcher.IndexReader.Leaves;

            Weight weight = searcher.CreateNormalizedWeight(query);

            // Now merge sort docIDs from hits, with reader's leaves:
            int    hitUpto    = 0;
            int    readerUpto = -1;
            int    endDoc     = 0;
            int    docBase    = 0;
            Scorer scorer     = null;

            while (hitUpto < hits.Length)
            {
                ScoreDoc            hit           = hits[hitUpto];
                int                 docID         = hit.Doc;
                AtomicReaderContext readerContext = null;
                while (docID >= endDoc)
                {
                    readerUpto++;
                    readerContext = leaves[readerUpto];
                    endDoc        = readerContext.DocBase + readerContext.Reader.MaxDoc;
                }

                if (readerContext != null)
                {
                    // We advanced to another segment:
                    docBase = readerContext.DocBase;
                    scorer  = weight.GetScorer(readerContext, null);
                }

                int targetDoc = docID - docBase;
                int actualDoc = scorer.DocID;
                if (actualDoc < targetDoc)
                {
                    actualDoc = scorer.Advance(targetDoc);
                }

                if (actualDoc == targetDoc)
                {
                    // Query did match this doc:
                    hit.Score = Combine(hit.Score, true, scorer.GetScore());
                }
                else
                {
                    // Query did not match this doc:
                    if (Debugging.AssertsEnabled)
                    {
                        Debugging.Assert(actualDoc > targetDoc);
                    }
                    hit.Score = Combine(hit.Score, false, 0.0f);
                }

                hitUpto++;
            }

            // TODO: we should do a partial sort (of only topN)
            // instead, but typically the number of hits is
            // smallish:
            Array.Sort(hits, Comparer <ScoreDoc> .Create((a, b) =>
            {
                // Sort by score descending, then docID ascending:
                if (a.Score > b.Score)
                {
                    return(-1);
                }
                else if (a.Score < b.Score)
                {
                    return(1);
                }
                else
                {
                    // this subtraction can't overflow int
                    // because docIDs are >= 0:
                    return(a.Doc - b.Doc);
                }
            }));

            if (topN < hits.Length)
            {
                ScoreDoc[] subset = new ScoreDoc[topN];
                Array.Copy(hits, 0, subset, 0, topN);
                hits = subset;
            }

            return(new TopDocs(firstPassTopDocs.TotalHits, hits, hits[0].Score));
        }
예제 #29
0
        // Delete by Term
        private long ApplyTermDeletes(IEnumerable <Term> termsIter, ReadersAndUpdates rld, SegmentReader reader)
        {
            UninterruptableMonitor.Enter(this);
            try
            {
                long   delCount = 0;
                Fields fields   = reader.Fields;
                if (fields == null)
                {
                    // this reader has no postings
                    return(0);
                }

                TermsEnum termsEnum = null;

                string   currentField = null;
                DocsEnum docs         = null;

                if (Debugging.AssertsEnabled)
                {
                    Debugging.Assert(CheckDeleteTerm(null));
                }

                bool any = false;

                //System.out.println(Thread.currentThread().getName() + " del terms reader=" + reader);
                foreach (Term term in termsIter)
                {
                    // Since we visit terms sorted, we gain performance
                    // by re-using the same TermsEnum and seeking only
                    // forwards
                    if (!string.Equals(term.Field, currentField, StringComparison.Ordinal))
                    {
                        if (Debugging.AssertsEnabled)
                        {
                            Debugging.Assert(currentField == null || currentField.CompareToOrdinal(term.Field) < 0);
                        }
                        currentField = term.Field;
                        Terms terms = fields.GetTerms(currentField);
                        if (terms != null)
                        {
                            termsEnum = terms.GetEnumerator(termsEnum);
                        }
                        else
                        {
                            termsEnum = null;
                        }
                    }

                    if (termsEnum == null)
                    {
                        continue;
                    }
                    if (Debugging.AssertsEnabled)
                    {
                        Debugging.Assert(CheckDeleteTerm(term));
                    }

                    // System.out.println("  term=" + term);

                    if (termsEnum.SeekExact(term.Bytes))
                    {
                        // we don't need term frequencies for this
                        DocsEnum docsEnum = termsEnum.Docs(rld.LiveDocs, docs, DocsFlags.NONE);
                        //System.out.println("BDS: got docsEnum=" + docsEnum);

                        if (docsEnum != null)
                        {
                            while (true)
                            {
                                int docID = docsEnum.NextDoc();
                                //System.out.println(Thread.currentThread().getName() + " del term=" + term + " doc=" + docID);
                                if (docID == DocIdSetIterator.NO_MORE_DOCS)
                                {
                                    break;
                                }
                                if (!any)
                                {
                                    rld.InitWritableLiveDocs();
                                    any = true;
                                }
                                // NOTE: there is no limit check on the docID
                                // when deleting by Term (unlike by Query)
                                // because on flush we apply all Term deletes to
                                // each segment.  So all Term deleting here is
                                // against prior segments:
                                if (rld.Delete(docID))
                                {
                                    delCount++;
                                }
                            }
                        }
                    }
                }

                return(delCount);
            }
            finally
            {
                UninterruptableMonitor.Exit(this);
            }
        }
예제 #30
0
        /// <summary>
        /// Used by near real-time search </summary>
        internal static DirectoryReader Open(IndexWriter writer, SegmentInfos infos, bool applyAllDeletes)
        {
            // IndexWriter synchronizes externally before calling
            // us, which ensures infos will not change; so there's
            // no need to process segments in reverse order
            int numSegments = infos.Count;

            IList <SegmentReader> readers = new List <SegmentReader>();
            Directory             dir     = writer.Directory;

            SegmentInfos segmentInfos = (SegmentInfos)infos.Clone();
            int          infosUpto    = 0;
            bool         success      = false;

            try
            {
                for (int i = 0; i < numSegments; i++)
                {
                    // NOTE: important that we use infos not
                    // segmentInfos here, so that we are passing the
                    // actual instance of SegmentInfoPerCommit in
                    // IndexWriter's segmentInfos:
                    SegmentCommitInfo info = infos.Info(i);
                    if (Debugging.AssertsEnabled)
                    {
                        Debugging.Assert(info.Info.Dir == dir);
                    }
                    ReadersAndUpdates rld = writer.readerPool.Get(info, true);
                    try
                    {
                        SegmentReader reader = rld.GetReadOnlyClone(IOContext.READ);
                        if (reader.NumDocs > 0 || writer.KeepFullyDeletedSegments)
                        {
                            // Steal the ref:
                            readers.Add(reader);
                            infosUpto++;
                        }
                        else
                        {
                            reader.DecRef();
                            segmentInfos.Remove(infosUpto);
                        }
                    }
                    finally
                    {
                        writer.readerPool.Release(rld);
                    }
                }

                writer.IncRefDeleter(segmentInfos);

                StandardDirectoryReader result = new StandardDirectoryReader(dir, readers.ToArray(), writer, segmentInfos, writer.Config.ReaderTermsIndexDivisor, applyAllDeletes);
                success = true;
                return(result);
            }
            finally
            {
                if (!success)
                {
                    foreach (SegmentReader r in readers)
                    {
                        try
                        {
                            r.DecRef();
                        }
#pragma warning disable 168
                        catch (Exception th)
#pragma warning restore 168
                        {
                            // ignore any exception that is thrown here to not mask any original
                            // exception.
                        }
                    }
                }
            }
        }