internal void InitializeInstanceFields()
		  {
			  @in = new ByteArrayDataInput(buffer);
		  }
Esempio n. 2
0
        public override void Build(IInputEnumerator enumerator)
        {
            // LUCENENET: Added guard clause for null
            if (enumerator is null)
            {
                throw new ArgumentNullException(nameof(enumerator));
            }

            if (enumerator.HasPayloads)
            {
                throw new ArgumentException("this suggester doesn't support payloads");
            }
            if (enumerator.HasContexts)
            {
                throw new ArgumentException("this suggester doesn't support contexts");
            }
            FileInfo tempInput  = FileSupport.CreateTempFile(typeof(FSTCompletionLookup).Name, ".input", OfflineSorter.DefaultTempDir());
            FileInfo tempSorted = FileSupport.CreateTempFile(typeof(FSTCompletionLookup).Name, ".sorted", OfflineSorter.DefaultTempDir());

            OfflineSorter.ByteSequencesWriter writer = new OfflineSorter.ByteSequencesWriter(tempInput);
            OfflineSorter.ByteSequencesReader reader = null;
            ExternalRefSorter sorter = null;

            // Push floats up front before sequences to sort them. For now, assume they are non-negative.
            // If negative floats are allowed some trickery needs to be done to find their byte order.
            bool success = false;

            count = 0;
            try
            {
                byte[] buffer = Arrays.Empty <byte>();
                ByteArrayDataOutput output = new ByteArrayDataOutput(buffer);
                BytesRef            spare;
                while (enumerator.MoveNext())
                {
                    spare = enumerator.Current;
                    if (spare.Length + 4 >= buffer.Length)
                    {
                        buffer = ArrayUtil.Grow(buffer, spare.Length + 4);
                    }

                    output.Reset(buffer);
                    output.WriteInt32(EncodeWeight(enumerator.Weight));
                    output.WriteBytes(spare.Bytes, spare.Offset, spare.Length);
                    writer.Write(buffer, 0, output.Position);
                }
                writer.Dispose();

                // We don't know the distribution of scores and we need to bucket them, so we'll sort
                // and divide into equal buckets.
                OfflineSorter.SortInfo info = (new OfflineSorter()).Sort(tempInput, tempSorted);
                tempInput.Delete();
                FSTCompletionBuilder builder = new FSTCompletionBuilder(buckets, sorter = new ExternalRefSorter(new OfflineSorter()), sharedTailLength);

                int inputLines = info.Lines;
                reader = new OfflineSorter.ByteSequencesReader(tempSorted);
                long line                = 0;
                int  previousBucket      = 0;
                int  previousScore       = 0;
                ByteArrayDataInput input = new ByteArrayDataInput();
                BytesRef           tmp1  = new BytesRef();
                BytesRef           tmp2  = new BytesRef();
                while (reader.Read(tmp1))
                {
                    input.Reset(tmp1.Bytes);
                    int currentScore = input.ReadInt32();

                    int bucket;
                    if (line > 0 && currentScore == previousScore)
                    {
                        bucket = previousBucket;
                    }
                    else
                    {
                        bucket = (int)(line * buckets / inputLines);
                    }
                    previousScore  = currentScore;
                    previousBucket = bucket;

                    // Only append the input, discard the weight.
                    tmp2.Bytes  = tmp1.Bytes;
                    tmp2.Offset = input.Position;
                    tmp2.Length = tmp1.Length - input.Position;
                    builder.Add(tmp2, bucket);

                    line++;
                    count++;
                }

                // The two FSTCompletions share the same automaton.
                this.higherWeightsCompletion = builder.Build();
                this.normalCompletion        = new FSTCompletion(higherWeightsCompletion.FST, false, exactMatchFirst);

                success = true;
            }
            finally
            {
                if (success)
                {
                    IOUtils.Dispose(reader, writer, sorter);
                }
                else
                {
                    IOUtils.DisposeWhileHandlingException(reader, writer, sorter);
                }

                tempInput.Delete();
                tempSorted.Delete();
            }
        }
 internal virtual void Reset(int numTerms, int flags, int[] prefixLengths, int[] suffixLengths, int[] termFreqs, int[] positionIndex, int[] positions, int[] startOffsets, int[] lengths, int[] payloadIndex, BytesRef payloads, ByteArrayDataInput @in)
 {
     this.numTerms      = numTerms;
     this.prefixLengths = prefixLengths;
     this.suffixLengths = suffixLengths;
     this.termFreqs     = termFreqs;
     this.positionIndex = positionIndex;
     this.positions     = positions;
     this.startOffsets  = startOffsets;
     this.lengths       = lengths;
     this.payloadIndex  = payloadIndex;
     this.payloads      = payloads;
     this.@in           = @in;
     startPos           = @in.Position;
     Reset();
 }
                    internal void Load(BytesRef frameIndexData)
                    {
                        // if (DEBUG) System.out.println("    load fp=" + fp + " fpOrig=" + fpOrig + " frameIndexData=" + frameIndexData + " trans=" + (transitions.length != 0 ? transitions[0] : "n/a" + " state=" + state));

                        if (frameIndexData != null && Transitions.Length != 0)
                        {
                            // Floor frame
                            if (FloorData.Length < frameIndexData.Length)
                            {
                                this.FloorData = new byte[ArrayUtil.Oversize(frameIndexData.Length, 1)];
                            }
                            System.Buffer.BlockCopy(frameIndexData.Bytes, frameIndexData.Offset, FloorData, 0, frameIndexData.Length);
                            FloorDataReader.Reset(FloorData, 0, frameIndexData.Length);
                            // Skip first long -- has redundant fp, hasTerms
                            // flag, isFloor flag
                            long code = FloorDataReader.ReadVLong();
                            if ((code & BlockTreeTermsWriter.OUTPUT_FLAG_IS_FLOOR) != 0)
                            {
                                NumFollowFloorBlocks = FloorDataReader.ReadVInt();
                                NextFloorLabel = FloorDataReader.ReadByte() & 0xff;
                                // if (DEBUG) System.out.println("    numFollowFloorBlocks=" + numFollowFloorBlocks + " nextFloorLabel=" + nextFloorLabel);

                                // If current state is accept, we must process
                                // first block in case it has empty suffix:
                                if (OuterInstance.runAutomaton.IsAccept(state))
                                {
                                    // Maybe skip floor blocks:
                                    while (NumFollowFloorBlocks != 0 && NextFloorLabel <= Transitions[0].Min)
                                    {
                                        Fp = FpOrig + ((int)((uint)FloorDataReader.ReadVLong() >> 1));
                                        NumFollowFloorBlocks--;
                                        // if (DEBUG) System.out.println("    skip floor block!  nextFloorLabel=" + (char) nextFloorLabel + " vs target=" + (char) transitions[0].getMin() + " newFP=" + fp + " numFollowFloorBlocks=" + numFollowFloorBlocks);
                                        if (NumFollowFloorBlocks != 0)
                                        {
                                            NextFloorLabel = FloorDataReader.ReadByte() & 0xff;
                                        }
                                        else
                                        {
                                            NextFloorLabel = 256;
                                        }
                                    }
                                }
                            }
                        }

                        [email protected](Fp);
                        int code_ = [email protected]();
                        EntCount = (int)((uint)code_ >> 1);
                        Debug.Assert(EntCount > 0);
                        IsLastInFloor = (code_ & 1) != 0;

                        // term suffixes:
                        code_ = [email protected]();
                        IsLeafBlock = (code_ & 1) != 0;
                        int numBytes = (int)((uint)code_ >> 1);
                        // if (DEBUG) System.out.println("      entCount=" + entCount + " lastInFloor?=" + isLastInFloor + " leafBlock?=" + isLeafBlock + " numSuffixBytes=" + numBytes);
                        if (SuffixBytes.Length < numBytes)
                        {
                            SuffixBytes = new byte[ArrayUtil.Oversize(numBytes, 1)];
                        }
                        [email protected](SuffixBytes, 0, numBytes);
                        SuffixesReader.Reset(SuffixBytes, 0, numBytes);

                        // stats
                        numBytes = [email protected]();
                        if (StatBytes.Length < numBytes)
                        {
                            StatBytes = new byte[ArrayUtil.Oversize(numBytes, 1)];
                        }
                        [email protected](StatBytes, 0, numBytes);
                        StatsReader.Reset(StatBytes, 0, numBytes);
                        MetaDataUpto = 0;

                        TermState.TermBlockOrd = 0;
                        NextEnt = 0;

                        // metadata
                        numBytes = [email protected]();
                        if (Bytes == null)
                        {
                            Bytes = new byte[ArrayUtil.Oversize(numBytes, 1)];
                            BytesReader = new ByteArrayDataInput();
                        }
                        else if (Bytes.Length < numBytes)
                        {
                            Bytes = new byte[ArrayUtil.Oversize(numBytes, 1)];
                        }
                        [email protected](Bytes, 0, numBytes);
                        BytesReader.Reset(Bytes, 0, numBytes);

                        if (!IsLastInFloor)
                        {
                            // Sub-blocks of a single floor block are always
                            // written one after another -- tail recurse:
                            FpEnd = [email protected];
                        }
                    }
Esempio n. 5
0
 public SortedSetDocValuesAnonymousInnerClassHelper(Lucene42DocValuesProducer outerInstance, FSTEntry entry, BinaryDocValues docToOrds, FST <long> fst, FST <long> .BytesReader @in, FST <long> .Arc <long> firstArc, FST <long> .Arc <long> scratchArc, IntsRef scratchInts, BytesRefFSTEnum <long> fstEnum, BytesRef @ref, ByteArrayDataInput input)
 {
     this.OuterInstance = outerInstance;
     this.Entry         = entry;
     this.DocToOrds     = docToOrds;
     this.Fst           = fst;
     this.@in           = @in;
     this.FirstArc      = firstArc;
     this.ScratchArc    = scratchArc;
     this.ScratchInts   = scratchInts;
     this.FstEnum       = fstEnum;
     this.@ref          = @ref;
     this.Input         = input;
 }
        public override void VisitDocument(int docID, StoredFieldVisitor visitor)
        {
            FieldsStream.Seek(IndexReader.GetStartPointer(docID));

            int docBase = FieldsStream.ReadVInt();
            int chunkDocs = FieldsStream.ReadVInt();
            if (docID < docBase || docID >= docBase + chunkDocs || docBase + chunkDocs > NumDocs)
            {
                throw new CorruptIndexException("Corrupted: docID=" + docID + ", docBase=" + docBase + ", chunkDocs=" + chunkDocs + ", numDocs=" + NumDocs + " (resource=" + FieldsStream + ")");
            }

            int numStoredFields, offset, length, totalLength;
            if (chunkDocs == 1)
            {
                numStoredFields = FieldsStream.ReadVInt();
                offset = 0;
                length = FieldsStream.ReadVInt();
                totalLength = length;
            }
            else
            {
                int bitsPerStoredFields = FieldsStream.ReadVInt();
                if (bitsPerStoredFields == 0)
                {
                    numStoredFields = FieldsStream.ReadVInt();
                }
                else if (bitsPerStoredFields > 31)
                {
                    throw new CorruptIndexException("bitsPerStoredFields=" + bitsPerStoredFields + " (resource=" + FieldsStream + ")");
                }
                else
                {
                    long filePointer = FieldsStream.FilePointer;
                    PackedInts.Reader reader = PackedInts.GetDirectReaderNoHeader(FieldsStream, PackedInts.Format.PACKED, PackedIntsVersion, chunkDocs, bitsPerStoredFields);
                    numStoredFields = (int)(reader.Get(docID - docBase));
                    FieldsStream.Seek(filePointer + PackedInts.Format.PACKED.ByteCount(PackedIntsVersion, chunkDocs, bitsPerStoredFields));
                }

                int bitsPerLength = FieldsStream.ReadVInt();
                if (bitsPerLength == 0)
                {
                    length = FieldsStream.ReadVInt();
                    offset = (docID - docBase) * length;
                    totalLength = chunkDocs * length;
                }
                else if (bitsPerStoredFields > 31)
                {
                    throw new CorruptIndexException("bitsPerLength=" + bitsPerLength + " (resource=" + FieldsStream + ")");
                }
                else
                {
                    PackedInts.ReaderIterator it = PackedInts.GetReaderIteratorNoHeader(FieldsStream, PackedInts.Format.PACKED, PackedIntsVersion, chunkDocs, bitsPerLength, 1);
                    int off = 0;
                    for (int i = 0; i < docID - docBase; ++i)
                    {
                        off += (int)it.Next();
                    }
                    offset = off;
                    length = (int)it.Next();
                    off += length;
                    for (int i = docID - docBase + 1; i < chunkDocs; ++i)
                    {
                        off += (int)it.Next();
                    }
                    totalLength = off;
                }
            }

            if ((length == 0) != (numStoredFields == 0))
            {
                throw new CorruptIndexException("length=" + length + ", numStoredFields=" + numStoredFields + " (resource=" + FieldsStream + ")");
            }
            if (numStoredFields == 0)
            {
                // nothing to do
                return;
            }

            DataInput documentInput;
            if (Version_Renamed >= CompressingStoredFieldsWriter.VERSION_BIG_CHUNKS && totalLength >= 2 * ChunkSize_Renamed)
            {
                Debug.Assert(ChunkSize_Renamed > 0);
                Debug.Assert(offset < ChunkSize_Renamed);

                Decompressor.Decompress(FieldsStream, ChunkSize_Renamed, offset, Math.Min(length, ChunkSize_Renamed - offset), Bytes);
                documentInput = new DataInputAnonymousInnerClassHelper(this, offset, length);
            }
            else
            {
                BytesRef bytes = totalLength <= BUFFER_REUSE_THRESHOLD ? this.Bytes : new BytesRef();
                Decompressor.Decompress(FieldsStream, totalLength, offset, length, bytes);
                Debug.Assert(bytes.Length == length);
                documentInput = new ByteArrayDataInput((byte[])(Array)bytes.Bytes, bytes.Offset, bytes.Length);
            }

            for (int fieldIDX = 0; fieldIDX < numStoredFields; fieldIDX++)
            {
                long infoAndBits = documentInput.ReadVLong();
                int fieldNumber = (int)((long)((ulong)infoAndBits >> CompressingStoredFieldsWriter.TYPE_BITS));
                FieldInfo fieldInfo = FieldInfos.FieldInfo(fieldNumber);

                int bits = (int)(infoAndBits & CompressingStoredFieldsWriter.TYPE_MASK);
                Debug.Assert(bits <= CompressingStoredFieldsWriter.NUMERIC_DOUBLE, "bits=" + bits.ToString("x"));

                switch (visitor.NeedsField(fieldInfo))
                {
                    case StoredFieldVisitor.Status.YES:
                        ReadField(documentInput, visitor, fieldInfo, bits);
                        break;

                    case StoredFieldVisitor.Status.NO:
                        SkipField(documentInput, bits);
                        break;

                    case StoredFieldVisitor.Status.STOP:
                        return;
                }
            }
        }
                    /* Does initial decode of next block of terms; this
                       doesn't actually decode the docFreq, totalTermFreq,
                       postings details (frq/prx offset, etc.) metadata;
                       it just loads them as byte[] blobs which are then
                       decoded on-demand if the metadata is ever requested
                       for any term in this block.  this enables terms-only
                       intensive consumes (eg certain MTQs, respelling) to
                       not pay the price of decoding metadata they won't
                       use. */

                    internal void LoadBlock()
                    {
                        // Clone the IndexInput lazily, so that consumers
                        // that just pull a TermsEnum to
                        // seekExact(TermState) don't pay this cost:
                        OuterInstance.InitIndexInput();

                        if (NextEnt != -1)
                        {
                            // Already loaded
                            return;
                        }
                        //System.out.println("blc=" + blockLoadCount);

                        [email protected](Fp);
                        int code = [email protected]();
                        EntCount = (int)((uint)code >> 1);
                        Debug.Assert(EntCount > 0);
                        IsLastInFloor = (code & 1) != 0;
                        Debug.Assert(Arc == null || (IsLastInFloor || IsFloor));

                        // TODO: if suffixes were stored in random-access
                        // array structure, then we could do binary search
                        // instead of linear scan to find target term; eg
                        // we could have simple array of offsets

                        // term suffixes:
                        code = [email protected]();
                        IsLeafBlock = (code & 1) != 0;
                        int numBytes = (int)((uint)code >> 1);
                        if (SuffixBytes.Length < numBytes)
                        {
                            SuffixBytes = new byte[ArrayUtil.Oversize(numBytes, 1)];
                        }
                        [email protected](SuffixBytes, 0, numBytes);
                        SuffixesReader.Reset(SuffixBytes, 0, numBytes);

                        /*if (DEBUG) {
                          if (arc == null) {
                            System.out.println("    loadBlock (next) fp=" + fp + " entCount=" + entCount + " prefixLen=" + prefix + " isLastInFloor=" + isLastInFloor + " leaf?=" + isLeafBlock);
                          } else {
                            System.out.println("    loadBlock (seek) fp=" + fp + " entCount=" + entCount + " prefixLen=" + prefix + " hasTerms?=" + hasTerms + " isFloor?=" + isFloor + " isLastInFloor=" + isLastInFloor + " leaf?=" + isLeafBlock);
                          }
                          }*/

                        // stats
                        numBytes = [email protected]();
                        if (StatBytes.Length < numBytes)
                        {
                            StatBytes = new byte[ArrayUtil.Oversize(numBytes, 1)];
                        }
                        [email protected](StatBytes, 0, numBytes);
                        StatsReader.Reset(StatBytes, 0, numBytes);
                        MetaDataUpto = 0;

                        State.TermBlockOrd = 0;
                        NextEnt = 0;
                        LastSubFP = -1;

                        // TODO: we could skip this if !hasTerms; but
                        // that's rare so won't help much
                        // metadata
                        numBytes = [email protected]();
                        if (Bytes == null)
                        {
                            Bytes = new byte[ArrayUtil.Oversize(numBytes, 1)];
                            BytesReader = new ByteArrayDataInput();
                        }
                        else if (Bytes.Length < numBytes)
                        {
                            Bytes = new byte[ArrayUtil.Oversize(numBytes, 1)];
                        }
                        [email protected](Bytes, 0, numBytes);
                        BytesReader.Reset(Bytes, 0, numBytes);

                        // Sub-blocks of a single floor block are always
                        // written one after another -- tail recurse:
                        FpEnd = [email protected];
                        // if (DEBUG) {
                        //   System.out.println("      fpEnd=" + fpEnd);
                        // }
                    }
Esempio n. 8
0
        public override void Build(IInputEnumerator enumerator)
        {
            // LUCENENET: Added guard clause for null
            if (enumerator is null)
            {
                throw new ArgumentNullException(nameof(enumerator));
            }

            if (enumerator.HasContexts)
            {
                throw new ArgumentException("this suggester doesn't support contexts");
            }
            string prefix     = this.GetType().Name;
            var    directory  = OfflineSorter.DefaultTempDir();
            var    tempInput  = FileSupport.CreateTempFile(prefix, ".input", directory);
            var    tempSorted = FileSupport.CreateTempFile(prefix, ".sorted", directory);

            hasPayloads = enumerator.HasPayloads;

            var writer = new OfflineSorter.ByteSequencesWriter(tempInput);

            OfflineSorter.ByteSequencesReader reader = null;
            var scratch = new BytesRef();

            TokenStreamToAutomaton ts2a = GetTokenStreamToAutomaton();

            bool success = false;

            count = 0;
            byte[] buffer = new byte[8];
            try
            {
                var      output = new ByteArrayDataOutput(buffer);
                BytesRef surfaceForm;

                while (enumerator.MoveNext())
                {
                    surfaceForm = enumerator.Current;
                    ISet <Int32sRef> paths = ToFiniteStrings(surfaceForm, ts2a);

                    maxAnalyzedPathsForOneInput = Math.Max(maxAnalyzedPathsForOneInput, paths.Count);

                    foreach (Int32sRef path in paths)
                    {
                        Util.Fst.Util.ToBytesRef(path, scratch);

                        // length of the analyzed text (FST input)
                        if (scratch.Length > ushort.MaxValue - 2)
                        {
                            throw new ArgumentException("cannot handle analyzed forms > " + (ushort.MaxValue - 2) +
                                                        " in length (got " + scratch.Length + ")");
                        }
                        ushort analyzedLength = (ushort)scratch.Length;

                        // compute the required length:
                        // analyzed sequence + weight (4) + surface + analyzedLength (short)
                        int requiredLength = analyzedLength + 4 + surfaceForm.Length + 2;

                        BytesRef payload;

                        if (hasPayloads)
                        {
                            if (surfaceForm.Length > (ushort.MaxValue - 2))
                            {
                                throw new ArgumentException("cannot handle surface form > " + (ushort.MaxValue - 2) +
                                                            " in length (got " + surfaceForm.Length + ")");
                            }
                            payload = enumerator.Payload;
                            // payload + surfaceLength (short)
                            requiredLength += payload.Length + 2;
                        }
                        else
                        {
                            payload = null;
                        }

                        buffer = ArrayUtil.Grow(buffer, requiredLength);

                        output.Reset(buffer);

                        output.WriteInt16((short)analyzedLength);

                        output.WriteBytes(scratch.Bytes, scratch.Offset, scratch.Length);

                        output.WriteInt32(EncodeWeight(enumerator.Weight));

                        if (hasPayloads)
                        {
                            for (int i = 0; i < surfaceForm.Length; i++)
                            {
                                if (surfaceForm.Bytes[i] == PAYLOAD_SEP)
                                {
                                    throw new ArgumentException(
                                              "surface form cannot contain unit separator character U+001F; this character is reserved");
                                }
                            }
                            output.WriteInt16((short)surfaceForm.Length);
                            output.WriteBytes(surfaceForm.Bytes, surfaceForm.Offset, surfaceForm.Length);
                            output.WriteBytes(payload.Bytes, payload.Offset, payload.Length);
                        }
                        else
                        {
                            output.WriteBytes(surfaceForm.Bytes, surfaceForm.Offset, surfaceForm.Length);
                        }

                        if (Debugging.AssertsEnabled)
                        {
                            Debugging.Assert(output.Position == requiredLength, "{0} vs {1}", output.Position, requiredLength);
                        }

                        writer.Write(buffer, 0, output.Position);
                    }
                    count++;
                }
                writer.Dispose();

                // Sort all input/output pairs (required by FST.Builder):
                (new OfflineSorter(new AnalyzingComparer(hasPayloads))).Sort(tempInput, tempSorted);

                // Free disk space:
                tempInput.Delete();

                reader = new OfflineSorter.ByteSequencesReader(tempSorted);

                var outputs = new PairOutputs <Int64, BytesRef>(PositiveInt32Outputs.Singleton,
                                                                ByteSequenceOutputs.Singleton);
                var builder = new Builder <PairOutputs <Int64, BytesRef> .Pair>(FST.INPUT_TYPE.BYTE1, outputs);

                // Build FST:
                BytesRef  previousAnalyzed = null;
                BytesRef  analyzed         = new BytesRef();
                BytesRef  surface          = new BytesRef();
                Int32sRef scratchInts      = new Int32sRef();
                var       input            = new ByteArrayDataInput();

                // Used to remove duplicate surface forms (but we
                // still index the hightest-weight one).  We clear
                // this when we see a new analyzed form, so it cannot
                // grow unbounded (at most 256 entries):
                var seenSurfaceForms = new JCG.HashSet <BytesRef>();

                var dedup = 0;
                while (reader.Read(scratch))
                {
                    input.Reset(scratch.Bytes, scratch.Offset, scratch.Length);
                    ushort analyzedLength = (ushort)input.ReadInt16();
                    analyzed.Grow(analyzedLength + 2);
                    input.ReadBytes(analyzed.Bytes, 0, analyzedLength);
                    analyzed.Length = analyzedLength;

                    long cost = input.ReadInt32();

                    surface.Bytes = scratch.Bytes;
                    if (hasPayloads)
                    {
                        surface.Length = (ushort)input.ReadInt16();
                        surface.Offset = input.Position;
                    }
                    else
                    {
                        surface.Offset = input.Position;
                        surface.Length = scratch.Length - surface.Offset;
                    }

                    if (previousAnalyzed is null)
                    {
                        previousAnalyzed = new BytesRef();
                        previousAnalyzed.CopyBytes(analyzed);
                        seenSurfaceForms.Add(BytesRef.DeepCopyOf(surface));
                    }
                    else if (analyzed.Equals(previousAnalyzed))
                    {
                        dedup++;
                        if (dedup >= maxSurfaceFormsPerAnalyzedForm)
                        {
                            // More than maxSurfaceFormsPerAnalyzedForm
                            // dups: skip the rest:
                            continue;
                        }
                        if (seenSurfaceForms.Contains(surface))
                        {
                            continue;
                        }
                        seenSurfaceForms.Add(BytesRef.DeepCopyOf(surface));
                    }
                    else
                    {
                        dedup = 0;
                        previousAnalyzed.CopyBytes(analyzed);
                        seenSurfaceForms.Clear();
                        seenSurfaceForms.Add(BytesRef.DeepCopyOf(surface));
                    }

                    // TODO: I think we can avoid the extra 2 bytes when
                    // there is no dup (dedup==0), but we'd have to fix
                    // the exactFirst logic ... which would be sort of
                    // hairy because we'd need to special case the two
                    // (dup/not dup)...

                    // NOTE: must be byte 0 so we sort before whatever
                    // is next
                    analyzed.Bytes[analyzed.Offset + analyzed.Length]     = 0;
                    analyzed.Bytes[analyzed.Offset + analyzed.Length + 1] = (byte)dedup;
                    analyzed.Length += 2;

                    Util.Fst.Util.ToInt32sRef(analyzed, scratchInts);
                    //System.out.println("ADD: " + scratchInts + " -> " + cost + ": " + surface.utf8ToString());
                    if (!hasPayloads)
                    {
                        builder.Add(scratchInts, outputs.NewPair(cost, BytesRef.DeepCopyOf(surface)));
                    }
                    else
                    {
                        int      payloadOffset = input.Position + surface.Length;
                        int      payloadLength = scratch.Length - payloadOffset;
                        BytesRef br            = new BytesRef(surface.Length + 1 + payloadLength);
                        Array.Copy(surface.Bytes, surface.Offset, br.Bytes, 0, surface.Length);
                        br.Bytes[surface.Length] = PAYLOAD_SEP;
                        Array.Copy(scratch.Bytes, payloadOffset, br.Bytes, surface.Length + 1, payloadLength);
                        br.Length = br.Bytes.Length;
                        builder.Add(scratchInts, outputs.NewPair(cost, br));
                    }
                }
                fst = builder.Finish();

                //Util.dotToFile(fst, "/tmp/suggest.dot");

                success = true;
            }
            finally
            {
                if (success)
                {
                    IOUtils.Dispose(reader, writer);
                }
                else
                {
                    IOUtils.DisposeWhileHandlingException(reader, writer);
                }

                tempInput.Delete();
                tempSorted.Delete();
            }
        }
Esempio n. 9
0
        public override void VisitDocument(int docID, StoredFieldVisitor visitor)
        {
            fieldsStream.Seek(indexReader.GetStartPointer(docID));

            int docBase   = fieldsStream.ReadVInt32();
            int chunkDocs = fieldsStream.ReadVInt32();

            if (docID < docBase || docID >= docBase + chunkDocs || docBase + chunkDocs > numDocs)
            {
                throw new CorruptIndexException("Corrupted: docID=" + docID + ", docBase=" + docBase + ", chunkDocs=" + chunkDocs + ", numDocs=" + numDocs + " (resource=" + fieldsStream + ")");
            }

            int numStoredFields, offset, length, totalLength;

            if (chunkDocs == 1)
            {
                numStoredFields = fieldsStream.ReadVInt32();
                offset          = 0;
                length          = fieldsStream.ReadVInt32();
                totalLength     = length;
            }
            else
            {
                int bitsPerStoredFields = fieldsStream.ReadVInt32();
                if (bitsPerStoredFields == 0)
                {
                    numStoredFields = fieldsStream.ReadVInt32();
                }
                else if (bitsPerStoredFields > 31)
                {
                    throw new CorruptIndexException("bitsPerStoredFields=" + bitsPerStoredFields + " (resource=" + fieldsStream + ")");
                }
                else
                {
                    long filePointer           = fieldsStream.GetFilePointer();
                    PackedInt32s.Reader reader = PackedInt32s.GetDirectReaderNoHeader(fieldsStream, PackedInt32s.Format.PACKED, packedIntsVersion, chunkDocs, bitsPerStoredFields);
                    numStoredFields = (int)(reader.Get(docID - docBase));
                    fieldsStream.Seek(filePointer + PackedInt32s.Format.PACKED.ByteCount(packedIntsVersion, chunkDocs, bitsPerStoredFields));
                }

                int bitsPerLength = fieldsStream.ReadVInt32();
                if (bitsPerLength == 0)
                {
                    length      = fieldsStream.ReadVInt32();
                    offset      = (docID - docBase) * length;
                    totalLength = chunkDocs * length;
                }
                else if (bitsPerStoredFields > 31)
                {
                    throw new CorruptIndexException("bitsPerLength=" + bitsPerLength + " (resource=" + fieldsStream + ")");
                }
                else
                {
                    PackedInt32s.IReaderIterator it = PackedInt32s.GetReaderIteratorNoHeader(fieldsStream, PackedInt32s.Format.PACKED, packedIntsVersion, chunkDocs, bitsPerLength, 1);
                    int off = 0;
                    for (int i = 0; i < docID - docBase; ++i)
                    {
                        off += (int)it.Next();
                    }
                    offset = off;
                    length = (int)it.Next();
                    off   += length;
                    for (int i = docID - docBase + 1; i < chunkDocs; ++i)
                    {
                        off += (int)it.Next();
                    }
                    totalLength = off;
                }
            }

            if ((length == 0) != (numStoredFields == 0))
            {
                throw new CorruptIndexException("length=" + length + ", numStoredFields=" + numStoredFields + " (resource=" + fieldsStream + ")");
            }
            if (numStoredFields == 0)
            {
                // nothing to do
                return;
            }

            DataInput documentInput;

            if (version >= CompressingStoredFieldsWriter.VERSION_BIG_CHUNKS && totalLength >= 2 * chunkSize)
            {
                if (Debugging.AssertsEnabled)
                {
                    Debugging.Assert(chunkSize > 0);
                    Debugging.Assert(offset < chunkSize);
                }

                decompressor.Decompress(fieldsStream, chunkSize, offset, Math.Min(length, chunkSize - offset), bytes);
                documentInput = new DataInputAnonymousClass(this, length);
            }
            else
            {
                BytesRef bytes = totalLength <= BUFFER_REUSE_THRESHOLD ? this.bytes : new BytesRef();
                decompressor.Decompress(fieldsStream, totalLength, offset, length, bytes);
                if (Debugging.AssertsEnabled)
                {
                    Debugging.Assert(bytes.Length == length);
                }
                documentInput = new ByteArrayDataInput(bytes.Bytes, bytes.Offset, bytes.Length);
            }

            for (int fieldIDX = 0; fieldIDX < numStoredFields; fieldIDX++)
            {
                long      infoAndBits = documentInput.ReadVInt64();
                int       fieldNumber = (int)infoAndBits.TripleShift(CompressingStoredFieldsWriter.TYPE_BITS);
                FieldInfo fieldInfo   = fieldInfos.FieldInfo(fieldNumber);

                int bits = (int)(infoAndBits & CompressingStoredFieldsWriter.TYPE_MASK);
                if (Debugging.AssertsEnabled)
                {
                    Debugging.Assert(bits <= CompressingStoredFieldsWriter.NUMERIC_DOUBLE, "bits={0:x}", bits);
                }

                switch (visitor.NeedsField(fieldInfo))
                {
                case StoredFieldVisitor.Status.YES:
                    ReadField(documentInput, visitor, fieldInfo, bits);
                    break;

                case StoredFieldVisitor.Status.NO:
                    SkipField(documentInput, bits);
                    break;

                case StoredFieldVisitor.Status.STOP:
                    return;
                }
            }
        }
Esempio n. 10
0
 public SortedSetDocValuesAnonymousInnerClassHelper(FSTEntry entry, BinaryDocValues docToOrds, FST <long?> fst, FST.BytesReader @in, FST.Arc <long?> firstArc, FST.Arc <long?> scratchArc, Int32sRef scratchInts, BytesRefFSTEnum <long?> fstEnum, BytesRef @ref, ByteArrayDataInput input)
 {
     this.entry       = entry;
     this.docToOrds   = docToOrds;
     this.fst         = fst;
     this.@in         = @in;
     this.firstArc    = firstArc;
     this.scratchArc  = scratchArc;
     this.scratchInts = scratchInts;
     this.fstEnum     = fstEnum;
     this.@ref        = @ref;
     this.input       = input;
 }
Esempio n. 11
0
 /// <summary>
 /// Constructs a new Stemmer which will use the provided Dictionary to create its stems.
 /// </summary>
 /// <param name="dictionary"> Dictionary that will be used to create the stems </param>
 public Stemmer(Dictionary dictionary)
 {
     this.dictionary  = dictionary;
     this.affixReader = new ByteArrayDataInput(dictionary.affixData);
 }
Esempio n. 12
0
        public virtual void TestVariableBinary([ValueSource(typeof(ConcurrentMergeSchedulerFactories), "Values")] Func <IConcurrentMergeScheduler> newScheduler)
        {
            BaseDirectoryWrapper dir = NewFSDirectory(CreateTempDir("2BVariableBinary"));

            if (dir is MockDirectoryWrapper)
            {
                ((MockDirectoryWrapper)dir).Throttling = MockDirectoryWrapper.Throttling_e.NEVER;
            }

            var config = new IndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer(Random()))
                         .SetMaxBufferedDocs(IndexWriterConfig.DISABLE_AUTO_FLUSH)
                         .SetRAMBufferSizeMB(256.0)
                         .SetMergeScheduler(newScheduler())
                         .SetMergePolicy(NewLogMergePolicy(false, 10))
                         .SetOpenMode(OpenMode.CREATE);
            IndexWriter w = new IndexWriter(dir, config);

            Document             doc     = new Document();
            var                  bytes   = new byte[4];
            ByteArrayDataOutput  encoder = new ByteArrayDataOutput(bytes);
            BytesRef             data    = new BytesRef(bytes);
            BinaryDocValuesField dvField = new BinaryDocValuesField("dv", data);

            doc.Add(dvField);

            for (int i = 0; i < int.MaxValue; i++)
            {
                encoder.Reset(bytes);
                encoder.WriteVInt32(i % 65535); // 1, 2, or 3 bytes
                data.Length = encoder.Position;
                w.AddDocument(doc);
                if (i % 100000 == 0)
                {
                    Console.WriteLine("indexed: " + i);
                    Console.Out.Flush();
                }
            }

            w.ForceMerge(1);
            w.Dispose();

            Console.WriteLine("verifying...");
            Console.Out.Flush();

            DirectoryReader    r             = DirectoryReader.Open(dir);
            int                expectedValue = 0;
            ByteArrayDataInput input         = new ByteArrayDataInput();

            foreach (AtomicReaderContext context in r.Leaves)
            {
                AtomicReader    reader  = context.AtomicReader;
                BytesRef        scratch = new BytesRef(bytes);
                BinaryDocValues dv      = reader.GetBinaryDocValues("dv");
                for (int i = 0; i < reader.MaxDoc; i++)
                {
                    dv.Get(i, scratch);
                    input.Reset(scratch.Bytes, scratch.Offset, scratch.Length);
                    Assert.AreEqual(expectedValue % 65535, input.ReadVInt32());
                    Assert.IsTrue(input.Eof);
                    expectedValue++;
                }
            }

            r.Dispose();
            dir.Dispose();
        }
Esempio n. 13
0
                // Does initial decode of next block of terms; this
                // doesn't actually decode the docFreq, totalTermFreq,
                // postings details (frq/prx offset, etc.) metadata;
                // it just loads them as byte[] blobs which are then
                // decoded on-demand if the metadata is ever requested
                // for any term in this block.  This enables terms-only
                // intensive consumes (eg certain MTQs, respelling) to
                // not pay the price of decoding metadata they won't
                // use.

                private bool NextBlock()
                {
                    // TODO: we still lazy-decode the byte[] for each
                    // term (the suffix), but, if we decoded
                    // all N terms up front then seeking could do a fast
                    // bsearch w/in the block...

                    _state.BlockFilePointer = _input.FilePointer;
                    _blockTermCount         = _input.ReadVInt();

                    if (_blockTermCount == 0)
                    {
                        return(false);
                    }

                    _termBlockPrefix = _input.ReadVInt();

                    // term suffixes:
                    int len = _input.ReadVInt();

                    if (_termSuffixes.Length < len)
                    {
                        _termSuffixes = new byte[ArrayUtil.Oversize(len, 1)];
                    }
                    //System.out.println("  termSuffixes len=" + len);
                    _input.ReadBytes(_termSuffixes, 0, len);

                    _termSuffixesReader.Reset(_termSuffixes, 0, len);

                    // docFreq, totalTermFreq
                    len = _input.ReadVInt();
                    if (_docFreqBytes.Length < len)
                    {
                        _docFreqBytes = new byte[ArrayUtil.Oversize(len, 1)];
                    }

                    _input.ReadBytes(_docFreqBytes, 0, len);
                    _freqReader.Reset(_docFreqBytes, 0, len);

                    // metadata
                    len = _input.ReadVInt();
                    if (_bytes == null)
                    {
                        _bytes       = new byte[ArrayUtil.Oversize(len, 1)];
                        _bytesReader = new ByteArrayDataInput();
                    }
                    else if (_bytes.Length < len)
                    {
                        _bytes = new byte[ArrayUtil.Oversize(len, 1)];
                    }

                    _input.ReadBytes(_bytes, 0, len);
                    _bytesReader.Reset(_bytes, 0, len);

                    _metaDataUpto       = 0;
                    _state.TermBlockOrd = 0;

                    _blocksSinceSeek++;
                    _indexIsCurrent = _indexIsCurrent && (_blocksSinceSeek < _blockTermsReader._indexReader.Divisor);

                    return(true);
                }
Esempio n. 14
0
 internal void InitializeInstanceFields()
 {
     @in = new ByteArrayDataInput(buffer);
 }
        public virtual void TestVariableBinary([ValueSource(typeof(ConcurrentMergeSchedulers), "Values")]IConcurrentMergeScheduler scheduler)
        {
            BaseDirectoryWrapper dir = NewFSDirectory(CreateTempDir("2BVariableBinary"));
            if (dir is MockDirectoryWrapper)
            {
                ((MockDirectoryWrapper)dir).Throttling = MockDirectoryWrapper.Throttling_e.NEVER;
            }

            var config = new IndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer(Random()))
                            .SetMaxBufferedDocs(IndexWriterConfig.DISABLE_AUTO_FLUSH)
                            .SetRAMBufferSizeMB(256.0)
                            .SetMergeScheduler(scheduler)
                            .SetMergePolicy(NewLogMergePolicy(false, 10))
                            .SetOpenMode(IndexWriterConfig.OpenMode_e.CREATE);
            IndexWriter w = new IndexWriter(dir, config);

            Document doc = new Document();
            var bytes = new byte[4];
            ByteArrayDataOutput encoder = new ByteArrayDataOutput(bytes);
            BytesRef data = new BytesRef(bytes);
            BinaryDocValuesField dvField = new BinaryDocValuesField("dv", data);
            doc.Add(dvField);

            for (int i = 0; i < int.MaxValue; i++)
            {
                encoder.Reset(bytes);
                encoder.WriteVInt(i % 65535); // 1, 2, or 3 bytes
                data.Length = encoder.Position;
                w.AddDocument(doc);
                if (i % 100000 == 0)
                {
                    Console.WriteLine("indexed: " + i);
                    Console.Out.Flush();
                }
            }

            w.ForceMerge(1);
            w.Dispose();

            Console.WriteLine("verifying...");
            Console.Out.Flush();

            DirectoryReader r = DirectoryReader.Open(dir);
            int expectedValue = 0;
            ByteArrayDataInput input = new ByteArrayDataInput();
            foreach (AtomicReaderContext context in r.Leaves)
            {
                AtomicReader reader = context.AtomicReader;
                BytesRef scratch = new BytesRef(bytes);
                BinaryDocValues dv = reader.GetBinaryDocValues("dv");
                for (int i = 0; i < reader.MaxDoc; i++)
                {
                    dv.Get(i, scratch);
                    input.Reset((byte[])(Array)scratch.Bytes, scratch.Offset, scratch.Length);
                    Assert.AreEqual(expectedValue % 65535, input.ReadVInt());
                    Assert.IsTrue(input.Eof());
                    expectedValue++;
                }
            }

            r.Dispose();
            dir.Dispose();
        }
Esempio n. 16
0
 public void SetFloorData(ByteArrayDataInput @in, BytesRef source)
 {
     int numBytes = source.Length - (@in.Position - source.Offset);
     if (numBytes > FloorData.Length)
     {
         FloorData = new byte[ArrayUtil.Oversize(numBytes, 1)];
     }
     System.Buffer.BlockCopy(source.Bytes, source.Offset + @in.Position, FloorData, 0, numBytes);
     FloorDataReader.Reset(FloorData, 0, numBytes);
     NumFollowFloorBlocks = FloorDataReader.ReadVInt();
     NextFloorLabel = FloorDataReader.ReadByte() & 0xff;
     //if (DEBUG) {
     //System.out.println("    setFloorData fpOrig=" + fpOrig + " bytes=" + new BytesRef(source.bytes, source.offset + in.getPosition(), numBytes) + " numFollowFloorBlocks=" + numFollowFloorBlocks + " nextFloorLabel=" + toHex(nextFloorLabel));
     //}
 }
Esempio n. 17
0
                // Does initial decode of next block of terms; this
                // doesn't actually decode the docFreq, totalTermFreq,
                // postings details (frq/prx offset, etc.) metadata;
                // it just loads them as byte[] blobs which are then      
                // decoded on-demand if the metadata is ever requested
                // for any term in this block.  This enables terms-only
                // intensive consumes (eg certain MTQs, respelling) to
                // not pay the price of decoding metadata they won't
                // use.

                private bool NextBlock()
                {
                    // TODO: we still lazy-decode the byte[] for each
                    // term (the suffix), but, if we decoded
                    // all N terms up front then seeking could do a fast
                    // bsearch w/in the block...

                    _state.BlockFilePointer = _input.FilePointer;
                    _blockTermCount = _input.ReadVInt();

                    if (_blockTermCount == 0)
                        return false;

                    _termBlockPrefix = _input.ReadVInt();

                    // term suffixes:
                    int len = _input.ReadVInt();
                    if (_termSuffixes.Length < len)
                    {
                        _termSuffixes = new byte[ArrayUtil.Oversize(len, 1)];
                    }
                    //System.out.println("  termSuffixes len=" + len);
                    _input.ReadBytes(_termSuffixes, 0, len);

                    _termSuffixesReader.Reset(_termSuffixes, 0, len);

                    // docFreq, totalTermFreq
                    len = _input.ReadVInt();
                    if (_docFreqBytes.Length < len)
                        _docFreqBytes = new byte[ArrayUtil.Oversize(len, 1)];

                    _input.ReadBytes(_docFreqBytes, 0, len);
                    _freqReader.Reset(_docFreqBytes, 0, len);

                    // metadata
                    len = _input.ReadVInt();
                    if (_bytes == null)
                    {
                        _bytes = new byte[ArrayUtil.Oversize(len, 1)];
                        _bytesReader = new ByteArrayDataInput();
                    }
                    else if (_bytes.Length < len)
                    {
                        _bytes = new byte[ArrayUtil.Oversize(len, 1)];
                    }

                    _input.ReadBytes(_bytes, 0, len);
                    _bytesReader.Reset(_bytes, 0, len);

                    _metaDataUpto = 0;
                    _state.TermBlockOrd = 0;

                    _blocksSinceSeek++;
                    _indexIsCurrent = _indexIsCurrent && (_blocksSinceSeek < _blockTermsReader._indexReader.Divisor);

                    return true;
                }
Esempio n. 18
0
                // Does initial decode of next block of terms; this
                // doesn't actually decode the docFreq, totalTermFreq,
                // postings details (frq/prx offset, etc.) metadata;
                // it just loads them as byte[] blobs which are then
                // decoded on-demand if the metadata is ever requested
                // for any term in this block.  This enables terms-only
                // intensive consumes (eg certain MTQs, respelling) to
                // not pay the price of decoding metadata they won't
                // use.

                private bool NextBlock()
                {
                    // TODO: we still lazy-decode the byte[] for each
                    // term (the suffix), but, if we decoded
                    // all N terms up front then seeking could do a fast
                    // bsearch w/in the block...

                    //System.out.println("BTR.nextBlock() fp=" + in.getFilePointer() + " this=" + this);
                    state.BlockFilePointer = input.GetFilePointer();
                    blockTermCount         = input.ReadVInt32();
                    //System.out.println("  blockTermCount=" + blockTermCount);
                    if (blockTermCount == 0)
                    {
                        return(false);
                    }
                    termBlockPrefix = input.ReadVInt32();

                    // term suffixes:
                    int len = input.ReadVInt32();

                    if (termSuffixes.Length < len)
                    {
                        termSuffixes = new byte[ArrayUtil.Oversize(len, 1)];
                    }
                    //System.out.println("  termSuffixes len=" + len);
                    input.ReadBytes(termSuffixes, 0, len);
                    termSuffixesReader.Reset(termSuffixes, 0, len);

                    // docFreq, totalTermFreq
                    len = input.ReadVInt32();
                    if (docFreqBytes.Length < len)
                    {
                        docFreqBytes = new byte[ArrayUtil.Oversize(len, 1)];
                    }
                    //System.out.println("  freq bytes len=" + len);
                    input.ReadBytes(docFreqBytes, 0, len);
                    freqReader.Reset(docFreqBytes, 0, len);

                    // metadata
                    len = input.ReadVInt32();
                    if (bytes == null)
                    {
                        bytes       = new byte[ArrayUtil.Oversize(len, 1)];
                        bytesReader = new ByteArrayDataInput();
                    }
                    else if (bytes.Length < len)
                    {
                        bytes = new byte[ArrayUtil.Oversize(len, 1)];
                    }
                    input.ReadBytes(bytes, 0, len);
                    bytesReader.Reset(bytes, 0, len);

                    metaDataUpto       = 0;
                    state.TermBlockOrd = 0;

                    blocksSinceSeek++;
                    indexIsCurrent = indexIsCurrent && (blocksSinceSeek < outerInstance.outerInstance.indexReader.Divisor);
                    //System.out.println("  indexIsCurrent=" + indexIsCurrent);

                    return(true);
                }