예제 #1
0
        /// <summary>Increments the enumeration to the next element.  True if one exists.</summary>
        public override bool Next()
        {
            if (position++ >= size - 1)
            {
                prevBuffer.Set(termBuffer);
                termBuffer.Reset();
                return(false);
            }

            prevBuffer.Set(termBuffer);
            termBuffer.Read(input, fieldInfos);

            termInfo.docFreq      = input.ReadVInt();        // read doc freq
            termInfo.freqPointer += input.ReadVLong();       // read freq pointer
            termInfo.proxPointer += input.ReadVLong();       // read prox pointer

            if (format == -1)
            {
                //  just read skipOffset in order to increment  file pointer;
                // value is never used since skipTo is switched off
                if (!isIndex)
                {
                    if (termInfo.docFreq > formatM1SkipInterval)
                    {
                        termInfo.skipOffset = input.ReadVInt();
                    }
                }
            }
            else
            {
                if (termInfo.docFreq >= skipInterval)
                {
                    termInfo.skipOffset = input.ReadVInt();
                }
            }

            if (isIndex)
            {
                indexPointer += input.ReadVLong();                 // read index pointer
            }
            return(true);
        }
예제 #2
0
        /// <summary> Retrieve the term vector for the given document and field</summary>
        /// <param name="docNum">The document number to retrieve the vector for
        /// </param>
        /// <param name="field">The field within the document to retrieve
        /// </param>
        /// <returns> The TermFreqVector for the document and field or null if there is no termVector for this field.
        /// </returns>
        /// <throws>  IOException if there is an error reading the term vector files </throws>
        public /*internal*/ virtual TermFreqVector Get(int docNum, System.String field)
        {
            // Check if no term vectors are available for this segment at all
            int            fieldNumber = fieldInfos.FieldNumber(field);
            TermFreqVector result      = null;

            if (tvx != null)
            {
                //We need to account for the FORMAT_SIZE at when seeking in the tvx
                //We don't need to do this in other seeks because we already have the
                // file pointer
                //that was written in another file
                tvx.Seek((docNum * 8L) + TermVectorsWriter.FORMAT_SIZE);
                //System.out.println("TVX Pointer: " + tvx.getFilePointer());
                long position = tvx.ReadLong();

                tvd.Seek(position);
                int fieldCount = tvd.ReadVInt();
                //System.out.println("Num Fields: " + fieldCount);
                // There are only a few fields per document. We opt for a full scan
                // rather then requiring that they be ordered. We need to read through
                // all of the fields anyway to get to the tvf pointers.
                int number = 0;
                int found  = -1;
                for (int i = 0; i < fieldCount; i++)
                {
                    if (tvdFormat == TermVectorsWriter.FORMAT_VERSION)
                    {
                        number = tvd.ReadVInt();
                    }
                    else
                    {
                        number += tvd.ReadVInt();
                    }

                    if (number == fieldNumber)
                    {
                        found = i;
                    }
                }

                // This field, although valid in the segment, was not found in this
                // document
                if (found != -1)
                {
                    // Compute position in the tvf file
                    position = 0;
                    for (int i = 0; i <= found; i++)
                    {
                        position += tvd.ReadVLong();
                    }

                    result = ReadTermVector(field, position);
                }
                else
                {
                    //System.out.println("Field not found");
                }
            }
            else
            {
                //System.out.println("No tvx file");
            }
            return(result);
        }
예제 #3
0
        public virtual void  Get(int docNum, System.String field, TermVectorMapper mapper)
        {
            if (tvx != null)
            {
                int fieldNumber = fieldInfos.FieldNumber(field);
                //We need to account for the FORMAT_SIZE at when seeking in the tvx
                //We don't need to do this in other seeks because we already have the
                // file pointer
                //that was written in another file
                SeekTvx(docNum);
                //System.out.println("TVX Pointer: " + tvx.getFilePointer());
                long tvdPosition = tvx.ReadLong();

                tvd.Seek(tvdPosition);
                int fieldCount = tvd.ReadVInt();
                //System.out.println("Num Fields: " + fieldCount);
                // There are only a few fields per document. We opt for a full scan
                // rather then requiring that they be ordered. We need to read through
                // all of the fields anyway to get to the tvf pointers.
                int number = 0;
                int found  = -1;
                for (int i = 0; i < fieldCount; i++)
                {
                    if (format >= FORMAT_VERSION)
                    {
                        number = tvd.ReadVInt();
                    }
                    else
                    {
                        number += tvd.ReadVInt();
                    }

                    if (number == fieldNumber)
                    {
                        found = i;
                    }
                }

                // This field, although valid in the segment, was not found in this
                // document
                if (found != -1)
                {
                    // Compute position in the tvf file
                    long position;
                    if (format >= FORMAT_VERSION2)
                    {
                        position = tvx.ReadLong();
                    }
                    else
                    {
                        position = tvd.ReadVLong();
                    }
                    for (int i = 1; i <= found; i++)
                    {
                        position += tvd.ReadVLong();
                    }

                    mapper.SetDocumentNumber(docNum);
                    ReadTermVector(field, position, mapper);
                }
                else
                {
                    //System.out.println("Fieldable not found");
                }
            }
            else
            {
                //System.out.println("No tvx file");
            }
        }
        internal readonly PackedInts.Reader[] StartPointersDeltas; // delta from the avg

        #endregion Fields

        #region Constructors

        // It is the responsibility of the caller to close fieldsIndexIn after this constructor
        // has been called
        internal CompressingStoredFieldsIndexReader(IndexInput fieldsIndexIn, SegmentInfo si)
        {
            MaxDoc = si.DocCount;
            int[] docBases = new int[16];
            long[] startPointers = new long[16];
            int[] avgChunkDocs = new int[16];
            long[] avgChunkSizes = new long[16];
            PackedInts.Reader[] docBasesDeltas = new PackedInts.Reader[16];
            PackedInts.Reader[] startPointersDeltas = new PackedInts.Reader[16];

            int packedIntsVersion = fieldsIndexIn.ReadVInt();

            int blockCount = 0;

            for (; ; )
            {
                int numChunks = fieldsIndexIn.ReadVInt();
                if (numChunks == 0)
                {
                    break;
                }
                if (blockCount == docBases.Length)
                {
                    int newSize = ArrayUtil.Oversize(blockCount + 1, 8);
                    docBases = Arrays.CopyOf(docBases, newSize);
                    startPointers = Arrays.CopyOf(startPointers, newSize);
                    avgChunkDocs = Arrays.CopyOf(avgChunkDocs, newSize);
                    avgChunkSizes = Arrays.CopyOf(avgChunkSizes, newSize);
                    docBasesDeltas = Arrays.CopyOf(docBasesDeltas, newSize);
                    startPointersDeltas = Arrays.CopyOf(startPointersDeltas, newSize);
                }

                // doc bases
                docBases[blockCount] = fieldsIndexIn.ReadVInt();
                avgChunkDocs[blockCount] = fieldsIndexIn.ReadVInt();
                int bitsPerDocBase = fieldsIndexIn.ReadVInt();
                if (bitsPerDocBase > 32)
                {
                    throw new CorruptIndexException("Corrupted bitsPerDocBase (resource=" + fieldsIndexIn + ")");
                }
                docBasesDeltas[blockCount] = PackedInts.GetReaderNoHeader(fieldsIndexIn, PackedInts.Format.PACKED, packedIntsVersion, numChunks, bitsPerDocBase);

                // start pointers
                startPointers[blockCount] = fieldsIndexIn.ReadVLong();
                avgChunkSizes[blockCount] = fieldsIndexIn.ReadVLong();
                int bitsPerStartPointer = fieldsIndexIn.ReadVInt();
                if (bitsPerStartPointer > 64)
                {
                    throw new CorruptIndexException("Corrupted bitsPerStartPointer (resource=" + fieldsIndexIn + ")");
                }
                startPointersDeltas[blockCount] = PackedInts.GetReaderNoHeader(fieldsIndexIn, PackedInts.Format.PACKED, packedIntsVersion, numChunks, bitsPerStartPointer);

                ++blockCount;
            }

            this.DocBases = Arrays.CopyOf(docBases, blockCount);
            this.StartPointers = Arrays.CopyOf(startPointers, blockCount);
            this.AvgChunkDocs = Arrays.CopyOf(avgChunkDocs, blockCount);
            this.AvgChunkSizes = Arrays.CopyOf(avgChunkSizes, blockCount);
            this.DocBasesDeltas = Arrays.CopyOf(docBasesDeltas, blockCount);
            this.StartPointersDeltas = Arrays.CopyOf(startPointersDeltas, blockCount);
        }
        internal static NumericEntry ReadNumericEntry(IndexInput meta)
        {
            NumericEntry entry = new NumericEntry();
            entry.Format = meta.ReadVInt();
            entry.MissingOffset = meta.ReadLong();
            entry.PackedIntsVersion = meta.ReadVInt();
            entry.Offset = meta.ReadLong();
            entry.Count = meta.ReadVLong();
            entry.BlockSize = meta.ReadVInt();
            switch (entry.Format)
            {
                case Lucene45DocValuesConsumer.GCD_COMPRESSED:
                    entry.MinValue = meta.ReadLong();
                    entry.Gcd = meta.ReadLong();
                    break;

                case Lucene45DocValuesConsumer.TABLE_COMPRESSED:
                    if (entry.Count > int.MaxValue)
                    {
                        throw new Exception("Cannot use TABLE_COMPRESSED with more than MAX_VALUE values, input=" + meta);
                    }
                    int uniqueValues = meta.ReadVInt();
                    if (uniqueValues > 256)
                    {
                        throw new Exception("TABLE_COMPRESSED cannot have more than 256 distinct values, input=" + meta);
                    }
                    entry.Table = new long[uniqueValues];
                    for (int i = 0; i < uniqueValues; ++i)
                    {
                        entry.Table[i] = meta.ReadLong();
                    }
                    break;

                case Lucene45DocValuesConsumer.DELTA_COMPRESSED:
                    break;

                default:
                    throw new Exception("Unknown format: " + entry.Format + ", input=" + meta);
            }
            return entry;
        }
        internal static BinaryEntry ReadBinaryEntry(IndexInput meta)
        {
            BinaryEntry entry = new BinaryEntry();
            entry.Format = meta.ReadVInt();
            entry.MissingOffset = meta.ReadLong();
            entry.MinLength = meta.ReadVInt();
            entry.MaxLength = meta.ReadVInt();
            entry.Count = meta.ReadVLong();
            entry.Offset = meta.ReadLong();
            switch (entry.Format)
            {
                case Lucene45DocValuesConsumer.BINARY_FIXED_UNCOMPRESSED:
                    break;

                case Lucene45DocValuesConsumer.BINARY_PREFIX_COMPRESSED:
                    entry.AddressInterval = meta.ReadVInt();
                    entry.AddressesOffset = meta.ReadLong();
                    entry.PackedIntsVersion = meta.ReadVInt();
                    entry.BlockSize = meta.ReadVInt();
                    break;

                case Lucene45DocValuesConsumer.BINARY_VARIABLE_UNCOMPRESSED:
                    entry.AddressesOffset = meta.ReadLong();
                    entry.PackedIntsVersion = meta.ReadVInt();
                    entry.BlockSize = meta.ReadVInt();
                    break;

                default:
                    throw new Exception("Unknown format: " + entry.Format + ", input=" + meta);
            }
            return entry;
        }