/// <summary> Return all term vectors stored for this document or null if the could not be read in.
        ///
        /// </summary>
        /// <param name="docNum">The document number to retrieve the vector for
        /// </param>
        /// <returns> All term frequency vectors
        /// </returns>
        /// <throws>  IOException if there is an error reading the term vector files  </throws>
        public /*internal*/ virtual TermFreqVector[] Get(int docNum)
        {
            TermFreqVector[] result = null;
            if (tvx != null)
            {
                //We need to offset by
                tvx.Seek(((docNum + docStoreOffset) * 8L) + FORMAT_SIZE);
                long position = tvx.ReadLong();

                tvd.Seek(position);
                int fieldCount = tvd.ReadVInt();

                // No fields are vectorized for this document
                if (fieldCount != 0)
                {
                    int             number = 0;
                    System.String[] fields = new System.String[fieldCount];

                    for (int i = 0; i < fieldCount; i++)
                    {
                        if (tvdFormat == FORMAT_VERSION)
                        {
                            number = tvd.ReadVInt();
                        }
                        else
                        {
                            number += tvd.ReadVInt();
                        }

                        fields[i] = fieldInfos.FieldName(number);
                    }

                    // Compute position in the tvf file
                    position = 0;
                    long[] tvfPointers = new long[fieldCount];
                    for (int i = 0; i < fieldCount; i++)
                    {
                        position      += tvd.ReadVLong();
                        tvfPointers[i] = position;
                    }

                    result = ReadTermVectors(docNum, fields, tvfPointers);
                }
            }
            else
            {
                //System.out.println("No tvx file");
            }
            return(result);
        }
Exemple #2
0
        public void  Read(IndexInput input, FieldInfos fieldInfos)
        {
            this.term = null;             // invalidate cache
            int start       = input.ReadVInt();
            int length      = input.ReadVInt();
            int totalLength = start + length;

            if (preUTF8Strings)
            {
                text.SetLength(totalLength);
                input.ReadChars(text.result, start, length);
            }
            else
            {
                if (dirty)
                {
                    // Fully convert all bytes since bytes is dirty
                    UnicodeUtil.UTF16toUTF8(text.result, 0, text.length, bytes);
                    bytes.SetLength(totalLength);
                    input.ReadBytes(bytes.result, start, length);
                    UnicodeUtil.UTF8toUTF16(bytes.result, 0, totalLength, text);
                    dirty = false;
                }
                else
                {
                    // Incrementally convert only the UTF8 bytes that are new:
                    bytes.SetLength(totalLength);
                    input.ReadBytes(bytes.result, start, length);
                    UnicodeUtil.UTF8toUTF16(bytes.result, start, length, text);
                }
            }
            this.field = fieldInfos.FieldName(input.ReadVInt());
        }
Exemple #3
0
        private void SetMatchingSegmentReaders()
        {
            // if the i'th reader is a SegmentReader and has
            // identical fieldName->number mapping the this
            // array will be non-null at position i:
            matchingSegmentReaders = new SegmentReader[readers.Count];

            // if this reader is a SegmentReader, and all of its
            // fieldName->number mappings match the "merged"
            // FieldInfos, then we can do a bulk copy of the
            // stored fields
            for (int i = 0; i < readers.Count; i++)
            {
                IndexReader reader = (IndexReader)readers[i];
                if (reader is SegmentReader)
                {
                    SegmentReader segmentReader     = (SegmentReader)reader;
                    bool          same              = true;
                    FieldInfos    segmentFieldInfos = segmentReader.GetFieldInfos();
                    for (int j = 0; same && j < segmentFieldInfos.Size(); j++)
                    {
                        same = fieldInfos.FieldName(j).Equals(segmentFieldInfos.FieldName(j));
                    }
                    if (same)
                    {
                        matchingSegmentReaders[i] = segmentReader;
                    }
                }
            }

            // used for bulk-reading raw bytes for stored fields
            rawDocLengths  = new int[MAX_RAW_MERGE_DOCS];
            rawDocLengths2 = new int[MAX_RAW_MERGE_DOCS];
        }
Exemple #4
0
        // Currently used only by assert statement
        private int CompareToLastTerm(int fieldNumber, char[] termText, int start, int length)
        {
            int pos = 0;

            if (lastFieldNumber != fieldNumber)
            {
                int cmp = String.CompareOrdinal(fieldInfos.FieldName(lastFieldNumber), fieldInfos.FieldName(fieldNumber));
                // If there is a field named "" (empty string) then we
                // will get 0 on this comparison, yet, it's "OK".  But
                // it's not OK if two different field numbers map to
                // the same name.
                if (cmp != 0 || lastFieldNumber != -1)
                {
                    return(cmp);
                }
            }

            while (pos < length && pos < lastTermTextLength)
            {
                char c1 = lastTermText[pos];
                char c2 = termText[pos + start];
                if (c1 < c2)
                {
                    return(-1);
                }
                else if (c1 > c2)
                {
                    return(1);
                }
                pos++;
            }

            if (pos < lastTermTextLength)
            {
                // Last term was longer
                return(1);
            }
            else if (pos < length)
            {
                // Last term was shorter
                return(-1);
            }
            else
            {
                return(0);
            }
        }
		public void  Read(IndexInput input, FieldInfos fieldInfos)
		{
			this.term = null; // invalidate cache
			int start = input.ReadVInt();
			int length = input.ReadVInt();
			int totalLength = start + length;
			SetTextLength(totalLength);
			input.ReadChars(this.text, start, length);
			this.field = fieldInfos.FieldName(input.ReadVInt());
		}
Exemple #6
0
        public void  Read(IndexInput input, FieldInfos fieldInfos)
        {
            this.term = null;             // invalidate cache
            int start       = input.ReadVInt();
            int length      = input.ReadVInt();
            int totalLength = start + length;

            SetTextLength(totalLength);
            input.ReadChars(this.text, start, length);
            this.field = fieldInfos.FieldName(input.ReadVInt());
        }
Exemple #7
0
        // Currently used only by assert statement
        private int CompareToLastTerm(int fieldNumber, byte[] termBytes, int termBytesLength)
        {
            if (lastFieldNumber != fieldNumber)
            {
                int cmp = String.CompareOrdinal(fieldInfos.FieldName(lastFieldNumber), fieldInfos.FieldName(fieldNumber));
                // If there is a field named "" (empty string) then we
                // will get 0 on this comparison, yet, it's "OK".  But
                // it's not OK if two different field numbers map to
                // the same name.
                if (cmp != 0 || lastFieldNumber != -1)
                {
                    return(cmp);
                }
            }

            UnicodeUtil.UTF8toUTF16(lastTermBytes, 0, lastTermBytesLength, utf16Result1);
            UnicodeUtil.UTF8toUTF16(termBytes, 0, termBytesLength, utf16Result2);

            int len;

            if (utf16Result1.length < utf16Result2.length)
            {
                len = utf16Result1.length;
            }
            else
            {
                len = utf16Result2.length;
            }

            for (int i = 0; i < len; i++)
            {
                char ch1 = utf16Result1.result[i];
                char ch2 = utf16Result2.result[i];
                if (ch1 != ch2)
                {
                    return(ch1 - ch2);
                }
            }
            return(utf16Result1.length - utf16Result2.length);
        }
Exemple #8
0
        // Reads the String[] fields; you have to pre-seek tvd to
        // the right point
        private System.String[] ReadFields(int fieldCount)
        {
            int number = 0;

            System.String[] fields = new System.String[fieldCount];

            for (int i = 0; i < fieldCount; i++)
            {
                if (format >= FORMAT_VERSION)
                {
                    number = tvd.ReadVInt();
                }
                else
                {
                    number += tvd.ReadVInt();
                }

                fields[i] = fieldInfos.FieldName(number);
            }

            return(fields);
        }
Exemple #9
0
        public void Read(IndexInput input, FieldInfos fieldInfos)
        {
            this.term = null; // invalidate cache
            int start = input.ReadVInt();
            int length = input.ReadVInt();
            int totalLength = start + length;
            if (preUTF8Strings)
            {
                text.SetLength(totalLength);
                input.ReadChars(text.result, start, length);
            }
            else
            {

                if (dirty)
                {
                    // Fully convert all bytes since bytes is dirty
                    UnicodeUtil.UTF16toUTF8(text.result, 0, text.length, bytes);
                    bytes.SetLength(totalLength);
                    input.ReadBytes(bytes.result, start, length);
                    UnicodeUtil.UTF8toUTF16(bytes.result, 0, totalLength, text);
                    dirty = false;
                }
                else
                {
                    // Incrementally convert only the UTF8 bytes that are new:
                    bytes.SetLength(totalLength);
                    input.ReadBytes(bytes.result, start, length);
                    UnicodeUtil.UTF8toUTF16(bytes.result, start, length, text);
                }
            }
            this.field = fieldInfos.FieldName(input.ReadVInt());
        }
Exemple #10
0
        /// <summary> </summary>
        /// <returns> The number of documents in all of the readers
        /// </returns>
        /// <throws>  CorruptIndexException if the index is corrupt </throws>
        /// <throws>  IOException if there is a low-level IO error </throws>
        private int MergeFields()
        {
            if (!mergeDocStores)
            {
                // When we are not merging by doc stores, that means
                // all segments were written as part of a single
                // autoCommit=false IndexWriter session, so their field
                // name -> number mapping are the same.  So, we start
                // with the fieldInfos of the last segment in this
                // case, to keep that numbering.
                SegmentReader sr = (SegmentReader)readers[readers.Count - 1];
                fieldInfos = (FieldInfos)sr.fieldInfos.Clone();
            }
            else
            {
                fieldInfos = new FieldInfos();                 // merge field names
            }

            for (int i = 0; i < readers.Count; i++)
            {
                IndexReader reader = (IndexReader)readers[i];
                if (reader is SegmentReader)
                {
                    SegmentReader segmentReader = (SegmentReader)reader;
                    for (int j = 0; j < segmentReader.GetFieldInfos().Size(); j++)
                    {
                        FieldInfo fi = segmentReader.GetFieldInfos().FieldInfo(j);
                        fieldInfos.Add(fi.name, fi.isIndexed, fi.storeTermVector, fi.storePositionWithTermVector, fi.storeOffsetWithTermVector, !reader.HasNorms(fi.name), fi.storePayloads);
                    }
                }
                else
                {
                    AddIndexed(reader, fieldInfos, reader.GetFieldNames(IndexReader.FieldOption.TERMVECTOR_WITH_POSITION_OFFSET), true, true, true, false);
                    AddIndexed(reader, fieldInfos, reader.GetFieldNames(IndexReader.FieldOption.TERMVECTOR_WITH_POSITION), true, true, false, false);
                    AddIndexed(reader, fieldInfos, reader.GetFieldNames(IndexReader.FieldOption.TERMVECTOR_WITH_OFFSET), true, false, true, false);
                    AddIndexed(reader, fieldInfos, reader.GetFieldNames(IndexReader.FieldOption.TERMVECTOR), true, false, false, false);
                    AddIndexed(reader, fieldInfos, reader.GetFieldNames(IndexReader.FieldOption.STORES_PAYLOADS), false, false, false, true);
                    AddIndexed(reader, fieldInfos, reader.GetFieldNames(IndexReader.FieldOption.INDEXED), false, false, false, false);
                    fieldInfos.Add(reader.GetFieldNames(IndexReader.FieldOption.UNINDEXED), false);
                }
            }
            fieldInfos.Write(directory, segment + ".fnm");

            int docCount = 0;

            if (mergeDocStores)
            {
                // If the i'th reader is a SegmentReader and has
                // identical fieldName -> number mapping, then this
                // array will be non-null at position i:
                SegmentReader[] matchingSegmentReaders = new SegmentReader[readers.Count];

                // If this reader is a SegmentReader, and all of its
                // field name -> number mappings match the "merged"
                // FieldInfos, then we can do a bulk copy of the
                // stored fields:
                for (int i = 0; i < readers.Count; i++)
                {
                    IndexReader reader = (IndexReader)readers[i];
                    if (reader is SegmentReader)
                    {
                        SegmentReader segmentReader     = (SegmentReader)reader;
                        bool          same              = true;
                        FieldInfos    segmentFieldInfos = segmentReader.GetFieldInfos();
                        for (int j = 0; same && j < segmentFieldInfos.Size(); j++)
                        {
                            same = fieldInfos.FieldName(j).Equals(segmentFieldInfos.FieldName(j));
                        }
                        if (same)
                        {
                            matchingSegmentReaders[i] = segmentReader;
                        }
                    }
                }

                // Used for bulk-reading raw bytes for stored fields
                int[] rawDocLengths = new int[MAX_RAW_MERGE_DOCS];

                // for merging we don't want to compress/uncompress the data, so to tell the FieldsReader that we're
                // in  merge mode, we use this FieldSelector
                FieldSelector fieldSelectorMerge = new AnonymousClassFieldSelector(this);

                // merge field values
                FieldsWriter fieldsWriter = new FieldsWriter(directory, segment, fieldInfos);

                try
                {
                    for (int i = 0; i < readers.Count; i++)
                    {
                        IndexReader   reader = (IndexReader)readers[i];
                        SegmentReader matchingSegmentReader = matchingSegmentReaders[i];
                        FieldsReader  matchingFieldsReader;
                        if (matchingSegmentReader != null)
                        {
                            matchingFieldsReader = matchingSegmentReader.GetFieldsReader();
                        }
                        else
                        {
                            matchingFieldsReader = null;
                        }
                        int maxDoc = reader.MaxDoc();
                        for (int j = 0; j < maxDoc;)
                        {
                            if (!reader.IsDeleted(j))
                            {
                                // skip deleted docs
                                if (matchingSegmentReader != null)
                                {
                                    // We can optimize this case (doing a bulk
                                    // byte copy) since the field numbers are
                                    // identical
                                    int start   = j;
                                    int numDocs = 0;
                                    do
                                    {
                                        j++;
                                        numDocs++;
                                    }while (j < maxDoc && !matchingSegmentReader.IsDeleted(j) && numDocs < MAX_RAW_MERGE_DOCS);

                                    IndexInput stream = matchingFieldsReader.RawDocs(rawDocLengths, start, numDocs);
                                    fieldsWriter.AddRawDocuments(stream, rawDocLengths, numDocs);
                                    docCount += numDocs;
                                    if (checkAbort != null)
                                    {
                                        checkAbort.Work(300 * numDocs);
                                    }
                                }
                                else
                                {
                                    fieldsWriter.AddDocument(reader.Document(j, fieldSelectorMerge));
                                    j++;
                                    docCount++;
                                    if (checkAbort != null)
                                    {
                                        checkAbort.Work(300);
                                    }
                                }
                            }
                            else
                            {
                                j++;
                            }
                        }
                    }
                }
                finally
                {
                    fieldsWriter.Close();
                }
            }
            // If we are skipping the doc stores, that means there
            // are no deletions in any of these segments, so we
            // just sum numDocs() of each segment to get total docCount
            else
            {
                for (int i = 0; i < readers.Count; i++)
                {
                    docCount += ((IndexReader)readers[i]).NumDocs();
                }
            }

            return(docCount);
        }