/// <summary> Return all term vectors stored for this document or null if the could not be read in. /// /// </summary> /// <param name="docNum">The document number to retrieve the vector for /// </param> /// <returns> All term frequency vectors /// </returns> /// <throws> IOException if there is an error reading the term vector files </throws> public /*internal*/ virtual TermFreqVector[] Get(int docNum) { TermFreqVector[] result = null; if (tvx != null) { //We need to offset by tvx.Seek(((docNum + docStoreOffset) * 8L) + FORMAT_SIZE); long position = tvx.ReadLong(); tvd.Seek(position); int fieldCount = tvd.ReadVInt(); // No fields are vectorized for this document if (fieldCount != 0) { int number = 0; System.String[] fields = new System.String[fieldCount]; for (int i = 0; i < fieldCount; i++) { if (tvdFormat == FORMAT_VERSION) { number = tvd.ReadVInt(); } else { number += tvd.ReadVInt(); } fields[i] = fieldInfos.FieldName(number); } // Compute position in the tvf file position = 0; long[] tvfPointers = new long[fieldCount]; for (int i = 0; i < fieldCount; i++) { position += tvd.ReadVLong(); tvfPointers[i] = position; } result = ReadTermVectors(docNum, fields, tvfPointers); } } else { //System.out.println("No tvx file"); } return(result); }
public void Read(IndexInput input, FieldInfos fieldInfos) { this.term = null; // invalidate cache int start = input.ReadVInt(); int length = input.ReadVInt(); int totalLength = start + length; if (preUTF8Strings) { text.SetLength(totalLength); input.ReadChars(text.result, start, length); } else { if (dirty) { // Fully convert all bytes since bytes is dirty UnicodeUtil.UTF16toUTF8(text.result, 0, text.length, bytes); bytes.SetLength(totalLength); input.ReadBytes(bytes.result, start, length); UnicodeUtil.UTF8toUTF16(bytes.result, 0, totalLength, text); dirty = false; } else { // Incrementally convert only the UTF8 bytes that are new: bytes.SetLength(totalLength); input.ReadBytes(bytes.result, start, length); UnicodeUtil.UTF8toUTF16(bytes.result, start, length, text); } } this.field = fieldInfos.FieldName(input.ReadVInt()); }
private void SetMatchingSegmentReaders() { // if the i'th reader is a SegmentReader and has // identical fieldName->number mapping the this // array will be non-null at position i: matchingSegmentReaders = new SegmentReader[readers.Count]; // if this reader is a SegmentReader, and all of its // fieldName->number mappings match the "merged" // FieldInfos, then we can do a bulk copy of the // stored fields for (int i = 0; i < readers.Count; i++) { IndexReader reader = (IndexReader)readers[i]; if (reader is SegmentReader) { SegmentReader segmentReader = (SegmentReader)reader; bool same = true; FieldInfos segmentFieldInfos = segmentReader.GetFieldInfos(); for (int j = 0; same && j < segmentFieldInfos.Size(); j++) { same = fieldInfos.FieldName(j).Equals(segmentFieldInfos.FieldName(j)); } if (same) { matchingSegmentReaders[i] = segmentReader; } } } // used for bulk-reading raw bytes for stored fields rawDocLengths = new int[MAX_RAW_MERGE_DOCS]; rawDocLengths2 = new int[MAX_RAW_MERGE_DOCS]; }
// Currently used only by assert statement private int CompareToLastTerm(int fieldNumber, char[] termText, int start, int length) { int pos = 0; if (lastFieldNumber != fieldNumber) { int cmp = String.CompareOrdinal(fieldInfos.FieldName(lastFieldNumber), fieldInfos.FieldName(fieldNumber)); // If there is a field named "" (empty string) then we // will get 0 on this comparison, yet, it's "OK". But // it's not OK if two different field numbers map to // the same name. if (cmp != 0 || lastFieldNumber != -1) { return(cmp); } } while (pos < length && pos < lastTermTextLength) { char c1 = lastTermText[pos]; char c2 = termText[pos + start]; if (c1 < c2) { return(-1); } else if (c1 > c2) { return(1); } pos++; } if (pos < lastTermTextLength) { // Last term was longer return(1); } else if (pos < length) { // Last term was shorter return(-1); } else { return(0); } }
public void Read(IndexInput input, FieldInfos fieldInfos) { this.term = null; // invalidate cache int start = input.ReadVInt(); int length = input.ReadVInt(); int totalLength = start + length; SetTextLength(totalLength); input.ReadChars(this.text, start, length); this.field = fieldInfos.FieldName(input.ReadVInt()); }
public void Read(IndexInput input, FieldInfos fieldInfos) { this.term = null; // invalidate cache int start = input.ReadVInt(); int length = input.ReadVInt(); int totalLength = start + length; SetTextLength(totalLength); input.ReadChars(this.text, start, length); this.field = fieldInfos.FieldName(input.ReadVInt()); }
// Currently used only by assert statement private int CompareToLastTerm(int fieldNumber, byte[] termBytes, int termBytesLength) { if (lastFieldNumber != fieldNumber) { int cmp = String.CompareOrdinal(fieldInfos.FieldName(lastFieldNumber), fieldInfos.FieldName(fieldNumber)); // If there is a field named "" (empty string) then we // will get 0 on this comparison, yet, it's "OK". But // it's not OK if two different field numbers map to // the same name. if (cmp != 0 || lastFieldNumber != -1) { return(cmp); } } UnicodeUtil.UTF8toUTF16(lastTermBytes, 0, lastTermBytesLength, utf16Result1); UnicodeUtil.UTF8toUTF16(termBytes, 0, termBytesLength, utf16Result2); int len; if (utf16Result1.length < utf16Result2.length) { len = utf16Result1.length; } else { len = utf16Result2.length; } for (int i = 0; i < len; i++) { char ch1 = utf16Result1.result[i]; char ch2 = utf16Result2.result[i]; if (ch1 != ch2) { return(ch1 - ch2); } } return(utf16Result1.length - utf16Result2.length); }
// Reads the String[] fields; you have to pre-seek tvd to // the right point private System.String[] ReadFields(int fieldCount) { int number = 0; System.String[] fields = new System.String[fieldCount]; for (int i = 0; i < fieldCount; i++) { if (format >= FORMAT_VERSION) { number = tvd.ReadVInt(); } else { number += tvd.ReadVInt(); } fields[i] = fieldInfos.FieldName(number); } return(fields); }
public void Read(IndexInput input, FieldInfos fieldInfos) { this.term = null; // invalidate cache int start = input.ReadVInt(); int length = input.ReadVInt(); int totalLength = start + length; if (preUTF8Strings) { text.SetLength(totalLength); input.ReadChars(text.result, start, length); } else { if (dirty) { // Fully convert all bytes since bytes is dirty UnicodeUtil.UTF16toUTF8(text.result, 0, text.length, bytes); bytes.SetLength(totalLength); input.ReadBytes(bytes.result, start, length); UnicodeUtil.UTF8toUTF16(bytes.result, 0, totalLength, text); dirty = false; } else { // Incrementally convert only the UTF8 bytes that are new: bytes.SetLength(totalLength); input.ReadBytes(bytes.result, start, length); UnicodeUtil.UTF8toUTF16(bytes.result, start, length, text); } } this.field = fieldInfos.FieldName(input.ReadVInt()); }
/// <summary> </summary> /// <returns> The number of documents in all of the readers /// </returns> /// <throws> CorruptIndexException if the index is corrupt </throws> /// <throws> IOException if there is a low-level IO error </throws> private int MergeFields() { if (!mergeDocStores) { // When we are not merging by doc stores, that means // all segments were written as part of a single // autoCommit=false IndexWriter session, so their field // name -> number mapping are the same. So, we start // with the fieldInfos of the last segment in this // case, to keep that numbering. SegmentReader sr = (SegmentReader)readers[readers.Count - 1]; fieldInfos = (FieldInfos)sr.fieldInfos.Clone(); } else { fieldInfos = new FieldInfos(); // merge field names } for (int i = 0; i < readers.Count; i++) { IndexReader reader = (IndexReader)readers[i]; if (reader is SegmentReader) { SegmentReader segmentReader = (SegmentReader)reader; for (int j = 0; j < segmentReader.GetFieldInfos().Size(); j++) { FieldInfo fi = segmentReader.GetFieldInfos().FieldInfo(j); fieldInfos.Add(fi.name, fi.isIndexed, fi.storeTermVector, fi.storePositionWithTermVector, fi.storeOffsetWithTermVector, !reader.HasNorms(fi.name), fi.storePayloads); } } else { AddIndexed(reader, fieldInfos, reader.GetFieldNames(IndexReader.FieldOption.TERMVECTOR_WITH_POSITION_OFFSET), true, true, true, false); AddIndexed(reader, fieldInfos, reader.GetFieldNames(IndexReader.FieldOption.TERMVECTOR_WITH_POSITION), true, true, false, false); AddIndexed(reader, fieldInfos, reader.GetFieldNames(IndexReader.FieldOption.TERMVECTOR_WITH_OFFSET), true, false, true, false); AddIndexed(reader, fieldInfos, reader.GetFieldNames(IndexReader.FieldOption.TERMVECTOR), true, false, false, false); AddIndexed(reader, fieldInfos, reader.GetFieldNames(IndexReader.FieldOption.STORES_PAYLOADS), false, false, false, true); AddIndexed(reader, fieldInfos, reader.GetFieldNames(IndexReader.FieldOption.INDEXED), false, false, false, false); fieldInfos.Add(reader.GetFieldNames(IndexReader.FieldOption.UNINDEXED), false); } } fieldInfos.Write(directory, segment + ".fnm"); int docCount = 0; if (mergeDocStores) { // If the i'th reader is a SegmentReader and has // identical fieldName -> number mapping, then this // array will be non-null at position i: SegmentReader[] matchingSegmentReaders = new SegmentReader[readers.Count]; // If this reader is a SegmentReader, and all of its // field name -> number mappings match the "merged" // FieldInfos, then we can do a bulk copy of the // stored fields: for (int i = 0; i < readers.Count; i++) { IndexReader reader = (IndexReader)readers[i]; if (reader is SegmentReader) { SegmentReader segmentReader = (SegmentReader)reader; bool same = true; FieldInfos segmentFieldInfos = segmentReader.GetFieldInfos(); for (int j = 0; same && j < segmentFieldInfos.Size(); j++) { same = fieldInfos.FieldName(j).Equals(segmentFieldInfos.FieldName(j)); } if (same) { matchingSegmentReaders[i] = segmentReader; } } } // Used for bulk-reading raw bytes for stored fields int[] rawDocLengths = new int[MAX_RAW_MERGE_DOCS]; // for merging we don't want to compress/uncompress the data, so to tell the FieldsReader that we're // in merge mode, we use this FieldSelector FieldSelector fieldSelectorMerge = new AnonymousClassFieldSelector(this); // merge field values FieldsWriter fieldsWriter = new FieldsWriter(directory, segment, fieldInfos); try { for (int i = 0; i < readers.Count; i++) { IndexReader reader = (IndexReader)readers[i]; SegmentReader matchingSegmentReader = matchingSegmentReaders[i]; FieldsReader matchingFieldsReader; if (matchingSegmentReader != null) { matchingFieldsReader = matchingSegmentReader.GetFieldsReader(); } else { matchingFieldsReader = null; } int maxDoc = reader.MaxDoc(); for (int j = 0; j < maxDoc;) { if (!reader.IsDeleted(j)) { // skip deleted docs if (matchingSegmentReader != null) { // We can optimize this case (doing a bulk // byte copy) since the field numbers are // identical int start = j; int numDocs = 0; do { j++; numDocs++; }while (j < maxDoc && !matchingSegmentReader.IsDeleted(j) && numDocs < MAX_RAW_MERGE_DOCS); IndexInput stream = matchingFieldsReader.RawDocs(rawDocLengths, start, numDocs); fieldsWriter.AddRawDocuments(stream, rawDocLengths, numDocs); docCount += numDocs; if (checkAbort != null) { checkAbort.Work(300 * numDocs); } } else { fieldsWriter.AddDocument(reader.Document(j, fieldSelectorMerge)); j++; docCount++; if (checkAbort != null) { checkAbort.Work(300); } } } else { j++; } } } } finally { fieldsWriter.Close(); } } // If we are skipping the doc stores, that means there // are no deletions in any of these segments, so we // just sum numDocs() of each segment to get total docCount else { for (int i = 0; i < readers.Count; i++) { docCount += ((IndexReader)readers[i]).NumDocs(); } } return(docCount); }