public void Read(IndexInput input, FieldInfos fieldInfos) { this.Term = null; // invalidate cache NewSuffixStart = input.ReadVInt(); int length = input.ReadVInt(); int totalLength = NewSuffixStart + length; Debug.Assert(totalLength <= ByteBlockPool.BYTE_BLOCK_SIZE - 2, "termLength=" + totalLength + ",resource=" + input); if (Bytes.Bytes.Length < totalLength) { Bytes.Grow(totalLength); } Bytes.Length = totalLength; input.ReadBytes(Bytes.Bytes, NewSuffixStart, length); int fieldNumber = input.ReadVInt(); if (fieldNumber != CurrentFieldNumber) { CurrentFieldNumber = fieldNumber; // NOTE: too much sneakiness here, seriously this is a negative vint?! if (CurrentFieldNumber == -1) { Field = ""; } else { Debug.Assert(fieldInfos.FieldInfo(CurrentFieldNumber) != null, CurrentFieldNumber.ToString()); Field = String.Intern(fieldInfos.FieldInfo(CurrentFieldNumber).Name); } } else { Debug.Assert(Field.Equals(fieldInfos.FieldInfo(fieldNumber).Name), "currentFieldNumber=" + CurrentFieldNumber + " field=" + Field + " vs " + fieldInfos.FieldInfo(fieldNumber) == null ? "null" : fieldInfos.FieldInfo(fieldNumber).Name); } }
public override BytesRef Next() { if (nextTerm >= numTerms) { return(null); } term.CopyBytes(lastTerm); int start = tvf.ReadVInt32(); int deltaLen = tvf.ReadVInt32(); term.Length = start + deltaLen; term.Grow(term.Length); tvf.ReadBytes(term.Bytes, start, deltaLen); freq = tvf.ReadVInt32(); if (storePayloads) { positions = new int[freq]; payloadOffsets = new int[freq]; int totalPayloadLength = 0; int pos = 0; for (int posUpto = 0; posUpto < freq; posUpto++) { int code = tvf.ReadVInt32(); pos += (int)((uint)code >> 1); positions[posUpto] = pos; if ((code & 1) != 0) { // length change lastPayloadLength = tvf.ReadVInt32(); } payloadOffsets[posUpto] = totalPayloadLength; totalPayloadLength += lastPayloadLength; Debug.Assert(totalPayloadLength >= 0); } payloadData = new byte[totalPayloadLength]; tvf.ReadBytes(payloadData, 0, payloadData.Length); } // no payloads else if (storePositions) { // TODO: we could maybe reuse last array, if we can // somehow be careful about consumer never using two // D&PEnums at once... positions = new int[freq]; int pos = 0; for (int posUpto = 0; posUpto < freq; posUpto++) { pos += tvf.ReadVInt32(); positions[posUpto] = pos; } } if (storeOffsets) { startOffsets = new int[freq]; endOffsets = new int[freq]; int offset = 0; for (int posUpto = 0; posUpto < freq; posUpto++) { startOffsets[posUpto] = offset + tvf.ReadVInt32(); offset = endOffsets[posUpto] = startOffsets[posUpto] + tvf.ReadVInt32(); } } lastTerm.CopyBytes(term); nextTerm++; return(term); }
public override void ReadBytes(byte[] b, int offset, int len) { _indexInput.ReadBytes(b, offset, len); }
public override BytesRef Next() { if (NextTerm >= NumTerms) { return(null); } Term_Renamed.CopyBytes(LastTerm); int start = Tvf.ReadVInt(); int deltaLen = Tvf.ReadVInt(); Term_Renamed.Length = start + deltaLen; Term_Renamed.Grow(Term_Renamed.Length); Tvf.ReadBytes(Term_Renamed.Bytes, start, deltaLen); Freq = Tvf.ReadVInt(); if (StorePayloads) { Positions = new int[Freq]; PayloadOffsets = new int[Freq]; int totalPayloadLength = 0; int pos = 0; for (int posUpto = 0; posUpto < Freq; posUpto++) { int code = Tvf.ReadVInt(); pos += (int)((uint)code >> 1); Positions[posUpto] = pos; if ((code & 1) != 0) { // length change LastPayloadLength = Tvf.ReadVInt(); } PayloadOffsets[posUpto] = totalPayloadLength; totalPayloadLength += LastPayloadLength; Debug.Assert(totalPayloadLength >= 0); } PayloadData = new sbyte[totalPayloadLength]; Tvf.ReadBytes(PayloadData, 0, PayloadData.Length); } // no payloads else if (StorePositions) { // TODO: we could maybe reuse last array, if we can // somehow be careful about consumer never using two // D&PEnums at once... Positions = new int[Freq]; int pos = 0; for (int posUpto = 0; posUpto < Freq; posUpto++) { pos += Tvf.ReadVInt(); Positions[posUpto] = pos; } } if (StoreOffsets) { StartOffsets = new int[Freq]; EndOffsets = new int[Freq]; int offset = 0; for (int posUpto = 0; posUpto < Freq; posUpto++) { StartOffsets[posUpto] = offset + Tvf.ReadVInt(); offset = EndOffsets[posUpto] = StartOffsets[posUpto] + Tvf.ReadVInt(); } } LastTerm.CopyBytes(Term_Renamed); NextTerm++; return(Term_Renamed); }
// Does initial decode of next block of terms; this // doesn't actually decode the docFreq, totalTermFreq, // postings details (frq/prx offset, etc.) metadata; // it just loads them as byte[] blobs which are then // decoded on-demand if the metadata is ever requested // for any term in this block. This enables terms-only // intensive consumes (eg certain MTQs, respelling) to // not pay the price of decoding metadata they won't // use. private bool NextBlock() { // TODO: we still lazy-decode the byte[] for each // term (the suffix), but, if we decoded // all N terms up front then seeking could do a fast // bsearch w/in the block... //System.out.println("BTR.nextBlock() fp=" + in.getFilePointer() + " this=" + this); state.BlockFilePointer = input.GetFilePointer(); blockTermCount = input.ReadVInt32(); //System.out.println(" blockTermCount=" + blockTermCount); if (blockTermCount == 0) { return(false); } termBlockPrefix = input.ReadVInt32(); // term suffixes: int len = input.ReadVInt32(); if (termSuffixes.Length < len) { termSuffixes = new byte[ArrayUtil.Oversize(len, 1)]; } //System.out.println(" termSuffixes len=" + len); input.ReadBytes(termSuffixes, 0, len); termSuffixesReader.Reset(termSuffixes, 0, len); // docFreq, totalTermFreq len = input.ReadVInt32(); if (docFreqBytes.Length < len) { docFreqBytes = new byte[ArrayUtil.Oversize(len, 1)]; } //System.out.println(" freq bytes len=" + len); input.ReadBytes(docFreqBytes, 0, len); freqReader.Reset(docFreqBytes, 0, len); // metadata len = input.ReadVInt32(); if (bytes == null) { bytes = new byte[ArrayUtil.Oversize(len, 1)]; bytesReader = new ByteArrayDataInput(); } else if (bytes.Length < len) { bytes = new byte[ArrayUtil.Oversize(len, 1)]; } input.ReadBytes(bytes, 0, len); bytesReader.Reset(bytes, 0, len); metaDataUpto = 0; state.TermBlockOrd = 0; blocksSinceSeek++; indexIsCurrent = indexIsCurrent && (blocksSinceSeek < outerInstance.outerInstance.indexReader.Divisor); //System.out.println(" indexIsCurrent=" + indexIsCurrent); return(true); }
protected override void ReadInternal(byte[] b, int offset, int length) { SimOutage(); @delegate.Seek(GetFilePointer()); @delegate.ReadBytes(b, offset, length); }
private NumericDocValues LoadByteField(FieldInfo field, IndexInput input) { CodecUtil.CheckHeader(input, Lucene40DocValuesFormat.INTS_CODEC_NAME, Lucene40DocValuesFormat.INTS_VERSION_START, Lucene40DocValuesFormat.INTS_VERSION_CURRENT); int valueSize = input.ReadInt(); if (valueSize != 1) { throw new CorruptIndexException("invalid valueSize: " + valueSize); } int maxDoc = State.SegmentInfo.DocCount; var values = new byte[maxDoc]; input.ReadBytes(values, 0, values.Length); RamBytesUsed_Renamed.AddAndGet(RamUsageEstimator.SizeOf(values)); return new NumericDocValuesAnonymousInnerClassHelper3(this, values); }
protected override void ReadInternal(byte[] b, int offset, int length) { SimOutage(); @delegate.Seek(Position); // LUCENENET specific: Renamed from getFilePointer() to match FileStream @delegate.ReadBytes(b, offset, length); }
/// <summary> /// Sole constructor. </summary> public BlockTreeTermsReader(Directory dir, FieldInfos fieldInfos, SegmentInfo info, PostingsReaderBase postingsReader, IOContext ioContext, string segmentSuffix, int indexDivisor) { if (!InstanceFieldsInitialized) { InitializeInstanceFields(); InstanceFieldsInitialized = true; } this.PostingsReader = postingsReader; this.Segment = info.Name; @in = dir.OpenInput(IndexFileNames.SegmentFileName(Segment, segmentSuffix, BlockTreeTermsWriter.TERMS_EXTENSION), ioContext); bool success = false; IndexInput indexIn = null; try { Version = ReadHeader(@in); if (indexDivisor != -1) { indexIn = dir.OpenInput(IndexFileNames.SegmentFileName(Segment, segmentSuffix, BlockTreeTermsWriter.TERMS_INDEX_EXTENSION), ioContext); int indexVersion = ReadIndexHeader(indexIn); if (indexVersion != Version) { throw new CorruptIndexException("mixmatched version files: " + @in + "=" + Version + "," + indexIn + "=" + indexVersion); } } // verify if (indexIn != null && Version >= BlockTreeTermsWriter.VERSION_CHECKSUM) { CodecUtil.ChecksumEntireFile(indexIn); } // Have PostingsReader init itself postingsReader.Init(@in); // Read per-field details SeekDir(@in, DirOffset); if (indexDivisor != -1) { SeekDir(indexIn, IndexDirOffset); } int numFields = @in.ReadVInt(); if (numFields < 0) { throw new CorruptIndexException("invalid numFields: " + numFields + " (resource=" + @in + ")"); } for (int i = 0; i < numFields; i++) { int field = @in.ReadVInt(); long numTerms = @in.ReadVLong(); Debug.Assert(numTerms >= 0); int numBytes = @in.ReadVInt(); BytesRef rootCode = new BytesRef(new byte[numBytes]); @in.ReadBytes(rootCode.Bytes, 0, numBytes); rootCode.Length = numBytes; FieldInfo fieldInfo = fieldInfos.FieldInfo(field); Debug.Assert(fieldInfo != null, "field=" + field); long sumTotalTermFreq = fieldInfo.FieldIndexOptions == FieldInfo.IndexOptions.DOCS_ONLY ? -1 : @in.ReadVLong(); long sumDocFreq = @in.ReadVLong(); int docCount = @in.ReadVInt(); int longsSize = Version >= BlockTreeTermsWriter.VERSION_META_ARRAY ? @in.ReadVInt() : 0; if (docCount < 0 || docCount > info.DocCount) // #docs with field must be <= #docs { throw new CorruptIndexException("invalid docCount: " + docCount + " maxDoc: " + info.DocCount + " (resource=" + @in + ")"); } if (sumDocFreq < docCount) // #postings must be >= #docs with field { throw new CorruptIndexException("invalid sumDocFreq: " + sumDocFreq + " docCount: " + docCount + " (resource=" + @in + ")"); } if (sumTotalTermFreq != -1 && sumTotalTermFreq < sumDocFreq) // #positions must be >= #postings { throw new CorruptIndexException("invalid sumTotalTermFreq: " + sumTotalTermFreq + " sumDocFreq: " + sumDocFreq + " (resource=" + @in + ")"); } long indexStartFP = indexDivisor != -1 ? indexIn.ReadVLong() : 0; if (Fields.ContainsKey(fieldInfo.Name)) { throw new CorruptIndexException("duplicate field: " + fieldInfo.Name + " (resource=" + @in + ")"); } else { Fields[fieldInfo.Name] = new FieldReader(this, fieldInfo, numTerms, rootCode, sumTotalTermFreq, sumDocFreq, docCount, indexStartFP, longsSize, indexIn); } } if (indexDivisor != -1) { indexIn.Dispose(); } success = true; } finally { if (!success) { // this.close() will close in: IOUtils.CloseWhileHandlingException(indexIn, this); } } }
// Does initial decode of next block of terms; this // doesn't actually decode the docFreq, totalTermFreq, // postings details (frq/prx offset, etc.) metadata; // it just loads them as byte[] blobs which are then // decoded on-demand if the metadata is ever requested // for any term in this block. This enables terms-only // intensive consumes (eg certain MTQs, respelling) to // not pay the price of decoding metadata they won't // use. private bool NextBlock() { // TODO: we still lazy-decode the byte[] for each // term (the suffix), but, if we decoded // all N terms up front then seeking could do a fast // bsearch w/in the block... _state.BlockFilePointer = _input.FilePointer; _blockTermCount = _input.ReadVInt(); if (_blockTermCount == 0) { return(false); } _termBlockPrefix = _input.ReadVInt(); // term suffixes: int len = _input.ReadVInt(); if (_termSuffixes.Length < len) { _termSuffixes = new byte[ArrayUtil.Oversize(len, 1)]; } //System.out.println(" termSuffixes len=" + len); _input.ReadBytes(_termSuffixes, 0, len); _termSuffixesReader.Reset(_termSuffixes, 0, len); // docFreq, totalTermFreq len = _input.ReadVInt(); if (_docFreqBytes.Length < len) { _docFreqBytes = new byte[ArrayUtil.Oversize(len, 1)]; } _input.ReadBytes(_docFreqBytes, 0, len); _freqReader.Reset(_docFreqBytes, 0, len); // metadata len = _input.ReadVInt(); if (_bytes == null) { _bytes = new byte[ArrayUtil.Oversize(len, 1)]; _bytesReader = new ByteArrayDataInput(); } else if (_bytes.Length < len) { _bytes = new byte[ArrayUtil.Oversize(len, 1)]; } _input.ReadBytes(_bytes, 0, len); _bytesReader.Reset(_bytes, 0, len); _metaDataUpto = 0; _state.TermBlockOrd = 0; _blocksSinceSeek++; _indexIsCurrent = _indexIsCurrent && (_blocksSinceSeek < _blockTermsReader._indexReader.Divisor); return(true); }
public override void Close() { _fileMutex.WaitOne(); try { string fileName = _name; // make sure it's all written out _indexOutput.Flush(); long originalLength = _indexOutput.Length(); _indexOutput.Close(); Stream blobStream; #if COMPRESSBLOBS // optionally put a compressor around the blob stream if (_azureDirectory.ShouldCompressFile(_name)) { // unfortunately, deflate stream doesn't allow seek, and we need a seekable stream // to pass to the blob storage stuff, so we compress into a memory stream MemoryStream compressedStream = new MemoryStream(); try { IndexInput indexInput = CacheDirectory.OpenInput(fileName); using (DeflateStream compressor = new DeflateStream(compressedStream, CompressionMode.Compress, true)) { // compress to compressedOutputStream byte[] bytes = new byte[indexInput.Length()]; indexInput.ReadBytes(bytes, 0, (int)bytes.Length); compressor.Write(bytes, 0, (int)bytes.Length); } indexInput.Close(); // seek back to beginning of comrpessed stream compressedStream.Seek(0, SeekOrigin.Begin); Debug.WriteLine(string.Format("COMPRESSED {0} -> {1} {2}% to {3}", originalLength, compressedStream.Length, ((float)compressedStream.Length / (float)originalLength) * 100, _name)); } catch { // release the compressed stream resources if an error occurs compressedStream.Dispose(); throw; } blobStream = compressedStream; } else #endif { blobStream = new StreamInput(CacheDirectory.OpenInput(fileName)); } try { // push the blobStream up to the cloud _blob.UploadFromStream(blobStream); // set the metadata with the original index file properties _blob.Metadata["CachedLength"] = originalLength.ToString(); _blob.Metadata["CachedLastModified"] = CacheDirectory.FileModified(fileName).ToString(); _blob.SetMetadata(); Debug.WriteLine(string.Format("PUT {1} bytes to {0} in cloud", _name, blobStream.Length)); } finally { blobStream.Dispose(); } #if FULLDEBUG Debug.WriteLine(string.Format("CLOSED WRITESTREAM {0}", _name)); #endif // clean up _indexOutput = null; _blobContainer = null; _blob = null; GC.SuppressFinalize(this); } finally { _fileMutex.ReleaseMutex(); } }
public override void ReadBytes(byte[] b, int offset, int len) { main.ReadBytes(b, offset, len); digest.Update(b, offset, len); }
/// <summary> /// Prints the filename and size of each file within a given compound file. /// Add the -extract flag to extract files to the current working directory. /// In order to make the extracted version of the index work, you have to copy /// the segments file from the compound index into the directory where the extracted files are stored. </summary> ///// <param name="args"> Usage: org.apache.lucene.index.IndexReader [-extract] <cfsfile> </param> public static void Main(string[] args) { string filename = null; bool extract = false; string dirImpl = null; int j = 0; while (j < args.Length) { string arg = args[j]; if ("-extract".Equals(arg, StringComparison.Ordinal)) { extract = true; } else if ("-dir-impl".Equals(arg, StringComparison.Ordinal)) { if (j == args.Length - 1) { // LUCENENET specific - our wrapper console shows the correct usage throw new ArgumentException("ERROR: missing value for --directory-type option"); //Console.WriteLine("ERROR: missing value for -dir-impl option"); //Environment.Exit(1); } j++; dirImpl = args[j]; } else if (filename == null) { filename = arg; } j++; } if (filename == null) { // LUCENENET specific - our wrapper console shows the correct usage throw new ArgumentException("ERROR: CFS-FILE is required"); //Console.WriteLine("Usage: org.apache.lucene.index.CompoundFileExtractor [-extract] [-dir-impl X] <cfsfile>"); //return; } Store.Directory dir = null; CompoundFileDirectory cfr = null; IOContext context = IOContext.READ; try { FileInfo file = new FileInfo(filename); string dirname = file.DirectoryName; filename = file.Name; if (dirImpl == null) { dir = FSDirectory.Open(new DirectoryInfo(dirname)); } else { dir = CommandLineUtil.NewFSDirectory(dirImpl, new DirectoryInfo(dirname)); } cfr = new CompoundFileDirectory(dir, filename, IOContext.DEFAULT, false); string[] files = cfr.ListAll(); ArrayUtil.TimSort(files); // sort the array of filename so that the output is more readable for (int i = 0; i < files.Length; ++i) { long len = cfr.FileLength(files[i]); if (extract) { Console.WriteLine("extract " + files[i] + " with " + len + " bytes to local directory..."); using IndexInput ii = cfr.OpenInput(files[i], context); using FileStream f = new FileStream(files[i], FileMode.Open, FileAccess.ReadWrite); // read and write with a small buffer, which is more effective than reading byte by byte byte[] buffer = new byte[1024]; int chunk = buffer.Length; while (len > 0) { int bufLen = (int)Math.Min(chunk, len); ii.ReadBytes(buffer, 0, bufLen); f.Write(buffer, 0, bufLen); len -= bufLen; } } else { Console.WriteLine(files[i] + ": " + len + " bytes"); } } } catch (IOException ioe) { Console.WriteLine(ioe.ToString()); //Console.Write(ioe.StackTrace); } finally { try { if (dir != null) { dir.Dispose(); } if (cfr != null) { cfr.Dispose(); } } catch (IOException ioe) { Console.WriteLine(ioe.ToString()); //Console.Write(ioe.StackTrace); } } }
/// <summary> </summary> /// <param name="field">The field to read in /// </param> /// <param name="tvfPointer">The pointer within the tvf file where we should start reading /// </param> /// <param name="mapper">The mapper used to map the TermVector /// </param> /// <throws> IOException </throws> private void ReadTermVector(System.String field, long tvfPointer, TermVectorMapper mapper) { // Now read the data from specified position //We don't need to offset by the FORMAT here since the pointer already includes the offset tvf.Seek(tvfPointer); int numTerms = tvf.ReadVInt(); //System.out.println("Num Terms: " + numTerms); // If no terms - return a constant empty termvector. However, this should never occur! if (numTerms == 0) { return; } bool storePositions; bool storeOffsets; if (format >= FORMAT_VERSION) { byte bits = tvf.ReadByte(); storePositions = (bits & STORE_POSITIONS_WITH_TERMVECTOR) != 0; storeOffsets = (bits & STORE_OFFSET_WITH_TERMVECTOR) != 0; } else { tvf.ReadVInt(); storePositions = false; storeOffsets = false; } mapper.SetExpectations(field, numTerms, storeOffsets, storePositions); int start = 0; int deltaLength = 0; int totalLength = 0; byte[] byteBuffer; char[] charBuffer; bool preUTF8 = format < FORMAT_UTF8_LENGTH_IN_BYTES; // init the buffers if (preUTF8) { charBuffer = new char[10]; byteBuffer = null; } else { charBuffer = null; byteBuffer = new byte[20]; } for (int i = 0; i < numTerms; i++) { start = tvf.ReadVInt(); deltaLength = tvf.ReadVInt(); totalLength = start + deltaLength; System.String term; if (preUTF8) { // Term stored as java chars if (charBuffer.Length < totalLength) { char[] newCharBuffer = new char[(int)(1.5 * totalLength)]; Array.Copy(charBuffer, 0, newCharBuffer, 0, start); charBuffer = newCharBuffer; } tvf.ReadChars(charBuffer, start, deltaLength); term = new System.String(charBuffer, 0, totalLength); } else { // Term stored as utf8 bytes if (byteBuffer.Length < totalLength) { byte[] newByteBuffer = new byte[(int)(1.5 * totalLength)]; Array.Copy(byteBuffer, 0, newByteBuffer, 0, start); byteBuffer = newByteBuffer; } tvf.ReadBytes(byteBuffer, start, deltaLength); term = System.Text.Encoding.UTF8.GetString(byteBuffer, 0, totalLength); } int freq = tvf.ReadVInt(); int[] positions = null; if (storePositions) { //read in the positions //does the mapper even care about positions? if (mapper.IsIgnoringPositions == false) { positions = new int[freq]; int prevPosition = 0; for (int j = 0; j < freq; j++) { positions[j] = prevPosition + tvf.ReadVInt(); prevPosition = positions[j]; } } else { //we need to skip over the positions. Since these are VInts, I don't believe there is anyway to know for sure how far to skip // for (int j = 0; j < freq; j++) { tvf.ReadVInt(); } } } TermVectorOffsetInfo[] offsets = null; if (storeOffsets) { //does the mapper even care about offsets? if (mapper.IsIgnoringOffsets == false) { offsets = new TermVectorOffsetInfo[freq]; int prevOffset = 0; for (int j = 0; j < freq; j++) { int startOffset = prevOffset + tvf.ReadVInt(); int endOffset = startOffset + tvf.ReadVInt(); offsets[j] = new TermVectorOffsetInfo(startOffset, endOffset); prevOffset = endOffset; } } else { for (int j = 0; j < freq; j++) { tvf.ReadVInt(); tvf.ReadVInt(); } } } mapper.Map(term, freq, offsets, positions); } }
/// <summary> /// Read as a bit set. </summary> private void ReadBits(IndexInput input) { count = input.ReadInt32(); // read count bits = new byte[GetNumBytes(size)]; // allocate bits input.ReadBytes(bits, 0, bits.Length); }
/// <summary>Read as a bit set </summary> private void ReadBits(IndexInput input) { count = input.ReadInt(); // read count bits = new byte[(size >> 3) + 1]; // allocate bits input.ReadBytes(bits, 0, bits.Length); }
/// <summary> /// Read as a bit set </summary> private void ReadBits(IndexInput input) { Count_Renamed = input.ReadInt(); // read count Bits = new byte[GetNumBytes(Size_Renamed)]; // allocate bits input.ReadBytes(Bits, 0, Bits.Length); }