internal void InitializeInstanceFields() { @in = new ByteArrayDataInput(buffer); }
public override void Build(IInputEnumerator enumerator) { // LUCENENET: Added guard clause for null if (enumerator is null) { throw new ArgumentNullException(nameof(enumerator)); } if (enumerator.HasPayloads) { throw new ArgumentException("this suggester doesn't support payloads"); } if (enumerator.HasContexts) { throw new ArgumentException("this suggester doesn't support contexts"); } FileInfo tempInput = FileSupport.CreateTempFile(typeof(FSTCompletionLookup).Name, ".input", OfflineSorter.DefaultTempDir()); FileInfo tempSorted = FileSupport.CreateTempFile(typeof(FSTCompletionLookup).Name, ".sorted", OfflineSorter.DefaultTempDir()); OfflineSorter.ByteSequencesWriter writer = new OfflineSorter.ByteSequencesWriter(tempInput); OfflineSorter.ByteSequencesReader reader = null; ExternalRefSorter sorter = null; // Push floats up front before sequences to sort them. For now, assume they are non-negative. // If negative floats are allowed some trickery needs to be done to find their byte order. bool success = false; count = 0; try { byte[] buffer = Arrays.Empty <byte>(); ByteArrayDataOutput output = new ByteArrayDataOutput(buffer); BytesRef spare; while (enumerator.MoveNext()) { spare = enumerator.Current; if (spare.Length + 4 >= buffer.Length) { buffer = ArrayUtil.Grow(buffer, spare.Length + 4); } output.Reset(buffer); output.WriteInt32(EncodeWeight(enumerator.Weight)); output.WriteBytes(spare.Bytes, spare.Offset, spare.Length); writer.Write(buffer, 0, output.Position); } writer.Dispose(); // We don't know the distribution of scores and we need to bucket them, so we'll sort // and divide into equal buckets. OfflineSorter.SortInfo info = (new OfflineSorter()).Sort(tempInput, tempSorted); tempInput.Delete(); FSTCompletionBuilder builder = new FSTCompletionBuilder(buckets, sorter = new ExternalRefSorter(new OfflineSorter()), sharedTailLength); int inputLines = info.Lines; reader = new OfflineSorter.ByteSequencesReader(tempSorted); long line = 0; int previousBucket = 0; int previousScore = 0; ByteArrayDataInput input = new ByteArrayDataInput(); BytesRef tmp1 = new BytesRef(); BytesRef tmp2 = new BytesRef(); while (reader.Read(tmp1)) { input.Reset(tmp1.Bytes); int currentScore = input.ReadInt32(); int bucket; if (line > 0 && currentScore == previousScore) { bucket = previousBucket; } else { bucket = (int)(line * buckets / inputLines); } previousScore = currentScore; previousBucket = bucket; // Only append the input, discard the weight. tmp2.Bytes = tmp1.Bytes; tmp2.Offset = input.Position; tmp2.Length = tmp1.Length - input.Position; builder.Add(tmp2, bucket); line++; count++; } // The two FSTCompletions share the same automaton. this.higherWeightsCompletion = builder.Build(); this.normalCompletion = new FSTCompletion(higherWeightsCompletion.FST, false, exactMatchFirst); success = true; } finally { if (success) { IOUtils.Dispose(reader, writer, sorter); } else { IOUtils.DisposeWhileHandlingException(reader, writer, sorter); } tempInput.Delete(); tempSorted.Delete(); } }
internal virtual void Reset(int numTerms, int flags, int[] prefixLengths, int[] suffixLengths, int[] termFreqs, int[] positionIndex, int[] positions, int[] startOffsets, int[] lengths, int[] payloadIndex, BytesRef payloads, ByteArrayDataInput @in) { this.numTerms = numTerms; this.prefixLengths = prefixLengths; this.suffixLengths = suffixLengths; this.termFreqs = termFreqs; this.positionIndex = positionIndex; this.positions = positions; this.startOffsets = startOffsets; this.lengths = lengths; this.payloadIndex = payloadIndex; this.payloads = payloads; this.@in = @in; startPos = @in.Position; Reset(); }
internal void Load(BytesRef frameIndexData) { // if (DEBUG) System.out.println(" load fp=" + fp + " fpOrig=" + fpOrig + " frameIndexData=" + frameIndexData + " trans=" + (transitions.length != 0 ? transitions[0] : "n/a" + " state=" + state)); if (frameIndexData != null && Transitions.Length != 0) { // Floor frame if (FloorData.Length < frameIndexData.Length) { this.FloorData = new byte[ArrayUtil.Oversize(frameIndexData.Length, 1)]; } System.Buffer.BlockCopy(frameIndexData.Bytes, frameIndexData.Offset, FloorData, 0, frameIndexData.Length); FloorDataReader.Reset(FloorData, 0, frameIndexData.Length); // Skip first long -- has redundant fp, hasTerms // flag, isFloor flag long code = FloorDataReader.ReadVLong(); if ((code & BlockTreeTermsWriter.OUTPUT_FLAG_IS_FLOOR) != 0) { NumFollowFloorBlocks = FloorDataReader.ReadVInt(); NextFloorLabel = FloorDataReader.ReadByte() & 0xff; // if (DEBUG) System.out.println(" numFollowFloorBlocks=" + numFollowFloorBlocks + " nextFloorLabel=" + nextFloorLabel); // If current state is accept, we must process // first block in case it has empty suffix: if (OuterInstance.runAutomaton.IsAccept(state)) { // Maybe skip floor blocks: while (NumFollowFloorBlocks != 0 && NextFloorLabel <= Transitions[0].Min) { Fp = FpOrig + ((int)((uint)FloorDataReader.ReadVLong() >> 1)); NumFollowFloorBlocks--; // if (DEBUG) System.out.println(" skip floor block! nextFloorLabel=" + (char) nextFloorLabel + " vs target=" + (char) transitions[0].getMin() + " newFP=" + fp + " numFollowFloorBlocks=" + numFollowFloorBlocks); if (NumFollowFloorBlocks != 0) { NextFloorLabel = FloorDataReader.ReadByte() & 0xff; } else { NextFloorLabel = 256; } } } } } [email protected](Fp); int code_ = [email protected](); EntCount = (int)((uint)code_ >> 1); Debug.Assert(EntCount > 0); IsLastInFloor = (code_ & 1) != 0; // term suffixes: code_ = [email protected](); IsLeafBlock = (code_ & 1) != 0; int numBytes = (int)((uint)code_ >> 1); // if (DEBUG) System.out.println(" entCount=" + entCount + " lastInFloor?=" + isLastInFloor + " leafBlock?=" + isLeafBlock + " numSuffixBytes=" + numBytes); if (SuffixBytes.Length < numBytes) { SuffixBytes = new byte[ArrayUtil.Oversize(numBytes, 1)]; } [email protected](SuffixBytes, 0, numBytes); SuffixesReader.Reset(SuffixBytes, 0, numBytes); // stats numBytes = [email protected](); if (StatBytes.Length < numBytes) { StatBytes = new byte[ArrayUtil.Oversize(numBytes, 1)]; } [email protected](StatBytes, 0, numBytes); StatsReader.Reset(StatBytes, 0, numBytes); MetaDataUpto = 0; TermState.TermBlockOrd = 0; NextEnt = 0; // metadata numBytes = [email protected](); if (Bytes == null) { Bytes = new byte[ArrayUtil.Oversize(numBytes, 1)]; BytesReader = new ByteArrayDataInput(); } else if (Bytes.Length < numBytes) { Bytes = new byte[ArrayUtil.Oversize(numBytes, 1)]; } [email protected](Bytes, 0, numBytes); BytesReader.Reset(Bytes, 0, numBytes); if (!IsLastInFloor) { // Sub-blocks of a single floor block are always // written one after another -- tail recurse: FpEnd = [email protected]; } }
public SortedSetDocValuesAnonymousInnerClassHelper(Lucene42DocValuesProducer outerInstance, FSTEntry entry, BinaryDocValues docToOrds, FST <long> fst, FST <long> .BytesReader @in, FST <long> .Arc <long> firstArc, FST <long> .Arc <long> scratchArc, IntsRef scratchInts, BytesRefFSTEnum <long> fstEnum, BytesRef @ref, ByteArrayDataInput input) { this.OuterInstance = outerInstance; this.Entry = entry; this.DocToOrds = docToOrds; this.Fst = fst; this.@in = @in; this.FirstArc = firstArc; this.ScratchArc = scratchArc; this.ScratchInts = scratchInts; this.FstEnum = fstEnum; this.@ref = @ref; this.Input = input; }
public override void VisitDocument(int docID, StoredFieldVisitor visitor) { FieldsStream.Seek(IndexReader.GetStartPointer(docID)); int docBase = FieldsStream.ReadVInt(); int chunkDocs = FieldsStream.ReadVInt(); if (docID < docBase || docID >= docBase + chunkDocs || docBase + chunkDocs > NumDocs) { throw new CorruptIndexException("Corrupted: docID=" + docID + ", docBase=" + docBase + ", chunkDocs=" + chunkDocs + ", numDocs=" + NumDocs + " (resource=" + FieldsStream + ")"); } int numStoredFields, offset, length, totalLength; if (chunkDocs == 1) { numStoredFields = FieldsStream.ReadVInt(); offset = 0; length = FieldsStream.ReadVInt(); totalLength = length; } else { int bitsPerStoredFields = FieldsStream.ReadVInt(); if (bitsPerStoredFields == 0) { numStoredFields = FieldsStream.ReadVInt(); } else if (bitsPerStoredFields > 31) { throw new CorruptIndexException("bitsPerStoredFields=" + bitsPerStoredFields + " (resource=" + FieldsStream + ")"); } else { long filePointer = FieldsStream.FilePointer; PackedInts.Reader reader = PackedInts.GetDirectReaderNoHeader(FieldsStream, PackedInts.Format.PACKED, PackedIntsVersion, chunkDocs, bitsPerStoredFields); numStoredFields = (int)(reader.Get(docID - docBase)); FieldsStream.Seek(filePointer + PackedInts.Format.PACKED.ByteCount(PackedIntsVersion, chunkDocs, bitsPerStoredFields)); } int bitsPerLength = FieldsStream.ReadVInt(); if (bitsPerLength == 0) { length = FieldsStream.ReadVInt(); offset = (docID - docBase) * length; totalLength = chunkDocs * length; } else if (bitsPerStoredFields > 31) { throw new CorruptIndexException("bitsPerLength=" + bitsPerLength + " (resource=" + FieldsStream + ")"); } else { PackedInts.ReaderIterator it = PackedInts.GetReaderIteratorNoHeader(FieldsStream, PackedInts.Format.PACKED, PackedIntsVersion, chunkDocs, bitsPerLength, 1); int off = 0; for (int i = 0; i < docID - docBase; ++i) { off += (int)it.Next(); } offset = off; length = (int)it.Next(); off += length; for (int i = docID - docBase + 1; i < chunkDocs; ++i) { off += (int)it.Next(); } totalLength = off; } } if ((length == 0) != (numStoredFields == 0)) { throw new CorruptIndexException("length=" + length + ", numStoredFields=" + numStoredFields + " (resource=" + FieldsStream + ")"); } if (numStoredFields == 0) { // nothing to do return; } DataInput documentInput; if (Version_Renamed >= CompressingStoredFieldsWriter.VERSION_BIG_CHUNKS && totalLength >= 2 * ChunkSize_Renamed) { Debug.Assert(ChunkSize_Renamed > 0); Debug.Assert(offset < ChunkSize_Renamed); Decompressor.Decompress(FieldsStream, ChunkSize_Renamed, offset, Math.Min(length, ChunkSize_Renamed - offset), Bytes); documentInput = new DataInputAnonymousInnerClassHelper(this, offset, length); } else { BytesRef bytes = totalLength <= BUFFER_REUSE_THRESHOLD ? this.Bytes : new BytesRef(); Decompressor.Decompress(FieldsStream, totalLength, offset, length, bytes); Debug.Assert(bytes.Length == length); documentInput = new ByteArrayDataInput((byte[])(Array)bytes.Bytes, bytes.Offset, bytes.Length); } for (int fieldIDX = 0; fieldIDX < numStoredFields; fieldIDX++) { long infoAndBits = documentInput.ReadVLong(); int fieldNumber = (int)((long)((ulong)infoAndBits >> CompressingStoredFieldsWriter.TYPE_BITS)); FieldInfo fieldInfo = FieldInfos.FieldInfo(fieldNumber); int bits = (int)(infoAndBits & CompressingStoredFieldsWriter.TYPE_MASK); Debug.Assert(bits <= CompressingStoredFieldsWriter.NUMERIC_DOUBLE, "bits=" + bits.ToString("x")); switch (visitor.NeedsField(fieldInfo)) { case StoredFieldVisitor.Status.YES: ReadField(documentInput, visitor, fieldInfo, bits); break; case StoredFieldVisitor.Status.NO: SkipField(documentInput, bits); break; case StoredFieldVisitor.Status.STOP: return; } } }
/* Does initial decode of next block of terms; this doesn't actually decode the docFreq, totalTermFreq, postings details (frq/prx offset, etc.) metadata; it just loads them as byte[] blobs which are then decoded on-demand if the metadata is ever requested for any term in this block. this enables terms-only intensive consumes (eg certain MTQs, respelling) to not pay the price of decoding metadata they won't use. */ internal void LoadBlock() { // Clone the IndexInput lazily, so that consumers // that just pull a TermsEnum to // seekExact(TermState) don't pay this cost: OuterInstance.InitIndexInput(); if (NextEnt != -1) { // Already loaded return; } //System.out.println("blc=" + blockLoadCount); [email protected](Fp); int code = [email protected](); EntCount = (int)((uint)code >> 1); Debug.Assert(EntCount > 0); IsLastInFloor = (code & 1) != 0; Debug.Assert(Arc == null || (IsLastInFloor || IsFloor)); // TODO: if suffixes were stored in random-access // array structure, then we could do binary search // instead of linear scan to find target term; eg // we could have simple array of offsets // term suffixes: code = [email protected](); IsLeafBlock = (code & 1) != 0; int numBytes = (int)((uint)code >> 1); if (SuffixBytes.Length < numBytes) { SuffixBytes = new byte[ArrayUtil.Oversize(numBytes, 1)]; } [email protected](SuffixBytes, 0, numBytes); SuffixesReader.Reset(SuffixBytes, 0, numBytes); /*if (DEBUG) { if (arc == null) { System.out.println(" loadBlock (next) fp=" + fp + " entCount=" + entCount + " prefixLen=" + prefix + " isLastInFloor=" + isLastInFloor + " leaf?=" + isLeafBlock); } else { System.out.println(" loadBlock (seek) fp=" + fp + " entCount=" + entCount + " prefixLen=" + prefix + " hasTerms?=" + hasTerms + " isFloor?=" + isFloor + " isLastInFloor=" + isLastInFloor + " leaf?=" + isLeafBlock); } }*/ // stats numBytes = [email protected](); if (StatBytes.Length < numBytes) { StatBytes = new byte[ArrayUtil.Oversize(numBytes, 1)]; } [email protected](StatBytes, 0, numBytes); StatsReader.Reset(StatBytes, 0, numBytes); MetaDataUpto = 0; State.TermBlockOrd = 0; NextEnt = 0; LastSubFP = -1; // TODO: we could skip this if !hasTerms; but // that's rare so won't help much // metadata numBytes = [email protected](); if (Bytes == null) { Bytes = new byte[ArrayUtil.Oversize(numBytes, 1)]; BytesReader = new ByteArrayDataInput(); } else if (Bytes.Length < numBytes) { Bytes = new byte[ArrayUtil.Oversize(numBytes, 1)]; } [email protected](Bytes, 0, numBytes); BytesReader.Reset(Bytes, 0, numBytes); // Sub-blocks of a single floor block are always // written one after another -- tail recurse: FpEnd = [email protected]; // if (DEBUG) { // System.out.println(" fpEnd=" + fpEnd); // } }
public override void Build(IInputEnumerator enumerator) { // LUCENENET: Added guard clause for null if (enumerator is null) { throw new ArgumentNullException(nameof(enumerator)); } if (enumerator.HasContexts) { throw new ArgumentException("this suggester doesn't support contexts"); } string prefix = this.GetType().Name; var directory = OfflineSorter.DefaultTempDir(); var tempInput = FileSupport.CreateTempFile(prefix, ".input", directory); var tempSorted = FileSupport.CreateTempFile(prefix, ".sorted", directory); hasPayloads = enumerator.HasPayloads; var writer = new OfflineSorter.ByteSequencesWriter(tempInput); OfflineSorter.ByteSequencesReader reader = null; var scratch = new BytesRef(); TokenStreamToAutomaton ts2a = GetTokenStreamToAutomaton(); bool success = false; count = 0; byte[] buffer = new byte[8]; try { var output = new ByteArrayDataOutput(buffer); BytesRef surfaceForm; while (enumerator.MoveNext()) { surfaceForm = enumerator.Current; ISet <Int32sRef> paths = ToFiniteStrings(surfaceForm, ts2a); maxAnalyzedPathsForOneInput = Math.Max(maxAnalyzedPathsForOneInput, paths.Count); foreach (Int32sRef path in paths) { Util.Fst.Util.ToBytesRef(path, scratch); // length of the analyzed text (FST input) if (scratch.Length > ushort.MaxValue - 2) { throw new ArgumentException("cannot handle analyzed forms > " + (ushort.MaxValue - 2) + " in length (got " + scratch.Length + ")"); } ushort analyzedLength = (ushort)scratch.Length; // compute the required length: // analyzed sequence + weight (4) + surface + analyzedLength (short) int requiredLength = analyzedLength + 4 + surfaceForm.Length + 2; BytesRef payload; if (hasPayloads) { if (surfaceForm.Length > (ushort.MaxValue - 2)) { throw new ArgumentException("cannot handle surface form > " + (ushort.MaxValue - 2) + " in length (got " + surfaceForm.Length + ")"); } payload = enumerator.Payload; // payload + surfaceLength (short) requiredLength += payload.Length + 2; } else { payload = null; } buffer = ArrayUtil.Grow(buffer, requiredLength); output.Reset(buffer); output.WriteInt16((short)analyzedLength); output.WriteBytes(scratch.Bytes, scratch.Offset, scratch.Length); output.WriteInt32(EncodeWeight(enumerator.Weight)); if (hasPayloads) { for (int i = 0; i < surfaceForm.Length; i++) { if (surfaceForm.Bytes[i] == PAYLOAD_SEP) { throw new ArgumentException( "surface form cannot contain unit separator character U+001F; this character is reserved"); } } output.WriteInt16((short)surfaceForm.Length); output.WriteBytes(surfaceForm.Bytes, surfaceForm.Offset, surfaceForm.Length); output.WriteBytes(payload.Bytes, payload.Offset, payload.Length); } else { output.WriteBytes(surfaceForm.Bytes, surfaceForm.Offset, surfaceForm.Length); } if (Debugging.AssertsEnabled) { Debugging.Assert(output.Position == requiredLength, "{0} vs {1}", output.Position, requiredLength); } writer.Write(buffer, 0, output.Position); } count++; } writer.Dispose(); // Sort all input/output pairs (required by FST.Builder): (new OfflineSorter(new AnalyzingComparer(hasPayloads))).Sort(tempInput, tempSorted); // Free disk space: tempInput.Delete(); reader = new OfflineSorter.ByteSequencesReader(tempSorted); var outputs = new PairOutputs <Int64, BytesRef>(PositiveInt32Outputs.Singleton, ByteSequenceOutputs.Singleton); var builder = new Builder <PairOutputs <Int64, BytesRef> .Pair>(FST.INPUT_TYPE.BYTE1, outputs); // Build FST: BytesRef previousAnalyzed = null; BytesRef analyzed = new BytesRef(); BytesRef surface = new BytesRef(); Int32sRef scratchInts = new Int32sRef(); var input = new ByteArrayDataInput(); // Used to remove duplicate surface forms (but we // still index the hightest-weight one). We clear // this when we see a new analyzed form, so it cannot // grow unbounded (at most 256 entries): var seenSurfaceForms = new JCG.HashSet <BytesRef>(); var dedup = 0; while (reader.Read(scratch)) { input.Reset(scratch.Bytes, scratch.Offset, scratch.Length); ushort analyzedLength = (ushort)input.ReadInt16(); analyzed.Grow(analyzedLength + 2); input.ReadBytes(analyzed.Bytes, 0, analyzedLength); analyzed.Length = analyzedLength; long cost = input.ReadInt32(); surface.Bytes = scratch.Bytes; if (hasPayloads) { surface.Length = (ushort)input.ReadInt16(); surface.Offset = input.Position; } else { surface.Offset = input.Position; surface.Length = scratch.Length - surface.Offset; } if (previousAnalyzed is null) { previousAnalyzed = new BytesRef(); previousAnalyzed.CopyBytes(analyzed); seenSurfaceForms.Add(BytesRef.DeepCopyOf(surface)); } else if (analyzed.Equals(previousAnalyzed)) { dedup++; if (dedup >= maxSurfaceFormsPerAnalyzedForm) { // More than maxSurfaceFormsPerAnalyzedForm // dups: skip the rest: continue; } if (seenSurfaceForms.Contains(surface)) { continue; } seenSurfaceForms.Add(BytesRef.DeepCopyOf(surface)); } else { dedup = 0; previousAnalyzed.CopyBytes(analyzed); seenSurfaceForms.Clear(); seenSurfaceForms.Add(BytesRef.DeepCopyOf(surface)); } // TODO: I think we can avoid the extra 2 bytes when // there is no dup (dedup==0), but we'd have to fix // the exactFirst logic ... which would be sort of // hairy because we'd need to special case the two // (dup/not dup)... // NOTE: must be byte 0 so we sort before whatever // is next analyzed.Bytes[analyzed.Offset + analyzed.Length] = 0; analyzed.Bytes[analyzed.Offset + analyzed.Length + 1] = (byte)dedup; analyzed.Length += 2; Util.Fst.Util.ToInt32sRef(analyzed, scratchInts); //System.out.println("ADD: " + scratchInts + " -> " + cost + ": " + surface.utf8ToString()); if (!hasPayloads) { builder.Add(scratchInts, outputs.NewPair(cost, BytesRef.DeepCopyOf(surface))); } else { int payloadOffset = input.Position + surface.Length; int payloadLength = scratch.Length - payloadOffset; BytesRef br = new BytesRef(surface.Length + 1 + payloadLength); Array.Copy(surface.Bytes, surface.Offset, br.Bytes, 0, surface.Length); br.Bytes[surface.Length] = PAYLOAD_SEP; Array.Copy(scratch.Bytes, payloadOffset, br.Bytes, surface.Length + 1, payloadLength); br.Length = br.Bytes.Length; builder.Add(scratchInts, outputs.NewPair(cost, br)); } } fst = builder.Finish(); //Util.dotToFile(fst, "/tmp/suggest.dot"); success = true; } finally { if (success) { IOUtils.Dispose(reader, writer); } else { IOUtils.DisposeWhileHandlingException(reader, writer); } tempInput.Delete(); tempSorted.Delete(); } }
public override void VisitDocument(int docID, StoredFieldVisitor visitor) { fieldsStream.Seek(indexReader.GetStartPointer(docID)); int docBase = fieldsStream.ReadVInt32(); int chunkDocs = fieldsStream.ReadVInt32(); if (docID < docBase || docID >= docBase + chunkDocs || docBase + chunkDocs > numDocs) { throw new CorruptIndexException("Corrupted: docID=" + docID + ", docBase=" + docBase + ", chunkDocs=" + chunkDocs + ", numDocs=" + numDocs + " (resource=" + fieldsStream + ")"); } int numStoredFields, offset, length, totalLength; if (chunkDocs == 1) { numStoredFields = fieldsStream.ReadVInt32(); offset = 0; length = fieldsStream.ReadVInt32(); totalLength = length; } else { int bitsPerStoredFields = fieldsStream.ReadVInt32(); if (bitsPerStoredFields == 0) { numStoredFields = fieldsStream.ReadVInt32(); } else if (bitsPerStoredFields > 31) { throw new CorruptIndexException("bitsPerStoredFields=" + bitsPerStoredFields + " (resource=" + fieldsStream + ")"); } else { long filePointer = fieldsStream.GetFilePointer(); PackedInt32s.Reader reader = PackedInt32s.GetDirectReaderNoHeader(fieldsStream, PackedInt32s.Format.PACKED, packedIntsVersion, chunkDocs, bitsPerStoredFields); numStoredFields = (int)(reader.Get(docID - docBase)); fieldsStream.Seek(filePointer + PackedInt32s.Format.PACKED.ByteCount(packedIntsVersion, chunkDocs, bitsPerStoredFields)); } int bitsPerLength = fieldsStream.ReadVInt32(); if (bitsPerLength == 0) { length = fieldsStream.ReadVInt32(); offset = (docID - docBase) * length; totalLength = chunkDocs * length; } else if (bitsPerStoredFields > 31) { throw new CorruptIndexException("bitsPerLength=" + bitsPerLength + " (resource=" + fieldsStream + ")"); } else { PackedInt32s.IReaderIterator it = PackedInt32s.GetReaderIteratorNoHeader(fieldsStream, PackedInt32s.Format.PACKED, packedIntsVersion, chunkDocs, bitsPerLength, 1); int off = 0; for (int i = 0; i < docID - docBase; ++i) { off += (int)it.Next(); } offset = off; length = (int)it.Next(); off += length; for (int i = docID - docBase + 1; i < chunkDocs; ++i) { off += (int)it.Next(); } totalLength = off; } } if ((length == 0) != (numStoredFields == 0)) { throw new CorruptIndexException("length=" + length + ", numStoredFields=" + numStoredFields + " (resource=" + fieldsStream + ")"); } if (numStoredFields == 0) { // nothing to do return; } DataInput documentInput; if (version >= CompressingStoredFieldsWriter.VERSION_BIG_CHUNKS && totalLength >= 2 * chunkSize) { if (Debugging.AssertsEnabled) { Debugging.Assert(chunkSize > 0); Debugging.Assert(offset < chunkSize); } decompressor.Decompress(fieldsStream, chunkSize, offset, Math.Min(length, chunkSize - offset), bytes); documentInput = new DataInputAnonymousClass(this, length); } else { BytesRef bytes = totalLength <= BUFFER_REUSE_THRESHOLD ? this.bytes : new BytesRef(); decompressor.Decompress(fieldsStream, totalLength, offset, length, bytes); if (Debugging.AssertsEnabled) { Debugging.Assert(bytes.Length == length); } documentInput = new ByteArrayDataInput(bytes.Bytes, bytes.Offset, bytes.Length); } for (int fieldIDX = 0; fieldIDX < numStoredFields; fieldIDX++) { long infoAndBits = documentInput.ReadVInt64(); int fieldNumber = (int)infoAndBits.TripleShift(CompressingStoredFieldsWriter.TYPE_BITS); FieldInfo fieldInfo = fieldInfos.FieldInfo(fieldNumber); int bits = (int)(infoAndBits & CompressingStoredFieldsWriter.TYPE_MASK); if (Debugging.AssertsEnabled) { Debugging.Assert(bits <= CompressingStoredFieldsWriter.NUMERIC_DOUBLE, "bits={0:x}", bits); } switch (visitor.NeedsField(fieldInfo)) { case StoredFieldVisitor.Status.YES: ReadField(documentInput, visitor, fieldInfo, bits); break; case StoredFieldVisitor.Status.NO: SkipField(documentInput, bits); break; case StoredFieldVisitor.Status.STOP: return; } } }
public SortedSetDocValuesAnonymousInnerClassHelper(FSTEntry entry, BinaryDocValues docToOrds, FST <long?> fst, FST.BytesReader @in, FST.Arc <long?> firstArc, FST.Arc <long?> scratchArc, Int32sRef scratchInts, BytesRefFSTEnum <long?> fstEnum, BytesRef @ref, ByteArrayDataInput input) { this.entry = entry; this.docToOrds = docToOrds; this.fst = fst; this.@in = @in; this.firstArc = firstArc; this.scratchArc = scratchArc; this.scratchInts = scratchInts; this.fstEnum = fstEnum; this.@ref = @ref; this.input = input; }
/// <summary> /// Constructs a new Stemmer which will use the provided Dictionary to create its stems. /// </summary> /// <param name="dictionary"> Dictionary that will be used to create the stems </param> public Stemmer(Dictionary dictionary) { this.dictionary = dictionary; this.affixReader = new ByteArrayDataInput(dictionary.affixData); }
public virtual void TestVariableBinary([ValueSource(typeof(ConcurrentMergeSchedulerFactories), "Values")] Func <IConcurrentMergeScheduler> newScheduler) { BaseDirectoryWrapper dir = NewFSDirectory(CreateTempDir("2BVariableBinary")); if (dir is MockDirectoryWrapper) { ((MockDirectoryWrapper)dir).Throttling = MockDirectoryWrapper.Throttling_e.NEVER; } var config = new IndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer(Random())) .SetMaxBufferedDocs(IndexWriterConfig.DISABLE_AUTO_FLUSH) .SetRAMBufferSizeMB(256.0) .SetMergeScheduler(newScheduler()) .SetMergePolicy(NewLogMergePolicy(false, 10)) .SetOpenMode(OpenMode.CREATE); IndexWriter w = new IndexWriter(dir, config); Document doc = new Document(); var bytes = new byte[4]; ByteArrayDataOutput encoder = new ByteArrayDataOutput(bytes); BytesRef data = new BytesRef(bytes); BinaryDocValuesField dvField = new BinaryDocValuesField("dv", data); doc.Add(dvField); for (int i = 0; i < int.MaxValue; i++) { encoder.Reset(bytes); encoder.WriteVInt32(i % 65535); // 1, 2, or 3 bytes data.Length = encoder.Position; w.AddDocument(doc); if (i % 100000 == 0) { Console.WriteLine("indexed: " + i); Console.Out.Flush(); } } w.ForceMerge(1); w.Dispose(); Console.WriteLine("verifying..."); Console.Out.Flush(); DirectoryReader r = DirectoryReader.Open(dir); int expectedValue = 0; ByteArrayDataInput input = new ByteArrayDataInput(); foreach (AtomicReaderContext context in r.Leaves) { AtomicReader reader = context.AtomicReader; BytesRef scratch = new BytesRef(bytes); BinaryDocValues dv = reader.GetBinaryDocValues("dv"); for (int i = 0; i < reader.MaxDoc; i++) { dv.Get(i, scratch); input.Reset(scratch.Bytes, scratch.Offset, scratch.Length); Assert.AreEqual(expectedValue % 65535, input.ReadVInt32()); Assert.IsTrue(input.Eof); expectedValue++; } } r.Dispose(); dir.Dispose(); }
// Does initial decode of next block of terms; this // doesn't actually decode the docFreq, totalTermFreq, // postings details (frq/prx offset, etc.) metadata; // it just loads them as byte[] blobs which are then // decoded on-demand if the metadata is ever requested // for any term in this block. This enables terms-only // intensive consumes (eg certain MTQs, respelling) to // not pay the price of decoding metadata they won't // use. private bool NextBlock() { // TODO: we still lazy-decode the byte[] for each // term (the suffix), but, if we decoded // all N terms up front then seeking could do a fast // bsearch w/in the block... _state.BlockFilePointer = _input.FilePointer; _blockTermCount = _input.ReadVInt(); if (_blockTermCount == 0) { return(false); } _termBlockPrefix = _input.ReadVInt(); // term suffixes: int len = _input.ReadVInt(); if (_termSuffixes.Length < len) { _termSuffixes = new byte[ArrayUtil.Oversize(len, 1)]; } //System.out.println(" termSuffixes len=" + len); _input.ReadBytes(_termSuffixes, 0, len); _termSuffixesReader.Reset(_termSuffixes, 0, len); // docFreq, totalTermFreq len = _input.ReadVInt(); if (_docFreqBytes.Length < len) { _docFreqBytes = new byte[ArrayUtil.Oversize(len, 1)]; } _input.ReadBytes(_docFreqBytes, 0, len); _freqReader.Reset(_docFreqBytes, 0, len); // metadata len = _input.ReadVInt(); if (_bytes == null) { _bytes = new byte[ArrayUtil.Oversize(len, 1)]; _bytesReader = new ByteArrayDataInput(); } else if (_bytes.Length < len) { _bytes = new byte[ArrayUtil.Oversize(len, 1)]; } _input.ReadBytes(_bytes, 0, len); _bytesReader.Reset(_bytes, 0, len); _metaDataUpto = 0; _state.TermBlockOrd = 0; _blocksSinceSeek++; _indexIsCurrent = _indexIsCurrent && (_blocksSinceSeek < _blockTermsReader._indexReader.Divisor); return(true); }
public virtual void TestVariableBinary([ValueSource(typeof(ConcurrentMergeSchedulers), "Values")]IConcurrentMergeScheduler scheduler) { BaseDirectoryWrapper dir = NewFSDirectory(CreateTempDir("2BVariableBinary")); if (dir is MockDirectoryWrapper) { ((MockDirectoryWrapper)dir).Throttling = MockDirectoryWrapper.Throttling_e.NEVER; } var config = new IndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer(Random())) .SetMaxBufferedDocs(IndexWriterConfig.DISABLE_AUTO_FLUSH) .SetRAMBufferSizeMB(256.0) .SetMergeScheduler(scheduler) .SetMergePolicy(NewLogMergePolicy(false, 10)) .SetOpenMode(IndexWriterConfig.OpenMode_e.CREATE); IndexWriter w = new IndexWriter(dir, config); Document doc = new Document(); var bytes = new byte[4]; ByteArrayDataOutput encoder = new ByteArrayDataOutput(bytes); BytesRef data = new BytesRef(bytes); BinaryDocValuesField dvField = new BinaryDocValuesField("dv", data); doc.Add(dvField); for (int i = 0; i < int.MaxValue; i++) { encoder.Reset(bytes); encoder.WriteVInt(i % 65535); // 1, 2, or 3 bytes data.Length = encoder.Position; w.AddDocument(doc); if (i % 100000 == 0) { Console.WriteLine("indexed: " + i); Console.Out.Flush(); } } w.ForceMerge(1); w.Dispose(); Console.WriteLine("verifying..."); Console.Out.Flush(); DirectoryReader r = DirectoryReader.Open(dir); int expectedValue = 0; ByteArrayDataInput input = new ByteArrayDataInput(); foreach (AtomicReaderContext context in r.Leaves) { AtomicReader reader = context.AtomicReader; BytesRef scratch = new BytesRef(bytes); BinaryDocValues dv = reader.GetBinaryDocValues("dv"); for (int i = 0; i < reader.MaxDoc; i++) { dv.Get(i, scratch); input.Reset((byte[])(Array)scratch.Bytes, scratch.Offset, scratch.Length); Assert.AreEqual(expectedValue % 65535, input.ReadVInt()); Assert.IsTrue(input.Eof()); expectedValue++; } } r.Dispose(); dir.Dispose(); }
public void SetFloorData(ByteArrayDataInput @in, BytesRef source) { int numBytes = source.Length - (@in.Position - source.Offset); if (numBytes > FloorData.Length) { FloorData = new byte[ArrayUtil.Oversize(numBytes, 1)]; } System.Buffer.BlockCopy(source.Bytes, source.Offset + @in.Position, FloorData, 0, numBytes); FloorDataReader.Reset(FloorData, 0, numBytes); NumFollowFloorBlocks = FloorDataReader.ReadVInt(); NextFloorLabel = FloorDataReader.ReadByte() & 0xff; //if (DEBUG) { //System.out.println(" setFloorData fpOrig=" + fpOrig + " bytes=" + new BytesRef(source.bytes, source.offset + in.getPosition(), numBytes) + " numFollowFloorBlocks=" + numFollowFloorBlocks + " nextFloorLabel=" + toHex(nextFloorLabel)); //} }
// Does initial decode of next block of terms; this // doesn't actually decode the docFreq, totalTermFreq, // postings details (frq/prx offset, etc.) metadata; // it just loads them as byte[] blobs which are then // decoded on-demand if the metadata is ever requested // for any term in this block. This enables terms-only // intensive consumes (eg certain MTQs, respelling) to // not pay the price of decoding metadata they won't // use. private bool NextBlock() { // TODO: we still lazy-decode the byte[] for each // term (the suffix), but, if we decoded // all N terms up front then seeking could do a fast // bsearch w/in the block... _state.BlockFilePointer = _input.FilePointer; _blockTermCount = _input.ReadVInt(); if (_blockTermCount == 0) return false; _termBlockPrefix = _input.ReadVInt(); // term suffixes: int len = _input.ReadVInt(); if (_termSuffixes.Length < len) { _termSuffixes = new byte[ArrayUtil.Oversize(len, 1)]; } //System.out.println(" termSuffixes len=" + len); _input.ReadBytes(_termSuffixes, 0, len); _termSuffixesReader.Reset(_termSuffixes, 0, len); // docFreq, totalTermFreq len = _input.ReadVInt(); if (_docFreqBytes.Length < len) _docFreqBytes = new byte[ArrayUtil.Oversize(len, 1)]; _input.ReadBytes(_docFreqBytes, 0, len); _freqReader.Reset(_docFreqBytes, 0, len); // metadata len = _input.ReadVInt(); if (_bytes == null) { _bytes = new byte[ArrayUtil.Oversize(len, 1)]; _bytesReader = new ByteArrayDataInput(); } else if (_bytes.Length < len) { _bytes = new byte[ArrayUtil.Oversize(len, 1)]; } _input.ReadBytes(_bytes, 0, len); _bytesReader.Reset(_bytes, 0, len); _metaDataUpto = 0; _state.TermBlockOrd = 0; _blocksSinceSeek++; _indexIsCurrent = _indexIsCurrent && (_blocksSinceSeek < _blockTermsReader._indexReader.Divisor); return true; }
// Does initial decode of next block of terms; this // doesn't actually decode the docFreq, totalTermFreq, // postings details (frq/prx offset, etc.) metadata; // it just loads them as byte[] blobs which are then // decoded on-demand if the metadata is ever requested // for any term in this block. This enables terms-only // intensive consumes (eg certain MTQs, respelling) to // not pay the price of decoding metadata they won't // use. private bool NextBlock() { // TODO: we still lazy-decode the byte[] for each // term (the suffix), but, if we decoded // all N terms up front then seeking could do a fast // bsearch w/in the block... //System.out.println("BTR.nextBlock() fp=" + in.getFilePointer() + " this=" + this); state.BlockFilePointer = input.GetFilePointer(); blockTermCount = input.ReadVInt32(); //System.out.println(" blockTermCount=" + blockTermCount); if (blockTermCount == 0) { return(false); } termBlockPrefix = input.ReadVInt32(); // term suffixes: int len = input.ReadVInt32(); if (termSuffixes.Length < len) { termSuffixes = new byte[ArrayUtil.Oversize(len, 1)]; } //System.out.println(" termSuffixes len=" + len); input.ReadBytes(termSuffixes, 0, len); termSuffixesReader.Reset(termSuffixes, 0, len); // docFreq, totalTermFreq len = input.ReadVInt32(); if (docFreqBytes.Length < len) { docFreqBytes = new byte[ArrayUtil.Oversize(len, 1)]; } //System.out.println(" freq bytes len=" + len); input.ReadBytes(docFreqBytes, 0, len); freqReader.Reset(docFreqBytes, 0, len); // metadata len = input.ReadVInt32(); if (bytes == null) { bytes = new byte[ArrayUtil.Oversize(len, 1)]; bytesReader = new ByteArrayDataInput(); } else if (bytes.Length < len) { bytes = new byte[ArrayUtil.Oversize(len, 1)]; } input.ReadBytes(bytes, 0, len); bytesReader.Reset(bytes, 0, len); metaDataUpto = 0; state.TermBlockOrd = 0; blocksSinceSeek++; indexIsCurrent = indexIsCurrent && (blocksSinceSeek < outerInstance.outerInstance.indexReader.Divisor); //System.out.println(" indexIsCurrent=" + indexIsCurrent); return(true); }