// Does initial decode of next block of terms; this // doesn't actually decode the docFreq, totalTermFreq, // postings details (frq/prx offset, etc.) metadata; // it just loads them as byte[] blobs which are then // decoded on-demand if the metadata is ever requested // for any term in this block. This enables terms-only // intensive consumes (eg certain MTQs, respelling) to // not pay the price of decoding metadata they won't // use. private bool NextBlock() { // TODO: we still lazy-decode the byte[] for each // term (the suffix), but, if we decoded // all N terms up front then seeking could do a fast // bsearch w/in the block... //System.out.println("BTR.nextBlock() fp=" + in.getFilePointer() + " this=" + this); state.BlockFilePointer = input.GetFilePointer(); blockTermCount = input.ReadVInt32(); //System.out.println(" blockTermCount=" + blockTermCount); if (blockTermCount == 0) { return(false); } termBlockPrefix = input.ReadVInt32(); // term suffixes: int len = input.ReadVInt32(); if (termSuffixes.Length < len) { termSuffixes = new byte[ArrayUtil.Oversize(len, 1)]; } //System.out.println(" termSuffixes len=" + len); input.ReadBytes(termSuffixes, 0, len); termSuffixesReader.Reset(termSuffixes, 0, len); // docFreq, totalTermFreq len = input.ReadVInt32(); if (docFreqBytes.Length < len) { docFreqBytes = new byte[ArrayUtil.Oversize(len, 1)]; } //System.out.println(" freq bytes len=" + len); input.ReadBytes(docFreqBytes, 0, len); freqReader.Reset(docFreqBytes, 0, len); // metadata len = input.ReadVInt32(); if (bytes == null) { bytes = new byte[ArrayUtil.Oversize(len, 1)]; bytesReader = new ByteArrayDataInput(); } else if (bytes.Length < len) { bytes = new byte[ArrayUtil.Oversize(len, 1)]; } input.ReadBytes(bytes, 0, len); bytesReader.Reset(bytes, 0, len); metaDataUpto = 0; state.TermBlockOrd = 0; blocksSinceSeek++; indexIsCurrent = indexIsCurrent && (blocksSinceSeek < outerInstance.outerInstance.indexReader.Divisor); //System.out.println(" indexIsCurrent=" + indexIsCurrent); return(true); }
public override void SetDocument(int docID) { docToOrds.Get(docID, @ref); input.Reset(@ref.Bytes, @ref.Offset, @ref.Length); currentOrd = 0; }
public virtual void TestVariableBinary() { BaseDirectoryWrapper dir = NewFSDirectory(CreateTempDir("2BVariableBinary")); if (dir is MockDirectoryWrapper) { ((MockDirectoryWrapper)dir).Throttling = Throttling.NEVER; } var config = new IndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer(Random)) .SetMaxBufferedDocs(IndexWriterConfig.DISABLE_AUTO_FLUSH) .SetRAMBufferSizeMB(256.0) .SetMergeScheduler(new ConcurrentMergeScheduler()) .SetMergePolicy(NewLogMergePolicy(false, 10)) .SetOpenMode(OpenMode.CREATE); IndexWriter w = new IndexWriter(dir, config); Document doc = new Document(); var bytes = new byte[4]; ByteArrayDataOutput encoder = new ByteArrayDataOutput(bytes); BytesRef data = new BytesRef(bytes); BinaryDocValuesField dvField = new BinaryDocValuesField("dv", data); doc.Add(dvField); for (int i = 0; i < int.MaxValue; i++) { encoder.Reset(bytes); encoder.WriteVInt32(i % 65535); // 1, 2, or 3 bytes data.Length = encoder.Position; w.AddDocument(doc); if (i % 100000 == 0) { Console.WriteLine("indexed: " + i); Console.Out.Flush(); } } w.ForceMerge(1); w.Dispose(); Console.WriteLine("verifying..."); Console.Out.Flush(); DirectoryReader r = DirectoryReader.Open(dir); int expectedValue = 0; ByteArrayDataInput input = new ByteArrayDataInput(); foreach (AtomicReaderContext context in r.Leaves) { AtomicReader reader = context.AtomicReader; BytesRef scratch = new BytesRef(bytes); BinaryDocValues dv = reader.GetBinaryDocValues("dv"); for (int i = 0; i < reader.MaxDoc; i++) { dv.Get(i, scratch); input.Reset(scratch.Bytes, scratch.Offset, scratch.Length); Assert.AreEqual(expectedValue % 65535, input.ReadVInt32()); Assert.IsTrue(input.Eof); expectedValue++; } } r.Dispose(); dir.Dispose(); }
public override void Build(IInputIterator iterator) { if (iterator.HasPayloads) { throw new ArgumentException("this suggester doesn't support payloads"); } if (iterator.HasContexts) { throw new ArgumentException("this suggester doesn't support contexts"); } FileInfo tempInput = FileSupport.CreateTempFile(typeof(FSTCompletionLookup).Name, ".input", OfflineSorter.DefaultTempDir()); FileInfo tempSorted = FileSupport.CreateTempFile(typeof(FSTCompletionLookup).Name, ".sorted", OfflineSorter.DefaultTempDir()); OfflineSorter.ByteSequencesWriter writer = new OfflineSorter.ByteSequencesWriter(tempInput); OfflineSorter.ByteSequencesReader reader = null; ExternalRefSorter sorter = null; // Push floats up front before sequences to sort them. For now, assume they are non-negative. // If negative floats are allowed some trickery needs to be done to find their byte order. bool success = false; count = 0; try { byte[] buffer = Arrays.Empty <byte>(); ByteArrayDataOutput output = new ByteArrayDataOutput(buffer); BytesRef spare; while ((spare = iterator.Next()) != null) { if (spare.Length + 4 >= buffer.Length) { buffer = ArrayUtil.Grow(buffer, spare.Length + 4); } output.Reset(buffer); output.WriteInt32(EncodeWeight(iterator.Weight)); output.WriteBytes(spare.Bytes, spare.Offset, spare.Length); writer.Write(buffer, 0, output.Position); } writer.Dispose(); // We don't know the distribution of scores and we need to bucket them, so we'll sort // and divide into equal buckets. OfflineSorter.SortInfo info = (new OfflineSorter()).Sort(tempInput, tempSorted); tempInput.Delete(); FSTCompletionBuilder builder = new FSTCompletionBuilder(buckets, sorter = new ExternalRefSorter(new OfflineSorter()), sharedTailLength); int inputLines = info.Lines; reader = new OfflineSorter.ByteSequencesReader(tempSorted); long line = 0; int previousBucket = 0; int previousScore = 0; ByteArrayDataInput input = new ByteArrayDataInput(); BytesRef tmp1 = new BytesRef(); BytesRef tmp2 = new BytesRef(); while (reader.Read(tmp1)) { input.Reset(tmp1.Bytes); int currentScore = input.ReadInt32(); int bucket; if (line > 0 && currentScore == previousScore) { bucket = previousBucket; } else { bucket = (int)(line * buckets / inputLines); } previousScore = currentScore; previousBucket = bucket; // Only append the input, discard the weight. tmp2.Bytes = tmp1.Bytes; tmp2.Offset = input.Position; tmp2.Length = tmp1.Length - input.Position; builder.Add(tmp2, bucket); line++; count++; } // The two FSTCompletions share the same automaton. this.higherWeightsCompletion = builder.Build(); this.normalCompletion = new FSTCompletion(higherWeightsCompletion.FST, false, exactMatchFirst); success = true; } finally { if (success) { IOUtils.Dispose(reader, writer, sorter); } else { IOUtils.DisposeWhileHandlingException(reader, writer, sorter); } tempInput.Delete(); tempSorted.Delete(); } }
// Interleaves all output tokens onto the futureOutputs: private void AddOutput(BytesRef bytes, int matchInputLength, int matchEndOffset) { bytesReader.Reset(bytes.Bytes, bytes.Offset, bytes.Length); int code = bytesReader.ReadVInt32(); bool keepOrig = (code & 0x1) == 0; int count = code.TripleShift(1); //System.out.println(" addOutput count=" + count + " keepOrig=" + keepOrig); for (int outputIDX = 0; outputIDX < count; outputIDX++) { synonyms.Words.Get(bytesReader.ReadVInt32(), scratchBytes); //System.out.println(" outIDX=" + outputIDX + " bytes=" + scratchBytes.length); UnicodeUtil.UTF8toUTF16(scratchBytes, scratchChars); int lastStart = scratchChars.Offset; int chEnd = lastStart + scratchChars.Length; int outputUpto = nextRead; for (int chIDX = lastStart; chIDX <= chEnd; chIDX++) { if (chIDX == chEnd || scratchChars.Chars[chIDX] == SynonymMap.WORD_SEPARATOR) { int outputLen = chIDX - lastStart; // Caller is not allowed to have empty string in // the output: if (Debugging.AssertsEnabled) { Debugging.Assert(outputLen > 0, "output contains empty string: {0}", scratchChars); } int endOffset; int posLen; if (chIDX == chEnd && lastStart == scratchChars.Offset) { // This rule had a single output token, so, we set // this output's endOffset to the current // endOffset (ie, endOffset of the last input // token it matched): endOffset = matchEndOffset; posLen = keepOrig ? matchInputLength : 1; } else { // This rule has more than one output token; we // can't pick any particular endOffset for this // case, so, we inherit the endOffset for the // input token which this output overlaps: endOffset = -1; posLen = 1; } futureOutputs[outputUpto].Add(scratchChars.Chars, lastStart, outputLen, endOffset, posLen); //System.out.println(" " + new String(scratchChars.chars, lastStart, outputLen) + " outputUpto=" + outputUpto); lastStart = 1 + chIDX; //System.out.println(" slot=" + outputUpto + " keepOrig=" + keepOrig); outputUpto = RollIncr(outputUpto); if (Debugging.AssertsEnabled) { Debugging.Assert(futureOutputs[outputUpto].posIncr == 1, "outputUpto={0} vs nextWrite={1}", outputUpto, nextWrite); } } } } int upto = nextRead; for (int idx = 0; idx < matchInputLength; idx++) { futureInputs[upto].keepOrig |= keepOrig; futureInputs[upto].matched = true; upto = RollIncr(upto); } }
public virtual void TestVariableBinary([ValueSource(typeof(ConcurrentMergeSchedulers), "Values")]IConcurrentMergeScheduler scheduler) { BaseDirectoryWrapper dir = NewFSDirectory(CreateTempDir("2BVariableBinary")); if (dir is MockDirectoryWrapper) { ((MockDirectoryWrapper)dir).Throttling = MockDirectoryWrapper.Throttling_e.NEVER; } var config = new IndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer(Random())) .SetMaxBufferedDocs(IndexWriterConfig.DISABLE_AUTO_FLUSH) .SetRAMBufferSizeMB(256.0) .SetMergeScheduler(scheduler) .SetMergePolicy(NewLogMergePolicy(false, 10)) .SetOpenMode(IndexWriterConfig.OpenMode_e.CREATE); IndexWriter w = new IndexWriter(dir, config); Document doc = new Document(); var bytes = new byte[4]; ByteArrayDataOutput encoder = new ByteArrayDataOutput(bytes); BytesRef data = new BytesRef(bytes); BinaryDocValuesField dvField = new BinaryDocValuesField("dv", data); doc.Add(dvField); for (int i = 0; i < int.MaxValue; i++) { encoder.Reset(bytes); encoder.WriteVInt(i % 65535); // 1, 2, or 3 bytes data.Length = encoder.Position; w.AddDocument(doc); if (i % 100000 == 0) { Console.WriteLine("indexed: " + i); Console.Out.Flush(); } } w.ForceMerge(1); w.Dispose(); Console.WriteLine("verifying..."); Console.Out.Flush(); DirectoryReader r = DirectoryReader.Open(dir); int expectedValue = 0; ByteArrayDataInput input = new ByteArrayDataInput(); foreach (AtomicReaderContext context in r.Leaves) { AtomicReader reader = context.AtomicReader; BytesRef scratch = new BytesRef(bytes); BinaryDocValues dv = reader.GetBinaryDocValues("dv"); for (int i = 0; i < reader.MaxDoc; i++) { dv.Get(i, scratch); input.Reset((byte[])(Array)scratch.Bytes, scratch.Offset, scratch.Length); Assert.AreEqual(expectedValue % 65535, input.ReadVInt()); Assert.IsTrue(input.Eof()); expectedValue++; } } r.Dispose(); dir.Dispose(); }
public override void Build(InputIterator iterator) { if (iterator.HasContexts) { throw new System.ArgumentException("this suggester doesn't support contexts"); } string prefix = this.GetType().Name; var directory = OfflineSorter.DefaultTempDir(); var tempInput = File.CreateTempFile(prefix, ".input", directory); var tempSorted = File.CreateTempFile(prefix, ".sorted", directory); hasPayloads = iterator.HasPayloads; var writer = new OfflineSorter.ByteSequencesWriter(tempInput); OfflineSorter.ByteSequencesReader reader = null; var scratch = new BytesRef(); TokenStreamToAutomaton ts2a = TokenStreamToAutomaton; bool success = false; count = 0; sbyte[] buffer = new sbyte[8]; try { var output = new ByteArrayDataOutput(buffer); BytesRef surfaceForm; while ((surfaceForm = iterator.Next()) != null) { HashSet <IntsRef> paths = ToFiniteStrings(surfaceForm, ts2a); maxAnalyzedPathsForOneInput = Math.Max(maxAnalyzedPathsForOneInput, paths.Count); foreach (IntsRef path in paths) { Util.Fst.Util.ToBytesRef(path, scratch); // length of the analyzed text (FST input) if (scratch.Length > short.MaxValue - 2) { throw new System.ArgumentException("cannot handle analyzed forms > " + (short.MaxValue - 2) + " in length (got " + scratch.Length + ")"); } short analyzedLength = (short)scratch.Length; // compute the required length: // analyzed sequence + weight (4) + surface + analyzedLength (short) int requiredLength = analyzedLength + 4 + surfaceForm.Length + 2; BytesRef payload; if (hasPayloads) { if (surfaceForm.Length > (short.MaxValue - 2)) { throw new ArgumentException("cannot handle surface form > " + (short.MaxValue - 2) + " in length (got " + surfaceForm.Length + ")"); } payload = iterator.Payload; // payload + surfaceLength (short) requiredLength += payload.Length + 2; } else { payload = null; } buffer = ArrayUtil.Grow(buffer, requiredLength); output.Reset(buffer); output.WriteShort(analyzedLength); output.WriteBytes(scratch.Bytes, scratch.Offset, scratch.Length); output.WriteInt(EncodeWeight(iterator.Weight)); if (hasPayloads) { for (int i = 0; i < surfaceForm.Length; i++) { if (surfaceForm.Bytes[i] == PAYLOAD_SEP) { throw new ArgumentException( "surface form cannot contain unit separator character U+001F; this character is reserved"); } } output.WriteShort((short)surfaceForm.Length); output.WriteBytes(surfaceForm.Bytes, surfaceForm.Offset, surfaceForm.Length); output.WriteBytes(payload.Bytes, payload.Offset, payload.Length); } else { output.WriteBytes(surfaceForm.Bytes, surfaceForm.Offset, surfaceForm.Length); } Debug.Assert(output.Position == requiredLength, output.Position + " vs " + requiredLength); writer.Write(buffer, 0, output.Position); } count++; } writer.Dispose(); // Sort all input/output pairs (required by FST.Builder): (new OfflineSorter(new AnalyzingComparator(hasPayloads))).Sort(tempInput, tempSorted); // Free disk space: tempInput.Delete(); reader = new OfflineSorter.ByteSequencesReader(tempSorted); var outputs = new PairOutputs <long?, BytesRef>(PositiveIntOutputs.Singleton, ByteSequenceOutputs.Singleton); var builder = new Builder <PairOutputs <long?, BytesRef> .Pair>(FST.INPUT_TYPE.BYTE1, outputs); // Build FST: BytesRef previousAnalyzed = null; BytesRef analyzed = new BytesRef(); BytesRef surface = new BytesRef(); IntsRef scratchInts = new IntsRef(); var input = new ByteArrayDataInput(); // Used to remove duplicate surface forms (but we // still index the hightest-weight one). We clear // this when we see a new analyzed form, so it cannot // grow unbounded (at most 256 entries): var seenSurfaceForms = new HashSet <BytesRef>(); var dedup = 0; while (reader.Read(scratch)) { input.Reset(scratch.Bytes, scratch.Offset, scratch.Length); short analyzedLength = input.ReadShort(); analyzed.Grow(analyzedLength + 2); input.ReadBytes(analyzed.Bytes, 0, analyzedLength); analyzed.Length = analyzedLength; long cost = input.ReadInt(); surface.Bytes = scratch.Bytes; if (hasPayloads) { surface.Length = input.ReadShort(); surface.Offset = input.Position; } else { surface.Offset = input.Position; surface.Length = scratch.Length - surface.Offset; } if (previousAnalyzed == null) { previousAnalyzed = new BytesRef(); previousAnalyzed.CopyBytes(analyzed); seenSurfaceForms.Add(BytesRef.DeepCopyOf(surface)); } else if (analyzed.Equals(previousAnalyzed)) { dedup++; if (dedup >= maxSurfaceFormsPerAnalyzedForm) { // More than maxSurfaceFormsPerAnalyzedForm // dups: skip the rest: continue; } if (seenSurfaceForms.Contains(surface)) { continue; } seenSurfaceForms.Add(BytesRef.DeepCopyOf(surface)); } else { dedup = 0; previousAnalyzed.CopyBytes(analyzed); seenSurfaceForms.Clear(); seenSurfaceForms.Add(BytesRef.DeepCopyOf(surface)); } // TODO: I think we can avoid the extra 2 bytes when // there is no dup (dedup==0), but we'd have to fix // the exactFirst logic ... which would be sort of // hairy because we'd need to special case the two // (dup/not dup)... // NOTE: must be byte 0 so we sort before whatever // is next analyzed.Bytes[analyzed.Offset + analyzed.Length] = 0; analyzed.Bytes[analyzed.Offset + analyzed.Length + 1] = (sbyte)dedup; analyzed.Length += 2; Util.Fst.Util.ToIntsRef(analyzed, scratchInts); //System.out.println("ADD: " + scratchInts + " -> " + cost + ": " + surface.utf8ToString()); if (!hasPayloads) { builder.Add(scratchInts, outputs.NewPair(cost, BytesRef.DeepCopyOf(surface))); } else { int payloadOffset = input.Position + surface.Length; int payloadLength = scratch.Length - payloadOffset; BytesRef br = new BytesRef(surface.Length + 1 + payloadLength); Array.Copy(surface.Bytes, surface.Offset, br.Bytes, 0, surface.Length); br.Bytes[surface.Length] = PAYLOAD_SEP; Array.Copy(scratch.Bytes, payloadOffset, br.Bytes, surface.Length + 1, payloadLength); br.Length = br.Bytes.Length; builder.Add(scratchInts, outputs.NewPair(cost, br)); } } fst = builder.Finish(); //Util.dotToFile(fst, "/tmp/suggest.dot"); success = true; } finally { if (success) { IOUtils.Close(reader, writer); } else { IOUtils.CloseWhileHandlingException(reader, writer); } tempInput.Delete(); tempSorted.Delete(); } }
// Does initial decode of next block of terms; this // doesn't actually decode the docFreq, totalTermFreq, // postings details (frq/prx offset, etc.) metadata; // it just loads them as byte[] blobs which are then // decoded on-demand if the metadata is ever requested // for any term in this block. This enables terms-only // intensive consumes (eg certain MTQs, respelling) to // not pay the price of decoding metadata they won't // use. private bool NextBlock() { // TODO: we still lazy-decode the byte[] for each // term (the suffix), but, if we decoded // all N terms up front then seeking could do a fast // bsearch w/in the block... _state.BlockFilePointer = _input.FilePointer; _blockTermCount = _input.ReadVInt(); if (_blockTermCount == 0) { return(false); } _termBlockPrefix = _input.ReadVInt(); // term suffixes: int len = _input.ReadVInt(); if (_termSuffixes.Length < len) { _termSuffixes = new byte[ArrayUtil.Oversize(len, 1)]; } //System.out.println(" termSuffixes len=" + len); _input.ReadBytes(_termSuffixes, 0, len); _termSuffixesReader.Reset(_termSuffixes, 0, len); // docFreq, totalTermFreq len = _input.ReadVInt(); if (_docFreqBytes.Length < len) { _docFreqBytes = new byte[ArrayUtil.Oversize(len, 1)]; } _input.ReadBytes(_docFreqBytes, 0, len); _freqReader.Reset(_docFreqBytes, 0, len); // metadata len = _input.ReadVInt(); if (_bytes == null) { _bytes = new byte[ArrayUtil.Oversize(len, 1)]; _bytesReader = new ByteArrayDataInput(); } else if (_bytes.Length < len) { _bytes = new byte[ArrayUtil.Oversize(len, 1)]; } _input.ReadBytes(_bytes, 0, len); _bytesReader.Reset(_bytes, 0, len); _metaDataUpto = 0; _state.TermBlockOrd = 0; _blocksSinceSeek++; _indexIsCurrent = _indexIsCurrent && (_blocksSinceSeek < _blockTermsReader._indexReader.Divisor); return(true); }