internal IntersectTermsEnum(TermsReader outerInstance, CompiledAutomaton compiled, BytesRef startTerm) : base(outerInstance) { //if (TEST) System.out.println("Enum init, startTerm=" + startTerm); this.fst = outerInstance.index; this.fstReader = fst.GetBytesReader(); this.fstOutputs = outerInstance.index.Outputs; this.fsa = compiled.RunAutomaton; this.level = -1; this.stack = new Frame[16]; for (int i = 0; i < stack.Length; i++) { this.stack[i] = new Frame(); } Frame frame; /*frame = */ LoadVirtualFrame(NewFrame()); // LUCENENET: IDE0059: Remove unnecessary value assignment this.level++; frame = LoadFirstFrame(NewFrame()); PushFrame(frame); this.decoded = false; this.pending = false; if (startTerm == null) { pending = IsAccept(TopFrame()); } else { DoSeekCeil(startTerm); pending = !startTerm.Equals(term) && IsValid(TopFrame()) && IsAccept(TopFrame()); } }
public bool Equals(Term other) { if (object.ReferenceEquals(null, other)) { return(object.ReferenceEquals(null, this)); } if (object.ReferenceEquals(this, other)) { return(true); } if (this.GetType() != other.GetType()) { return(false); } if (string.Compare(this.Field_Renamed, other.Field_Renamed, StringComparison.Ordinal) != 0) { return(false); } if (Bytes_Renamed == null) { if (other.Bytes_Renamed != null) { return(false); } } else if (!Bytes_Renamed.Equals(other.Bytes_Renamed)) { return(false); } return(true); }
private IDictionary <string, long?> ReadFields(IndexInput @in) { ChecksumIndexInput input = new BufferedChecksumIndexInput(@in); var scratch = new BytesRef(10); // LUCENENET specific: Use StringComparer.Ordinal to get the same ordering as Java var fields = new JCG.SortedDictionary <string, long?>(StringComparer.Ordinal); while (true) { SimpleTextUtil.ReadLine(input, scratch); if (scratch.Equals(SimpleTextFieldsWriter.END)) { SimpleTextUtil.CheckFooter(input); return(fields); } if (StringHelper.StartsWith(scratch, SimpleTextFieldsWriter.FIELD)) { var fieldName = Encoding.UTF8.GetString(scratch.Bytes, scratch.Offset + SimpleTextFieldsWriter.FIELD.Length, scratch.Length - SimpleTextFieldsWriter.FIELD.Length); fields[fieldName] = input.GetFilePointer(); } } }
public override SeekStatus SeekCeil(BytesRef text) { //System.out.println("te.seek text=" + field.name + ":" + text.utf8ToString() + " this=" + this); current = fstEnum.SeekCeil(text); if (current == null) { return(SeekStatus.END); } else { // System.out.println(" got term=" + current.input.utf8ToString()); // for(int i=0;i<current.output.length;i++) { // System.out.println(" " + Integer.toHexString(current.output.bytes[i]&0xFF)); // } didDecode = false; if (text.Equals(current.Input)) { //System.out.println(" found!"); return(SeekStatus.FOUND); } else { //System.out.println(" not found: " + current.input.utf8ToString()); return(SeekStatus.NOT_FOUND); } } }
public override void seekExact(BytesRef target, TermState otherState) { if (!target.Equals(term_Renamed)) { state.copyFrom(otherState); term_Renamed = BytesRef.deepCopyOf(target); seekPending = true; } }
public override void SeekExact(BytesRef target, TermState otherState) { if (!target.Equals(term)) { state.CopyFrom(otherState); term = BytesRef.DeepCopyOf(target); seekPending = true; } }
public override IBits ReadLiveDocs(Directory dir, SegmentCommitInfo info, IOContext context) { if (Debugging.AssertsEnabled) { Debugging.Assert(info.HasDeletions); } var scratch = new BytesRef(); var scratchUtf16 = new CharsRef(); var fileName = IndexFileNames.FileNameFromGeneration(info.Info.Name, LIVEDOCS_EXTENSION, info.DelGen); ChecksumIndexInput input = null; var success = false; try { input = dir.OpenChecksumInput(fileName, context); SimpleTextUtil.ReadLine(input, scratch); if (Debugging.AssertsEnabled) { Debugging.Assert(StringHelper.StartsWith(scratch, SIZE)); } var size = ParseInt32At(scratch, SIZE.Length, scratchUtf16); var bits = new BitSet(size); SimpleTextUtil.ReadLine(input, scratch); while (!scratch.Equals(END)) { if (Debugging.AssertsEnabled) { Debugging.Assert(StringHelper.StartsWith(scratch, DOC)); } var docid = ParseInt32At(scratch, DOC.Length, scratchUtf16); bits.Set(docid); SimpleTextUtil.ReadLine(input, scratch); } SimpleTextUtil.CheckFooter(input); success = true; return(new SimpleTextBits(bits, size)); } finally { if (success) { IOUtils.Dispose(input); } else { IOUtils.DisposeWhileHandlingException(input); } } }
/// <summary> /// Checks if the list of /// <see cref="Lookup.LookupResult"/>s already has a /// <paramref name="key"/>. If so, reorders that /// <see cref="Lookup.LookupResult"/> to the first /// position. /// </summary> /// <returns> /// Returns <c>true</c> if and only if <paramref name="list"/> contained /// <paramref name="key"/>. /// </returns> private bool CheckExistingAndReorder(IList <Completion> list, BytesRef key) { // We assume list does not have duplicates (because of how the FST is created). for (int i = list.Count; --i >= 0;) { if (key.Equals(list[i].Utf8)) { // Key found. Unless already at i==0, remove it and push up front so // that the ordering // remains identical with the exception of the exact match. if (key.Equals(list[i].Utf8)) { var element = list[i]; list.Remove(element); list.Insert(0, element); } return(true); } } return(false); }
/// <summary> /// Returns grouped facet results that were computed over zero or more segments. /// Grouped facet counts are merged from zero or more segment results. /// </summary> /// <param name="size">The total number of facets to include. This is typically offset + limit</param> /// <param name="minCount">The minimum count a facet entry should have to be included in the grouped facet result</param> /// <param name="orderByCount"> /// Whether to sort the facet entries by facet entry count. If <c>false</c> then the facets /// are sorted lexicographically in ascending order. /// </param> /// <returns>grouped facet results</returns> /// <exception cref="System.IO.IOException">If I/O related errors occur during merging segment grouped facet counts.</exception> public virtual GroupedFacetResult MergeSegmentResults(int size, int minCount, bool orderByCount) { if (m_segmentFacetCounts != null) { m_segmentResults.Add(CreateSegmentResult()); m_segmentFacetCounts = null; // reset } int totalCount = 0; int missingCount = 0; SegmentResultPriorityQueue segments = new SegmentResultPriorityQueue(m_segmentResults.Count); foreach (AbstractSegmentResult segmentResult in m_segmentResults) { missingCount += segmentResult.m_missing; if (segmentResult.m_mergePos >= segmentResult.m_maxTermPos) { continue; } totalCount += segmentResult.m_total; segments.Add(segmentResult); } GroupedFacetResult facetResult = new GroupedFacetResult(size, minCount, orderByCount, totalCount, missingCount); while (segments.Count > 0) { AbstractSegmentResult segmentResult = segments.Top; BytesRef currentFacetValue = BytesRef.DeepCopyOf(segmentResult.m_mergeTerm); int count = 0; do { count += segmentResult.m_counts[segmentResult.m_mergePos++]; if (segmentResult.m_mergePos < segmentResult.m_maxTermPos) { segmentResult.NextTerm(); segmentResult = segments.UpdateTop(); } else { segments.Pop(); segmentResult = segments.Top; if (segmentResult == null) { break; } } } while (currentFacetValue.Equals(segmentResult.m_mergeTerm)); facetResult.AddFacetCount(currentFacetValue, count); } return(facetResult); }
public virtual void TestFromBytes() { sbyte[] bytes = new sbyte[] { (sbyte)'a', (sbyte)'b', (sbyte)'c', (sbyte)'d' }; BytesRef b = new BytesRef(bytes); Assert.AreEqual(bytes, b.Bytes); Assert.AreEqual(0, b.Offset); Assert.AreEqual(4, b.Length); BytesRef b2 = new BytesRef(bytes, 1, 3); Assert.AreEqual("bcd", b2.Utf8ToString()); Assert.IsFalse(b.Equals(b2)); }
public override bool Equals(object obj) { if (this == obj) { return(true); } if (!base.Equals(obj)) { return(false); } if (this.GetType() != obj.GetType()) { return(false); } TermRangeQuery other = (TermRangeQuery)obj; if (IncludeLower != other.IncludeLower) { return(false); } if (IncludeUpper != other.IncludeUpper) { return(false); } if (LowerTerm_Renamed == null) { if (other.LowerTerm_Renamed != null) { return(false); } } else if (!LowerTerm_Renamed.Equals(other.LowerTerm_Renamed)) { return(false); } if (UpperTerm_Renamed == null) { if (other.UpperTerm_Renamed != null) { return(false); } } else if (!UpperTerm_Renamed.Equals(other.UpperTerm_Renamed)) { return(false); } return(true); }
public override bool Equals(object obj) { if (this == obj) { return(true); } if (!base.Equals(obj)) { return(false); } if (this.GetType() != obj.GetType()) { return(false); } TermRangeQuery other = (TermRangeQuery)obj; if (includeLower != other.includeLower) { return(false); } if (includeUpper != other.includeUpper) { return(false); } if (lowerTerm == null) { if (other.lowerTerm != null) { return(false); } } else if (!lowerTerm.Equals(other.lowerTerm)) { return(false); } if (upperTerm == null) { if (other.upperTerm != null) { return(false); } } else if (!upperTerm.Equals(other.upperTerm)) { return(false); } return(true); }
//JAVA TO C# CONVERTER WARNING: Method 'throws' clauses are not available in .NET: //ORIGINAL LINE: @Override public void checkIntegrity() throws java.io.IOException public override void checkIntegrity() { BytesRef scratch = new BytesRef(); IndexInput clone = data.clone(); clone.seek(0); ChecksumIndexInput input = new BufferedChecksumIndexInput(clone); while (true) { SimpleTextUtil.ReadLine(input, scratch); if (scratch.Equals(END)) { SimpleTextUtil.CheckFooter(input); break; } } }
/// <remarks> /// we don't actually write a .fdx-like index, instead we read the /// stored fields file in entirety up-front and save the offsets /// so we can seek to the documents later. /// </remarks> private void ReadIndex(int size) { ChecksumIndexInput input = new BufferedChecksumIndexInput(_input); _offsets = new long[size]; var upto = 0; while (!_scratch.Equals(SimpleTextStoredFieldsWriter.END)) { SimpleTextUtil.ReadLine(input, _scratch); if (StringHelper.StartsWith(_scratch, SimpleTextStoredFieldsWriter.DOC)) { _offsets[upto] = input.GetFilePointer(); upto++; } } SimpleTextUtil.CheckFooter(input); Debug.Assert(upto == _offsets.Length); }
// we don't actually write a .fdx-like index, instead we read the // stored fields file in entirety up-front and save the offsets // so we can seek to the documents later. //JAVA TO C# CONVERTER WARNING: Method 'throws' clauses are not available in .NET: //ORIGINAL LINE: private void readIndex(int size) throws java.io.IOException private void readIndex(int size) { ChecksumIndexInput input = new BufferedChecksumIndexInput(@in); offsets = new long[size]; int upto = 0; while (!scratch.Equals(END)) { SimpleTextUtil.ReadLine(input, scratch); if (StringHelper.StartsWith(scratch, DOC)) { offsets[upto] = input.FilePointer; upto++; } } SimpleTextUtil.CheckFooter(input); Debug.Assert(upto == offsets.Length); }
public override void FinishTerm(BytesRef text, TermStats stats) { Debug.Assert(state == TermsConsumerState.START); state = TermsConsumerState.INITIAL; Debug.Assert(text.Equals(lastTerm)); Debug.Assert(stats.DocFreq > 0); // otherwise, this method should not be called. Debug.Assert(stats.DocFreq == lastPostingsConsumer.docFreq); sumDocFreq += stats.DocFreq; if (fieldInfo.IndexOptions == IndexOptions.DOCS_ONLY) { Debug.Assert(stats.TotalTermFreq == -1); } else { Debug.Assert(stats.TotalTermFreq == lastPostingsConsumer.totalTermFreq); sumTotalTermFreq += stats.TotalTermFreq; } @in.FinishTerm(text, stats); }
public override void CheckIntegrity() { var iScratch = new BytesRef(); var clone = (IndexInput)data.Clone(); clone.Seek(0); ChecksumIndexInput input = new BufferedChecksumIndexInput(clone); while (true) { SimpleTextUtil.ReadLine(input, iScratch); if (!iScratch.Equals(SimpleTextDocValuesWriter.END)) { continue; } SimpleTextUtil.CheckFooter(input); break; } }
private SortedDictionary <string, long?> ReadFields(IndexInput @in) { ChecksumIndexInput input = new BufferedChecksumIndexInput(@in); BytesRef scratch = new BytesRef(10); SortedDictionary <string, long?> fields = new SortedDictionary <string, long?>(); while (true) { SimpleTextUtil.ReadLine(input, scratch); if (scratch.Equals(END)) { SimpleTextUtil.CheckFooter(input); return(fields); } else if (StringHelper.StartsWith(scratch, FIELD)) { string fieldName = new string(scratch.Bytes, scratch.Offset + FIELD.length, scratch.Length - FIELD.length, StandardCharsets.UTF_8); fields[fieldName] = input.FilePointer; } } }
/// <remarks> /// We don't actually write a .fdx-like index, instead we read the /// stored fields file in entirety up-front and save the offsets /// so we can seek to the documents later. /// </remarks> private void ReadIndex(int size) { ChecksumIndexInput input = new BufferedChecksumIndexInput(_input); _offsets = new long[size]; var upto = 0; while (!_scratch.Equals(SimpleTextStoredFieldsWriter.END)) { SimpleTextUtil.ReadLine(input, _scratch); if (StringHelper.StartsWith(_scratch, SimpleTextStoredFieldsWriter.DOC)) { _offsets[upto] = input.Position; // LUCENENET specific: Renamed from getFilePointer() to match FileStream upto++; } } SimpleTextUtil.CheckFooter(input); if (Debugging.AssertsEnabled) { Debugging.Assert(upto == _offsets.Length); } }
// we don't actually write a .tvx-like index, instead we read the // vectors file in entirety up-front and save the offsets // so we can seek to the data later. private void ReadIndex(int maxDoc) { ChecksumIndexInput input = new BufferedChecksumIndexInput(_input); _offsets = new long[maxDoc]; int upto = 0; while (!_scratch.Equals(SimpleTextTermVectorsWriter.END)) { SimpleTextUtil.ReadLine(input, _scratch); if (StringHelper.StartsWith(_scratch, SimpleTextTermVectorsWriter.DOC)) { _offsets[upto] = input.GetFilePointer(); upto++; } } SimpleTextUtil.CheckFooter(input); if (Debugging.AssertsEnabled) { Debugging.Assert(upto == _offsets.Length); } }
public override void Build(IInputIterator iterator) { if (iterator.HasPayloads) { throw new ArgumentException("this suggester doesn't support payloads"); } if (iterator.HasContexts) { throw new ArgumentException("this suggester doesn't support contexts"); } count = 0; var scratch = new BytesRef(); IInputIterator iter = new WFSTInputIterator(this, iterator); var scratchInts = new Int32sRef(); BytesRef previous = null; var outputs = PositiveInt32Outputs.Singleton; var builder = new Builder <long?>(FST.INPUT_TYPE.BYTE1, outputs); while ((scratch = iter.Next()) != null) { long cost = iter.Weight; if (previous == null) { previous = new BytesRef(); } else if (scratch.Equals(previous)) { continue; // for duplicate suggestions, the best weight is actually // added } Lucene.Net.Util.Fst.Util.ToInt32sRef(scratch, scratchInts); builder.Add(scratchInts, cost); previous.CopyBytes(scratch); count++; } fst = builder.Finish(); }
public override bool Equals(object obj) { if (this == obj) { return(true); } if (obj == null) { return(false); } if (this.GetType() != obj.GetType()) { return(false); } CompiledAutomaton other = (CompiledAutomaton)obj; if (Type != other.Type) { return(false); } if (Type == AUTOMATON_TYPE.SINGLE || Type == AUTOMATON_TYPE.PREFIX) { if (!Term.Equals(other.Term)) { return(false); } } else if (Type == AUTOMATON_TYPE.NORMAL) { if (!RunAutomaton.Equals(other.RunAutomaton)) { return(false); } } return(true); }
internal IntersectTermsEnum(FSTTermsReader.TermsReader outerInstance, CompiledAutomaton compiled, BytesRef startTerm) : base(outerInstance) { this.outerInstance = outerInstance; //if (TEST) System.out.println("Enum init, startTerm=" + startTerm); this.fst = outerInstance.dict; this.fstReader = fst.GetBytesReader(); this.fstOutputs = outerInstance.dict.Outputs; this.fsa = compiled.RunAutomaton; this.level = -1; this.stack = new Frame[16]; for (int i = 0; i < stack.Length; i++) { this.stack[i] = new Frame(this); } Frame frame; frame = LoadVirtualFrame(NewFrame()); this.level++; frame = LoadFirstFrame(NewFrame()); PushFrame(frame); this.meta = null; this.metaUpto = 1; this.decoded = false; this.pending = false; if (startTerm == null) { pending = IsAccept(TopFrame()); } else { DoSeekCeil(startTerm); pending = !startTerm.Equals(term) && IsValid(TopFrame()) && IsAccept(TopFrame()); } }
protected override AcceptStatus Accept(BytesRef term) { if (!this.includeLower && term.Equals(lowerBytesRef)) { return(AcceptStatus.NO); } // Use this field's default sort ordering if (upperBytesRef != null) { int cmp = termComp.Compare(upperBytesRef, term); /* * if beyond the upper term, or is exclusive and this is equal to * the upper term, break out */ if ((cmp < 0) || (!includeUpper && cmp == 0)) { return(AcceptStatus.END); } } return(AcceptStatus.YES); }
public override bool Equals(object o) { if (this == o) { return(true); } if (o == null || GetType() != o.GetType()) { return(false); } FacetEntry that = (FacetEntry)o; if (count != that.count) { return(false); } if (!value.Equals(that.value)) { return(false); } return(true); }
public override void FinishTerm(BytesRef text, TermStats stats) { Debug.Assert(State == TermsConsumerState.START); State = TermsConsumerState.INITIAL; Debug.Assert(text.Equals(LastTerm)); Debug.Assert(stats.DocFreq > 0); // otherwise, this method should not be called. Debug.Assert(stats.DocFreq == LastPostingsConsumer.DocFreq); SumDocFreq += stats.DocFreq; if (fieldInfo.FieldIndexOptions == FieldInfo.IndexOptions.DOCS_ONLY) { Debug.Assert(stats.TotalTermFreq == -1); } else { Debug.Assert(stats.TotalTermFreq == LastPostingsConsumer.TotalTermFreq); SumTotalTermFreq += stats.TotalTermFreq; } @in.FinishTerm(text, stats); }
public override void Build(IInputEnumerator enumerator) { if (enumerator.HasContexts) { throw new ArgumentException("this suggester doesn't support contexts"); } string prefix = this.GetType().Name; var directory = OfflineSorter.DefaultTempDir(); var tempInput = FileSupport.CreateTempFile(prefix, ".input", directory); var tempSorted = FileSupport.CreateTempFile(prefix, ".sorted", directory); hasPayloads = enumerator.HasPayloads; var writer = new OfflineSorter.ByteSequencesWriter(tempInput); OfflineSorter.ByteSequencesReader reader = null; var scratch = new BytesRef(); TokenStreamToAutomaton ts2a = GetTokenStreamToAutomaton(); bool success = false; count = 0; byte[] buffer = new byte[8]; try { var output = new ByteArrayDataOutput(buffer); BytesRef surfaceForm; while (enumerator.MoveNext()) { surfaceForm = enumerator.Current; ISet <Int32sRef> paths = ToFiniteStrings(surfaceForm, ts2a); maxAnalyzedPathsForOneInput = Math.Max(maxAnalyzedPathsForOneInput, paths.Count); foreach (Int32sRef path in paths) { Util.Fst.Util.ToBytesRef(path, scratch); // length of the analyzed text (FST input) if (scratch.Length > ushort.MaxValue - 2) { throw new ArgumentException("cannot handle analyzed forms > " + (ushort.MaxValue - 2) + " in length (got " + scratch.Length + ")"); } ushort analyzedLength = (ushort)scratch.Length; // compute the required length: // analyzed sequence + weight (4) + surface + analyzedLength (short) int requiredLength = analyzedLength + 4 + surfaceForm.Length + 2; BytesRef payload; if (hasPayloads) { if (surfaceForm.Length > (ushort.MaxValue - 2)) { throw new ArgumentException("cannot handle surface form > " + (ushort.MaxValue - 2) + " in length (got " + surfaceForm.Length + ")"); } payload = enumerator.Payload; // payload + surfaceLength (short) requiredLength += payload.Length + 2; } else { payload = null; } buffer = ArrayUtil.Grow(buffer, requiredLength); output.Reset(buffer); output.WriteInt16((short)analyzedLength); output.WriteBytes(scratch.Bytes, scratch.Offset, scratch.Length); output.WriteInt32(EncodeWeight(enumerator.Weight)); if (hasPayloads) { for (int i = 0; i < surfaceForm.Length; i++) { if (surfaceForm.Bytes[i] == PAYLOAD_SEP) { throw new ArgumentException( "surface form cannot contain unit separator character U+001F; this character is reserved"); } } output.WriteInt16((short)surfaceForm.Length); output.WriteBytes(surfaceForm.Bytes, surfaceForm.Offset, surfaceForm.Length); output.WriteBytes(payload.Bytes, payload.Offset, payload.Length); } else { output.WriteBytes(surfaceForm.Bytes, surfaceForm.Offset, surfaceForm.Length); } if (Debugging.AssertsEnabled) { Debugging.Assert(output.Position == requiredLength, () => output.Position + " vs " + requiredLength); } writer.Write(buffer, 0, output.Position); } count++; } writer.Dispose(); // Sort all input/output pairs (required by FST.Builder): (new OfflineSorter(new AnalyzingComparer(hasPayloads))).Sort(tempInput, tempSorted); // Free disk space: tempInput.Delete(); reader = new OfflineSorter.ByteSequencesReader(tempSorted); var outputs = new PairOutputs <long?, BytesRef>(PositiveInt32Outputs.Singleton, ByteSequenceOutputs.Singleton); var builder = new Builder <PairOutputs <long?, BytesRef> .Pair>(FST.INPUT_TYPE.BYTE1, outputs); // Build FST: BytesRef previousAnalyzed = null; BytesRef analyzed = new BytesRef(); BytesRef surface = new BytesRef(); Int32sRef scratchInts = new Int32sRef(); var input = new ByteArrayDataInput(); // Used to remove duplicate surface forms (but we // still index the hightest-weight one). We clear // this when we see a new analyzed form, so it cannot // grow unbounded (at most 256 entries): var seenSurfaceForms = new JCG.HashSet <BytesRef>(); var dedup = 0; while (reader.Read(scratch)) { input.Reset(scratch.Bytes, scratch.Offset, scratch.Length); ushort analyzedLength = (ushort)input.ReadInt16(); analyzed.Grow(analyzedLength + 2); input.ReadBytes(analyzed.Bytes, 0, analyzedLength); analyzed.Length = analyzedLength; long cost = input.ReadInt32(); surface.Bytes = scratch.Bytes; if (hasPayloads) { surface.Length = (ushort)input.ReadInt16(); surface.Offset = input.Position; } else { surface.Offset = input.Position; surface.Length = scratch.Length - surface.Offset; } if (previousAnalyzed == null) { previousAnalyzed = new BytesRef(); previousAnalyzed.CopyBytes(analyzed); seenSurfaceForms.Add(BytesRef.DeepCopyOf(surface)); } else if (analyzed.Equals(previousAnalyzed)) { dedup++; if (dedup >= maxSurfaceFormsPerAnalyzedForm) { // More than maxSurfaceFormsPerAnalyzedForm // dups: skip the rest: continue; } if (seenSurfaceForms.Contains(surface)) { continue; } seenSurfaceForms.Add(BytesRef.DeepCopyOf(surface)); } else { dedup = 0; previousAnalyzed.CopyBytes(analyzed); seenSurfaceForms.Clear(); seenSurfaceForms.Add(BytesRef.DeepCopyOf(surface)); } // TODO: I think we can avoid the extra 2 bytes when // there is no dup (dedup==0), but we'd have to fix // the exactFirst logic ... which would be sort of // hairy because we'd need to special case the two // (dup/not dup)... // NOTE: must be byte 0 so we sort before whatever // is next analyzed.Bytes[analyzed.Offset + analyzed.Length] = 0; analyzed.Bytes[analyzed.Offset + analyzed.Length + 1] = (byte)dedup; analyzed.Length += 2; Util.Fst.Util.ToInt32sRef(analyzed, scratchInts); //System.out.println("ADD: " + scratchInts + " -> " + cost + ": " + surface.utf8ToString()); if (!hasPayloads) { builder.Add(scratchInts, outputs.NewPair(cost, BytesRef.DeepCopyOf(surface))); } else { int payloadOffset = input.Position + surface.Length; int payloadLength = scratch.Length - payloadOffset; BytesRef br = new BytesRef(surface.Length + 1 + payloadLength); Array.Copy(surface.Bytes, surface.Offset, br.Bytes, 0, surface.Length); br.Bytes[surface.Length] = PAYLOAD_SEP; Array.Copy(scratch.Bytes, payloadOffset, br.Bytes, surface.Length + 1, payloadLength); br.Length = br.Bytes.Length; builder.Add(scratchInts, outputs.NewPair(cost, br)); } } fst = builder.Finish(); //Util.dotToFile(fst, "/tmp/suggest.dot"); success = true; } finally { if (success) { IOUtils.Dispose(reader, writer); } else { IOUtils.DisposeWhileHandlingException(reader, writer); } tempInput.Delete(); tempSorted.Delete(); } }
public override bool SeekExact(BytesRef term) { queue.Clear(); numTop = 0; bool seekOpt = false; if (lastSeek != null && termComp.Compare(lastSeek, term) <= 0) { seekOpt = true; } lastSeek = null; lastSeekExact = true; for (int i = 0; i < numSubs; i++) { bool status; // LUCENE-2130: if we had just seek'd already, prior // to this seek, and the new seek term is after the // previous one, don't try to re-seek this sub if its // current term is already beyond this new seek term. // Doing so is a waste because this sub will simply // seek to the same spot. if (seekOpt) { BytesRef curTerm = currentSubs[i].Current; if (curTerm != null) { int cmp = termComp.Compare(term, curTerm); if (cmp == 0) { status = true; } else if (cmp < 0) { status = false; } else { status = currentSubs[i].Terms.SeekExact(term); } } else { status = false; } } else { status = currentSubs[i].Terms.SeekExact(term); } if (status) { top[numTop++] = currentSubs[i]; current = currentSubs[i].Current = currentSubs[i].Terms.Term; if (Debugging.AssertsEnabled) { Debugging.Assert(term.Equals(currentSubs[i].Current)); } } } // if at least one sub had exact match to the requested // term then we found match return(numTop > 0); }
/// <summary> /// returns true if term is within k edits of the query term </summary> internal bool Matches(BytesRef term, int k) { return k == 0 ? term.Equals(TermRef) : Matchers[k].Run(term.Bytes, term.Offset, term.Length); }
//JAVA TO C# CONVERTER WARNING: Method 'throws' clauses are not available in .NET: //ORIGINAL LINE: @Override public SeekStatus seekCeil(util.BytesRef text) throws java.io.IOException public override SeekStatus seekCeil(BytesRef text) { //System.out.println("te.seek text=" + field.name + ":" + text.utf8ToString() + " this=" + this); current = fstEnum.seekCeil(text); if (current == null) { return SeekStatus.END; } else { // System.out.println(" got term=" + current.input.utf8ToString()); // for(int i=0;i<current.output.length;i++) { // System.out.println(" " + Integer.toHexString(current.output.bytes[i]&0xFF)); // } didDecode = false; if (text.Equals(current.input)) { //System.out.println(" found!"); return SeekStatus.FOUND; } else { //System.out.println(" not found: " + current.input.utf8ToString()); return SeekStatus.NOT_FOUND; } } }
public DirectIntersectTermsEnum(DirectPostingsFormat.DirectField outerInstance, CompiledAutomaton compiled, BytesRef startTerm) { this.outerInstance = outerInstance; runAutomaton = compiled.RunAutomaton; compiledAutomaton = compiled; termOrd = -1; states = new State[1]; states[0] = new State(this); states[0].changeOrd = outerInstance.terms.Length; states[0].state = runAutomaton.InitialState; states[0].transitions = compiledAutomaton.SortedTransitions[states[0].state]; states[0].transitionUpto = -1; states[0].transitionMax = -1; //System.out.println("IE.init startTerm=" + startTerm); if (startTerm != null) { int skipUpto = 0; if (startTerm.Length == 0) { if (outerInstance.terms.Length > 0 && outerInstance.termOffsets[1] == 0) { termOrd = 0; } } else { termOrd++; for (int i = 0; i < startTerm.Length; i++) { int label = startTerm.Bytes[startTerm.Offset + i] & 0xFF; while (label > states[i].transitionMax) { states[i].transitionUpto++; Debug.Assert(states[i].transitionUpto < states[i].transitions.Length); states[i].transitionMin = states[i].transitions[states[i].transitionUpto].Min; states[i].transitionMax = states[i].transitions[states[i].transitionUpto].Max; Debug.Assert(states[i].transitionMin >= 0); Debug.Assert(states[i].transitionMin <= 255); Debug.Assert(states[i].transitionMax >= 0); Debug.Assert(states[i].transitionMax <= 255); } // Skip forwards until we find a term matching // the label at this position: while (termOrd < outerInstance.terms.Length) { int skipOffset = outerInstance.skipOffsets[termOrd]; int numSkips = outerInstance.skipOffsets[termOrd + 1] - skipOffset; int termOffset_i = outerInstance.termOffsets[termOrd]; int termLength = outerInstance.termOffsets[1 + termOrd] - termOffset_i; // if (DEBUG) { // System.out.println(" check termOrd=" + termOrd + " term=" + new BytesRef(termBytes, termOffset, termLength).utf8ToString() + " skips=" + Arrays.toString(skips) + " i=" + i); // } if (termOrd == states[stateUpto].changeOrd) { // if (DEBUG) { // System.out.println(" end push return"); // } stateUpto--; termOrd--; return; } if (termLength == i) { termOrd++; skipUpto = 0; // if (DEBUG) { // System.out.println(" term too short; next term"); // } } else if (label < (outerInstance.termBytes[termOffset_i + i] & 0xFF)) { termOrd--; // if (DEBUG) { // System.out.println(" no match; already beyond; return termOrd=" + termOrd); // } stateUpto -= skipUpto; Debug.Assert(stateUpto >= 0); return; } else if (label == (outerInstance.termBytes[termOffset_i + i] & 0xFF)) { // if (DEBUG) { // System.out.println(" label[" + i + "] matches"); // } if (skipUpto < numSkips) { Grow(); int nextState = runAutomaton.Step(states[stateUpto].state, label); // Automaton is required to accept startTerm: Debug.Assert(nextState != -1); stateUpto++; states[stateUpto].changeOrd = outerInstance.skips[skipOffset + skipUpto++]; states[stateUpto].state = nextState; states[stateUpto].transitions = compiledAutomaton.SortedTransitions[nextState]; states[stateUpto].transitionUpto = -1; states[stateUpto].transitionMax = -1; //System.out.println(" push " + states[stateUpto].transitions.length + " trans"); // if (DEBUG) { // System.out.println(" push skip; changeOrd=" + states[stateUpto].changeOrd); // } // Match next label at this same term: goto nextLabelContinue; } else { // if (DEBUG) { // System.out.println(" linear scan"); // } // Index exhausted: just scan now (the // number of scans required will be less // than the minSkipCount): int startTermOrd = termOrd; while (termOrd < outerInstance.terms.Length && outerInstance.Compare(termOrd, startTerm) <= 0) { Debug.Assert(termOrd == startTermOrd || outerInstance.skipOffsets[termOrd] == outerInstance.skipOffsets[termOrd + 1]); termOrd++; } Debug.Assert(termOrd - startTermOrd < outerInstance.minSkipCount); termOrd--; stateUpto -= skipUpto; // if (DEBUG) { // System.out.println(" end termOrd=" + termOrd); // } return; } } else { if (skipUpto < numSkips) { termOrd = outerInstance.skips[skipOffset + skipUpto]; // if (DEBUG) { // System.out.println(" no match; skip to termOrd=" + termOrd); // } } else { // if (DEBUG) { // System.out.println(" no match; next term"); // } termOrd++; } skipUpto = 0; } } // startTerm is >= last term so enum will not // return any terms: termOrd--; // if (DEBUG) { // System.out.println(" beyond end; no terms will match"); // } return; nextLabelContinue: ; } nextLabelBreak: ; } int termOffset = outerInstance.termOffsets[termOrd]; int termLen = outerInstance.termOffsets[1 + termOrd] - termOffset; if (termOrd >= 0 && !startTerm.Equals(new BytesRef(outerInstance.termBytes, termOffset, termLen))) { stateUpto -= skipUpto; termOrd--; } // if (DEBUG) { // System.out.println(" loop end; return termOrd=" + termOrd + " stateUpto=" + stateUpto); // } } }
public override SeekStatus SeekCeil(BytesRef target) { // already here if (term != null && term.Equals(target)) { return(SeekStatus.FOUND); } int startIdx = Array.BinarySearch(outerInstance.m_indexedTermsArray, target); if (startIdx >= 0) { // we hit the term exactly... lucky us! TermsEnum.SeekStatus seekStatus = termsEnum.SeekCeil(target); if (Debugging.AssertsEnabled) { Debugging.Assert(seekStatus == TermsEnum.SeekStatus.FOUND); } ord = startIdx << outerInstance.indexIntervalBits; SetTerm(); if (Debugging.AssertsEnabled) { Debugging.Assert(term != null); } return(SeekStatus.FOUND); } // we didn't hit the term exactly startIdx = -startIdx - 1; if (startIdx == 0) { // our target occurs *before* the first term TermsEnum.SeekStatus seekStatus = termsEnum.SeekCeil(target); if (Debugging.AssertsEnabled) { Debugging.Assert(seekStatus == TermsEnum.SeekStatus.NOT_FOUND); } ord = 0; SetTerm(); if (Debugging.AssertsEnabled) { Debugging.Assert(term != null); } return(SeekStatus.NOT_FOUND); } // back up to the start of the block startIdx--; if ((ord >> outerInstance.indexIntervalBits) == startIdx && term != null && term.CompareTo(target) <= 0) { // we are already in the right block and the current term is before the term we want, // so we don't need to seek. } else { // seek to the right block TermsEnum.SeekStatus seekStatus = termsEnum.SeekCeil(outerInstance.m_indexedTermsArray[startIdx]); if (Debugging.AssertsEnabled) { Debugging.Assert(seekStatus == TermsEnum.SeekStatus.FOUND); } ord = startIdx << outerInstance.indexIntervalBits; SetTerm(); if (Debugging.AssertsEnabled) { Debugging.Assert(term != null); // should be non-null since it's in the index } } while (term != null && term.CompareTo(target) < 0) { Next(); } if (term == null) { return(SeekStatus.END); } else if (term.CompareTo(target) == 0) { return(SeekStatus.FOUND); } else { return(SeekStatus.NOT_FOUND); } }
public override SeekStatus SeekCeil(BytesRef target) { // already here if (Term_Renamed != null && Term_Renamed.Equals(target)) { return(SeekStatus.FOUND); } int startIdx = OuterInstance.IndexedTermsArray.ToList().BinarySearch(target); if (startIdx >= 0) { // we hit the term exactly... lucky us! TermsEnum.SeekStatus seekStatus = TermsEnum.SeekCeil(target); Debug.Assert(seekStatus == TermsEnum.SeekStatus.FOUND); Ord_Renamed = startIdx << OuterInstance.IndexIntervalBits; SetTerm(); Debug.Assert(Term_Renamed != null); return(SeekStatus.FOUND); } // we didn't hit the term exactly startIdx = -startIdx - 1; if (startIdx == 0) { // our target occurs *before* the first term TermsEnum.SeekStatus seekStatus = TermsEnum.SeekCeil(target); Debug.Assert(seekStatus == TermsEnum.SeekStatus.NOT_FOUND); Ord_Renamed = 0; SetTerm(); Debug.Assert(Term_Renamed != null); return(SeekStatus.NOT_FOUND); } // back up to the start of the block startIdx--; if ((Ord_Renamed >> OuterInstance.IndexIntervalBits) == startIdx && Term_Renamed != null && Term_Renamed.CompareTo(target) <= 0) { // we are already in the right block and the current term is before the term we want, // so we don't need to seek. } else { // seek to the right block TermsEnum.SeekStatus seekStatus = TermsEnum.SeekCeil(OuterInstance.IndexedTermsArray[startIdx]); Debug.Assert(seekStatus == TermsEnum.SeekStatus.FOUND); Ord_Renamed = startIdx << OuterInstance.IndexIntervalBits; SetTerm(); Debug.Assert(Term_Renamed != null); // should be non-null since it's in the index } while (Term_Renamed != null && Term_Renamed.CompareTo(target) < 0) { Next(); } if (Term_Renamed == null) { return(SeekStatus.END); } else if (Term_Renamed.CompareTo(target) == 0) { return(SeekStatus.FOUND); } else { return(SeekStatus.NOT_FOUND); } }
public override void SeekExact(BytesRef term, TermState state) { termOrd = (int) ((OrdTermState) state).Ord; SetTerm(); Debug.Assert(term.Equals(scratch)); }
/// <summary> /// Returns <c>true</c> if <paramref name="term"/> is within <paramref name="k"/> edits of the query term </summary> internal bool Matches(BytesRef term, int k) { return(k == 0 ? term.Equals(termRef) : matchers[k].Run(term.Bytes, term.Offset, term.Length)); }
private void LoadTerms() { var posIntOutputs = PositiveInt32Outputs.Singleton; var outputsInner = new PairOutputs <long?, long?>(posIntOutputs, posIntOutputs); var outputs = new PairOutputs <long?, PairOutputs <long?, long?> .Pair>(posIntOutputs, outputsInner); // honestly, wtf kind of generic mess is this. var b = new Builder <PairOutputs <long?, PairOutputs <long?, long?> .Pair> .Pair>(FST.INPUT_TYPE.BYTE1, outputs); var input = (IndexInput)_outerInstance._input.Clone(); input.Seek(_termsStart); var lastTerm = new BytesRef(10); long lastDocsStart = -1; int docFreq = 0; long totalTermFreq = 0; var visitedDocs = new FixedBitSet(_maxDoc); var scratchIntsRef = new Int32sRef(); while (true) { SimpleTextUtil.ReadLine(input, _scratch); if (_scratch.Equals(SimpleTextFieldsWriter.END) || StringHelper.StartsWith(_scratch, SimpleTextFieldsWriter.FIELD)) { if (lastDocsStart != -1) { b.Add(Util.ToInt32sRef(lastTerm, scratchIntsRef), outputs.NewPair(lastDocsStart, outputsInner.NewPair(docFreq, totalTermFreq))); _sumTotalTermFreq += totalTermFreq; } break; } if (StringHelper.StartsWith(_scratch, SimpleTextFieldsWriter.DOC)) { docFreq++; _sumDocFreq++; UnicodeUtil.UTF8toUTF16(_scratch.Bytes, _scratch.Offset + SimpleTextFieldsWriter.DOC.Length, _scratch.Length - SimpleTextFieldsWriter.DOC.Length, _scratchUtf16); int docId = ArrayUtil.ParseInt32(_scratchUtf16.Chars, 0, _scratchUtf16.Length); visitedDocs.Set(docId); } else if (StringHelper.StartsWith(_scratch, SimpleTextFieldsWriter.FREQ)) { UnicodeUtil.UTF8toUTF16(_scratch.Bytes, _scratch.Offset + SimpleTextFieldsWriter.FREQ.Length, _scratch.Length - SimpleTextFieldsWriter.FREQ.Length, _scratchUtf16); totalTermFreq += ArrayUtil.ParseInt32(_scratchUtf16.Chars, 0, _scratchUtf16.Length); } else if (StringHelper.StartsWith(_scratch, SimpleTextFieldsWriter.TERM)) { if (lastDocsStart != -1) { b.Add(Util.ToInt32sRef(lastTerm, scratchIntsRef), outputs.NewPair(lastDocsStart, outputsInner.NewPair(docFreq, totalTermFreq))); } lastDocsStart = input.GetFilePointer(); int len = _scratch.Length - SimpleTextFieldsWriter.TERM.Length; if (len > lastTerm.Length) { lastTerm.Grow(len); } Array.Copy(_scratch.Bytes, SimpleTextFieldsWriter.TERM.Length, lastTerm.Bytes, 0, len); lastTerm.Length = len; docFreq = 0; _sumTotalTermFreq += totalTermFreq; totalTermFreq = 0; _termCount++; } } _docCount = visitedDocs.Cardinality(); _fst = b.Finish(); }