public virtual void TestAppend() { sbyte[] bytes = new sbyte[] { (sbyte)'a', (sbyte)'b', (sbyte)'c', (sbyte)'d' }; BytesRef b = new BytesRef(bytes, 1, 3); // bcd b.Append(new BytesRef("e")); Assert.AreEqual("bcde", b.Utf8ToString()); }
private void WritePosition(int delta, BytesRef payload) { if (payloads) { int payloadLength = payload == null ? 0 : payload.Length; if (payloadLength != lastPayloadLength) { lastPayloadLength = payloadLength; tvf.WriteVInt32((delta << 1) | 1); tvf.WriteVInt32(payloadLength); } else { tvf.WriteVInt32(delta << 1); } if (payloadLength > 0) { if (payloadLength + payloadData.Length < 0) { // we overflowed the payload buffer, just throw UOE // having > System.Int32.MaxValue bytes of payload for a single term in a single doc is nuts. throw UnsupportedOperationException.Create("A term cannot have more than System.Int32.MaxValue bytes of payload data in a single document"); } payloadData.Append(payload); } } else { tvf.WriteVInt32(delta); } }
private void WritePosition(int delta, BytesRef payload) { if (Payloads) { int payloadLength = payload == null ? 0 : payload.Length; if (payloadLength != LastPayloadLength) { LastPayloadLength = payloadLength; Tvf.WriteVInt((delta << 1) | 1); Tvf.WriteVInt(payloadLength); } else { Tvf.WriteVInt(delta << 1); } if (payloadLength > 0) { if (payloadLength + PayloadData.Length < 0) { // we overflowed the payload buffer, just throw UOE // having > Integer.MAX_VALUE bytes of payload for a single term in a single doc is nuts. throw new System.NotSupportedException("A term cannot have more than Integer.MAX_VALUE bytes of payload data in a single document"); } PayloadData.Append(payload); } } else { Tvf.WriteVInt(delta); } }
public override void SetNextReader(AtomicReaderContext context) { if (m_segmentFacetCounts != null) { m_segmentResults.Add(CreateSegmentResult()); } groupFieldTermsIndex = FieldCache.DEFAULT.GetTermsIndex(context.AtomicReader, m_groupField); facetFieldTermsIndex = FieldCache.DEFAULT.GetTermsIndex(context.AtomicReader, m_facetField); // 1+ to allow for the -1 "not set": m_segmentFacetCounts = new int[facetFieldTermsIndex.ValueCount + 1]; m_segmentTotalCount = 0; segmentGroupedFacetHits.Clear(); foreach (GroupedFacetHit groupedFacetHit in groupedFacetHits) { int facetOrd = groupedFacetHit.facetValue == null ? -1 : facetFieldTermsIndex.LookupTerm(groupedFacetHit.facetValue); if (groupedFacetHit.facetValue != null && facetOrd < 0) { continue; } int groupOrd = groupedFacetHit.groupValue == null ? -1 : groupFieldTermsIndex.LookupTerm(groupedFacetHit.groupValue); if (groupedFacetHit.groupValue != null && groupOrd < 0) { continue; } int segmentGroupedFacetsIndex = groupOrd * (facetFieldTermsIndex.ValueCount + 1) + facetOrd; segmentGroupedFacetHits.Put(segmentGroupedFacetsIndex); } if (m_facetPrefix != null) { m_startFacetOrd = facetFieldTermsIndex.LookupTerm(m_facetPrefix); if (m_startFacetOrd < 0) { // Points to the ord one higher than facetPrefix m_startFacetOrd = -m_startFacetOrd - 1; } BytesRef facetEndPrefix = BytesRef.DeepCopyOf(m_facetPrefix); facetEndPrefix.Append(UnicodeUtil.BIG_TERM); m_endFacetOrd = facetFieldTermsIndex.LookupTerm(facetEndPrefix); if (Debugging.AssertsEnabled) { Debugging.Assert(m_endFacetOrd < 0); } m_endFacetOrd = -m_endFacetOrd - 1; // Points to the ord one higher than facetEndPrefix } else { m_startFacetOrd = -1; m_endFacetOrd = facetFieldTermsIndex.ValueCount; } }
protected override bool AcceptResult(Int32sRef input, long?output) { Util.Fst.Util.ToBytesRef(input, scratchBytes); finalLastToken.Grow(finalLastToken.Length + scratchBytes.Length); int lenSav = finalLastToken.Length; finalLastToken.Append(scratchBytes); //System.out.println(" accept? input='" + scratchBytes.utf8ToString() + "'; lastToken='" + finalLastToken.utf8ToString() + "'; return " + (seen.contains(finalLastToken) == false)); bool ret = seen.Contains(finalLastToken) == false; finalLastToken.Length = lenSav; return(ret); }
public override void Add(int doc, object value) { // TODO: if the Sorter interface changes to take long indexes, we can remove that limitation if (Size == int.MaxValue) { throw new InvalidOperationException("cannot support more than Integer.MAX_VALUE doc/value entries"); } BytesRef val = (BytesRef)value; if (val == null) { val = BinaryDocValuesUpdate.MISSING; } // grow the structures to have room for more elements if (Docs.Size() == Size) { Docs = Docs.Grow(Size + 1); Offsets = Offsets.Grow(Size + 1); Lengths = Lengths.Grow(Size + 1); DocsWithField = FixedBitSet.EnsureCapacity(DocsWithField, (int)Docs.Size()); } if (val != BinaryDocValuesUpdate.MISSING) { // only mark the document as having a value in that field if the value wasn't set to null (MISSING) DocsWithField.Set(Size); } Docs.Set(Size, doc); Offsets.Set(Size, Values.Length); Lengths.Set(Size, val.Length); Values.Append(val); ++Size; }
public override void Add(int doc, object value) { // TODO: if the Sorter interface changes to take long indexes, we can remove that limitation if (size == int.MaxValue) { throw new InvalidOperationException("cannot support more than System.Int32.MaxValue doc/value entries"); } BytesRef val = (BytesRef)value; if (val == null) { val = BinaryDocValuesUpdate.MISSING; } // grow the structures to have room for more elements if (docs.Count == size) { docs = docs.Grow(size + 1); offsets = offsets.Grow(size + 1); lengths = lengths.Grow(size + 1); docsWithField = FixedBitSet.EnsureCapacity(docsWithField, (int)docs.Count); } if (val != BinaryDocValuesUpdate.MISSING) { // only mark the document as having a value in that field if the value wasn't set to null (MISSING) docsWithField.Set(size); } docs.Set(size, doc); offsets.Set(size, values.Length); lengths.Set(size, val.Length); values.Append(val); ++size; }
private void Add(int doc, BytesRef value) // LUCENENET specific: Marked private instead of public and changed the value parameter type { // TODO: if the Sorter interface changes to take long indexes, we can remove that limitation if (size == int.MaxValue) { throw IllegalStateException.Create("cannot support more than System.Int32.MaxValue doc/value entries"); } BytesRef val = value; if (val is null) { val = BinaryDocValuesUpdate.MISSING; } // grow the structures to have room for more elements if (docs.Count == size) { docs = docs.Grow(size + 1); offsets = offsets.Grow(size + 1); lengths = lengths.Grow(size + 1); docsWithField = FixedBitSet.EnsureCapacity(docsWithField, (int)docs.Count); } if (val != BinaryDocValuesUpdate.MISSING) { // only mark the document as having a value in that field if the value wasn't set to null (MISSING) docsWithField.Set(size); } docs.Set(size, doc); offsets.Set(size, values.Length); lengths.Set(size, val.Length); values.Append(val); ++size; }
/// <summary> /// Appends a single suggestion and its weight to the internal buffers. /// </summary> /// <param name="utf8"> /// The suggestion (utf8 representation) to be added. The content is /// copied and the object can be reused. </param> /// <param name="bucket"> /// The bucket to place this suggestion in. Must be non-negative and /// smaller than the number of buckets passed in the constructor. /// Higher numbers indicate suggestions that should be presented /// before suggestions placed in smaller buckets. </param> public virtual void Add(BytesRef utf8, int bucket) { if (bucket < 0 || bucket >= buckets) { throw new ArgumentException("Bucket outside of the allowed range [0, " + buckets + "): " + bucket); } if (scratch.Bytes.Length < utf8.Length + 1) { scratch.Grow(utf8.Length + 10); } scratch.Length = 1; scratch.Bytes[0] = (byte)bucket; scratch.Append(utf8); sorter.Add(scratch); }
/// <summary> /// Appends a single suggestion and its weight to the internal buffers. /// </summary> /// <param name="utf8"> /// The suggestion (utf8 representation) to be added. The content is /// copied and the object can be reused. </param> /// <param name="bucket"> /// The bucket to place this suggestion in. Must be non-negative and /// smaller than the number of buckets passed in the constructor. /// Higher numbers indicate suggestions that should be presented /// before suggestions placed in smaller buckets. </param> public virtual void Add(BytesRef utf8, int bucket) { // LUCENENET: Added guard clause for null if (utf8 is null) { throw new ArgumentNullException(nameof(utf8)); } if (bucket < 0 || bucket >= buckets) { throw new ArgumentOutOfRangeException(nameof(buckets), "Bucket outside of the allowed range [0, " + buckets + "): " + bucket); // LUCENENET specific - changed from IllegalArgumentException to ArgumentOutOfRangeException (.NET convention) } if (scratch.Bytes.Length < utf8.Length + 1) { scratch.Grow(utf8.Length + 10); } scratch.Length = 1; scratch.Bytes[0] = (byte)bucket; scratch.Append(utf8); sorter.Add(scratch); }
public override IList <LookupResult> DoLookup(string key, IEnumerable <BytesRef> contexts, bool onlyMorePopular, int num) { if (contexts != null) { throw new ArgumentException("this suggester doesn't support contexts"); } if (Debugging.AssertsEnabled) { Debugging.Assert(num > 0); } if (onlyMorePopular) { throw new ArgumentException("this suggester only works with onlyMorePopular=false"); } if (fst == null) { return(Collections.EmptyList <LookupResult>()); } BytesRef scratch = new BytesRef(key); int prefixLength = scratch.Length; FST.Arc <long?> arc = new FST.Arc <long?>(); // match the prefix portion exactly long?prefixOutput = null; try { prefixOutput = LookupPrefix(scratch, arc); } catch (IOException bogus) { throw new Exception(bogus.ToString(), bogus); } if (prefixOutput == null) { return(Collections.EmptyList <LookupResult>()); } List <LookupResult> results = new List <LookupResult>(num); CharsRef spare = new CharsRef(); if (exactFirst && arc.IsFinal) { spare.Grow(scratch.Length); UnicodeUtil.UTF8toUTF16(scratch, spare); results.Add(new LookupResult(spare.ToString(), DecodeWeight(prefixOutput.GetValueOrDefault() + arc.NextFinalOutput.GetValueOrDefault()))); if (--num == 0) { return(results); // that was quick } } // complete top-N Util.Fst.Util.TopResults <long?> completions = null; try { completions = Lucene.Net.Util.Fst.Util.ShortestPaths(fst, arc, prefixOutput, weightComparer, num, !exactFirst); if (Debugging.AssertsEnabled) { Debugging.Assert(completions.IsComplete); } } catch (IOException bogus) { throw new Exception(bogus.ToString(), bogus); } BytesRef suffix = new BytesRef(8); foreach (Util.Fst.Util.Result <long?> completion in completions) { scratch.Length = prefixLength; // append suffix Lucene.Net.Util.Fst.Util.ToBytesRef(completion.Input, suffix); scratch.Append(suffix); spare.Grow(scratch.Length); UnicodeUtil.UTF8toUTF16(scratch, spare); results.Add(new LookupResult(spare.ToString(), DecodeWeight(completion.Output.GetValueOrDefault()))); } return(results); }
/// <summary> /// Retrieve suggestions. /// </summary> public virtual IList <LookupResult> DoLookup(string key, IEnumerable <BytesRef> contexts, int num) { if (contexts != null) { throw new System.ArgumentException("this suggester doesn't support contexts"); } TokenStream ts = queryAnalyzer.GetTokenStream("", key.ToString()); try { ITermToBytesRefAttribute termBytesAtt = ts.AddAttribute <ITermToBytesRefAttribute>(); IOffsetAttribute offsetAtt = ts.AddAttribute <IOffsetAttribute>(); IPositionLengthAttribute posLenAtt = ts.AddAttribute <IPositionLengthAttribute>(); IPositionIncrementAttribute posIncAtt = ts.AddAttribute <IPositionIncrementAttribute>(); ts.Reset(); var lastTokens = new BytesRef[grams]; //System.out.println("lookup: key='" + key + "'"); // Run full analysis, but save only the // last 1gram, last 2gram, etc.: BytesRef tokenBytes = termBytesAtt.BytesRef; int maxEndOffset = -1; bool sawRealToken = false; while (ts.IncrementToken()) { termBytesAtt.FillBytesRef(); sawRealToken |= tokenBytes.Length > 0; // TODO: this is somewhat iffy; today, ShingleFilter // sets posLen to the gram count; maybe we should make // a separate dedicated att for this? int gramCount = posLenAtt.PositionLength; Debug.Assert(gramCount <= grams); // Safety: make sure the recalculated count "agrees": if (CountGrams(tokenBytes) != gramCount) { throw new System.ArgumentException("tokens must not contain separator byte; got token=" + tokenBytes + " but gramCount=" + gramCount + " does not match recalculated count=" + CountGrams(tokenBytes)); } maxEndOffset = Math.Max(maxEndOffset, offsetAtt.EndOffset); lastTokens[gramCount - 1] = BytesRef.DeepCopyOf(tokenBytes); } ts.End(); if (!sawRealToken) { throw new System.ArgumentException("no tokens produced by analyzer, or the only tokens were empty strings"); } // Carefully fill last tokens with _ tokens; // ShingleFilter appraently won't emit "only hole" // tokens: int endPosInc = posIncAtt.PositionIncrement; // Note this will also be true if input is the empty // string (in which case we saw no tokens and // maxEndOffset is still -1), which in fact works out OK // because we fill the unigram with an empty BytesRef // below: bool lastTokenEnded = offsetAtt.EndOffset > maxEndOffset || endPosInc > 0; //System.out.println("maxEndOffset=" + maxEndOffset + " vs " + offsetAtt.EndOffset); if (lastTokenEnded) { //System.out.println(" lastTokenEnded"); // If user hit space after the last token, then // "upgrade" all tokens. This way "foo " will suggest // all bigrams starting w/ foo, and not any unigrams // starting with "foo": for (int i = grams - 1; i > 0; i--) { BytesRef token = lastTokens[i - 1]; if (token == null) { continue; } token.Grow(token.Length + 1); token.Bytes[token.Length] = separator; token.Length++; lastTokens[i] = token; } lastTokens[0] = new BytesRef(); } var arc = new FST.Arc <long?>(); var bytesReader = fst.GetBytesReader(); // Try highest order models first, and if they return // results, return that; else, fallback: double backoff = 1.0; List <LookupResult> results = new List <LookupResult>(num); // We only add a given suffix once, from the highest // order model that saw it; for subsequent lower order // models we skip it: var seen = new HashSet <BytesRef>(); for (int gram = grams - 1; gram >= 0; gram--) { BytesRef token = lastTokens[gram]; // Don't make unigram predictions from empty string: if (token == null || (token.Length == 0 && key.Length > 0)) { // Input didn't have enough tokens: //System.out.println(" gram=" + gram + ": skip: not enough input"); continue; } if (endPosInc > 0 && gram <= endPosInc) { // Skip hole-only predictions; in theory we // shouldn't have to do this, but we'd need to fix // ShingleFilter to produce only-hole tokens: //System.out.println(" break: only holes now"); break; } //System.out.println("try " + (gram+1) + " gram token=" + token.utf8ToString()); // TODO: we could add fuzziness here // match the prefix portion exactly //Pair<Long,BytesRef> prefixOutput = null; long?prefixOutput = null; try { prefixOutput = LookupPrefix(fst, bytesReader, token, arc); } catch (IOException bogus) { throw new Exception(bogus.ToString(), bogus); } //System.out.println(" prefixOutput=" + prefixOutput); if (prefixOutput == null) { // This model never saw this prefix, e.g. the // trigram model never saw context "purple mushroom" backoff *= ALPHA; continue; } // TODO: we could do this division at build time, and // bake it into the FST? // Denominator for computing scores from current // model's predictions: long contextCount = totTokens; BytesRef lastTokenFragment = null; for (int i = token.Length - 1; i >= 0; i--) { if (token.Bytes[token.Offset + i] == separator) { BytesRef context = new BytesRef(token.Bytes, token.Offset, i); long? output = Lucene.Net.Util.Fst.Util.Get(fst, Lucene.Net.Util.Fst.Util.ToInt32sRef(context, new Int32sRef())); Debug.Assert(output != null); contextCount = DecodeWeight(output); lastTokenFragment = new BytesRef(token.Bytes, token.Offset + i + 1, token.Length - i - 1); break; } } BytesRef finalLastToken; if (lastTokenFragment == null) { finalLastToken = BytesRef.DeepCopyOf(token); } else { finalLastToken = BytesRef.DeepCopyOf(lastTokenFragment); } Debug.Assert(finalLastToken.Offset == 0); CharsRef spare = new CharsRef(); // complete top-N Util.Fst.Util.TopResults <long?> completions = null; try { // Because we store multiple models in one FST // (1gram, 2gram, 3gram), we must restrict the // search so that it only considers the current // model. For highest order model, this is not // necessary since all completions in the FST // must be from this model, but for lower order // models we have to filter out the higher order // ones: // Must do num+seen.size() for queue depth because we may // reject up to seen.size() paths in acceptResult(): Util.Fst.Util.TopNSearcher <long?> searcher = new TopNSearcherAnonymousInnerClassHelper(this, fst, num, num + seen.Count, weightComparer, seen, finalLastToken); // since this search is initialized with a single start node // it is okay to start with an empty input path here searcher.AddStartPaths(arc, prefixOutput, true, new Int32sRef()); completions = searcher.Search(); Debug.Assert(completions.IsComplete); } catch (IOException bogus) { throw new Exception(bogus.ToString(), bogus); } int prefixLength = token.Length; BytesRef suffix = new BytesRef(8); //System.out.println(" " + completions.length + " completions"); foreach (Util.Fst.Util.Result <long?> completion in completions) { token.Length = prefixLength; // append suffix Util.Fst.Util.ToBytesRef(completion.Input, suffix); token.Append(suffix); //System.out.println(" completion " + token.utf8ToString()); // Skip this path if a higher-order model already // saw/predicted its last token: BytesRef lastToken = token; for (int i = token.Length - 1; i >= 0; i--) { if (token.Bytes[token.Offset + i] == separator) { Debug.Assert(token.Length - i - 1 > 0); lastToken = new BytesRef(token.Bytes, token.Offset + i + 1, token.Length - i - 1); break; } } if (seen.Contains(lastToken)) { //System.out.println(" skip dup " + lastToken.utf8ToString()); goto nextCompletionContinue; } seen.Add(BytesRef.DeepCopyOf(lastToken)); spare.Grow(token.Length); UnicodeUtil.UTF8toUTF16(token, spare); LookupResult result = new LookupResult(spare.ToString(), // LUCENENET NOTE: We need to calculate this as decimal because when using double it can sometimes // return numbers that are greater than long.MaxValue, which results in a negative long number. (long)(long.MaxValue * (decimal)backoff * ((decimal)DecodeWeight(completion.Output)) / contextCount)); results.Add(result); Debug.Assert(results.Count == seen.Count); //System.out.println(" add result=" + result); nextCompletionContinue :; } backoff *= ALPHA; } results.Sort(new ComparerAnonymousInnerClassHelper(this)); if (results.Count > num) { results.SubList(num, results.Count).Clear(); } return(results); } finally { IOUtils.DisposeWhileHandlingException(ts); } }
public override void SetNextReader(AtomicReaderContext context) { if (m_segmentFacetCounts != null) { m_segmentResults.Add(CreateSegmentResult()); } groupFieldTermsIndex = FieldCache.DEFAULT.GetTermsIndex(context.AtomicReader, m_groupField); facetFieldDocTermOrds = FieldCache.DEFAULT.GetDocTermOrds(context.AtomicReader, m_facetField); facetFieldNumTerms = (int)facetFieldDocTermOrds.ValueCount; if (facetFieldNumTerms == 0) { facetOrdTermsEnum = null; } else { facetOrdTermsEnum = facetFieldDocTermOrds.GetTermsEnum(); } // [facetFieldNumTerms() + 1] for all possible facet values and docs not containing facet field m_segmentFacetCounts = new int[facetFieldNumTerms + 1]; m_segmentTotalCount = 0; segmentGroupedFacetHits.Clear(); foreach (GroupedFacetHit groupedFacetHit in groupedFacetHits) { int groupOrd = groupedFacetHit.groupValue == null ? -1 : groupFieldTermsIndex.LookupTerm(groupedFacetHit.groupValue); if (groupedFacetHit.groupValue != null && groupOrd < 0) { continue; } int facetOrd; if (groupedFacetHit.facetValue != null) { if (facetOrdTermsEnum == null || !facetOrdTermsEnum.SeekExact(groupedFacetHit.facetValue)) { continue; } facetOrd = (int)facetOrdTermsEnum.Ord; } else { facetOrd = facetFieldNumTerms; } // (facetFieldDocTermOrds.numTerms() + 1) for all possible facet values and docs not containing facet field int segmentGroupedFacetsIndex = groupOrd * (facetFieldNumTerms + 1) + facetOrd; segmentGroupedFacetHits.Put(segmentGroupedFacetsIndex); } if (m_facetPrefix != null) { TermsEnum.SeekStatus seekStatus; if (facetOrdTermsEnum != null) { seekStatus = facetOrdTermsEnum.SeekCeil(m_facetPrefix); } else { seekStatus = TermsEnum.SeekStatus.END; } if (seekStatus != TermsEnum.SeekStatus.END) { m_startFacetOrd = (int)facetOrdTermsEnum.Ord; } else { m_startFacetOrd = 0; m_endFacetOrd = 0; return; } BytesRef facetEndPrefix = BytesRef.DeepCopyOf(m_facetPrefix); facetEndPrefix.Append(UnicodeUtil.BIG_TERM); seekStatus = facetOrdTermsEnum.SeekCeil(facetEndPrefix); if (seekStatus != TermsEnum.SeekStatus.END) { m_endFacetOrd = (int)facetOrdTermsEnum.Ord; } else { m_endFacetOrd = facetFieldNumTerms; // Don't include null... } } else { m_startFacetOrd = 0; m_endFacetOrd = facetFieldNumTerms + 1; } }
public override IList <LookupResult> Lookup(string key, HashSet <BytesRef> contexts, bool onlyMorePopular, int num) { if (contexts != null) { throw new System.ArgumentException("this suggester doesn't support contexts"); } Debug.Assert(num > 0); if (onlyMorePopular) { throw new System.ArgumentException("this suggester only works with onlyMorePopular=false"); } if (fst == null) { return(Collections.emptyList()); } BytesRef scratch = new BytesRef(key); int prefixLength = scratch.Length; Arc <long?> arc = new Arc <long?>(); // match the prefix portion exactly long?prefixOutput = null; try { prefixOutput = LookupPrefix(scratch, arc); } catch (IOException bogus) { throw new Exception(bogus); } if (prefixOutput == null) { return(Collections.emptyList()); } IList <LookupResult> results = new List <LookupResult>(num); CharsRef spare = new CharsRef(); if (exactFirst && arc.Final) { spare.grow(scratch.length); UnicodeUtil.UTF8toUTF16(scratch, spare); results.Add(new LookupResult(spare.ToString(), decodeWeight(prefixOutput + arc.nextFinalOutput))); if (--num == 0) { return(results); // that was quick } } // complete top-N TopResults <long?> completions = null; try { completions = Util.ShortestPaths(fst, arc, prefixOutput, weightComparator, num, !exactFirst); Debug.Assert(completions.isComplete); } catch (IOException bogus) { throw new Exception(bogus); } BytesRef suffix = new BytesRef(8); foreach (Result <long?> completion in completions) { scratch.length = prefixLength; // append suffix Util.ToBytesRef(completion.input, suffix); scratch.Append(suffix); spare.Grow(scratch.Length); UnicodeUtil.UTF8toUTF16(scratch, spare); results.Add(new LookupResult(spare.ToString(), decodeWeight(completion.output))); } return(results); }