protected internal virtual BytesRef AnalyzeMultitermTerm(string field, string part, Analyzer analyzerIn) { if (analyzerIn == null) { analyzerIn = Analyzer; } TokenStream source = null; try { source = analyzerIn.TokenStream(field, part); source.Reset(); ITermToBytesRefAttribute termAtt = source.GetAttribute <ITermToBytesRefAttribute>(); BytesRef bytes = termAtt.BytesRef; if (!source.IncrementToken()) { throw new ArgumentException("analyzer returned no terms for multiTerm term: " + part); } termAtt.FillBytesRef(); if (source.IncrementToken()) { throw new ArgumentException("analyzer returned too many terms for multiTerm term: " + part); } source.End(); return(BytesRef.DeepCopyOf(bytes)); } catch (IOException e) { throw new Exception("Error analyzing multiTerm term: " + part, e); } finally { IOUtils.CloseWhileHandlingException(source); } }
public virtual void TestLongStream() { using (NumericTokenStream stream = (new NumericTokenStream()).SetInt64Value(Lvalue)) { // use getAttribute to test if attributes really exist, if not an IAE will be throwed ITermToBytesRefAttribute bytesAtt = stream.GetAttribute <ITermToBytesRefAttribute>(); ITypeAttribute typeAtt = stream.GetAttribute <ITypeAttribute>(); NumericTokenStream.INumericTermAttribute numericAtt = stream.GetAttribute <NumericTokenStream.INumericTermAttribute>(); BytesRef bytes = bytesAtt.BytesRef; stream.Reset(); Assert.AreEqual(64, numericAtt.ValueSize); for (int shift = 0; shift < 64; shift += NumericUtils.PRECISION_STEP_DEFAULT) { Assert.IsTrue(stream.IncrementToken(), "New token is available"); Assert.AreEqual(shift, numericAtt.Shift, "Shift value wrong"); bytesAtt.FillBytesRef(); Assert.AreEqual(Lvalue & ~((1L << shift) - 1L), NumericUtils.PrefixCodedToInt64(bytes), "Term is incorrectly encoded"); Assert.AreEqual(Lvalue & ~((1L << shift) - 1L), numericAtt.RawValue, "Term raw value is incorrectly encoded"); Assert.AreEqual((shift == 0) ? NumericTokenStream.TOKEN_TYPE_FULL_PREC : NumericTokenStream.TOKEN_TYPE_LOWER_PREC, typeAtt.Type, "Type incorrect"); } Assert.IsFalse(stream.IncrementToken(), "More tokens available"); stream.End(); } }
public virtual Query GetQuery(XmlElement e) { string fieldName = DOMUtils.GetAttributeWithInheritanceOrFail(e, "fieldName"); string text = DOMUtils.GetNonBlankTextOrFail(e); BooleanQuery bq = new BooleanQuery(DOMUtils.GetAttribute(e, "disableCoord", false)); bq.MinimumNumberShouldMatch = DOMUtils.GetAttribute(e, "minimumNumberShouldMatch", 0); TokenStream ts = null; try { ts = analyzer.GetTokenStream(fieldName, text); ITermToBytesRefAttribute termAtt = ts.AddAttribute <ITermToBytesRefAttribute>(); Term term = null; BytesRef bytes = termAtt.BytesRef; ts.Reset(); while (ts.IncrementToken()) { termAtt.FillBytesRef(); term = new Term(fieldName, BytesRef.DeepCopyOf(bytes)); bq.Add(new BooleanClause(new TermQuery(term), Occur.SHOULD)); } ts.End(); } catch (Exception ioe) when(ioe.IsIOException()) { throw RuntimeException.Create("Error constructing terms from index:" + ioe, ioe); } finally { IOUtils.DisposeWhileHandlingException(ts); } bq.Boost = DOMUtils.GetAttribute(e, "boost", 1.0f); return(bq); }
public override SpanQuery GetSpanQuery(XmlElement e) { string fieldName = DOMUtils.GetAttributeWithInheritanceOrFail(e, "fieldName"); string value = DOMUtils.GetNonBlankTextOrFail(e); List <SpanQuery> clausesList = new List <SpanQuery>(); TokenStream ts = null; try { ts = analyzer.GetTokenStream(fieldName, value); ITermToBytesRefAttribute termAtt = ts.AddAttribute <ITermToBytesRefAttribute>(); BytesRef bytes = termAtt.BytesRef; ts.Reset(); while (ts.IncrementToken()) { termAtt.FillBytesRef(); SpanTermQuery stq = new SpanTermQuery(new Term(fieldName, BytesRef.DeepCopyOf(bytes))); clausesList.Add(stq); } ts.End(); SpanOrQuery soq = new SpanOrQuery(clausesList.ToArray(/*new SpanQuery[clausesList.size()]*/)); soq.Boost = DOMUtils.GetAttribute(e, "boost", 1.0f); return(soq); } #pragma warning disable 168 catch (IOException ioe) #pragma warning restore 168 { throw new ParserException("IOException parsing value:" + value); } finally { IOUtils.DisposeWhileHandlingException(ts); } }
public override void Run() { try { foreach (var mapping in this.map) { string term = mapping.Key; BytesRef expected = mapping.Value; IOException priorException = null; TokenStream ts = this.analyzer.GetTokenStream("fake", new StringReader(term)); try { ITermToBytesRefAttribute termAtt = ts.AddAttribute <ITermToBytesRefAttribute>(); BytesRef bytes = termAtt.BytesRef; ts.Reset(); Assert.IsTrue(ts.IncrementToken()); termAtt.FillBytesRef(); Assert.AreEqual(expected, bytes); Assert.IsFalse(ts.IncrementToken()); ts.End(); } catch (IOException e) { priorException = e; } finally { IOUtils.DisposeWhileHandlingException(priorException, ts); } } } catch (IOException e) { throw new Exception(e.ToString(), e); } }
/// <summary> /// Iterates over the given token stream and adds the resulting terms to the index; /// Equivalent to adding a tokenized, indexed, termVectorStored, unstored, /// Lucene <see cref="Documents.Field"/>. /// Finally closes the token stream. Note that untokenized keywords can be added with this method via /// <see cref="T:KeywordTokenStream{T}(ICollection{T}"/>)"/>, the Lucene <c>KeywordTokenizer</c> or similar utilities. /// /// </summary> /// <param name="fieldName"> a name to be associated with the text </param> /// <param name="stream"> the token stream to retrieve tokens from. </param> /// <param name="boost"> the boost factor for hits for this field </param> /// <param name="positionIncrementGap"> the position increment gap if fields with the same name are added more than once </param> /// <param name="offsetGap"> the offset gap if fields with the same name are added more than once </param> /// <seealso cref="Documents.Field.Boost"/> public virtual void AddField(string fieldName, TokenStream stream, float boost, int positionIncrementGap, int offsetGap) { try { if (fieldName == null) { throw new ArgumentException("fieldName must not be null"); } if (stream == null) { throw new ArgumentException("token stream must not be null"); } if (boost <= 0.0f) { throw new ArgumentException("boost factor must be greater than 0.0"); } int numTokens = 0; int numOverlapTokens = 0; int pos = -1; BytesRefHash terms; SliceByteStartArray sliceArray; long sumTotalTermFreq = 0; int offset = 0; if (fields.TryGetValue(fieldName, out Info info)) { numTokens = info.numTokens; numOverlapTokens = info.numOverlapTokens; pos = info.lastPosition + positionIncrementGap; offset = info.lastOffset + offsetGap; terms = info.terms; boost *= info.boost; sliceArray = info.sliceArray; sumTotalTermFreq = info.sumTotalTermFreq; } else { sliceArray = new SliceByteStartArray(BytesRefHash.DEFAULT_CAPACITY); terms = new BytesRefHash(byteBlockPool, BytesRefHash.DEFAULT_CAPACITY, sliceArray); } if (!fieldInfos.ContainsKey(fieldName)) { fieldInfos[fieldName] = new FieldInfo(fieldName, true, fieldInfos.Count, false, false, false, this.storeOffsets ? IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS : IndexOptions.DOCS_AND_FREQS_AND_POSITIONS, DocValuesType.NONE, DocValuesType.NONE, null); } ITermToBytesRefAttribute termAtt = stream.GetAttribute <ITermToBytesRefAttribute>(); IPositionIncrementAttribute posIncrAttribute = stream.AddAttribute <IPositionIncrementAttribute>(); IOffsetAttribute offsetAtt = stream.AddAttribute <IOffsetAttribute>(); BytesRef @ref = termAtt.BytesRef; stream.Reset(); while (stream.IncrementToken()) { termAtt.FillBytesRef(); // if (DEBUG) System.err.println("token='" + term + "'"); numTokens++; int posIncr = posIncrAttribute.PositionIncrement; if (posIncr == 0) { numOverlapTokens++; } pos += posIncr; int ord = terms.Add(@ref); if (ord < 0) { ord = (-ord) - 1; postingsWriter.Reset(sliceArray.end[ord]); } else { sliceArray.start[ord] = postingsWriter.StartNewSlice(); } sliceArray.freq[ord]++; sumTotalTermFreq++; if (!storeOffsets) { postingsWriter.WriteInt32(pos); } else { postingsWriter.WriteInt32(pos); postingsWriter.WriteInt32(offsetAtt.StartOffset + offset); postingsWriter.WriteInt32(offsetAtt.EndOffset + offset); } sliceArray.end[ord] = postingsWriter.CurrentOffset; } stream.End(); // ensure infos.numTokens > 0 invariant; needed for correct operation of terms() if (numTokens > 0) { fields[fieldName] = new Info(terms, sliceArray, numTokens, numOverlapTokens, boost, pos, offsetAtt.EndOffset + offset, sumTotalTermFreq); sortedFields = null; // invalidate sorted view, if any } } // can never happen catch (Exception e) { throw new Exception(e.ToString(), e); } finally { try { if (stream != null) { stream.Dispose(); } } catch (IOException e2) { throw new Exception(e2.ToString(), e2); } } }
/// <summary> /// Retrieve suggestions. /// </summary> public virtual IList <LookupResult> DoLookup(string key, IEnumerable <BytesRef> contexts, int num) { if (contexts != null) { throw new System.ArgumentException("this suggester doesn't support contexts"); } TokenStream ts = queryAnalyzer.GetTokenStream("", key.ToString()); try { ITermToBytesRefAttribute termBytesAtt = ts.AddAttribute <ITermToBytesRefAttribute>(); IOffsetAttribute offsetAtt = ts.AddAttribute <IOffsetAttribute>(); IPositionLengthAttribute posLenAtt = ts.AddAttribute <IPositionLengthAttribute>(); IPositionIncrementAttribute posIncAtt = ts.AddAttribute <IPositionIncrementAttribute>(); ts.Reset(); var lastTokens = new BytesRef[grams]; //System.out.println("lookup: key='" + key + "'"); // Run full analysis, but save only the // last 1gram, last 2gram, etc.: BytesRef tokenBytes = termBytesAtt.BytesRef; int maxEndOffset = -1; bool sawRealToken = false; while (ts.IncrementToken()) { termBytesAtt.FillBytesRef(); sawRealToken |= tokenBytes.Length > 0; // TODO: this is somewhat iffy; today, ShingleFilter // sets posLen to the gram count; maybe we should make // a separate dedicated att for this? int gramCount = posLenAtt.PositionLength; Debug.Assert(gramCount <= grams); // Safety: make sure the recalculated count "agrees": if (CountGrams(tokenBytes) != gramCount) { throw new System.ArgumentException("tokens must not contain separator byte; got token=" + tokenBytes + " but gramCount=" + gramCount + " does not match recalculated count=" + CountGrams(tokenBytes)); } maxEndOffset = Math.Max(maxEndOffset, offsetAtt.EndOffset); lastTokens[gramCount - 1] = BytesRef.DeepCopyOf(tokenBytes); } ts.End(); if (!sawRealToken) { throw new System.ArgumentException("no tokens produced by analyzer, or the only tokens were empty strings"); } // Carefully fill last tokens with _ tokens; // ShingleFilter appraently won't emit "only hole" // tokens: int endPosInc = posIncAtt.PositionIncrement; // Note this will also be true if input is the empty // string (in which case we saw no tokens and // maxEndOffset is still -1), which in fact works out OK // because we fill the unigram with an empty BytesRef // below: bool lastTokenEnded = offsetAtt.EndOffset > maxEndOffset || endPosInc > 0; //System.out.println("maxEndOffset=" + maxEndOffset + " vs " + offsetAtt.EndOffset); if (lastTokenEnded) { //System.out.println(" lastTokenEnded"); // If user hit space after the last token, then // "upgrade" all tokens. This way "foo " will suggest // all bigrams starting w/ foo, and not any unigrams // starting with "foo": for (int i = grams - 1; i > 0; i--) { BytesRef token = lastTokens[i - 1]; if (token == null) { continue; } token.Grow(token.Length + 1); token.Bytes[token.Length] = separator; token.Length++; lastTokens[i] = token; } lastTokens[0] = new BytesRef(); } var arc = new FST.Arc <long?>(); var bytesReader = fst.GetBytesReader(); // Try highest order models first, and if they return // results, return that; else, fallback: double backoff = 1.0; List <LookupResult> results = new List <LookupResult>(num); // We only add a given suffix once, from the highest // order model that saw it; for subsequent lower order // models we skip it: var seen = new HashSet <BytesRef>(); for (int gram = grams - 1; gram >= 0; gram--) { BytesRef token = lastTokens[gram]; // Don't make unigram predictions from empty string: if (token == null || (token.Length == 0 && key.Length > 0)) { // Input didn't have enough tokens: //System.out.println(" gram=" + gram + ": skip: not enough input"); continue; } if (endPosInc > 0 && gram <= endPosInc) { // Skip hole-only predictions; in theory we // shouldn't have to do this, but we'd need to fix // ShingleFilter to produce only-hole tokens: //System.out.println(" break: only holes now"); break; } //System.out.println("try " + (gram+1) + " gram token=" + token.utf8ToString()); // TODO: we could add fuzziness here // match the prefix portion exactly //Pair<Long,BytesRef> prefixOutput = null; long?prefixOutput = null; try { prefixOutput = LookupPrefix(fst, bytesReader, token, arc); } catch (IOException bogus) { throw new Exception(bogus.ToString(), bogus); } //System.out.println(" prefixOutput=" + prefixOutput); if (prefixOutput == null) { // This model never saw this prefix, e.g. the // trigram model never saw context "purple mushroom" backoff *= ALPHA; continue; } // TODO: we could do this division at build time, and // bake it into the FST? // Denominator for computing scores from current // model's predictions: long contextCount = totTokens; BytesRef lastTokenFragment = null; for (int i = token.Length - 1; i >= 0; i--) { if (token.Bytes[token.Offset + i] == separator) { BytesRef context = new BytesRef(token.Bytes, token.Offset, i); long? output = Lucene.Net.Util.Fst.Util.Get(fst, Lucene.Net.Util.Fst.Util.ToInt32sRef(context, new Int32sRef())); Debug.Assert(output != null); contextCount = DecodeWeight(output); lastTokenFragment = new BytesRef(token.Bytes, token.Offset + i + 1, token.Length - i - 1); break; } } BytesRef finalLastToken; if (lastTokenFragment == null) { finalLastToken = BytesRef.DeepCopyOf(token); } else { finalLastToken = BytesRef.DeepCopyOf(lastTokenFragment); } Debug.Assert(finalLastToken.Offset == 0); CharsRef spare = new CharsRef(); // complete top-N Util.Fst.Util.TopResults <long?> completions = null; try { // Because we store multiple models in one FST // (1gram, 2gram, 3gram), we must restrict the // search so that it only considers the current // model. For highest order model, this is not // necessary since all completions in the FST // must be from this model, but for lower order // models we have to filter out the higher order // ones: // Must do num+seen.size() for queue depth because we may // reject up to seen.size() paths in acceptResult(): Util.Fst.Util.TopNSearcher <long?> searcher = new TopNSearcherAnonymousInnerClassHelper(this, fst, num, num + seen.Count, weightComparer, seen, finalLastToken); // since this search is initialized with a single start node // it is okay to start with an empty input path here searcher.AddStartPaths(arc, prefixOutput, true, new Int32sRef()); completions = searcher.Search(); Debug.Assert(completions.IsComplete); } catch (IOException bogus) { throw new Exception(bogus.ToString(), bogus); } int prefixLength = token.Length; BytesRef suffix = new BytesRef(8); //System.out.println(" " + completions.length + " completions"); foreach (Util.Fst.Util.Result <long?> completion in completions) { token.Length = prefixLength; // append suffix Util.Fst.Util.ToBytesRef(completion.Input, suffix); token.Append(suffix); //System.out.println(" completion " + token.utf8ToString()); // Skip this path if a higher-order model already // saw/predicted its last token: BytesRef lastToken = token; for (int i = token.Length - 1; i >= 0; i--) { if (token.Bytes[token.Offset + i] == separator) { Debug.Assert(token.Length - i - 1 > 0); lastToken = new BytesRef(token.Bytes, token.Offset + i + 1, token.Length - i - 1); break; } } if (seen.Contains(lastToken)) { //System.out.println(" skip dup " + lastToken.utf8ToString()); goto nextCompletionContinue; } seen.Add(BytesRef.DeepCopyOf(lastToken)); spare.Grow(token.Length); UnicodeUtil.UTF8toUTF16(token, spare); LookupResult result = new LookupResult(spare.ToString(), // LUCENENET NOTE: We need to calculate this as decimal because when using double it can sometimes // return numbers that are greater than long.MaxValue, which results in a negative long number. (long)(long.MaxValue * (decimal)backoff * ((decimal)DecodeWeight(completion.Output)) / contextCount)); results.Add(result); Debug.Assert(results.Count == seen.Count); //System.out.println(" add result=" + result); nextCompletionContinue :; } backoff *= ALPHA; } results.Sort(new ComparerAnonymousInnerClassHelper(this)); if (results.Count > num) { results.SubList(num, results.Count).Clear(); } return(results); } finally { IOUtils.DisposeWhileHandlingException(ts); } }
/// <summary> /// Creates a query from the analysis chain. /// <para/> /// Expert: this is more useful for subclasses such as queryparsers. /// If using this class directly, just use <see cref="CreateBooleanQuery(string, string)"/> /// and <see cref="CreatePhraseQuery(string, string)"/>. </summary> /// <param name="analyzer"> Analyzer used for this query. </param> /// <param name="operator"> Default boolean operator used for this query. </param> /// <param name="field"> Field to create queries against. </param> /// <param name="queryText"> Text to be passed to the analysis chain. </param> /// <param name="quoted"> <c>true</c> if phrases should be generated when terms occur at more than one position. </param> /// <param name="phraseSlop"> Slop factor for phrase/multiphrase queries. </param> protected Query CreateFieldQuery(Analyzer analyzer, Occur @operator, string field, string queryText, bool quoted, int phraseSlop) { Debug.Assert(@operator == Occur.SHOULD || @operator == Occur.MUST); // Use the analyzer to get all the tokens, and then build a TermQuery, // PhraseQuery, or nothing based on the term count CachingTokenFilter buffer = null; ITermToBytesRefAttribute termAtt = null; IPositionIncrementAttribute posIncrAtt = null; int numTokens = 0; int positionCount = 0; bool severalTokensAtSamePosition = false; bool hasMoreTokens = false; TokenStream source = null; try { source = analyzer.GetTokenStream(field, new StringReader(queryText)); source.Reset(); buffer = new CachingTokenFilter(source); buffer.Reset(); if (buffer.HasAttribute <ITermToBytesRefAttribute>()) { termAtt = buffer.GetAttribute <ITermToBytesRefAttribute>(); } if (buffer.HasAttribute <IPositionIncrementAttribute>()) { posIncrAtt = buffer.GetAttribute <IPositionIncrementAttribute>(); } if (termAtt != null) { try { hasMoreTokens = buffer.IncrementToken(); while (hasMoreTokens) { numTokens++; int positionIncrement = (posIncrAtt != null) ? posIncrAtt.PositionIncrement : 1; if (positionIncrement != 0) { positionCount += positionIncrement; } else { severalTokensAtSamePosition = true; } hasMoreTokens = buffer.IncrementToken(); } } catch (System.IO.IOException) { // ignore } } } catch (System.IO.IOException e) { throw new Exception("Error analyzing query text", e); } finally { IOUtils.DisposeWhileHandlingException(source); } // rewind the buffer stream buffer.Reset(); BytesRef bytes = termAtt == null ? null : termAtt.BytesRef; if (numTokens == 0) { return(null); } else if (numTokens == 1) { try { bool hasNext = buffer.IncrementToken(); Debug.Assert(hasNext == true); termAtt.FillBytesRef(); } catch (System.IO.IOException) { // safe to ignore, because we know the number of tokens } return(NewTermQuery(new Term(field, BytesRef.DeepCopyOf(bytes)))); } else { if (severalTokensAtSamePosition || (!quoted)) { if (positionCount == 1 || (!quoted)) { // no phrase query: if (positionCount == 1) { // simple case: only one position, with synonyms BooleanQuery q = NewBooleanQuery(true); for (int i = 0; i < numTokens; i++) { try { bool hasNext = buffer.IncrementToken(); Debug.Assert(hasNext == true); termAtt.FillBytesRef(); } catch (System.IO.IOException) { // safe to ignore, because we know the number of tokens } Query currentQuery = NewTermQuery(new Term(field, BytesRef.DeepCopyOf(bytes))); q.Add(currentQuery, Occur.SHOULD); } return(q); } else { // multiple positions BooleanQuery q = NewBooleanQuery(false); Query currentQuery = null; for (int i = 0; i < numTokens; i++) { try { bool hasNext = buffer.IncrementToken(); Debug.Assert(hasNext == true); termAtt.FillBytesRef(); } catch (System.IO.IOException) { // safe to ignore, because we know the number of tokens } if (posIncrAtt != null && posIncrAtt.PositionIncrement == 0) { if (!(currentQuery is BooleanQuery)) { Query t = currentQuery; currentQuery = NewBooleanQuery(true); ((BooleanQuery)currentQuery).Add(t, Occur.SHOULD); } ((BooleanQuery)currentQuery).Add(NewTermQuery(new Term(field, BytesRef.DeepCopyOf(bytes))), Occur.SHOULD); } else { if (currentQuery != null) { q.Add(currentQuery, @operator); } currentQuery = NewTermQuery(new Term(field, BytesRef.DeepCopyOf(bytes))); } } q.Add(currentQuery, @operator); return(q); } } else { // phrase query: MultiPhraseQuery mpq = NewMultiPhraseQuery(); mpq.Slop = phraseSlop; IList <Term> multiTerms = new List <Term>(); int position = -1; for (int i = 0; i < numTokens; i++) { int positionIncrement = 1; try { bool hasNext = buffer.IncrementToken(); Debug.Assert(hasNext == true); termAtt.FillBytesRef(); if (posIncrAtt != null) { positionIncrement = posIncrAtt.PositionIncrement; } } catch (System.IO.IOException) { // safe to ignore, because we know the number of tokens } if (positionIncrement > 0 && multiTerms.Count > 0) { if (enablePositionIncrements) { mpq.Add(multiTerms.ToArray(), position); } else { mpq.Add(multiTerms.ToArray()); } multiTerms.Clear(); } position += positionIncrement; multiTerms.Add(new Term(field, BytesRef.DeepCopyOf(bytes))); } if (enablePositionIncrements) { mpq.Add(multiTerms.ToArray(), position); } else { mpq.Add(multiTerms.ToArray()); } return(mpq); } } else { PhraseQuery pq = NewPhraseQuery(); pq.Slop = phraseSlop; int position = -1; for (int i = 0; i < numTokens; i++) { int positionIncrement = 1; try { bool hasNext = buffer.IncrementToken(); Debug.Assert(hasNext == true); termAtt.FillBytesRef(); if (posIncrAtt != null) { positionIncrement = posIncrAtt.PositionIncrement; } } catch (System.IO.IOException) { // safe to ignore, because we know the number of tokens } if (enablePositionIncrements) { position += positionIncrement; pq.Add(new Term(field, BytesRef.DeepCopyOf(bytes)), position); } else { pq.Add(new Term(field, BytesRef.DeepCopyOf(bytes))); } } return(pq); } } }
internal override void Start(IndexableField f) { TermAtt = FieldState.AttributeSource_Renamed.GetAttribute<ITermToBytesRefAttribute>(); TermBytesRef = TermAtt.BytesRef; Consumer.Start(f); if (NextPerField != null) { NextPerField.Start(f); } }