public FilteringTokenFilter(Lucene.Net.Util.LuceneVersion version, bool enablePositionIncrements, TokenStream input) : this(version, input) { posIncrAtt = AddAttribute<IPositionIncrementAttribute>(); CheckPositionIncrement(version, enablePositionIncrements); this.enablePositionIncrements = enablePositionIncrements; }
/// <summary> /// Create a new MockTokenFilter. /// </summary> /// <param name="input"> TokenStream to filter </param> /// <param name="filter"> DFA representing the terms that should be removed. </param> public MockTokenFilter(TokenStream input, CharacterRunAutomaton filter) : base(input) { this.Filter = filter; TermAtt = AddAttribute<ICharTermAttribute>(); PosIncrAtt = AddAttribute<IPositionIncrementAttribute>(); }
public MockHoleInjectingTokenFilter(Random random, TokenStream @in) : base(@in) { RandomSeed = random.Next(); PosIncAtt = AddAttribute<IPositionIncrementAttribute>(); PosLenAtt = AddAttribute<IPositionLengthAttribute>(); }
public CannedTokenizer(System.IO.TextReader reader, TokenAndPos[] tokens) : base(reader) { this.tokens = tokens; this.termAtt = AddAttribute<ICharTermAttribute>(); this.posIncrAtt = AddAttribute<IPositionIncrementAttribute>(); }
/// <summary> /// Create a new <seealso cref="ASCIIFoldingFilter"/>. /// </summary> /// <param name="input"> /// TokenStream to filter </param> /// <param name="preserveOriginal"> /// should the original tokens be kept on the input stream with a 0 position increment /// from the folded tokens? /// </param> public ASCIIFoldingFilter(TokenStream input, bool preserveOriginal) : base(input) { this.preserveOriginal = preserveOriginal; termAtt = AddAttribute<ICharTermAttribute>(); posIncAttr = AddAttribute<IPositionIncrementAttribute>(); }
/// <summary> /// Create a new <seealso cref="FilteringTokenFilter"/>. </summary> /// <param name="version"> the Lucene match version </param> /// <param name="in"> the <seealso cref="TokenStream"/> to consume </param> public FilteringTokenFilter(LuceneVersion version, TokenStream @in) : base(@in) { posIncrAtt = AddAttribute<IPositionIncrementAttribute>(); this.version = version; this.enablePositionIncrements = true; }
public CamelCaseFilter(TokenStream stream) : base(stream) { _termAttribute = AddAttribute<ITermAttribute>(); _offsetAttribute = AddAttribute<IOffsetAttribute>(); _positionIncrementAttribute = AddAttribute<IPositionIncrementAttribute>(); }
void Init() { InitPanGuSegment(); termAtt = AddAttribute<ITermAttribute>(); offsetAtt = AddAttribute<IOffsetAttribute>(); posIncrAtt = AddAttribute<IPositionIncrementAttribute>(); typeAtt = AddAttribute<ITypeAttribute>(); }
/// <summary> /// /// </summary> /// <remarks></remarks> /// <seealso cref=""/> /// <param name="input"></param> /// <param name="synonymEngine"></param> /// <return></return> public SynonymFilter(TokenStream input, ISynonymEngine synonymEngine) : base(input) { synonymStack = new Stack<String>(); this.engine = synonymEngine; this.termAtt = AddAttribute<ITermAttribute>(); this.posIncrAtt = AddAttribute<IPositionIncrementAttribute>(); }
public GraphTokenizer(TextReader input) : base(input) { TermAtt = AddAttribute<ICharTermAttribute>(); OffsetAtt = AddAttribute<IOffsetAttribute>(); PosIncrAtt = AddAttribute<IPositionIncrementAttribute>(); PosLengthAtt = AddAttribute<IPositionLengthAttribute>(); }
/// <summary> /// Creates a new HunspellStemFilter that will stem tokens from the given TokenStream using /// affix rules in the provided HunspellDictionary. /// </summary> /// <param name="input">TokenStream whose tokens will be stemmed.</param> /// <param name="dictionary">HunspellDictionary containing the affix rules and words that will be used to stem the tokens.</param> /// <param name="dedup">true if only unique terms should be output.</param> public HunspellStemFilter(TokenStream input, HunspellDictionary dictionary, Boolean dedup = true) : base(input) { _posIncAtt = AddAttribute<IPositionIncrementAttribute>(); _termAtt = AddAttribute<ITermAttribute>(); _dedup = dedup; _stemmer = new HunspellStemmer(dictionary); }
/// <summary> /// Sole constructor. </summary> public SuggestStopFilter(TokenStream input, CharArraySet stopWords) : base(input) { this.stopWords = stopWords; this.termAtt = AddAttribute<ICharTermAttribute>(); this.posIncAtt = AddAttribute<IPositionIncrementAttribute>(); this.keywordAtt = AddAttribute<IKeywordAttribute>(); this.offsetAtt = AddAttribute<IOffsetAttribute>(); }
public CannedBinaryTokenStream(params BinaryToken[] tokens) : base() { this.Tokens = tokens; TermAtt = AddAttribute<IBinaryTermAttribute>(); PosIncrAtt = AddAttribute<IPositionIncrementAttribute>(); PosLengthAtt = AddAttribute<IPositionLengthAttribute>(); OffsetAtt = AddAttribute<IOffsetAttribute>(); }
public MockPayloadFilter(TokenStream input, string fieldName) : base(input) { this.FieldName = fieldName; Pos = 0; i = 0; PosIncrAttr = input.AddAttribute<IPositionIncrementAttribute>(); PayloadAttr = input.AddAttribute<IPayloadAttribute>(); TermAttr = input.AddAttribute<ICharTermAttribute>(); }
/// <summary> /// Construct a token stream filtering the given input using a Set of common /// words to create bigrams. Outputs both unigrams with position increment and /// bigrams with position increment 0 type=gram where one or both of the words /// in a potential bigram are in the set of common words . /// </summary> /// <param name="input"> TokenStream input in filter chain </param> /// <param name="commonWords"> The set of common words. </param> public CommonGramsFilter(LuceneVersion matchVersion, TokenStream input, CharArraySet commonWords) : base(input) { termAttribute = AddAttribute<ICharTermAttribute>(); offsetAttribute = AddAttribute<IOffsetAttribute>(); typeAttribute = AddAttribute<ITypeAttribute>(); posIncAttribute = AddAttribute<IPositionIncrementAttribute>(); posLenAttribute = AddAttribute<IPositionLengthAttribute>(); this.commonWords = commonWords; }
public ExpandAcronymsFilter(TokenStream input, IAcronymExpansionProvider acronymExpansionProvider) : base(input) { _acronymExpansionProvider = acronymExpansionProvider; _termAttribute = AddAttribute<ITermAttribute>(); _positionIncrementAttribute = AddAttribute<IPositionIncrementAttribute>(); _tokenSet = new Queue<string>(); _recognizedTokens = new HashSet<string>(); }
/// <summary> /// Build a filter that limits the maximum position of tokens to emit. /// </summary> /// <param name="in"> the stream to wrap </param> /// <param name="maxTokenPosition"> max position of tokens to produce (1st token always has position 1) </param> /// <param name="consumeAllTokens"> whether all tokens from the wrapped input stream must be consumed /// even if maxTokenPosition is exceeded. </param> public LimitTokenPositionFilter(TokenStream @in, int maxTokenPosition, bool consumeAllTokens) : base(@in) { if (maxTokenPosition < 1) { throw new System.ArgumentException("maxTokenPosition must be greater than zero"); } this.maxTokenPosition = maxTokenPosition; this.consumeAllTokens = consumeAllTokens; posIncAtt = AddAttribute<IPositionIncrementAttribute>(); }
public IterTokenStream(params Token[] tokens) : base() { this.tokens = tokens; this.termAtt = AddAttribute<ICharTermAttribute>(); this.offsetAtt = AddAttribute<IOffsetAttribute>(); this.posIncAtt = AddAttribute<IPositionIncrementAttribute>(); this.flagsAtt = AddAttribute<IFlagsAttribute>(); this.typeAtt = AddAttribute<ITypeAttribute>(); this.payloadAtt = AddAttribute<IPayloadAttribute>(); }
public ExpanderFilter(TokenStream input, [NotNull] Func<String, IEnumerable<Expansion>> expander, Boolean emitSource = true) : base(input) { if (expander == null) throw new ArgumentNullException("expander"); _expander = expander; _emitSource = emitSource; _termAttr = AddAttribute<ITermAttribute>(); _posAttr = AddAttribute<IPositionIncrementAttribute>(); _typeAttr = AddAttribute<ITypeAttribute>(); }
private bool hasIllegalOffsets = false; // only if the length changed before this filter /// <summary> /// Creates a new ThaiWordFilter with the specified match version. </summary> public ThaiWordFilter(LuceneVersion matchVersion, TokenStream input) : base(matchVersion.OnOrAfter(LuceneVersion.LUCENE_31) ? input : new LowerCaseFilter(matchVersion, input)) { if (!DBBI_AVAILABLE) { throw new System.NotSupportedException("This JRE does not have support for Thai segmentation"); } handlePosIncr = matchVersion.OnOrAfter(LuceneVersion.LUCENE_31); termAtt = AddAttribute<ICharTermAttribute>(); offsetAtt = AddAttribute<IOffsetAttribute>(); posAtt = AddAttribute<IPositionIncrementAttribute>(); }
private void Init(System.IO.TextReader input, HebMorph.StreamLemmatizer _lemmatizer, HebMorph.LemmaFilters.LemmaFilterBase _lemmaFilter, bool AlwaysSaveMarkedOriginal) { termAtt = AddAttribute <ITermAttribute>(); offsetAtt = AddAttribute<IOffsetAttribute>(); posIncrAtt = AddAttribute<IPositionIncrementAttribute>(); typeAtt = AddAttribute <ITypeAttribute>(); //payAtt = (PayloadAttribute)AddAttribute(typeof(PayloadAttribute)); this.input = input; this._streamLemmatizer = _lemmatizer; this._streamLemmatizer.SetStream(input); this.alwaysSaveMarkedOriginal = AlwaysSaveMarkedOriginal; this.lemmaFilter = _lemmaFilter; }
/// <summary> Construct a token stream filtering the given input. /// If <c>stopWords</c> is an instance of <see cref="CharArraySet" /> (true if /// <c>makeStopSet()</c> was used to construct the set) it will be directly used /// and <c>ignoreCase</c> will be ignored since <c>CharArraySet</c> /// directly controls case sensitivity. /// <p/> /// If <c>stopWords</c> is not an instance of <see cref="CharArraySet" />, /// a new CharArraySet will be constructed and <c>ignoreCase</c> will be /// used to specify the case sensitivity of that set. /// </summary> /// <param name="enablePositionIncrements">true if token positions should record the removed stop words</param> /// <param name="input">Input TokenStream</param> /// <param name="stopWords">A Set of strings or strings or char[] or any other ToString()-able set representing the stopwords</param> /// <param name="ignoreCase">if true, all words are lower cased first</param> public StopFilter(bool enablePositionIncrements, TokenStream input, ISet<string> stopWords, bool ignoreCase) : base(input) { if (stopWords is CharArraySet) { this.stopWords = (CharArraySet) stopWords; } else { this.stopWords = new CharArraySet(stopWords.Count, ignoreCase); this.stopWords.AddAll(stopWords); } this.enablePositionIncrements = enablePositionIncrements; termAtt = AddAttribute<ITermAttribute>(); posIncrAtt = AddAttribute<IPositionIncrementAttribute>(); }
/// <summary> /// If inputText is non-null, and the TokenStream has /// offsets, we include the surface form in each arc's /// label. /// </summary> public TokenStreamToDot(string inputText, TokenStream @in, TextWriter @out) { this.@in = @in; this.@out = @out; this.InputText = inputText; TermAtt = @in.AddAttribute<ICharTermAttribute>(); PosIncAtt = @in.AddAttribute<IPositionIncrementAttribute>(); PosLengthAtt = @in.AddAttribute<IPositionLengthAttribute>(); if (@in.HasAttribute<IOffsetAttribute>()) { OffsetAtt = @in.AddAttribute<IOffsetAttribute>(); } else { OffsetAtt = null; } }
public virtual void TestFilterTokens() { SnowballFilter filter = new SnowballFilter(new TestTokenStream(this), "English"); ICharTermAttribute termAtt = filter.GetAttribute <ICharTermAttribute>(); IOffsetAttribute offsetAtt = filter.GetAttribute <IOffsetAttribute>(); ITypeAttribute typeAtt = filter.GetAttribute <ITypeAttribute>(); IPayloadAttribute payloadAtt = filter.GetAttribute <IPayloadAttribute>(); IPositionIncrementAttribute posIncAtt = filter.GetAttribute <IPositionIncrementAttribute>(); IFlagsAttribute flagsAtt = filter.GetAttribute <IFlagsAttribute>(); filter.IncrementToken(); assertEquals("accent", termAtt.ToString()); assertEquals(2, offsetAtt.StartOffset); assertEquals(7, offsetAtt.EndOffset); assertEquals("wrd", typeAtt.Type); assertEquals(3, posIncAtt.PositionIncrement); assertEquals(77, flagsAtt.Flags); assertEquals(new BytesRef(new byte[] { 0, 1, 2, 3 }), payloadAtt.Payload); }
private void Init(LuceneVersion version, Side side, int minGram, int maxGram) { //if (version == null) //{ // throw new ArgumentException("version must not be null"); //} if (!Enum.IsDefined(typeof(Side), side)) { throw new ArgumentException("sideLabel must be either front or back"); } if (minGram < 1) { throw new ArgumentException("minGram must be greater than zero"); } if (minGram > maxGram) { throw new ArgumentException("minGram must not be greater than maxGram"); } if (version.OnOrAfter(LuceneVersion.LUCENE_44)) { if (side == Side.BACK) { throw new ArgumentException("Side.BACK is not supported anymore as of Lucene 4.4"); } } else { maxGram = Math.Min(maxGram, 1024); } this.minGram = minGram; this.maxGram = maxGram; this.side = side; this.termAtt = AddAttribute <ICharTermAttribute>(); this.offsetAtt = AddAttribute <IOffsetAttribute>(); this.posIncrAtt = AddAttribute <IPositionIncrementAttribute>(); }
public PrefixAwareTokenFilter(TokenStream prefix, TokenStream suffix) : base(suffix) { this.suffix = suffix; this.prefix = prefix; prefixExhausted = false; termAtt = AddAttribute<ICharTermAttribute>(); posIncrAtt = AddAttribute<IPositionIncrementAttribute>(); payloadAtt = AddAttribute<IPayloadAttribute>(); offsetAtt = AddAttribute<IOffsetAttribute>(); typeAtt = AddAttribute<ITypeAttribute>(); flagsAtt = AddAttribute<IFlagsAttribute>(); p_termAtt = prefix.AddAttribute<ICharTermAttribute>(); p_posIncrAtt = prefix.AddAttribute<IPositionIncrementAttribute>(); p_payloadAtt = prefix.AddAttribute<IPayloadAttribute>(); p_offsetAtt = prefix.AddAttribute<IOffsetAttribute>(); p_typeAtt = prefix.AddAttribute<ITypeAttribute>(); p_flagsAtt = prefix.AddAttribute<IFlagsAttribute>(); }
public PrefixAwareTokenFilter(TokenStream prefix, TokenStream suffix) : base(suffix) { this.suffix = suffix; this.prefix = prefix; prefixExhausted = false; termAtt = AddAttribute <ICharTermAttribute>(); posIncrAtt = AddAttribute <IPositionIncrementAttribute>(); payloadAtt = AddAttribute <IPayloadAttribute>(); offsetAtt = AddAttribute <IOffsetAttribute>(); typeAtt = AddAttribute <ITypeAttribute>(); flagsAtt = AddAttribute <IFlagsAttribute>(); p_termAtt = prefix.AddAttribute <ICharTermAttribute>(); p_posIncrAtt = prefix.AddAttribute <IPositionIncrementAttribute>(); p_payloadAtt = prefix.AddAttribute <IPayloadAttribute>(); p_offsetAtt = prefix.AddAttribute <IOffsetAttribute>(); p_typeAtt = prefix.AddAttribute <ITypeAttribute>(); p_flagsAtt = prefix.AddAttribute <IFlagsAttribute>(); }
private void DoTestStopPositons(StopFilter stpf, bool enableIcrements) { log("---> test with enable-increments-" + (enableIcrements ? "enabled" : "disabled")); #pragma warning disable 612, 618 stpf.SetEnablePositionIncrements(enableIcrements); #pragma warning restore 612, 618 ICharTermAttribute termAtt = stpf.GetAttribute <ICharTermAttribute>(); IPositionIncrementAttribute posIncrAtt = stpf.GetAttribute <IPositionIncrementAttribute>(); stpf.Reset(); for (int i = 0; i < 20; i += 3) { assertTrue(stpf.IncrementToken()); log("Token " + i + ": " + stpf); string w = English.Int32ToEnglish(i).Trim(); assertEquals("expecting token " + i + " to be " + w, w, termAtt.ToString()); assertEquals("all but first token must have position increment of 3", enableIcrements ? (i == 0 ? 1 : 3) : 1, posIncrAtt.PositionIncrement); } assertFalse(stpf.IncrementToken()); stpf.End(); stpf.Dispose(); }
/// <summary> /// Creates a new WordDelimiterFilter /// </summary> /// <param name="in"> TokenStream to be filtered </param> /// <param name="charTypeTable"> table containing character types </param> /// <param name="configurationFlags"> Flags configuring the filter </param> /// <param name="protWords"> If not null is the set of tokens to protect from being delimited </param> public WordDelimiterFilter(LuceneVersion matchVersion, TokenStream @in, sbyte[] charTypeTable, int configurationFlags, CharArraySet protWords) : base(@in) { if (!InstanceFieldsInitialized) { InitializeInstanceFields(); InstanceFieldsInitialized = true; } if (!matchVersion.OnOrAfter(LuceneVersion.LUCENE_48)) { throw new System.ArgumentException("This class only works with Lucene 4.8+. To emulate the old (broken) behavior of WordDelimiterFilter, use Lucene47WordDelimiterFilter"); } this.flags = configurationFlags; this.protWords = protWords; this.iterator = new WordDelimiterIterator(charTypeTable, Has(SPLIT_ON_CASE_CHANGE), Has(SPLIT_ON_NUMERICS), Has(STEM_ENGLISH_POSSESSIVE)); this.termAttribute = AddAttribute<ICharTermAttribute>(); this.offsetAttribute = AddAttribute<IOffsetAttribute>(); this.posIncAttribute = AddAttribute<IPositionIncrementAttribute>(); this.typeAttribute = AddAttribute<ITypeAttribute>(); }
/// <summary> /// Creates a new WordDelimiterFilter /// </summary> /// <param name="matchVersion"> lucene compatibility version </param> /// <param name="in"> TokenStream to be filtered </param> /// <param name="charTypeTable"> table containing character types </param> /// <param name="configurationFlags"> Flags configuring the filter </param> /// <param name="protWords"> If not null is the set of tokens to protect from being delimited </param> public WordDelimiterFilter(LuceneVersion matchVersion, TokenStream @in, byte[] charTypeTable, WordDelimiterFlags configurationFlags, CharArraySet protWords) : base(@in) { this.termAttribute = AddAttribute <ICharTermAttribute>(); this.offsetAttribute = AddAttribute <IOffsetAttribute>(); this.posIncAttribute = AddAttribute <IPositionIncrementAttribute>(); this.typeAttribute = AddAttribute <ITypeAttribute>(); concat = new WordDelimiterConcatenation(this); concatAll = new WordDelimiterConcatenation(this); sorter = new OffsetSorter(this); if (!matchVersion.OnOrAfter(LuceneVersion.LUCENE_48)) { throw new ArgumentException("This class only works with Lucene 4.8+. To emulate the old (broken) behavior of WordDelimiterFilter, use Lucene47WordDelimiterFilter"); } this.flags = configurationFlags; this.protWords = protWords; this.iterator = new WordDelimiterIterator(charTypeTable, Has(WordDelimiterFlags.SPLIT_ON_CASE_CHANGE), Has(WordDelimiterFlags.SPLIT_ON_NUMERICS), Has(WordDelimiterFlags.STEM_ENGLISH_POSSESSIVE)); }
public PrefixAwareTokenFilter(TokenStream prefix, TokenStream suffix) : base(suffix) { Suffix = suffix; Prefix = prefix; _prefixExhausted = false; // ReSharper disable DoNotCallOverridableMethodsInConstructor _termAtt = AddAttribute <ITermAttribute>(); _posIncrAtt = AddAttribute <IPositionIncrementAttribute>(); _payloadAtt = AddAttribute <IPayloadAttribute>(); _offsetAtt = AddAttribute <IOffsetAttribute>(); _typeAtt = AddAttribute <ITypeAttribute>(); _flagsAtt = AddAttribute <IFlagsAttribute>(); // ReSharper restore DoNotCallOverridableMethodsInConstructor _pTermAtt = prefix.AddAttribute <ITermAttribute>(); _pPosIncrAtt = prefix.AddAttribute <IPositionIncrementAttribute>(); _pPayloadAtt = prefix.AddAttribute <IPayloadAttribute>(); _pOffsetAtt = prefix.AddAttribute <IOffsetAttribute>(); _pTypeAtt = prefix.AddAttribute <ITypeAttribute>(); _pFlagsAtt = prefix.AddAttribute <IFlagsAttribute>(); }
public EdgeNGramTokenFilter(LuceneVersion version, TokenStream input, Side side, int minGram, int maxGram) : base(input) { //if (version == null) //{ // throw new ArgumentException("version must not be null"); //} if (version.OnOrAfter(LuceneVersion.LUCENE_44) && side == Side.BACK) { throw new ArgumentException("Side.BACK is not supported anymore as of Lucene 4.4, use ReverseStringFilter up-front and afterward"); } if (!Enum.IsDefined(typeof(Side), side)) { throw new ArgumentException("sideLabel must be either front or back"); } if (minGram < 1) { throw new ArgumentException("minGram must be greater than zero"); } if (minGram > maxGram) { throw new ArgumentException("minGram must not be greater than maxGram"); } this.version = version; this.charUtils = version.OnOrAfter(LuceneVersion.LUCENE_44) ? CharacterUtils.GetInstance(version) : CharacterUtils.GetJava4Instance(version); this.minGram = minGram; this.maxGram = maxGram; this.side = side; this.termAtt = AddAttribute <ICharTermAttribute>(); this.offsetAtt = AddAttribute <IOffsetAttribute>(); this.posIncrAtt = AddAttribute <IPositionIncrementAttribute>(); this.posLenAtt = AddAttribute <IPositionLengthAttribute>(); }
public PrefixAwareTokenFilter(TokenStream prefix, TokenStream suffix) : base(suffix) { Suffix = suffix; Prefix = prefix; _prefixExhausted = false; // ReSharper disable DoNotCallOverridableMethodsInConstructor _termAtt = AddAttribute<ITermAttribute>(); _posIncrAtt = AddAttribute<IPositionIncrementAttribute>(); _payloadAtt = AddAttribute<IPayloadAttribute>(); _offsetAtt = AddAttribute<IOffsetAttribute>(); _typeAtt = AddAttribute<ITypeAttribute>(); _flagsAtt = AddAttribute<IFlagsAttribute>(); // ReSharper restore DoNotCallOverridableMethodsInConstructor _pTermAtt = prefix.AddAttribute<ITermAttribute>(); _pPosIncrAtt = prefix.AddAttribute<IPositionIncrementAttribute>(); _pPayloadAtt = prefix.AddAttribute<IPayloadAttribute>(); _pOffsetAtt = prefix.AddAttribute<IOffsetAttribute>(); _pTypeAtt = prefix.AddAttribute<ITypeAttribute>(); _pFlagsAtt = prefix.AddAttribute<IFlagsAttribute>(); }
private void Init(LuceneVersion version, Side side, int minGram, int maxGram) { // LUCENENET specific - version cannot be null because it is a value type. if (!side.IsDefined()) { throw new ArgumentOutOfRangeException(nameof(side), "sideLabel must be either front or back"); // LUCENENET specific - changed from IllegalArgumentException to ArgumentOutOfRangeException (.NET convention) } if (minGram < 1) { throw new ArgumentOutOfRangeException(nameof(minGram), "minGram must be greater than zero"); // LUCENENET specific - changed from IllegalArgumentException to ArgumentOutOfRangeException (.NET convention) } if (minGram > maxGram) { throw new ArgumentException("minGram must not be greater than maxGram"); } if (version.OnOrAfter(LuceneVersion.LUCENE_44)) { if (side == Side.BACK) { throw new ArgumentException("Side.BACK is not supported anymore as of Lucene 4.4"); } } else { maxGram = Math.Min(maxGram, 1024); } this.minGram = minGram; this.maxGram = maxGram; this.side = side; this.termAtt = AddAttribute <ICharTermAttribute>(); this.offsetAtt = AddAttribute <IOffsetAttribute>(); this.posIncrAtt = AddAttribute <IPositionIncrementAttribute>(); }
public virtual void TestOutputHangsOffEnd() { b = new SynonymMap.Builder(true); const bool keepOrig = false; // b hangs off the end (no input token under it): Add("a", "a b", keepOrig); tokensIn = new MockTokenizer(new StringReader("a"), MockTokenizer.WHITESPACE, true); tokensIn.Reset(); assertTrue(tokensIn.IncrementToken()); assertFalse(tokensIn.IncrementToken()); tokensIn.End(); tokensIn.Dispose(); tokensOut = new SynonymFilter(tokensIn, b.Build(), true); termAtt = tokensOut.AddAttribute <ICharTermAttribute>(); posIncrAtt = tokensOut.AddAttribute <IPositionIncrementAttribute>(); offsetAtt = tokensOut.AddAttribute <IOffsetAttribute>(); posLenAtt = tokensOut.AddAttribute <IPositionLengthAttribute>(); // Make sure endOffset inherits from previous input token: Verify("a", "a b:1"); }
public virtual void TestStopList() { var stopWordsSet = Support.Compatibility.SetFactory.CreateHashSet <string>(); stopWordsSet.Add("good"); stopWordsSet.Add("test"); stopWordsSet.Add("analyzer"); StopAnalyzer newStop = new StopAnalyzer(Version.LUCENE_24, stopWordsSet); System.IO.StringReader reader = new System.IO.StringReader("This is a good test of the english stop analyzer"); TokenStream stream = newStop.TokenStream("test", reader); Assert.IsNotNull(stream); ITermAttribute termAtt = stream.GetAttribute <ITermAttribute>(); IPositionIncrementAttribute posIncrAtt = stream.AddAttribute <IPositionIncrementAttribute>(); while (stream.IncrementToken()) { System.String text = termAtt.Term; Assert.IsFalse(stopWordsSet.Contains(text)); Assert.AreEqual(1, posIncrAtt.PositionIncrement); // in 2.4 stop tokenizer does not apply increments. } }
public EdgeNGramTokenFilter(LuceneVersion version, TokenStream input, Side side, int minGram, int maxGram) : base(input) { // LUCENENET specific - version cannot be null because it is a value type. if (version.OnOrAfter(LuceneVersion.LUCENE_44) && side == Side.BACK) { throw new ArgumentException("Side.BACK is not supported anymore as of Lucene 4.4, use ReverseStringFilter up-front and afterward"); } if (!side.IsDefined()) { throw new ArgumentOutOfRangeException(nameof(side), "sideLabel must be either front or back"); // LUCENENET specific - changed from IllegalArgumentException to ArgumentOutOfRangeException (.NET convention) } if (minGram < 1) { throw new ArgumentOutOfRangeException(nameof(minGram), "minGram must be greater than zero"); // LUCENENET specific - changed from IllegalArgumentException to ArgumentOutOfRangeException (.NET convention) } if (minGram > maxGram) { throw new ArgumentException("minGram must not be greater than maxGram"); } this.version = version; this.charUtils = version.OnOrAfter(LuceneVersion.LUCENE_44) ? CharacterUtils.GetInstance(version) : CharacterUtils.GetJava4Instance(version); this.minGram = minGram; this.maxGram = maxGram; this.side = side; this.termAtt = AddAttribute <ICharTermAttribute>(); this.offsetAtt = AddAttribute <IOffsetAttribute>(); this.posIncrAtt = AddAttribute <IPositionIncrementAttribute>(); this.posLenAtt = AddAttribute <IPositionLengthAttribute>(); }
public static IEnumerable <TokenAttributes> Tokenize(this TokenStream tokenStream) { var term = tokenStream.GetAttribute <ITermAttribute>(); var offset = tokenStream.GetAttribute <IOffsetAttribute>(); IPositionIncrementAttribute positionIncrement = null; if (tokenStream.HasAttribute <IPositionIncrementAttribute>()) { positionIncrement = tokenStream.GetAttribute <IPositionIncrementAttribute>(); } while (tokenStream.IncrementToken()) { var tokenAttributes = new TokenAttributes(term.Term, offset.StartOffset, offset.EndOffset); if (positionIncrement != null) { tokenAttributes.PositionIncrement = positionIncrement.PositionIncrement; } yield return(tokenAttributes); } }
/* * Need to worry about multiple scenarios: * - need to go for the longest match * a b => foo #shouldn't match if "a b" is followed by "c d" * a b c d => bar * - need to backtrack - retry matches for tokens already read * a b c d => foo * b c => bar * If the input stream is "a b c x", one will consume "a b c d" * trying to match the first rule... all but "a" should be * pushed back so a match may be made on "b c". * - don't try and match generated tokens (thus need separate queue) * matching is not recursive. * - handle optional generation of original tokens in all these cases, * merging token streams to preserve token positions. * - preserve original positionIncrement of first matched token */ public override bool IncrementToken() { while (true) { // if there are any generated tokens, return them... don't try any // matches against them, as we specifically don't want recursion. if (replacement != null && replacement.MoveNext()) { Copy(this, replacement.Current); return(true); } // common case fast-path of first token not matching anything AttributeSource firstTok = NextTok(); if (firstTok == null) { return(false); } var termAtt = firstTok.AddAttribute <ICharTermAttribute>(); SlowSynonymMap result = map.Submap != null?map.Submap.Get(termAtt.Buffer, 0, termAtt.Length) : null; if (result == null) { Copy(this, firstTok); return(true); } // fast-path failed, clone ourselves if needed if (firstTok == this) { firstTok = CloneAttributes(); } // OK, we matched a token, so find the longest match. matched = new LinkedList <AttributeSource>(); result = Match(result); if (result == null) { // no match, simply return the first token read. Copy(this, firstTok); return(true); } // reuse, or create new one each time? IList <AttributeSource> generated = new JCG.List <AttributeSource>(result.Synonyms.Length + matched.Count + 1); // // there was a match... let's generate the new tokens, merging // in the matched tokens (position increments need adjusting) // AttributeSource lastTok = matched.Count == 0 ? firstTok : matched.Last.Value; bool includeOrig = result.IncludeOrig; AttributeSource origTok = includeOrig ? firstTok : null; IPositionIncrementAttribute firstPosIncAtt = firstTok.AddAttribute <IPositionIncrementAttribute>(); int origPos = firstPosIncAtt.PositionIncrement; // position of origTok in the original stream int repPos = 0; // curr position in replacement token stream int pos = 0; // current position in merged token stream for (int i = 0; i < result.Synonyms.Length; i++) { Token repTok = result.Synonyms[i]; AttributeSource newTok = firstTok.CloneAttributes(); ICharTermAttribute newTermAtt = newTok.AddAttribute <ICharTermAttribute>(); IOffsetAttribute newOffsetAtt = newTok.AddAttribute <IOffsetAttribute>(); IPositionIncrementAttribute newPosIncAtt = newTok.AddAttribute <IPositionIncrementAttribute>(); IOffsetAttribute lastOffsetAtt = lastTok.AddAttribute <IOffsetAttribute>(); newOffsetAtt.SetOffset(newOffsetAtt.StartOffset, lastOffsetAtt.EndOffset); newTermAtt.CopyBuffer(repTok.Buffer, 0, repTok.Length); repPos += repTok.PositionIncrement; if (i == 0) // make position of first token equal to original { repPos = origPos; } // if necessary, insert original tokens and adjust position increment while (origTok != null && origPos <= repPos) { IPositionIncrementAttribute origPosInc = origTok.AddAttribute <IPositionIncrementAttribute>(); origPosInc.PositionIncrement = origPos - pos; generated.Add(origTok); pos += origPosInc.PositionIncrement; //origTok = matched.Count == 0 ? null : matched.RemoveFirst(); if (matched.Count == 0) { origTok = null; } else { origTok = matched.First.Value; matched.Remove(origTok); } if (origTok != null) { origPosInc = origTok.AddAttribute <IPositionIncrementAttribute>(); origPos += origPosInc.PositionIncrement; } } newPosIncAtt.PositionIncrement = repPos - pos; generated.Add(newTok); pos += newPosIncAtt.PositionIncrement; } // finish up any leftover original tokens while (origTok != null) { IPositionIncrementAttribute origPosInc = origTok.AddAttribute <IPositionIncrementAttribute>(); origPosInc.PositionIncrement = origPos - pos; generated.Add(origTok); pos += origPosInc.PositionIncrement; if (matched.Count == 0) { origTok = null; } else { origTok = matched.First.Value; matched.Remove(origTok); } if (origTok != null) { origPosInc = origTok.AddAttribute <IPositionIncrementAttribute>(); origPos += origPosInc.PositionIncrement; } } // what if we replaced a longer sequence with a shorter one? // a/0 b/5 => foo/0 // should I re-create the gap on the next buffered token? replacement = generated.GetEnumerator(); // Now return to the top of the loop to read and return the first // generated token.. The reason this is done is that we may have generated // nothing at all, and may need to continue with more matching logic. } }
/// <summary> /// Constructs a new CommonGramsQueryFilter based on the provided CommomGramsFilter /// </summary> /// <param name="input"> CommonGramsFilter the QueryFilter will use </param> public CommonGramsQueryFilter(CommonGramsFilter input) : base(input) { typeAttribute = AddAttribute <ITypeAttribute>(); posIncAttribute = AddAttribute <IPositionIncrementAttribute>(); }
/// <summary> /// Iterates over the given token stream and adds the resulting terms to the index; /// Equivalent to adding a tokenized, indexed, termVectorStored, unstored, /// Lucene <see cref="Documents.Field"/>. /// Finally closes the token stream. Note that untokenized keywords can be added with this method via /// <see cref="T:KeywordTokenStream{T}(ICollection{T}"/>)"/>, the Lucene <c>KeywordTokenizer</c> or similar utilities. /// /// </summary> /// <param name="fieldName"> a name to be associated with the text </param> /// <param name="stream"> the token stream to retrieve tokens from. </param> /// <param name="boost"> the boost factor for hits for this field </param> /// <param name="positionIncrementGap"> the position increment gap if fields with the same name are added more than once </param> /// <param name="offsetGap"> the offset gap if fields with the same name are added more than once </param> /// <seealso cref="Documents.Field.Boost"/> public virtual void AddField(string fieldName, TokenStream stream, float boost, int positionIncrementGap, int offsetGap) { try { if (fieldName == null) { throw new ArgumentException("fieldName must not be null"); } if (stream == null) { throw new ArgumentException("token stream must not be null"); } if (boost <= 0.0f) { throw new ArgumentException("boost factor must be greater than 0.0"); } int numTokens = 0; int numOverlapTokens = 0; int pos = -1; BytesRefHash terms; SliceByteStartArray sliceArray; long sumTotalTermFreq = 0; int offset = 0; if (fields.TryGetValue(fieldName, out Info info)) { numTokens = info.numTokens; numOverlapTokens = info.numOverlapTokens; pos = info.lastPosition + positionIncrementGap; offset = info.lastOffset + offsetGap; terms = info.terms; boost *= info.boost; sliceArray = info.sliceArray; sumTotalTermFreq = info.sumTotalTermFreq; } else { sliceArray = new SliceByteStartArray(BytesRefHash.DEFAULT_CAPACITY); terms = new BytesRefHash(byteBlockPool, BytesRefHash.DEFAULT_CAPACITY, sliceArray); } if (!fieldInfos.ContainsKey(fieldName)) { fieldInfos[fieldName] = new FieldInfo(fieldName, true, fieldInfos.Count, false, false, false, this.storeOffsets ? IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS : IndexOptions.DOCS_AND_FREQS_AND_POSITIONS, DocValuesType.NONE, DocValuesType.NONE, null); } ITermToBytesRefAttribute termAtt = stream.GetAttribute <ITermToBytesRefAttribute>(); IPositionIncrementAttribute posIncrAttribute = stream.AddAttribute <IPositionIncrementAttribute>(); IOffsetAttribute offsetAtt = stream.AddAttribute <IOffsetAttribute>(); BytesRef @ref = termAtt.BytesRef; stream.Reset(); while (stream.IncrementToken()) { termAtt.FillBytesRef(); // if (DEBUG) System.err.println("token='" + term + "'"); numTokens++; int posIncr = posIncrAttribute.PositionIncrement; if (posIncr == 0) { numOverlapTokens++; } pos += posIncr; int ord = terms.Add(@ref); if (ord < 0) { ord = (-ord) - 1; postingsWriter.Reset(sliceArray.end[ord]); } else { sliceArray.start[ord] = postingsWriter.StartNewSlice(); } sliceArray.freq[ord]++; sumTotalTermFreq++; if (!storeOffsets) { postingsWriter.WriteInt32(pos); } else { postingsWriter.WriteInt32(pos); postingsWriter.WriteInt32(offsetAtt.StartOffset + offset); postingsWriter.WriteInt32(offsetAtt.EndOffset + offset); } sliceArray.end[ord] = postingsWriter.CurrentOffset; } stream.End(); // ensure infos.numTokens > 0 invariant; needed for correct operation of terms() if (numTokens > 0) { fields[fieldName] = new Info(terms, sliceArray, numTokens, numOverlapTokens, boost, pos, offsetAtt.EndOffset + offset, sumTotalTermFreq); sortedFields = null; // invalidate sorted view, if any } } // can never happen catch (Exception e) { throw new Exception(e.ToString(), e); } finally { try { if (stream != null) { stream.Dispose(); } } catch (IOException e2) { throw new Exception(e2.ToString(), e2); } } }
public BugReproTokenStream() { termAtt = AddAttribute <ICharTermAttribute>(); offsetAtt = AddAttribute <IOffsetAttribute>(); posIncAtt = AddAttribute <IPositionIncrementAttribute>(); }
public RemoveATokens(TokenStream @in) : base(@in) { termAtt = AddAttribute <ICharTermAttribute>(); posIncAtt = AddAttribute <IPositionIncrementAttribute>(); }
/// <summary> Not an explicit test, just useful to print out some info on performance /// /// </summary> /// <throws> Exception </throws> public virtual void Performance() { int[] tokCount = new int[] { 100, 500, 1000, 2000, 5000, 10000 }; int[] modCounts = new int[] { 1, 2, 5, 10, 20, 50, 100, 200, 500 }; for (int k = 0; k < tokCount.Length; k++) { System.Text.StringBuilder buffer = new System.Text.StringBuilder(); System.Console.Out.WriteLine("-----Tokens: " + tokCount[k] + "-----"); for (int i = 0; i < tokCount[k]; i++) { buffer.Append(English.IntToEnglish(i).ToUpper()).Append(' '); } //make sure we produce the same tokens TeeSinkTokenFilter teeStream = new TeeSinkTokenFilter(new StandardFilter(new StandardTokenizer(Version.LUCENE_CURRENT, new System.IO.StringReader(buffer.ToString())))); TokenStream sink = teeStream.NewSinkTokenStream(new ModuloSinkFilter(this, 100)); teeStream.ConsumeAllTokens(); TokenStream stream = new ModuloTokenFilter(this, new StandardFilter(new StandardTokenizer(Version.LUCENE_CURRENT, new System.IO.StringReader(buffer.ToString()))), 100); ITermAttribute tfTok = stream.AddAttribute <ITermAttribute>(); ITermAttribute sinkTok = sink.AddAttribute <ITermAttribute>(); for (int i = 0; stream.IncrementToken(); i++) { Assert.IsTrue(sink.IncrementToken()); Assert.IsTrue(tfTok.Equals(sinkTok) == true, tfTok + " is not equal to " + sinkTok + " at token: " + i); } //simulate two fields, each being analyzed once, for 20 documents for (int j = 0; j < modCounts.Length; j++) { int tfPos = 0; long start = (DateTime.Now.Ticks / TimeSpan.TicksPerMillisecond); for (int i = 0; i < 20; i++) { stream = new StandardFilter(new StandardTokenizer(Version.LUCENE_CURRENT, new System.IO.StringReader(buffer.ToString()))); IPositionIncrementAttribute posIncrAtt = stream.GetAttribute <IPositionIncrementAttribute>(); while (stream.IncrementToken()) { tfPos += posIncrAtt.PositionIncrement; } stream = new ModuloTokenFilter(this, new StandardFilter(new StandardTokenizer(Version.LUCENE_CURRENT, new System.IO.StringReader(buffer.ToString()))), modCounts[j]); posIncrAtt = stream.GetAttribute <IPositionIncrementAttribute>(); while (stream.IncrementToken()) { tfPos += posIncrAtt.PositionIncrement; } } long finish = (DateTime.Now.Ticks / TimeSpan.TicksPerMillisecond); System.Console.Out.WriteLine("ModCount: " + modCounts[j] + " Two fields took " + (finish - start) + " ms"); int sinkPos = 0; //simulate one field with one sink start = (DateTime.Now.Ticks / TimeSpan.TicksPerMillisecond); for (int i = 0; i < 20; i++) { teeStream = new TeeSinkTokenFilter(new StandardFilter(new StandardTokenizer(Version.LUCENE_CURRENT, new System.IO.StringReader(buffer.ToString())))); sink = teeStream.NewSinkTokenStream(new ModuloSinkFilter(this, modCounts[j])); IPositionIncrementAttribute posIncrAtt = teeStream.GetAttribute <IPositionIncrementAttribute>(); while (teeStream.IncrementToken()) { sinkPos += posIncrAtt.PositionIncrement; } //System.out.println("Modulo--------"); posIncrAtt = sink.GetAttribute <IPositionIncrementAttribute>(); while (sink.IncrementToken()) { sinkPos += posIncrAtt.PositionIncrement; } } finish = (DateTime.Now.Ticks / TimeSpan.TicksPerMillisecond); System.Console.Out.WriteLine("ModCount: " + modCounts[j] + " Tee fields took " + (finish - start) + " ms"); Assert.IsTrue(sinkPos == tfPos, sinkPos + " does not equal: " + tfPos); } System.Console.Out.WriteLine("- End Tokens: " + tokCount[k] + "-----"); } }
public MockCJKSynonymFilter(TokenStream input) : base(input) { termAtt = AddAttribute <ICharTermAttribute>(); posIncAtt = AddAttribute <IPositionIncrementAttribute>(); }
internal LargePosIncTokenFilter(TestWordDelimiterFilter outerInstance, TokenStream input) : base(input) { this.outerInstance = outerInstance; this.termAtt = AddAttribute <ICharTermAttribute>(); this.posIncAtt = AddAttribute <IPositionIncrementAttribute>(); }
public TestPosIncrementFilter(TokenStream @in) : base(@in) { termAtt = AddAttribute<ICharTermAttribute>(); posIncrAtt = AddAttribute<IPositionIncrementAttribute>(); }
public SingleCharTokenizer(TokenStream input) : base(input) { _input = input; _termAttribute = AddAttribute<ITermAttribute>(); _offsetAttribute = AddAttribute<IOffsetAttribute>(); _positionIncrementAttribute = AddAttribute<IPositionIncrementAttribute>(); }
protected internal RandomTokenStream(BaseTermVectorsFormatTestCase baseTermVectorsFormatTestCase, int len, string[] sampleTerms, BytesRef[] sampleTermBytes, bool offsetsGoBackwards) { terms = new string[len]; termBytes = new BytesRef[len]; positionsIncrements = new int[len]; positions = new int[len]; startOffsets = new int[len]; endOffsets = new int[len]; payloads = new BytesRef[len]; for (int i = 0; i < len; ++i) { int o = Random.Next(sampleTerms.Length); terms[i] = sampleTerms[o]; termBytes[i] = sampleTermBytes[o]; positionsIncrements[i] = TestUtil.NextInt32(Random, i == 0 ? 1 : 0, 10); if (offsetsGoBackwards) { startOffsets[i] = Random.Next(); endOffsets[i] = Random.Next(); } else { if (i == 0) { startOffsets[i] = TestUtil.NextInt32(Random, 0, 1 << 16); } else { startOffsets[i] = startOffsets[i - 1] + TestUtil.NextInt32(Random, 0, Rarely() ? 1 << 16 : 20); } endOffsets[i] = startOffsets[i] + TestUtil.NextInt32(Random, 0, Rarely() ? 1 << 10 : 20); } } for (int i = 0; i < len; ++i) { if (i == 0) { positions[i] = positionsIncrements[i] - 1; } else { positions[i] = positions[i - 1] + positionsIncrements[i]; } } if (Rarely()) { Arrays.Fill(payloads, baseTermVectorsFormatTestCase.RandomPayload()); } else { for (int i = 0; i < len; ++i) { payloads[i] = baseTermVectorsFormatTestCase.RandomPayload(); } } positionToTerms = new Dictionary <int?, ISet <int?> >(len); startOffsetToTerms = new Dictionary <int?, ISet <int?> >(len); for (int i = 0; i < len; ++i) { if (!positionToTerms.TryGetValue(positions[i], out ISet <int?> positionTerms)) { positionToTerms[positions[i]] = positionTerms = new JCG.HashSet <int?>(1); } positionTerms.Add(i); if (!startOffsetToTerms.TryGetValue(startOffsets[i], out ISet <int?> startOffsetTerms)) { startOffsetToTerms[startOffsets[i]] = startOffsetTerms = new JCG.HashSet <int?>(1); } startOffsetTerms.Add(i); } freqs = new Dictionary <string, int?>(); foreach (string term in terms) { if (freqs.TryGetValue(term, out int?freq)) { freqs[term] = freq + 1; } else { freqs[term] = 1; } } AddAttributeImpl(new PermissiveOffsetAttribute()); termAtt = AddAttribute <ICharTermAttribute>(); piAtt = AddAttribute <IPositionIncrementAttribute>(); oAtt = AddAttribute <IOffsetAttribute>(); pAtt = AddAttribute <IPayloadAttribute>(); }
public virtual void TestRandom() { int alphabetSize = TestUtil.NextInt32(Random, 2, 7); int docLen = AtLeast(3000); //final int docLen = 50; string document = GetRandomString('a', alphabetSize, docLen); if (Verbose) { Console.WriteLine("TEST: doc=" + document); } int numSyn = AtLeast(5); //final int numSyn = 2; IDictionary <string, OneSyn> synMap = new Dictionary <string, OneSyn>(); IList <OneSyn> syns = new JCG.List <OneSyn>(); bool dedup = Random.nextBoolean(); if (Verbose) { Console.WriteLine(" dedup=" + dedup); } b = new SynonymMap.Builder(dedup); for (int synIDX = 0; synIDX < numSyn; synIDX++) { string synIn = GetRandomString('a', alphabetSize, TestUtil.NextInt32(Random, 1, 5)).Trim(); if (!synMap.TryGetValue(synIn, out OneSyn s) || s is null) { s = new OneSyn(); s.@in = synIn; syns.Add(s); s.@out = new JCG.List <string>(); synMap[synIn] = s; s.keepOrig = Random.nextBoolean(); } string synOut = GetRandomString('0', 10, TestUtil.NextInt32(Random, 1, 5)).Trim(); [email protected](synOut); Add(synIn, synOut, s.keepOrig); if (Verbose) { Console.WriteLine(" syns[" + synIDX + "] = " + s.@in + " -> " + s.@out + " keepOrig=" + s.keepOrig); } } tokensIn = new MockTokenizer(new StringReader("a"), MockTokenizer.WHITESPACE, true); tokensIn.Reset(); assertTrue(tokensIn.IncrementToken()); assertFalse(tokensIn.IncrementToken()); tokensIn.End(); tokensIn.Dispose(); tokensOut = new SynonymFilter(tokensIn, b.Build(), true); termAtt = tokensOut.AddAttribute <ICharTermAttribute>(); posIncrAtt = tokensOut.AddAttribute <IPositionIncrementAttribute>(); posLenAtt = tokensOut.AddAttribute <IPositionLengthAttribute>(); offsetAtt = tokensOut.AddAttribute <IOffsetAttribute>(); if (dedup) { PruneDups(syns); } string expected = SlowSynMatcher(document, syns, 5); if (Verbose) { Console.WriteLine("TEST: expected=" + expected); } Verify(document, expected); }
/// <summary> /// Retrieve suggestions. /// </summary> public virtual IList <LookupResult> DoLookup(string key, IEnumerable <BytesRef> contexts, int num) { if (contexts != null) { throw new System.ArgumentException("this suggester doesn't support contexts"); } TokenStream ts = queryAnalyzer.GetTokenStream("", key.ToString()); try { ITermToBytesRefAttribute termBytesAtt = ts.AddAttribute <ITermToBytesRefAttribute>(); IOffsetAttribute offsetAtt = ts.AddAttribute <IOffsetAttribute>(); IPositionLengthAttribute posLenAtt = ts.AddAttribute <IPositionLengthAttribute>(); IPositionIncrementAttribute posIncAtt = ts.AddAttribute <IPositionIncrementAttribute>(); ts.Reset(); var lastTokens = new BytesRef[grams]; //System.out.println("lookup: key='" + key + "'"); // Run full analysis, but save only the // last 1gram, last 2gram, etc.: BytesRef tokenBytes = termBytesAtt.BytesRef; int maxEndOffset = -1; bool sawRealToken = false; while (ts.IncrementToken()) { termBytesAtt.FillBytesRef(); sawRealToken |= tokenBytes.Length > 0; // TODO: this is somewhat iffy; today, ShingleFilter // sets posLen to the gram count; maybe we should make // a separate dedicated att for this? int gramCount = posLenAtt.PositionLength; Debug.Assert(gramCount <= grams); // Safety: make sure the recalculated count "agrees": if (CountGrams(tokenBytes) != gramCount) { throw new System.ArgumentException("tokens must not contain separator byte; got token=" + tokenBytes + " but gramCount=" + gramCount + " does not match recalculated count=" + CountGrams(tokenBytes)); } maxEndOffset = Math.Max(maxEndOffset, offsetAtt.EndOffset); lastTokens[gramCount - 1] = BytesRef.DeepCopyOf(tokenBytes); } ts.End(); if (!sawRealToken) { throw new System.ArgumentException("no tokens produced by analyzer, or the only tokens were empty strings"); } // Carefully fill last tokens with _ tokens; // ShingleFilter appraently won't emit "only hole" // tokens: int endPosInc = posIncAtt.PositionIncrement; // Note this will also be true if input is the empty // string (in which case we saw no tokens and // maxEndOffset is still -1), which in fact works out OK // because we fill the unigram with an empty BytesRef // below: bool lastTokenEnded = offsetAtt.EndOffset > maxEndOffset || endPosInc > 0; //System.out.println("maxEndOffset=" + maxEndOffset + " vs " + offsetAtt.EndOffset); if (lastTokenEnded) { //System.out.println(" lastTokenEnded"); // If user hit space after the last token, then // "upgrade" all tokens. This way "foo " will suggest // all bigrams starting w/ foo, and not any unigrams // starting with "foo": for (int i = grams - 1; i > 0; i--) { BytesRef token = lastTokens[i - 1]; if (token == null) { continue; } token.Grow(token.Length + 1); token.Bytes[token.Length] = separator; token.Length++; lastTokens[i] = token; } lastTokens[0] = new BytesRef(); } var arc = new FST.Arc <long?>(); var bytesReader = fst.GetBytesReader(); // Try highest order models first, and if they return // results, return that; else, fallback: double backoff = 1.0; List <LookupResult> results = new List <LookupResult>(num); // We only add a given suffix once, from the highest // order model that saw it; for subsequent lower order // models we skip it: var seen = new HashSet <BytesRef>(); for (int gram = grams - 1; gram >= 0; gram--) { BytesRef token = lastTokens[gram]; // Don't make unigram predictions from empty string: if (token == null || (token.Length == 0 && key.Length > 0)) { // Input didn't have enough tokens: //System.out.println(" gram=" + gram + ": skip: not enough input"); continue; } if (endPosInc > 0 && gram <= endPosInc) { // Skip hole-only predictions; in theory we // shouldn't have to do this, but we'd need to fix // ShingleFilter to produce only-hole tokens: //System.out.println(" break: only holes now"); break; } //System.out.println("try " + (gram+1) + " gram token=" + token.utf8ToString()); // TODO: we could add fuzziness here // match the prefix portion exactly //Pair<Long,BytesRef> prefixOutput = null; long?prefixOutput = null; try { prefixOutput = LookupPrefix(fst, bytesReader, token, arc); } catch (IOException bogus) { throw new Exception(bogus.ToString(), bogus); } //System.out.println(" prefixOutput=" + prefixOutput); if (prefixOutput == null) { // This model never saw this prefix, e.g. the // trigram model never saw context "purple mushroom" backoff *= ALPHA; continue; } // TODO: we could do this division at build time, and // bake it into the FST? // Denominator for computing scores from current // model's predictions: long contextCount = totTokens; BytesRef lastTokenFragment = null; for (int i = token.Length - 1; i >= 0; i--) { if (token.Bytes[token.Offset + i] == separator) { BytesRef context = new BytesRef(token.Bytes, token.Offset, i); long? output = Lucene.Net.Util.Fst.Util.Get(fst, Lucene.Net.Util.Fst.Util.ToInt32sRef(context, new Int32sRef())); Debug.Assert(output != null); contextCount = DecodeWeight(output); lastTokenFragment = new BytesRef(token.Bytes, token.Offset + i + 1, token.Length - i - 1); break; } } BytesRef finalLastToken; if (lastTokenFragment == null) { finalLastToken = BytesRef.DeepCopyOf(token); } else { finalLastToken = BytesRef.DeepCopyOf(lastTokenFragment); } Debug.Assert(finalLastToken.Offset == 0); CharsRef spare = new CharsRef(); // complete top-N Util.Fst.Util.TopResults <long?> completions = null; try { // Because we store multiple models in one FST // (1gram, 2gram, 3gram), we must restrict the // search so that it only considers the current // model. For highest order model, this is not // necessary since all completions in the FST // must be from this model, but for lower order // models we have to filter out the higher order // ones: // Must do num+seen.size() for queue depth because we may // reject up to seen.size() paths in acceptResult(): Util.Fst.Util.TopNSearcher <long?> searcher = new TopNSearcherAnonymousInnerClassHelper(this, fst, num, num + seen.Count, weightComparer, seen, finalLastToken); // since this search is initialized with a single start node // it is okay to start with an empty input path here searcher.AddStartPaths(arc, prefixOutput, true, new Int32sRef()); completions = searcher.Search(); Debug.Assert(completions.IsComplete); } catch (IOException bogus) { throw new Exception(bogus.ToString(), bogus); } int prefixLength = token.Length; BytesRef suffix = new BytesRef(8); //System.out.println(" " + completions.length + " completions"); foreach (Util.Fst.Util.Result <long?> completion in completions) { token.Length = prefixLength; // append suffix Util.Fst.Util.ToBytesRef(completion.Input, suffix); token.Append(suffix); //System.out.println(" completion " + token.utf8ToString()); // Skip this path if a higher-order model already // saw/predicted its last token: BytesRef lastToken = token; for (int i = token.Length - 1; i >= 0; i--) { if (token.Bytes[token.Offset + i] == separator) { Debug.Assert(token.Length - i - 1 > 0); lastToken = new BytesRef(token.Bytes, token.Offset + i + 1, token.Length - i - 1); break; } } if (seen.Contains(lastToken)) { //System.out.println(" skip dup " + lastToken.utf8ToString()); goto nextCompletionContinue; } seen.Add(BytesRef.DeepCopyOf(lastToken)); spare.Grow(token.Length); UnicodeUtil.UTF8toUTF16(token, spare); LookupResult result = new LookupResult(spare.ToString(), // LUCENENET NOTE: We need to calculate this as decimal because when using double it can sometimes // return numbers that are greater than long.MaxValue, which results in a negative long number. (long)(long.MaxValue * (decimal)backoff * ((decimal)DecodeWeight(completion.Output)) / contextCount)); results.Add(result); Debug.Assert(results.Count == seen.Count); //System.out.println(" add result=" + result); nextCompletionContinue :; } backoff *= ALPHA; } results.Sort(new ComparerAnonymousInnerClassHelper(this)); if (results.Count > num) { results.SubList(num, results.Count).Clear(); } return(results); } finally { IOUtils.DisposeWhileHandlingException(ts); } }
internal TestTokenStream() { termAtt = AddAttribute<ITermAttribute>(); offsetAtt = AddAttribute<IOffsetAttribute>(); typeAtt = AddAttribute<ITypeAttribute>(); payloadAtt = AddAttribute<IPayloadAttribute>(); posIncAtt = AddAttribute<IPositionIncrementAttribute>(); flagsAtt = AddAttribute<IFlagsAttribute>(); }
protected internal RandomTokenStream(BaseTermVectorsFormatTestCase outerInstance, int len, string[] sampleTerms, BytesRef[] sampleTermBytes, bool offsetsGoBackwards) { this.OuterInstance = outerInstance; Terms = new string[len]; TermBytes = new BytesRef[len]; PositionsIncrements = new int[len]; Positions = new int[len]; StartOffsets = new int[len]; EndOffsets = new int[len]; Payloads = new BytesRef[len]; for (int i = 0; i < len; ++i) { int o = Random().Next(sampleTerms.Length); Terms[i] = sampleTerms[o]; TermBytes[i] = sampleTermBytes[o]; PositionsIncrements[i] = TestUtil.NextInt(Random(), i == 0 ? 1 : 0, 10); if (offsetsGoBackwards) { StartOffsets[i] = Random().Next(); EndOffsets[i] = Random().Next(); } else { if (i == 0) { StartOffsets[i] = TestUtil.NextInt(Random(), 0, 1 << 16); } else { StartOffsets[i] = StartOffsets[i - 1] + TestUtil.NextInt(Random(), 0, Rarely() ? 1 << 16 : 20); } EndOffsets[i] = StartOffsets[i] + TestUtil.NextInt(Random(), 0, Rarely() ? 1 << 10 : 20); } } for (int i = 0; i < len; ++i) { if (i == 0) { Positions[i] = PositionsIncrements[i] - 1; } else { Positions[i] = Positions[i - 1] + PositionsIncrements[i]; } } if (Rarely()) { Arrays.Fill(Payloads, outerInstance.RandomPayload()); } else { for (int i = 0; i < len; ++i) { Payloads[i] = outerInstance.RandomPayload(); } } PositionToTerms = new Dictionary <int?, ISet <int?> >(len); StartOffsetToTerms = new Dictionary <int?, ISet <int?> >(len); for (int i = 0; i < len; ++i) { if (!PositionToTerms.ContainsKey(Positions[i])) { PositionToTerms[Positions[i]] = new HashSet <int?>();//size1 } PositionToTerms[Positions[i]].Add(i); if (!StartOffsetToTerms.ContainsKey(StartOffsets[i])) { StartOffsetToTerms[StartOffsets[i]] = new HashSet <int?>();//size1 } StartOffsetToTerms[StartOffsets[i]].Add(i); } Freqs = new Dictionary <string, int?>(); foreach (string term in Terms) { if (Freqs.ContainsKey(term)) { Freqs[term] = Freqs[term] + 1; } else { Freqs[term] = 1; } } AddAttributeImpl(new PermissiveOffsetAttributeImpl()); TermAtt = AddAttribute <ICharTermAttribute>(); PiAtt = AddAttribute <IPositionIncrementAttribute>(); OAtt = AddAttribute <IOffsetAttribute>(); PAtt = AddAttribute <IPayloadAttribute>(); }
protected override IQueryNode PostProcessNode(IQueryNode node) { if (node is ITextableQueryNode && !(node is WildcardQueryNode) && !(node is FuzzyQueryNode) && !(node is RegexpQueryNode) && !(node.Parent is IRangeQueryNode)) { FieldQueryNode fieldNode = ((FieldQueryNode)node); string text = fieldNode.GetTextAsString(); string field = fieldNode.GetFieldAsString(); CachingTokenFilter buffer = null; IPositionIncrementAttribute posIncrAtt = null; int numTokens = 0; int positionCount = 0; bool severalTokensAtSamePosition = false; TokenStream source = null; try { source = this.analyzer.GetTokenStream(field, text); source.Reset(); buffer = new CachingTokenFilter(source); if (buffer.HasAttribute <IPositionIncrementAttribute>()) { posIncrAtt = buffer.GetAttribute <IPositionIncrementAttribute>(); } try { while (buffer.IncrementToken()) { numTokens++; int positionIncrement = (posIncrAtt != null) ? posIncrAtt .PositionIncrement : 1; if (positionIncrement != 0) { positionCount += positionIncrement; } else { severalTokensAtSamePosition = true; } } } #pragma warning disable 168 catch (IOException e) #pragma warning restore 168 { // ignore } } catch (IOException e) { throw new Exception(e.ToString(), e); } finally { IOUtils.DisposeWhileHandlingException(source); } // rewind the buffer stream buffer.Reset(); if (!buffer.HasAttribute <ICharTermAttribute>()) { return(new NoTokenFoundQueryNode()); } ICharTermAttribute termAtt = buffer.GetAttribute <ICharTermAttribute>(); if (numTokens == 0) { return(new NoTokenFoundQueryNode()); } else if (numTokens == 1) { string term = null; try { bool hasNext; hasNext = buffer.IncrementToken(); if (Debugging.AssertsEnabled) { Debugging.Assert(hasNext == true); } term = termAtt.ToString(); } catch (IOException) // LUCENENET: IDE0059: Remove unnecessary value assignment { // safe to ignore, because we know the number of tokens } fieldNode.Text = term.AsCharSequence(); return(fieldNode); } else if (severalTokensAtSamePosition || !(node is QuotedFieldQueryNode)) { if (positionCount == 1 || !(node is QuotedFieldQueryNode)) { // no phrase query: if (positionCount == 1) { // simple case: only one position, with synonyms List <IQueryNode> children = new List <IQueryNode>(); for (int i = 0; i < numTokens; i++) { string term = null; try { bool hasNext = buffer.IncrementToken(); if (Debugging.AssertsEnabled) { Debugging.Assert(hasNext == true); } term = termAtt.ToString(); } catch (IOException) // LUCENENET: IDE0059: Remove unnecessary value assignment { // safe to ignore, because we know the number of tokens } children.Add(new FieldQueryNode(field, term, -1, -1)); } return(new GroupQueryNode( new StandardBooleanQueryNode(children, positionCount == 1))); } else { // multiple positions IQueryNode q = new StandardBooleanQueryNode(Collections.EmptyList <IQueryNode>(), false); IQueryNode currentQuery = null; for (int i = 0; i < numTokens; i++) { string term = null; try { bool hasNext = buffer.IncrementToken(); if (Debugging.AssertsEnabled) { Debugging.Assert(hasNext == true); } term = termAtt.ToString(); } catch (IOException) // LUCENENET: IDE0059: Remove unnecessary value assignment { // safe to ignore, because we know the number of tokens } if (posIncrAtt != null && posIncrAtt.PositionIncrement == 0) { if (!(currentQuery is BooleanQueryNode)) { IQueryNode t = currentQuery; currentQuery = new StandardBooleanQueryNode(Collections.EmptyList <IQueryNode>(), true); ((BooleanQueryNode)currentQuery).Add(t); } ((BooleanQueryNode)currentQuery).Add(new FieldQueryNode(field, term, -1, -1)); } else { if (currentQuery != null) { if (this.defaultOperator == Operator.OR) { q.Add(currentQuery); } else { q.Add(new ModifierQueryNode(currentQuery, Modifier.MOD_REQ)); } } currentQuery = new FieldQueryNode(field, term, -1, -1); } } if (this.defaultOperator == Operator.OR) { q.Add(currentQuery); } else { q.Add(new ModifierQueryNode(currentQuery, Modifier.MOD_REQ)); } if (q is BooleanQueryNode) { q = new GroupQueryNode(q); } return(q); } } else { // phrase query: MultiPhraseQueryNode mpq = new MultiPhraseQueryNode(); List <FieldQueryNode> multiTerms = new List <FieldQueryNode>(); int position = -1; int i = 0; int termGroupCount = 0; for (; i < numTokens; i++) { string term = null; int positionIncrement = 1; try { bool hasNext = buffer.IncrementToken(); if (Debugging.AssertsEnabled) { Debugging.Assert(hasNext == true); } term = termAtt.ToString(); if (posIncrAtt != null) { positionIncrement = posIncrAtt.PositionIncrement; } } catch (IOException) // LUCENENET: IDE0059: Remove unnecessary value assignment { // safe to ignore, because we know the number of tokens } if (positionIncrement > 0 && multiTerms.Count > 0) { foreach (FieldQueryNode termNode in multiTerms) { if (this.positionIncrementsEnabled) { termNode.PositionIncrement = position; } else { termNode.PositionIncrement = termGroupCount; } mpq.Add(termNode); } // Only increment once for each "group" of // terms that were in the same position: termGroupCount++; multiTerms.Clear(); } position += positionIncrement; multiTerms.Add(new FieldQueryNode(field, term, -1, -1)); } foreach (FieldQueryNode termNode in multiTerms) { if (this.positionIncrementsEnabled) { termNode.PositionIncrement = position; } else { termNode.PositionIncrement = termGroupCount; } mpq.Add(termNode); } return(mpq); } } else { TokenizedPhraseQueryNode pq = new TokenizedPhraseQueryNode(); int position = -1; for (int i = 0; i < numTokens; i++) { string term = null; int positionIncrement = 1; try { bool hasNext = buffer.IncrementToken(); if (Debugging.AssertsEnabled) { Debugging.Assert(hasNext == true); } term = termAtt.ToString(); if (posIncrAtt != null) { positionIncrement = posIncrAtt.PositionIncrement; } } catch (IOException) // LUCENENET: IDE0059: Remove unnecessary value assignment { // safe to ignore, because we know the number of tokens } FieldQueryNode newFieldNode = new FieldQueryNode(field, term, -1, -1); if (this.positionIncrementsEnabled) { position += positionIncrement; newFieldNode.PositionIncrement = position; } else { newFieldNode.PositionIncrement = i; } pq.Add(newFieldNode); } return(pq); } } return(node); }
public MyTokenStream() { TermAtt = AddAttribute <ICharTermAttribute>(); PosIncAtt = AddAttribute <IPositionIncrementAttribute>(); }
public OverlappingTokenStream() { termAttribute = AddAttribute <ICharTermAttribute>(); offsetAttribute = AddAttribute <IOffsetAttribute>(); positionIncrementAttribute = AddAttribute <IPositionIncrementAttribute>(); }
/// <summary> /// Not an explicit test, just useful to print out some info on performance /// </summary> public virtual void Performance() { int[] tokCount = new int[] { 100, 500, 1000, 2000, 5000, 10000 }; int[] modCounts = new int[] { 1, 2, 5, 10, 20, 50, 100, 200, 500 }; for (int k = 0; k < tokCount.Length; k++) { StringBuilder buffer = new StringBuilder(); Console.WriteLine("-----Tokens: " + tokCount[k] + "-----"); for (int i = 0; i < tokCount[k]; i++) { //buffer.Append(English.intToEnglish(i).toUpperCase(Locale.ROOT)).Append(' '); buffer.Append(i.ToString(CultureInfo.InvariantCulture)).Append(' '); } //make sure we produce the same tokens TeeSinkTokenFilter teeStream = new TeeSinkTokenFilter(new StandardFilter(TEST_VERSION_CURRENT, new StandardTokenizer(TEST_VERSION_CURRENT, new StringReader(buffer.ToString())))); TokenStream sink = teeStream.NewSinkTokenStream(new ModuloSinkFilter(this, 100)); teeStream.ConsumeAllTokens(); TokenStream stream = new ModuloTokenFilter(this, new StandardFilter(TEST_VERSION_CURRENT, new StandardTokenizer(TEST_VERSION_CURRENT, new StringReader(buffer.ToString()))), 100); ICharTermAttribute tfTok = stream.AddAttribute <ICharTermAttribute>(); ICharTermAttribute sinkTok = sink.AddAttribute <ICharTermAttribute>(); for (int i = 0; stream.IncrementToken(); i++) { assertTrue(sink.IncrementToken()); assertTrue(tfTok + " is not equal to " + sinkTok + " at token: " + i, tfTok.Equals(sinkTok) == true); } //simulate two fields, each being analyzed once, for 20 documents for (int j = 0; j < modCounts.Length; j++) { int tfPos = 0; //long start = DateTimeHelperClass.CurrentUnixTimeMillis(); long start = Environment.TickCount; for (int i = 0; i < 20; i++) { stream = new StandardFilter(TEST_VERSION_CURRENT, new StandardTokenizer(TEST_VERSION_CURRENT, new StringReader(buffer.ToString()))); IPositionIncrementAttribute posIncrAtt = stream.GetAttribute <IPositionIncrementAttribute>(); while (stream.IncrementToken()) { tfPos += posIncrAtt.PositionIncrement; } stream = new ModuloTokenFilter(this, new StandardFilter(TEST_VERSION_CURRENT, new StandardTokenizer(TEST_VERSION_CURRENT, new StringReader(buffer.ToString()))), modCounts[j]); posIncrAtt = stream.GetAttribute <IPositionIncrementAttribute>(); while (stream.IncrementToken()) { tfPos += posIncrAtt.PositionIncrement; } } //long finish = DateTimeHelperClass.CurrentUnixTimeMillis(); long finish = Environment.TickCount; Console.WriteLine("ModCount: " + modCounts[j] + " Two fields took " + (finish - start) + " ms"); int sinkPos = 0; //simulate one field with one sink //start = DateTimeHelperClass.CurrentUnixTimeMillis(); start = Environment.TickCount; for (int i = 0; i < 20; i++) { teeStream = new TeeSinkTokenFilter(new StandardFilter(TEST_VERSION_CURRENT, new StandardTokenizer(TEST_VERSION_CURRENT, new StringReader(buffer.ToString())))); sink = teeStream.NewSinkTokenStream(new ModuloSinkFilter(this, modCounts[j])); IPositionIncrementAttribute posIncrAtt = teeStream.GetAttribute <IPositionIncrementAttribute>(); while (teeStream.IncrementToken()) { sinkPos += posIncrAtt.PositionIncrement; } //System.out.println("Modulo--------"); posIncrAtt = sink.GetAttribute <IPositionIncrementAttribute>(); while (sink.IncrementToken()) { sinkPos += posIncrAtt.PositionIncrement; } } //finish = DateTimeHelperClass.CurrentUnixTimeMillis(); finish = Environment.TickCount; Console.WriteLine("ModCount: " + modCounts[j] + " Tee fields took " + (finish - start) + " ms"); assertTrue(sinkPos + " does not equal: " + tfPos, sinkPos == tfPos); } Console.WriteLine("- End Tokens: " + tokCount[k] + "-----"); } }
public void TestPayloads() { Directory dir = NewDirectory(); RandomIndexWriter writer = new RandomIndexWriter( #if FEATURE_INSTANCE_TESTDATA_INITIALIZATION this, #endif Random, dir); FieldType myFieldType = new FieldType(TextField.TYPE_NOT_STORED); myFieldType.StoreTermVectors = (true); myFieldType.StoreTermVectorOffsets = (true); myFieldType.StoreTermVectorPositions = (true); myFieldType.StoreTermVectorPayloads = (true); curOffset = 0; Token[] tokens = new Token[] { getToken("foxes"), getToken("can"), getToken("jump"), getToken("high") }; Document doc = new Document(); doc.Add(new Field("field", new CannedTokenStream(tokens), myFieldType)); writer.AddDocument(doc); IndexReader reader = writer.GetReader(); writer.Dispose(); assertEquals(1, reader.NumDocs); for (int i = 0; i < 2; i++) { // Do this twice, once passing true and then passing // false: they are entirely different code paths // under-the-hood: TokenStream ts = TokenSources.GetTokenStream(reader.GetTermVectors(0).GetTerms("field"), i == 0); ICharTermAttribute termAtt = ts.GetAttribute <ICharTermAttribute>(); IPositionIncrementAttribute posIncAtt = ts.GetAttribute <IPositionIncrementAttribute>(); IOffsetAttribute offsetAtt = ts.GetAttribute <IOffsetAttribute>(); IPayloadAttribute payloadAtt = ts.GetAttribute <IPayloadAttribute>(); foreach (Token token in tokens) { assertTrue(ts.IncrementToken()); assertEquals(token.toString(), termAtt.toString()); assertEquals(token.PositionIncrement, posIncAtt.PositionIncrement); assertEquals(token.Payload, payloadAtt.Payload); assertEquals(token.StartOffset, offsetAtt.StartOffset); assertEquals(token.EndOffset, offsetAtt.EndOffset); } assertFalse(ts.IncrementToken()); } reader.Dispose(); dir.Dispose(); }
protected internal RandomTokenStream(BaseTermVectorsFormatTestCase outerInstance, int len, string[] sampleTerms, BytesRef[] sampleTermBytes, bool offsetsGoBackwards) { this.OuterInstance = outerInstance; Terms = new string[len]; TermBytes = new BytesRef[len]; PositionsIncrements = new int[len]; Positions = new int[len]; StartOffsets = new int[len]; EndOffsets = new int[len]; Payloads = new BytesRef[len]; for (int i = 0; i < len; ++i) { int o = Random().Next(sampleTerms.Length); Terms[i] = sampleTerms[o]; TermBytes[i] = sampleTermBytes[o]; PositionsIncrements[i] = TestUtil.NextInt(Random(), i == 0 ? 1 : 0, 10); if (offsetsGoBackwards) { StartOffsets[i] = Random().Next(); EndOffsets[i] = Random().Next(); } else { if (i == 0) { StartOffsets[i] = TestUtil.NextInt(Random(), 0, 1 << 16); } else { StartOffsets[i] = StartOffsets[i - 1] + TestUtil.NextInt(Random(), 0, Rarely() ? 1 << 16 : 20); } EndOffsets[i] = StartOffsets[i] + TestUtil.NextInt(Random(), 0, Rarely() ? 1 << 10 : 20); } } for (int i = 0; i < len; ++i) { if (i == 0) { Positions[i] = PositionsIncrements[i] - 1; } else { Positions[i] = Positions[i - 1] + PositionsIncrements[i]; } } if (Rarely()) { Arrays.Fill(Payloads, outerInstance.RandomPayload()); } else { for (int i = 0; i < len; ++i) { Payloads[i] = outerInstance.RandomPayload(); } } PositionToTerms = new Dictionary<int?, ISet<int?>>(len); StartOffsetToTerms = new Dictionary<int?, ISet<int?>>(len); for (int i = 0; i < len; ++i) { if (!PositionToTerms.ContainsKey(Positions[i])) { PositionToTerms[Positions[i]] = new HashSet<int?>();//size1 } PositionToTerms[Positions[i]].Add(i); if (!StartOffsetToTerms.ContainsKey(StartOffsets[i])) { StartOffsetToTerms[StartOffsets[i]] = new HashSet<int?>();//size1 } StartOffsetToTerms[StartOffsets[i]].Add(i); } Freqs = new Dictionary<string, int?>(); foreach (string term in Terms) { if (Freqs.ContainsKey(term)) { Freqs[term] = Freqs[term] + 1; } else { Freqs[term] = 1; } } AddAttributeImpl(new PermissiveOffsetAttributeImpl()); TermAtt = AddAttribute<ICharTermAttribute>(); PiAtt = AddAttribute<IPositionIncrementAttribute>(); OAtt = AddAttribute<IOffsetAttribute>(); PAtt = AddAttribute<IPayloadAttribute>(); }
public SinglePositionTokenStream(string word) { termAtt = AddAttribute<ICharTermAttribute>(); posIncrAtt = AddAttribute<IPositionIncrementAttribute>(); this.word = word; returned = true; }