예제 #1
0
 public FilteringTokenFilter(Lucene.Net.Util.LuceneVersion version, bool enablePositionIncrements, TokenStream input)
     : this(version, input)
 {
     posIncrAtt = AddAttribute<IPositionIncrementAttribute>();
     CheckPositionIncrement(version, enablePositionIncrements);
     this.enablePositionIncrements = enablePositionIncrements;
 }
예제 #2
0
 /// <summary>
 /// Create a new MockTokenFilter.
 /// </summary>
 /// <param name="input"> TokenStream to filter </param>
 /// <param name="filter"> DFA representing the terms that should be removed. </param>
 public MockTokenFilter(TokenStream input, CharacterRunAutomaton filter)
     : base(input)
 {
     this.Filter = filter;
     TermAtt = AddAttribute<ICharTermAttribute>();
     PosIncrAtt = AddAttribute<IPositionIncrementAttribute>();
 }
 public MockHoleInjectingTokenFilter(Random random, TokenStream @in)
     : base(@in)
 {
     RandomSeed = random.Next();
     PosIncAtt = AddAttribute<IPositionIncrementAttribute>();
     PosLenAtt = AddAttribute<IPositionLengthAttribute>();
 }
 public CannedTokenizer(System.IO.TextReader reader, TokenAndPos[] tokens)
     : base(reader)
 {
     this.tokens = tokens;
     this.termAtt = AddAttribute<ICharTermAttribute>();
     this.posIncrAtt = AddAttribute<IPositionIncrementAttribute>();
 }
 /// <summary>
 /// Create a new <seealso cref="ASCIIFoldingFilter"/>.
 /// </summary>
 /// <param name="input">
 ///          TokenStream to filter </param>
 /// <param name="preserveOriginal">
 ///          should the original tokens be kept on the input stream with a 0 position increment
 ///          from the folded tokens?
 ///  </param>
 public ASCIIFoldingFilter(TokenStream input, bool preserveOriginal)
     : base(input)
 {
     this.preserveOriginal = preserveOriginal;
     termAtt = AddAttribute<ICharTermAttribute>();
     posIncAttr = AddAttribute<IPositionIncrementAttribute>();
 }
 /// <summary>
 /// Create a new <seealso cref="FilteringTokenFilter"/>. </summary>
 /// <param name="version"> the Lucene match version </param>
 /// <param name="in">      the <seealso cref="TokenStream"/> to consume </param>
 public FilteringTokenFilter(LuceneVersion version, TokenStream @in)
     : base(@in)
 {
     posIncrAtt = AddAttribute<IPositionIncrementAttribute>();
     this.version = version;
     this.enablePositionIncrements = true;
 }
예제 #7
0
 public CamelCaseFilter(TokenStream stream)
     : base(stream)
 {
     _termAttribute = AddAttribute<ITermAttribute>();
     _offsetAttribute = AddAttribute<IOffsetAttribute>();
     _positionIncrementAttribute = AddAttribute<IPositionIncrementAttribute>();
 }
 void Init()
 {
     InitPanGuSegment();
     termAtt = AddAttribute<ITermAttribute>();
     offsetAtt = AddAttribute<IOffsetAttribute>();
     posIncrAtt = AddAttribute<IPositionIncrementAttribute>();
     typeAtt = AddAttribute<ITypeAttribute>();
 }
예제 #9
0
 /// <summary>
 /// 
 /// </summary>
 /// <remarks></remarks>
 /// <seealso cref=""/>
 /// <param name="input"></param>
 /// <param name="synonymEngine"></param>
 /// <return></return>
 public SynonymFilter(TokenStream input, ISynonymEngine synonymEngine)
     : base(input)
 {
     synonymStack = new Stack<String>();
     this.engine = synonymEngine;
     this.termAtt = AddAttribute<ITermAttribute>();
     this.posIncrAtt = AddAttribute<IPositionIncrementAttribute>();
 }
예제 #10
0
 public GraphTokenizer(TextReader input)
     : base(input)
 {
     TermAtt = AddAttribute<ICharTermAttribute>();
     OffsetAtt = AddAttribute<IOffsetAttribute>();
     PosIncrAtt = AddAttribute<IPositionIncrementAttribute>();
     PosLengthAtt = AddAttribute<IPositionLengthAttribute>();
 }
예제 #11
0
        /// <summary>
        ///   Creates a new HunspellStemFilter that will stem tokens from the given TokenStream using
        ///   affix rules in the provided HunspellDictionary.
        /// </summary>
        /// <param name="input">TokenStream whose tokens will be stemmed.</param>
        /// <param name="dictionary">HunspellDictionary containing the affix rules and words that will be used to stem the tokens.</param>
        /// <param name="dedup">true if only unique terms should be output.</param>
        public HunspellStemFilter(TokenStream input, HunspellDictionary dictionary, Boolean dedup = true)
            : base(input)
        {
            _posIncAtt = AddAttribute<IPositionIncrementAttribute>();
            _termAtt = AddAttribute<ITermAttribute>();

            _dedup = dedup;
            _stemmer = new HunspellStemmer(dictionary);
        }
예제 #12
0
 /// <summary>
 /// Sole constructor. </summary>
 public SuggestStopFilter(TokenStream input, CharArraySet stopWords)
     : base(input)
 {
     this.stopWords = stopWords;
     this.termAtt = AddAttribute<ICharTermAttribute>();
     this.posIncAtt = AddAttribute<IPositionIncrementAttribute>();
     this.keywordAtt = AddAttribute<IKeywordAttribute>();
     this.offsetAtt = AddAttribute<IOffsetAttribute>();
 }
예제 #13
0
 public CannedBinaryTokenStream(params BinaryToken[] tokens)
     : base()
 {
     this.Tokens = tokens;
     TermAtt = AddAttribute<IBinaryTermAttribute>();
     PosIncrAtt = AddAttribute<IPositionIncrementAttribute>();
     PosLengthAtt = AddAttribute<IPositionLengthAttribute>();
     OffsetAtt = AddAttribute<IOffsetAttribute>();
 }
예제 #14
0
 public MockPayloadFilter(TokenStream input, string fieldName)
     : base(input)
 {
     this.FieldName = fieldName;
     Pos = 0;
     i = 0;
     PosIncrAttr = input.AddAttribute<IPositionIncrementAttribute>();
     PayloadAttr = input.AddAttribute<IPayloadAttribute>();
     TermAttr = input.AddAttribute<ICharTermAttribute>();
 }
예제 #15
0
 /// <summary>
 /// Construct a token stream filtering the given input using a Set of common
 /// words to create bigrams. Outputs both unigrams with position increment and
 /// bigrams with position increment 0 type=gram where one or both of the words
 /// in a potential bigram are in the set of common words .
 /// </summary>
 /// <param name="input"> TokenStream input in filter chain </param>
 /// <param name="commonWords"> The set of common words. </param>
 public CommonGramsFilter(LuceneVersion matchVersion, TokenStream input, CharArraySet commonWords)
     : base(input)
 {
     termAttribute = AddAttribute<ICharTermAttribute>();
     offsetAttribute = AddAttribute<IOffsetAttribute>();
     typeAttribute = AddAttribute<ITypeAttribute>();
     posIncAttribute = AddAttribute<IPositionIncrementAttribute>();
     posLenAttribute = AddAttribute<IPositionLengthAttribute>();
     this.commonWords = commonWords;
 }
        public ExpandAcronymsFilter(TokenStream input, IAcronymExpansionProvider acronymExpansionProvider)
            : base(input)
        {
            _acronymExpansionProvider = acronymExpansionProvider;

            _termAttribute = AddAttribute<ITermAttribute>();
            _positionIncrementAttribute = AddAttribute<IPositionIncrementAttribute>();
            _tokenSet = new Queue<string>();
            _recognizedTokens = new HashSet<string>();
        }
 /// <summary>
 /// Build a filter that limits the maximum position of tokens to emit.
 /// </summary>
 /// <param name="in"> the stream to wrap </param>
 /// <param name="maxTokenPosition"> max position of tokens to produce (1st token always has position 1) </param>
 /// <param name="consumeAllTokens"> whether all tokens from the wrapped input stream must be consumed
 ///                         even if maxTokenPosition is exceeded. </param>
 public LimitTokenPositionFilter(TokenStream @in, int maxTokenPosition, bool consumeAllTokens)
     : base(@in)
 {
     if (maxTokenPosition < 1)
     {
         throw new System.ArgumentException("maxTokenPosition must be greater than zero");
     }
     this.maxTokenPosition = maxTokenPosition;
     this.consumeAllTokens = consumeAllTokens;
     posIncAtt = AddAttribute<IPositionIncrementAttribute>();
 }
예제 #18
0
 public IterTokenStream(params Token[] tokens)
         : base()
 {
     this.tokens = tokens;
     this.termAtt = AddAttribute<ICharTermAttribute>();
     this.offsetAtt = AddAttribute<IOffsetAttribute>();
     this.posIncAtt = AddAttribute<IPositionIncrementAttribute>();
     this.flagsAtt = AddAttribute<IFlagsAttribute>();
     this.typeAtt = AddAttribute<ITypeAttribute>();
     this.payloadAtt = AddAttribute<IPayloadAttribute>();
 }
예제 #19
0
        public ExpanderFilter(TokenStream input, [NotNull] Func<String, IEnumerable<Expansion>> expander, Boolean emitSource = true)
            : base(input)
        {
            if (expander == null)
                throw new ArgumentNullException("expander");

            _expander = expander;
            _emitSource = emitSource;
            _termAttr = AddAttribute<ITermAttribute>();
            _posAttr = AddAttribute<IPositionIncrementAttribute>();
            _typeAttr = AddAttribute<ITypeAttribute>();
        }
예제 #20
0
        private bool hasIllegalOffsets = false; // only if the length changed before this filter

        /// <summary>
        /// Creates a new ThaiWordFilter with the specified match version. </summary>
        public ThaiWordFilter(LuceneVersion matchVersion, TokenStream input)
              : base(matchVersion.OnOrAfter(LuceneVersion.LUCENE_31) ? input : new LowerCaseFilter(matchVersion, input))
        {
            if (!DBBI_AVAILABLE)
            {
                throw new System.NotSupportedException("This JRE does not have support for Thai segmentation");
            }
            handlePosIncr = matchVersion.OnOrAfter(LuceneVersion.LUCENE_31);
            termAtt = AddAttribute<ICharTermAttribute>();
            offsetAtt = AddAttribute<IOffsetAttribute>();
            posAtt = AddAttribute<IPositionIncrementAttribute>();
        }
예제 #21
0
        private void Init(System.IO.TextReader input, HebMorph.StreamLemmatizer _lemmatizer,
            HebMorph.LemmaFilters.LemmaFilterBase _lemmaFilter, bool AlwaysSaveMarkedOriginal)
        {
			termAtt = AddAttribute <ITermAttribute>();
	        offsetAtt = AddAttribute<IOffsetAttribute>();
	        posIncrAtt = AddAttribute<IPositionIncrementAttribute>();
			typeAtt = AddAttribute <ITypeAttribute>();
            //payAtt = (PayloadAttribute)AddAttribute(typeof(PayloadAttribute));

        	this.input = input;
            this._streamLemmatizer = _lemmatizer;
            this._streamLemmatizer.SetStream(input);
            this.alwaysSaveMarkedOriginal = AlwaysSaveMarkedOriginal;
            this.lemmaFilter = _lemmaFilter;
        }
		/// <summary> Construct a token stream filtering the given input.
		/// If <c>stopWords</c> is an instance of <see cref="CharArraySet" /> (true if
		/// <c>makeStopSet()</c> was used to construct the set) it will be directly used
		/// and <c>ignoreCase</c> will be ignored since <c>CharArraySet</c>
		/// directly controls case sensitivity.
		/// <p/>
		/// If <c>stopWords</c> is not an instance of <see cref="CharArraySet" />,
		/// a new CharArraySet will be constructed and <c>ignoreCase</c> will be
		/// used to specify the case sensitivity of that set.
		/// </summary>
		/// <param name="enablePositionIncrements">true if token positions should record the removed stop words</param>
		/// <param name="input">Input TokenStream</param>
		/// <param name="stopWords">A Set of strings or strings or char[] or any other ToString()-able set representing the stopwords</param>
        /// <param name="ignoreCase">if true, all words are lower cased first</param>
        public StopFilter(bool enablePositionIncrements, TokenStream input, ISet<string> stopWords, bool ignoreCase)
            : base(input)
		{
		    if (stopWords is CharArraySet)
		    {
		        this.stopWords = (CharArraySet) stopWords;
		    }
		    else
		    {
		        this.stopWords = new CharArraySet(stopWords.Count, ignoreCase);
		        this.stopWords.AddAll(stopWords);
		    }
		    this.enablePositionIncrements = enablePositionIncrements;
		    termAtt = AddAttribute<ITermAttribute>();
            posIncrAtt = AddAttribute<IPositionIncrementAttribute>();
		}
예제 #23
0
 /// <summary>
 /// If inputText is non-null, and the TokenStream has
 ///  offsets, we include the surface form in each arc's
 ///  label.
 /// </summary>
 public TokenStreamToDot(string inputText, TokenStream @in, TextWriter @out)
 {
     this.@in = @in;
     this.@out = @out;
     this.InputText = inputText;
     TermAtt = @in.AddAttribute<ICharTermAttribute>();
     PosIncAtt = @in.AddAttribute<IPositionIncrementAttribute>();
     PosLengthAtt = @in.AddAttribute<IPositionLengthAttribute>();
     if (@in.HasAttribute<IOffsetAttribute>())
     {
         OffsetAtt = @in.AddAttribute<IOffsetAttribute>();
     }
     else
     {
         OffsetAtt = null;
     }
 }
예제 #24
0
        public virtual void TestFilterTokens()
        {
            SnowballFilter              filter     = new SnowballFilter(new TestTokenStream(this), "English");
            ICharTermAttribute          termAtt    = filter.GetAttribute <ICharTermAttribute>();
            IOffsetAttribute            offsetAtt  = filter.GetAttribute <IOffsetAttribute>();
            ITypeAttribute              typeAtt    = filter.GetAttribute <ITypeAttribute>();
            IPayloadAttribute           payloadAtt = filter.GetAttribute <IPayloadAttribute>();
            IPositionIncrementAttribute posIncAtt  = filter.GetAttribute <IPositionIncrementAttribute>();
            IFlagsAttribute             flagsAtt   = filter.GetAttribute <IFlagsAttribute>();

            filter.IncrementToken();

            assertEquals("accent", termAtt.ToString());
            assertEquals(2, offsetAtt.StartOffset);
            assertEquals(7, offsetAtt.EndOffset);
            assertEquals("wrd", typeAtt.Type);
            assertEquals(3, posIncAtt.PositionIncrement);
            assertEquals(77, flagsAtt.Flags);
            assertEquals(new BytesRef(new byte[] { 0, 1, 2, 3 }), payloadAtt.Payload);
        }
예제 #25
0
        private void Init(LuceneVersion version, Side side, int minGram, int maxGram)
        {
            //if (version == null)
            //{
            //    throw new ArgumentException("version must not be null");
            //}

            if (!Enum.IsDefined(typeof(Side), side))
            {
                throw new ArgumentException("sideLabel must be either front or back");
            }

            if (minGram < 1)
            {
                throw new ArgumentException("minGram must be greater than zero");
            }

            if (minGram > maxGram)
            {
                throw new ArgumentException("minGram must not be greater than maxGram");
            }

            if (version.OnOrAfter(LuceneVersion.LUCENE_44))
            {
                if (side == Side.BACK)
                {
                    throw new ArgumentException("Side.BACK is not supported anymore as of Lucene 4.4");
                }
            }
            else
            {
                maxGram = Math.Min(maxGram, 1024);
            }

            this.minGram    = minGram;
            this.maxGram    = maxGram;
            this.side       = side;
            this.termAtt    = AddAttribute <ICharTermAttribute>();
            this.offsetAtt  = AddAttribute <IOffsetAttribute>();
            this.posIncrAtt = AddAttribute <IPositionIncrementAttribute>();
        }
예제 #26
0
        public PrefixAwareTokenFilter(TokenStream prefix, TokenStream suffix)
            : base(suffix)
        {
            this.suffix = suffix;
            this.prefix = prefix;
            prefixExhausted = false;

            termAtt = AddAttribute<ICharTermAttribute>();
            posIncrAtt = AddAttribute<IPositionIncrementAttribute>();
            payloadAtt = AddAttribute<IPayloadAttribute>();
            offsetAtt = AddAttribute<IOffsetAttribute>();
            typeAtt = AddAttribute<ITypeAttribute>();
            flagsAtt = AddAttribute<IFlagsAttribute>();

            p_termAtt = prefix.AddAttribute<ICharTermAttribute>();
            p_posIncrAtt = prefix.AddAttribute<IPositionIncrementAttribute>();
            p_payloadAtt = prefix.AddAttribute<IPayloadAttribute>();
            p_offsetAtt = prefix.AddAttribute<IOffsetAttribute>();
            p_typeAtt = prefix.AddAttribute<ITypeAttribute>();
            p_flagsAtt = prefix.AddAttribute<IFlagsAttribute>();
        }
예제 #27
0
        public PrefixAwareTokenFilter(TokenStream prefix, TokenStream suffix)
            : base(suffix)
        {
            this.suffix     = suffix;
            this.prefix     = prefix;
            prefixExhausted = false;

            termAtt    = AddAttribute <ICharTermAttribute>();
            posIncrAtt = AddAttribute <IPositionIncrementAttribute>();
            payloadAtt = AddAttribute <IPayloadAttribute>();
            offsetAtt  = AddAttribute <IOffsetAttribute>();
            typeAtt    = AddAttribute <ITypeAttribute>();
            flagsAtt   = AddAttribute <IFlagsAttribute>();

            p_termAtt    = prefix.AddAttribute <ICharTermAttribute>();
            p_posIncrAtt = prefix.AddAttribute <IPositionIncrementAttribute>();
            p_payloadAtt = prefix.AddAttribute <IPayloadAttribute>();
            p_offsetAtt  = prefix.AddAttribute <IOffsetAttribute>();
            p_typeAtt    = prefix.AddAttribute <ITypeAttribute>();
            p_flagsAtt   = prefix.AddAttribute <IFlagsAttribute>();
        }
예제 #28
0
        private void DoTestStopPositons(StopFilter stpf, bool enableIcrements)
        {
            log("---> test with enable-increments-" + (enableIcrements ? "enabled" : "disabled"));
#pragma warning disable 612, 618
            stpf.SetEnablePositionIncrements(enableIcrements);
#pragma warning restore 612, 618
            ICharTermAttribute          termAtt    = stpf.GetAttribute <ICharTermAttribute>();
            IPositionIncrementAttribute posIncrAtt = stpf.GetAttribute <IPositionIncrementAttribute>();
            stpf.Reset();
            for (int i = 0; i < 20; i += 3)
            {
                assertTrue(stpf.IncrementToken());
                log("Token " + i + ": " + stpf);
                string w = English.Int32ToEnglish(i).Trim();
                assertEquals("expecting token " + i + " to be " + w, w, termAtt.ToString());
                assertEquals("all but first token must have position increment of 3", enableIcrements ? (i == 0 ? 1 : 3) : 1, posIncrAtt.PositionIncrement);
            }
            assertFalse(stpf.IncrementToken());
            stpf.End();
            stpf.Dispose();
        }
예제 #29
0
        /// <summary>
        /// Creates a new WordDelimiterFilter
        /// </summary>
        /// <param name="in"> TokenStream to be filtered </param>
        /// <param name="charTypeTable"> table containing character types </param>
        /// <param name="configurationFlags"> Flags configuring the filter </param>
        /// <param name="protWords"> If not null is the set of tokens to protect from being delimited </param>
        public WordDelimiterFilter(LuceneVersion matchVersion, TokenStream @in, sbyte[] charTypeTable, int configurationFlags, CharArraySet protWords)
              : base(@in)
        {
            if (!InstanceFieldsInitialized)
            {
                InitializeInstanceFields();
                InstanceFieldsInitialized = true;
            }
            if (!matchVersion.OnOrAfter(LuceneVersion.LUCENE_48))
            {
                throw new System.ArgumentException("This class only works with Lucene 4.8+. To emulate the old (broken) behavior of WordDelimiterFilter, use Lucene47WordDelimiterFilter");
            }
            this.flags = configurationFlags;
            this.protWords = protWords;
            this.iterator = new WordDelimiterIterator(charTypeTable, Has(SPLIT_ON_CASE_CHANGE), Has(SPLIT_ON_NUMERICS), Has(STEM_ENGLISH_POSSESSIVE));

            this.termAttribute = AddAttribute<ICharTermAttribute>();
            this.offsetAttribute = AddAttribute<IOffsetAttribute>();
            this.posIncAttribute = AddAttribute<IPositionIncrementAttribute>();
            this.typeAttribute = AddAttribute<ITypeAttribute>();
        }
예제 #30
0
        /// <summary>
        /// Creates a new WordDelimiterFilter
        /// </summary>
        /// <param name="matchVersion"> lucene compatibility version </param>
        /// <param name="in"> TokenStream to be filtered </param>
        /// <param name="charTypeTable"> table containing character types </param>
        /// <param name="configurationFlags"> Flags configuring the filter </param>
        /// <param name="protWords"> If not null is the set of tokens to protect from being delimited </param>
        public WordDelimiterFilter(LuceneVersion matchVersion, TokenStream @in, byte[] charTypeTable, WordDelimiterFlags configurationFlags, CharArraySet protWords)
            : base(@in)
        {
            this.termAttribute   = AddAttribute <ICharTermAttribute>();
            this.offsetAttribute = AddAttribute <IOffsetAttribute>();
            this.posIncAttribute = AddAttribute <IPositionIncrementAttribute>();
            this.typeAttribute   = AddAttribute <ITypeAttribute>();
            concat    = new WordDelimiterConcatenation(this);
            concatAll = new WordDelimiterConcatenation(this);
            sorter    = new OffsetSorter(this);

            if (!matchVersion.OnOrAfter(LuceneVersion.LUCENE_48))
            {
                throw new ArgumentException("This class only works with Lucene 4.8+. To emulate the old (broken) behavior of WordDelimiterFilter, use Lucene47WordDelimiterFilter");
            }
            this.flags     = configurationFlags;
            this.protWords = protWords;
            this.iterator  = new WordDelimiterIterator(charTypeTable,
                                                       Has(WordDelimiterFlags.SPLIT_ON_CASE_CHANGE),
                                                       Has(WordDelimiterFlags.SPLIT_ON_NUMERICS),
                                                       Has(WordDelimiterFlags.STEM_ENGLISH_POSSESSIVE));
        }
예제 #31
0
        public PrefixAwareTokenFilter(TokenStream prefix, TokenStream suffix) : base(suffix)
        {
            Suffix           = suffix;
            Prefix           = prefix;
            _prefixExhausted = false;

            // ReSharper disable DoNotCallOverridableMethodsInConstructor
            _termAtt    = AddAttribute <ITermAttribute>();
            _posIncrAtt = AddAttribute <IPositionIncrementAttribute>();
            _payloadAtt = AddAttribute <IPayloadAttribute>();
            _offsetAtt  = AddAttribute <IOffsetAttribute>();
            _typeAtt    = AddAttribute <ITypeAttribute>();
            _flagsAtt   = AddAttribute <IFlagsAttribute>();
            // ReSharper restore DoNotCallOverridableMethodsInConstructor

            _pTermAtt    = prefix.AddAttribute <ITermAttribute>();
            _pPosIncrAtt = prefix.AddAttribute <IPositionIncrementAttribute>();
            _pPayloadAtt = prefix.AddAttribute <IPayloadAttribute>();
            _pOffsetAtt  = prefix.AddAttribute <IOffsetAttribute>();
            _pTypeAtt    = prefix.AddAttribute <ITypeAttribute>();
            _pFlagsAtt   = prefix.AddAttribute <IFlagsAttribute>();
        }
예제 #32
0
        public EdgeNGramTokenFilter(LuceneVersion version, TokenStream input, Side side, int minGram, int maxGram)
            : base(input)
        {
            //if (version == null)
            //{
            //    throw new ArgumentException("version must not be null");
            //}

            if (version.OnOrAfter(LuceneVersion.LUCENE_44) && side == Side.BACK)
            {
                throw new ArgumentException("Side.BACK is not supported anymore as of Lucene 4.4, use ReverseStringFilter up-front and afterward");
            }

            if (!Enum.IsDefined(typeof(Side), side))
            {
                throw new ArgumentException("sideLabel must be either front or back");
            }

            if (minGram < 1)
            {
                throw new ArgumentException("minGram must be greater than zero");
            }

            if (minGram > maxGram)
            {
                throw new ArgumentException("minGram must not be greater than maxGram");
            }

            this.version   = version;
            this.charUtils = version.OnOrAfter(LuceneVersion.LUCENE_44) ? CharacterUtils.GetInstance(version) : CharacterUtils.GetJava4Instance(version);
            this.minGram   = minGram;
            this.maxGram   = maxGram;
            this.side      = side;

            this.termAtt    = AddAttribute <ICharTermAttribute>();
            this.offsetAtt  = AddAttribute <IOffsetAttribute>();
            this.posIncrAtt = AddAttribute <IPositionIncrementAttribute>();
            this.posLenAtt  = AddAttribute <IPositionLengthAttribute>();
        }
        public PrefixAwareTokenFilter(TokenStream prefix, TokenStream suffix) : base(suffix)
        {
            Suffix = suffix;
            Prefix = prefix;
            _prefixExhausted = false;

            // ReSharper disable DoNotCallOverridableMethodsInConstructor
            _termAtt = AddAttribute<ITermAttribute>();
            _posIncrAtt = AddAttribute<IPositionIncrementAttribute>();
            _payloadAtt = AddAttribute<IPayloadAttribute>();
            _offsetAtt = AddAttribute<IOffsetAttribute>();
            _typeAtt = AddAttribute<ITypeAttribute>();
            _flagsAtt = AddAttribute<IFlagsAttribute>();
            // ReSharper restore DoNotCallOverridableMethodsInConstructor

            _pTermAtt = prefix.AddAttribute<ITermAttribute>();
            _pPosIncrAtt = prefix.AddAttribute<IPositionIncrementAttribute>();
            _pPayloadAtt = prefix.AddAttribute<IPayloadAttribute>();
            _pOffsetAtt = prefix.AddAttribute<IOffsetAttribute>();
            _pTypeAtt = prefix.AddAttribute<ITypeAttribute>();
            _pFlagsAtt = prefix.AddAttribute<IFlagsAttribute>();
        }
예제 #34
0
        private void Init(LuceneVersion version, Side side, int minGram, int maxGram)
        {
            // LUCENENET specific - version cannot be null because it is a value type.

            if (!side.IsDefined())
            {
                throw new ArgumentOutOfRangeException(nameof(side), "sideLabel must be either front or back"); // LUCENENET specific - changed from IllegalArgumentException to ArgumentOutOfRangeException (.NET convention)
            }

            if (minGram < 1)
            {
                throw new ArgumentOutOfRangeException(nameof(minGram), "minGram must be greater than zero"); // LUCENENET specific - changed from IllegalArgumentException to ArgumentOutOfRangeException (.NET convention)
            }

            if (minGram > maxGram)
            {
                throw new ArgumentException("minGram must not be greater than maxGram");
            }

            if (version.OnOrAfter(LuceneVersion.LUCENE_44))
            {
                if (side == Side.BACK)
                {
                    throw new ArgumentException("Side.BACK is not supported anymore as of Lucene 4.4");
                }
            }
            else
            {
                maxGram = Math.Min(maxGram, 1024);
            }

            this.minGram    = minGram;
            this.maxGram    = maxGram;
            this.side       = side;
            this.termAtt    = AddAttribute <ICharTermAttribute>();
            this.offsetAtt  = AddAttribute <IOffsetAttribute>();
            this.posIncrAtt = AddAttribute <IPositionIncrementAttribute>();
        }
예제 #35
0
        public virtual void TestOutputHangsOffEnd()
        {
            b = new SynonymMap.Builder(true);
            const bool keepOrig = false;

            // b hangs off the end (no input token under it):
            Add("a", "a b", keepOrig);
            tokensIn = new MockTokenizer(new StringReader("a"), MockTokenizer.WHITESPACE, true);
            tokensIn.Reset();
            assertTrue(tokensIn.IncrementToken());
            assertFalse(tokensIn.IncrementToken());
            tokensIn.End();
            tokensIn.Dispose();

            tokensOut  = new SynonymFilter(tokensIn, b.Build(), true);
            termAtt    = tokensOut.AddAttribute <ICharTermAttribute>();
            posIncrAtt = tokensOut.AddAttribute <IPositionIncrementAttribute>();
            offsetAtt  = tokensOut.AddAttribute <IOffsetAttribute>();
            posLenAtt  = tokensOut.AddAttribute <IPositionLengthAttribute>();

            // Make sure endOffset inherits from previous input token:
            Verify("a", "a b:1");
        }
예제 #36
0
        public virtual void  TestStopList()
        {
            var stopWordsSet = Support.Compatibility.SetFactory.CreateHashSet <string>();

            stopWordsSet.Add("good");
            stopWordsSet.Add("test");
            stopWordsSet.Add("analyzer");
            StopAnalyzer newStop = new StopAnalyzer(Version.LUCENE_24, stopWordsSet);

            System.IO.StringReader reader = new System.IO.StringReader("This is a good test of the english stop analyzer");
            TokenStream            stream = newStop.TokenStream("test", reader);

            Assert.IsNotNull(stream);
            ITermAttribute termAtt = stream.GetAttribute <ITermAttribute>();
            IPositionIncrementAttribute posIncrAtt = stream.AddAttribute <IPositionIncrementAttribute>();

            while (stream.IncrementToken())
            {
                System.String text = termAtt.Term;
                Assert.IsFalse(stopWordsSet.Contains(text));
                Assert.AreEqual(1, posIncrAtt.PositionIncrement); // in 2.4 stop tokenizer does not apply increments.
            }
        }
예제 #37
0
        public EdgeNGramTokenFilter(LuceneVersion version, TokenStream input, Side side, int minGram, int maxGram)
            : base(input)
        {
            // LUCENENET specific - version cannot be null because it is a value type.

            if (version.OnOrAfter(LuceneVersion.LUCENE_44) && side == Side.BACK)
            {
                throw new ArgumentException("Side.BACK is not supported anymore as of Lucene 4.4, use ReverseStringFilter up-front and afterward");
            }

            if (!side.IsDefined())
            {
                throw new ArgumentOutOfRangeException(nameof(side), "sideLabel must be either front or back"); // LUCENENET specific - changed from IllegalArgumentException to ArgumentOutOfRangeException (.NET convention)
            }

            if (minGram < 1)
            {
                throw new ArgumentOutOfRangeException(nameof(minGram), "minGram must be greater than zero"); // LUCENENET specific - changed from IllegalArgumentException to ArgumentOutOfRangeException (.NET convention)
            }

            if (minGram > maxGram)
            {
                throw new ArgumentException("minGram must not be greater than maxGram");
            }

            this.version   = version;
            this.charUtils = version.OnOrAfter(LuceneVersion.LUCENE_44) ? CharacterUtils.GetInstance(version) : CharacterUtils.GetJava4Instance(version);
            this.minGram   = minGram;
            this.maxGram   = maxGram;
            this.side      = side;

            this.termAtt    = AddAttribute <ICharTermAttribute>();
            this.offsetAtt  = AddAttribute <IOffsetAttribute>();
            this.posIncrAtt = AddAttribute <IPositionIncrementAttribute>();
            this.posLenAtt  = AddAttribute <IPositionLengthAttribute>();
        }
예제 #38
0
        public static IEnumerable <TokenAttributes> Tokenize(this TokenStream tokenStream)
        {
            var term   = tokenStream.GetAttribute <ITermAttribute>();
            var offset = tokenStream.GetAttribute <IOffsetAttribute>();

            IPositionIncrementAttribute positionIncrement = null;

            if (tokenStream.HasAttribute <IPositionIncrementAttribute>())
            {
                positionIncrement = tokenStream.GetAttribute <IPositionIncrementAttribute>();
            }

            while (tokenStream.IncrementToken())
            {
                var tokenAttributes = new TokenAttributes(term.Term, offset.StartOffset, offset.EndOffset);

                if (positionIncrement != null)
                {
                    tokenAttributes.PositionIncrement = positionIncrement.PositionIncrement;
                }

                yield return(tokenAttributes);
            }
        }
예제 #39
0
        /*
         * Need to worry about multiple scenarios:
         *  - need to go for the longest match
         *    a b => foo      #shouldn't match if "a b" is followed by "c d"
         *    a b c d => bar
         *  - need to backtrack - retry matches for tokens already read
         *     a b c d => foo
         *       b c => bar
         *     If the input stream is "a b c x", one will consume "a b c d"
         *     trying to match the first rule... all but "a" should be
         *     pushed back so a match may be made on "b c".
         *  - don't try and match generated tokens (thus need separate queue)
         *    matching is not recursive.
         *  - handle optional generation of original tokens in all these cases,
         *    merging token streams to preserve token positions.
         *  - preserve original positionIncrement of first matched token
         */
        public override bool IncrementToken()
        {
            while (true)
            {
                // if there are any generated tokens, return them... don't try any
                // matches against them, as we specifically don't want recursion.
                if (replacement != null && replacement.MoveNext())
                {
                    Copy(this, replacement.Current);
                    return(true);
                }

                // common case fast-path of first token not matching anything
                AttributeSource firstTok = NextTok();
                if (firstTok == null)
                {
                    return(false);
                }
                var            termAtt = firstTok.AddAttribute <ICharTermAttribute>();
                SlowSynonymMap result  = map.Submap != null?map.Submap.Get(termAtt.Buffer, 0, termAtt.Length) : null;

                if (result == null)
                {
                    Copy(this, firstTok);
                    return(true);
                }

                // fast-path failed, clone ourselves if needed
                if (firstTok == this)
                {
                    firstTok = CloneAttributes();
                }
                // OK, we matched a token, so find the longest match.

                matched = new LinkedList <AttributeSource>();

                result = Match(result);

                if (result == null)
                {
                    // no match, simply return the first token read.
                    Copy(this, firstTok);
                    return(true);
                }

                // reuse, or create new one each time?
                IList <AttributeSource> generated = new JCG.List <AttributeSource>(result.Synonyms.Length + matched.Count + 1);

                //
                // there was a match... let's generate the new tokens, merging
                // in the matched tokens (position increments need adjusting)
                //
                AttributeSource lastTok     = matched.Count == 0 ? firstTok : matched.Last.Value;
                bool            includeOrig = result.IncludeOrig;

                AttributeSource             origTok        = includeOrig ? firstTok : null;
                IPositionIncrementAttribute firstPosIncAtt = firstTok.AddAttribute <IPositionIncrementAttribute>();
                int origPos = firstPosIncAtt.PositionIncrement; // position of origTok in the original stream
                int repPos  = 0;                                // curr position in replacement token stream
                int pos     = 0;                                // current position in merged token stream

                for (int i = 0; i < result.Synonyms.Length; i++)
                {
                    Token                       repTok       = result.Synonyms[i];
                    AttributeSource             newTok       = firstTok.CloneAttributes();
                    ICharTermAttribute          newTermAtt   = newTok.AddAttribute <ICharTermAttribute>();
                    IOffsetAttribute            newOffsetAtt = newTok.AddAttribute <IOffsetAttribute>();
                    IPositionIncrementAttribute newPosIncAtt = newTok.AddAttribute <IPositionIncrementAttribute>();

                    IOffsetAttribute lastOffsetAtt = lastTok.AddAttribute <IOffsetAttribute>();

                    newOffsetAtt.SetOffset(newOffsetAtt.StartOffset, lastOffsetAtt.EndOffset);
                    newTermAtt.CopyBuffer(repTok.Buffer, 0, repTok.Length);
                    repPos += repTok.PositionIncrement;
                    if (i == 0) // make position of first token equal to original
                    {
                        repPos = origPos;
                    }

                    // if necessary, insert original tokens and adjust position increment
                    while (origTok != null && origPos <= repPos)
                    {
                        IPositionIncrementAttribute origPosInc = origTok.AddAttribute <IPositionIncrementAttribute>();
                        origPosInc.PositionIncrement = origPos - pos;
                        generated.Add(origTok);
                        pos += origPosInc.PositionIncrement;
                        //origTok = matched.Count == 0 ? null : matched.RemoveFirst();
                        if (matched.Count == 0)
                        {
                            origTok = null;
                        }
                        else
                        {
                            origTok = matched.First.Value;
                            matched.Remove(origTok);
                        }
                        if (origTok != null)
                        {
                            origPosInc = origTok.AddAttribute <IPositionIncrementAttribute>();
                            origPos   += origPosInc.PositionIncrement;
                        }
                    }

                    newPosIncAtt.PositionIncrement = repPos - pos;
                    generated.Add(newTok);
                    pos += newPosIncAtt.PositionIncrement;
                }

                // finish up any leftover original tokens
                while (origTok != null)
                {
                    IPositionIncrementAttribute origPosInc = origTok.AddAttribute <IPositionIncrementAttribute>();
                    origPosInc.PositionIncrement = origPos - pos;
                    generated.Add(origTok);
                    pos += origPosInc.PositionIncrement;
                    if (matched.Count == 0)
                    {
                        origTok = null;
                    }
                    else
                    {
                        origTok = matched.First.Value;
                        matched.Remove(origTok);
                    }
                    if (origTok != null)
                    {
                        origPosInc = origTok.AddAttribute <IPositionIncrementAttribute>();
                        origPos   += origPosInc.PositionIncrement;
                    }
                }

                // what if we replaced a longer sequence with a shorter one?
                // a/0 b/5 =>  foo/0
                // should I re-create the gap on the next buffered token?

                replacement = generated.GetEnumerator();
                // Now return to the top of the loop to read and return the first
                // generated token.. The reason this is done is that we may have generated
                // nothing at all, and may need to continue with more matching logic.
            }
        }
예제 #40
0
 /// <summary>
 /// Constructs a new CommonGramsQueryFilter based on the provided CommomGramsFilter
 /// </summary>
 /// <param name="input"> CommonGramsFilter the QueryFilter will use </param>
 public CommonGramsQueryFilter(CommonGramsFilter input)
     : base(input)
 {
     typeAttribute   = AddAttribute <ITypeAttribute>();
     posIncAttribute = AddAttribute <IPositionIncrementAttribute>();
 }
예제 #41
0
        /// <summary>
        /// Iterates over the given token stream and adds the resulting terms to the index;
        /// Equivalent to adding a tokenized, indexed, termVectorStored, unstored,
        /// Lucene <see cref="Documents.Field"/>.
        /// Finally closes the token stream. Note that untokenized keywords can be added with this method via
        /// <see cref="T:KeywordTokenStream{T}(ICollection{T}"/>)"/>, the Lucene <c>KeywordTokenizer</c> or similar utilities.
        ///
        /// </summary>
        /// <param name="fieldName"> a name to be associated with the text </param>
        /// <param name="stream"> the token stream to retrieve tokens from. </param>
        /// <param name="boost"> the boost factor for hits for this field </param>
        /// <param name="positionIncrementGap"> the position increment gap if fields with the same name are added more than once </param>
        /// <param name="offsetGap"> the offset gap if fields with the same name are added more than once </param>
        /// <seealso cref="Documents.Field.Boost"/>
        public virtual void AddField(string fieldName, TokenStream stream, float boost, int positionIncrementGap, int offsetGap)
        {
            try
            {
                if (fieldName == null)
                {
                    throw new ArgumentException("fieldName must not be null");
                }
                if (stream == null)
                {
                    throw new ArgumentException("token stream must not be null");
                }
                if (boost <= 0.0f)
                {
                    throw new ArgumentException("boost factor must be greater than 0.0");
                }
                int                 numTokens        = 0;
                int                 numOverlapTokens = 0;
                int                 pos = -1;
                BytesRefHash        terms;
                SliceByteStartArray sliceArray;
                long                sumTotalTermFreq = 0;
                int                 offset           = 0;
                if (fields.TryGetValue(fieldName, out Info info))
                {
                    numTokens        = info.numTokens;
                    numOverlapTokens = info.numOverlapTokens;
                    pos              = info.lastPosition + positionIncrementGap;
                    offset           = info.lastOffset + offsetGap;
                    terms            = info.terms;
                    boost           *= info.boost;
                    sliceArray       = info.sliceArray;
                    sumTotalTermFreq = info.sumTotalTermFreq;
                }
                else
                {
                    sliceArray = new SliceByteStartArray(BytesRefHash.DEFAULT_CAPACITY);
                    terms      = new BytesRefHash(byteBlockPool, BytesRefHash.DEFAULT_CAPACITY, sliceArray);
                }

                if (!fieldInfos.ContainsKey(fieldName))
                {
                    fieldInfos[fieldName] = new FieldInfo(fieldName,
                                                          true,
                                                          fieldInfos.Count,
                                                          false,
                                                          false,
                                                          false,
                                                          this.storeOffsets ? IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS : IndexOptions.DOCS_AND_FREQS_AND_POSITIONS,
                                                          DocValuesType.NONE,
                                                          DocValuesType.NONE,
                                                          null);
                }
                ITermToBytesRefAttribute    termAtt          = stream.GetAttribute <ITermToBytesRefAttribute>();
                IPositionIncrementAttribute posIncrAttribute = stream.AddAttribute <IPositionIncrementAttribute>();
                IOffsetAttribute            offsetAtt        = stream.AddAttribute <IOffsetAttribute>();
                BytesRef @ref = termAtt.BytesRef;
                stream.Reset();

                while (stream.IncrementToken())
                {
                    termAtt.FillBytesRef();
                    //        if (DEBUG) System.err.println("token='" + term + "'");
                    numTokens++;
                    int posIncr = posIncrAttribute.PositionIncrement;
                    if (posIncr == 0)
                    {
                        numOverlapTokens++;
                    }
                    pos += posIncr;
                    int ord = terms.Add(@ref);
                    if (ord < 0)
                    {
                        ord = (-ord) - 1;
                        postingsWriter.Reset(sliceArray.end[ord]);
                    }
                    else
                    {
                        sliceArray.start[ord] = postingsWriter.StartNewSlice();
                    }
                    sliceArray.freq[ord]++;
                    sumTotalTermFreq++;
                    if (!storeOffsets)
                    {
                        postingsWriter.WriteInt32(pos);
                    }
                    else
                    {
                        postingsWriter.WriteInt32(pos);
                        postingsWriter.WriteInt32(offsetAtt.StartOffset + offset);
                        postingsWriter.WriteInt32(offsetAtt.EndOffset + offset);
                    }
                    sliceArray.end[ord] = postingsWriter.CurrentOffset;
                }
                stream.End();

                // ensure infos.numTokens > 0 invariant; needed for correct operation of terms()
                if (numTokens > 0)
                {
                    fields[fieldName] = new Info(terms, sliceArray, numTokens, numOverlapTokens, boost, pos, offsetAtt.EndOffset + offset, sumTotalTermFreq);
                    sortedFields      = null; // invalidate sorted view, if any
                }
            } // can never happen
            catch (Exception e)
            {
                throw new Exception(e.ToString(), e);
            }
            finally
            {
                try
                {
                    if (stream != null)
                    {
                        stream.Dispose();
                    }
                }
                catch (IOException e2)
                {
                    throw new Exception(e2.ToString(), e2);
                }
            }
        }
예제 #42
0
 public BugReproTokenStream()
 {
     termAtt   = AddAttribute <ICharTermAttribute>();
     offsetAtt = AddAttribute <IOffsetAttribute>();
     posIncAtt = AddAttribute <IPositionIncrementAttribute>();
 }
예제 #43
0
 public RemoveATokens(TokenStream @in)
     : base(@in)
 {
     termAtt   = AddAttribute <ICharTermAttribute>();
     posIncAtt = AddAttribute <IPositionIncrementAttribute>();
 }
예제 #44
0
        /// <summary> Not an explicit test, just useful to print out some info on performance
        ///
        /// </summary>
        /// <throws>  Exception </throws>
        public virtual void Performance()
        {
            int[] tokCount  = new int[] { 100, 500, 1000, 2000, 5000, 10000 };
            int[] modCounts = new int[] { 1, 2, 5, 10, 20, 50, 100, 200, 500 };
            for (int k = 0; k < tokCount.Length; k++)
            {
                System.Text.StringBuilder buffer = new System.Text.StringBuilder();
                System.Console.Out.WriteLine("-----Tokens: " + tokCount[k] + "-----");
                for (int i = 0; i < tokCount[k]; i++)
                {
                    buffer.Append(English.IntToEnglish(i).ToUpper()).Append(' ');
                }
                //make sure we produce the same tokens
                TeeSinkTokenFilter teeStream = new TeeSinkTokenFilter(new StandardFilter(new StandardTokenizer(Version.LUCENE_CURRENT, new System.IO.StringReader(buffer.ToString()))));
                TokenStream        sink      = teeStream.NewSinkTokenStream(new ModuloSinkFilter(this, 100));
                teeStream.ConsumeAllTokens();
                TokenStream    stream  = new ModuloTokenFilter(this, new StandardFilter(new StandardTokenizer(Version.LUCENE_CURRENT, new System.IO.StringReader(buffer.ToString()))), 100);
                ITermAttribute tfTok   = stream.AddAttribute <ITermAttribute>();
                ITermAttribute sinkTok = sink.AddAttribute <ITermAttribute>();
                for (int i = 0; stream.IncrementToken(); i++)
                {
                    Assert.IsTrue(sink.IncrementToken());
                    Assert.IsTrue(tfTok.Equals(sinkTok) == true, tfTok + " is not equal to " + sinkTok + " at token: " + i);
                }

                //simulate two fields, each being analyzed once, for 20 documents
                for (int j = 0; j < modCounts.Length; j++)
                {
                    int  tfPos = 0;
                    long start = (DateTime.Now.Ticks / TimeSpan.TicksPerMillisecond);
                    for (int i = 0; i < 20; i++)
                    {
                        stream = new StandardFilter(new StandardTokenizer(Version.LUCENE_CURRENT, new System.IO.StringReader(buffer.ToString())));
                        IPositionIncrementAttribute posIncrAtt = stream.GetAttribute <IPositionIncrementAttribute>();
                        while (stream.IncrementToken())
                        {
                            tfPos += posIncrAtt.PositionIncrement;
                        }
                        stream     = new ModuloTokenFilter(this, new StandardFilter(new StandardTokenizer(Version.LUCENE_CURRENT, new System.IO.StringReader(buffer.ToString()))), modCounts[j]);
                        posIncrAtt = stream.GetAttribute <IPositionIncrementAttribute>();
                        while (stream.IncrementToken())
                        {
                            tfPos += posIncrAtt.PositionIncrement;
                        }
                    }
                    long finish = (DateTime.Now.Ticks / TimeSpan.TicksPerMillisecond);
                    System.Console.Out.WriteLine("ModCount: " + modCounts[j] + " Two fields took " + (finish - start) + " ms");
                    int sinkPos = 0;
                    //simulate one field with one sink
                    start = (DateTime.Now.Ticks / TimeSpan.TicksPerMillisecond);
                    for (int i = 0; i < 20; i++)
                    {
                        teeStream = new TeeSinkTokenFilter(new StandardFilter(new StandardTokenizer(Version.LUCENE_CURRENT, new System.IO.StringReader(buffer.ToString()))));
                        sink      = teeStream.NewSinkTokenStream(new ModuloSinkFilter(this, modCounts[j]));
                        IPositionIncrementAttribute posIncrAtt = teeStream.GetAttribute <IPositionIncrementAttribute>();
                        while (teeStream.IncrementToken())
                        {
                            sinkPos += posIncrAtt.PositionIncrement;
                        }
                        //System.out.println("Modulo--------");
                        posIncrAtt = sink.GetAttribute <IPositionIncrementAttribute>();
                        while (sink.IncrementToken())
                        {
                            sinkPos += posIncrAtt.PositionIncrement;
                        }
                    }
                    finish = (DateTime.Now.Ticks / TimeSpan.TicksPerMillisecond);
                    System.Console.Out.WriteLine("ModCount: " + modCounts[j] + " Tee fields took " + (finish - start) + " ms");
                    Assert.IsTrue(sinkPos == tfPos, sinkPos + " does not equal: " + tfPos);
                }
                System.Console.Out.WriteLine("- End Tokens: " + tokCount[k] + "-----");
            }
        }
예제 #45
0
 public MockCJKSynonymFilter(TokenStream input)
     : base(input)
 {
     termAtt   = AddAttribute <ICharTermAttribute>();
     posIncAtt = AddAttribute <IPositionIncrementAttribute>();
 }
예제 #46
0
 internal LargePosIncTokenFilter(TestWordDelimiterFilter outerInstance, TokenStream input) : base(input)
 {
     this.outerInstance = outerInstance;
     this.termAtt       = AddAttribute <ICharTermAttribute>();
     this.posIncAtt     = AddAttribute <IPositionIncrementAttribute>();
 }
예제 #47
0
 public TestPosIncrementFilter(TokenStream @in)
     : base(@in)
 {
     termAtt = AddAttribute<ICharTermAttribute>();
     posIncrAtt = AddAttribute<IPositionIncrementAttribute>();
 }
예제 #48
0
 public SingleCharTokenizer(TokenStream input)
     : base(input)
 {
     _input = input;
     _termAttribute = AddAttribute<ITermAttribute>();
     _offsetAttribute = AddAttribute<IOffsetAttribute>();
     _positionIncrementAttribute = AddAttribute<IPositionIncrementAttribute>();
 }
            protected internal RandomTokenStream(BaseTermVectorsFormatTestCase baseTermVectorsFormatTestCase, int len, string[] sampleTerms, BytesRef[] sampleTermBytes, bool offsetsGoBackwards)
            {
                terms               = new string[len];
                termBytes           = new BytesRef[len];
                positionsIncrements = new int[len];
                positions           = new int[len];
                startOffsets        = new int[len];
                endOffsets          = new int[len];
                payloads            = new BytesRef[len];
                for (int i = 0; i < len; ++i)
                {
                    int o = Random.Next(sampleTerms.Length);
                    terms[i]               = sampleTerms[o];
                    termBytes[i]           = sampleTermBytes[o];
                    positionsIncrements[i] = TestUtil.NextInt32(Random, i == 0 ? 1 : 0, 10);
                    if (offsetsGoBackwards)
                    {
                        startOffsets[i] = Random.Next();
                        endOffsets[i]   = Random.Next();
                    }
                    else
                    {
                        if (i == 0)
                        {
                            startOffsets[i] = TestUtil.NextInt32(Random, 0, 1 << 16);
                        }
                        else
                        {
                            startOffsets[i] = startOffsets[i - 1] + TestUtil.NextInt32(Random, 0, Rarely() ? 1 << 16 : 20);
                        }
                        endOffsets[i] = startOffsets[i] + TestUtil.NextInt32(Random, 0, Rarely() ? 1 << 10 : 20);
                    }
                }

                for (int i = 0; i < len; ++i)
                {
                    if (i == 0)
                    {
                        positions[i] = positionsIncrements[i] - 1;
                    }
                    else
                    {
                        positions[i] = positions[i - 1] + positionsIncrements[i];
                    }
                }
                if (Rarely())
                {
                    Arrays.Fill(payloads, baseTermVectorsFormatTestCase.RandomPayload());
                }
                else
                {
                    for (int i = 0; i < len; ++i)
                    {
                        payloads[i] = baseTermVectorsFormatTestCase.RandomPayload();
                    }
                }

                positionToTerms    = new Dictionary <int?, ISet <int?> >(len);
                startOffsetToTerms = new Dictionary <int?, ISet <int?> >(len);
                for (int i = 0; i < len; ++i)
                {
                    if (!positionToTerms.TryGetValue(positions[i], out ISet <int?> positionTerms))
                    {
                        positionToTerms[positions[i]] = positionTerms = new JCG.HashSet <int?>(1);
                    }
                    positionTerms.Add(i);
                    if (!startOffsetToTerms.TryGetValue(startOffsets[i], out ISet <int?> startOffsetTerms))
                    {
                        startOffsetToTerms[startOffsets[i]] = startOffsetTerms = new JCG.HashSet <int?>(1);
                    }
                    startOffsetTerms.Add(i);
                }

                freqs = new Dictionary <string, int?>();
                foreach (string term in terms)
                {
                    if (freqs.TryGetValue(term, out int?freq))
                    {
                        freqs[term] = freq + 1;
                    }
                    else
                    {
                        freqs[term] = 1;
                    }
                }

                AddAttributeImpl(new PermissiveOffsetAttribute());

                termAtt = AddAttribute <ICharTermAttribute>();
                piAtt   = AddAttribute <IPositionIncrementAttribute>();
                oAtt    = AddAttribute <IOffsetAttribute>();
                pAtt    = AddAttribute <IPayloadAttribute>();
            }
예제 #50
0
        public virtual void TestRandom()
        {
            int alphabetSize = TestUtil.NextInt32(Random, 2, 7);

            int docLen = AtLeast(3000);
            //final int docLen = 50;

            string document = GetRandomString('a', alphabetSize, docLen);

            if (Verbose)
            {
                Console.WriteLine("TEST: doc=" + document);
            }

            int numSyn = AtLeast(5);
            //final int numSyn = 2;

            IDictionary <string, OneSyn> synMap = new Dictionary <string, OneSyn>();
            IList <OneSyn> syns  = new JCG.List <OneSyn>();
            bool           dedup = Random.nextBoolean();

            if (Verbose)
            {
                Console.WriteLine("  dedup=" + dedup);
            }
            b = new SynonymMap.Builder(dedup);
            for (int synIDX = 0; synIDX < numSyn; synIDX++)
            {
                string synIn = GetRandomString('a', alphabetSize, TestUtil.NextInt32(Random, 1, 5)).Trim();
                if (!synMap.TryGetValue(synIn, out OneSyn s) || s is null)
                {
                    s     = new OneSyn();
                    s.@in = synIn;
                    syns.Add(s);
                    s.@out        = new JCG.List <string>();
                    synMap[synIn] = s;
                    s.keepOrig    = Random.nextBoolean();
                }
                string synOut = GetRandomString('0', 10, TestUtil.NextInt32(Random, 1, 5)).Trim();
                [email protected](synOut);
                Add(synIn, synOut, s.keepOrig);
                if (Verbose)
                {
                    Console.WriteLine("  syns[" + synIDX + "] = " + s.@in + " -> " + s.@out + " keepOrig=" + s.keepOrig);
                }
            }

            tokensIn = new MockTokenizer(new StringReader("a"), MockTokenizer.WHITESPACE, true);
            tokensIn.Reset();
            assertTrue(tokensIn.IncrementToken());
            assertFalse(tokensIn.IncrementToken());
            tokensIn.End();
            tokensIn.Dispose();

            tokensOut  = new SynonymFilter(tokensIn, b.Build(), true);
            termAtt    = tokensOut.AddAttribute <ICharTermAttribute>();
            posIncrAtt = tokensOut.AddAttribute <IPositionIncrementAttribute>();
            posLenAtt  = tokensOut.AddAttribute <IPositionLengthAttribute>();
            offsetAtt  = tokensOut.AddAttribute <IOffsetAttribute>();

            if (dedup)
            {
                PruneDups(syns);
            }

            string expected = SlowSynMatcher(document, syns, 5);

            if (Verbose)
            {
                Console.WriteLine("TEST: expected=" + expected);
            }

            Verify(document, expected);
        }
예제 #51
0
        /// <summary>
        /// Retrieve suggestions.
        /// </summary>
        public virtual IList <LookupResult> DoLookup(string key, IEnumerable <BytesRef> contexts, int num)
        {
            if (contexts != null)
            {
                throw new System.ArgumentException("this suggester doesn't support contexts");
            }

            TokenStream ts = queryAnalyzer.GetTokenStream("", key.ToString());

            try
            {
                ITermToBytesRefAttribute    termBytesAtt = ts.AddAttribute <ITermToBytesRefAttribute>();
                IOffsetAttribute            offsetAtt    = ts.AddAttribute <IOffsetAttribute>();
                IPositionLengthAttribute    posLenAtt    = ts.AddAttribute <IPositionLengthAttribute>();
                IPositionIncrementAttribute posIncAtt    = ts.AddAttribute <IPositionIncrementAttribute>();
                ts.Reset();

                var lastTokens = new BytesRef[grams];
                //System.out.println("lookup: key='" + key + "'");

                // Run full analysis, but save only the
                // last 1gram, last 2gram, etc.:
                BytesRef tokenBytes   = termBytesAtt.BytesRef;
                int      maxEndOffset = -1;
                bool     sawRealToken = false;
                while (ts.IncrementToken())
                {
                    termBytesAtt.FillBytesRef();
                    sawRealToken |= tokenBytes.Length > 0;
                    // TODO: this is somewhat iffy; today, ShingleFilter
                    // sets posLen to the gram count; maybe we should make
                    // a separate dedicated att for this?
                    int gramCount = posLenAtt.PositionLength;

                    Debug.Assert(gramCount <= grams);

                    // Safety: make sure the recalculated count "agrees":
                    if (CountGrams(tokenBytes) != gramCount)
                    {
                        throw new System.ArgumentException("tokens must not contain separator byte; got token=" + tokenBytes + " but gramCount=" + gramCount + " does not match recalculated count=" + CountGrams(tokenBytes));
                    }
                    maxEndOffset = Math.Max(maxEndOffset, offsetAtt.EndOffset);
                    lastTokens[gramCount - 1] = BytesRef.DeepCopyOf(tokenBytes);
                }
                ts.End();

                if (!sawRealToken)
                {
                    throw new System.ArgumentException("no tokens produced by analyzer, or the only tokens were empty strings");
                }

                // Carefully fill last tokens with _ tokens;
                // ShingleFilter appraently won't emit "only hole"
                // tokens:
                int endPosInc = posIncAtt.PositionIncrement;

                // Note this will also be true if input is the empty
                // string (in which case we saw no tokens and
                // maxEndOffset is still -1), which in fact works out OK
                // because we fill the unigram with an empty BytesRef
                // below:
                bool lastTokenEnded = offsetAtt.EndOffset > maxEndOffset || endPosInc > 0;
                //System.out.println("maxEndOffset=" + maxEndOffset + " vs " + offsetAtt.EndOffset);

                if (lastTokenEnded)
                {
                    //System.out.println("  lastTokenEnded");
                    // If user hit space after the last token, then
                    // "upgrade" all tokens.  This way "foo " will suggest
                    // all bigrams starting w/ foo, and not any unigrams
                    // starting with "foo":
                    for (int i = grams - 1; i > 0; i--)
                    {
                        BytesRef token = lastTokens[i - 1];
                        if (token == null)
                        {
                            continue;
                        }
                        token.Grow(token.Length + 1);
                        token.Bytes[token.Length] = separator;
                        token.Length++;
                        lastTokens[i] = token;
                    }
                    lastTokens[0] = new BytesRef();
                }

                var arc = new FST.Arc <long?>();

                var bytesReader = fst.GetBytesReader();

                // Try highest order models first, and if they return
                // results, return that; else, fallback:
                double backoff = 1.0;

                List <LookupResult> results = new List <LookupResult>(num);

                // We only add a given suffix once, from the highest
                // order model that saw it; for subsequent lower order
                // models we skip it:
                var seen = new HashSet <BytesRef>();

                for (int gram = grams - 1; gram >= 0; gram--)
                {
                    BytesRef token = lastTokens[gram];
                    // Don't make unigram predictions from empty string:
                    if (token == null || (token.Length == 0 && key.Length > 0))
                    {
                        // Input didn't have enough tokens:
                        //System.out.println("  gram=" + gram + ": skip: not enough input");
                        continue;
                    }

                    if (endPosInc > 0 && gram <= endPosInc)
                    {
                        // Skip hole-only predictions; in theory we
                        // shouldn't have to do this, but we'd need to fix
                        // ShingleFilter to produce only-hole tokens:
                        //System.out.println("  break: only holes now");
                        break;
                    }

                    //System.out.println("try " + (gram+1) + " gram token=" + token.utf8ToString());

                    // TODO: we could add fuzziness here
                    // match the prefix portion exactly
                    //Pair<Long,BytesRef> prefixOutput = null;
                    long?prefixOutput = null;
                    try
                    {
                        prefixOutput = LookupPrefix(fst, bytesReader, token, arc);
                    }
                    catch (IOException bogus)
                    {
                        throw new Exception(bogus.ToString(), bogus);
                    }
                    //System.out.println("  prefixOutput=" + prefixOutput);

                    if (prefixOutput == null)
                    {
                        // This model never saw this prefix, e.g. the
                        // trigram model never saw context "purple mushroom"
                        backoff *= ALPHA;
                        continue;
                    }

                    // TODO: we could do this division at build time, and
                    // bake it into the FST?

                    // Denominator for computing scores from current
                    // model's predictions:
                    long contextCount = totTokens;

                    BytesRef lastTokenFragment = null;

                    for (int i = token.Length - 1; i >= 0; i--)
                    {
                        if (token.Bytes[token.Offset + i] == separator)
                        {
                            BytesRef context = new BytesRef(token.Bytes, token.Offset, i);
                            long?    output  = Lucene.Net.Util.Fst.Util.Get(fst, Lucene.Net.Util.Fst.Util.ToInt32sRef(context, new Int32sRef()));
                            Debug.Assert(output != null);
                            contextCount      = DecodeWeight(output);
                            lastTokenFragment = new BytesRef(token.Bytes, token.Offset + i + 1, token.Length - i - 1);
                            break;
                        }
                    }

                    BytesRef finalLastToken;

                    if (lastTokenFragment == null)
                    {
                        finalLastToken = BytesRef.DeepCopyOf(token);
                    }
                    else
                    {
                        finalLastToken = BytesRef.DeepCopyOf(lastTokenFragment);
                    }
                    Debug.Assert(finalLastToken.Offset == 0);

                    CharsRef spare = new CharsRef();

                    // complete top-N
                    Util.Fst.Util.TopResults <long?> completions = null;
                    try
                    {
                        // Because we store multiple models in one FST
                        // (1gram, 2gram, 3gram), we must restrict the
                        // search so that it only considers the current
                        // model.  For highest order model, this is not
                        // necessary since all completions in the FST
                        // must be from this model, but for lower order
                        // models we have to filter out the higher order
                        // ones:

                        // Must do num+seen.size() for queue depth because we may
                        // reject up to seen.size() paths in acceptResult():
                        Util.Fst.Util.TopNSearcher <long?> searcher = new TopNSearcherAnonymousInnerClassHelper(this, fst, num, num + seen.Count, weightComparer, seen, finalLastToken);

                        // since this search is initialized with a single start node
                        // it is okay to start with an empty input path here
                        searcher.AddStartPaths(arc, prefixOutput, true, new Int32sRef());

                        completions = searcher.Search();
                        Debug.Assert(completions.IsComplete);
                    }
                    catch (IOException bogus)
                    {
                        throw new Exception(bogus.ToString(), bogus);
                    }

                    int prefixLength = token.Length;

                    BytesRef suffix = new BytesRef(8);
                    //System.out.println("    " + completions.length + " completions");

                    foreach (Util.Fst.Util.Result <long?> completion in completions)
                    {
                        token.Length = prefixLength;
                        // append suffix
                        Util.Fst.Util.ToBytesRef(completion.Input, suffix);
                        token.Append(suffix);

                        //System.out.println("    completion " + token.utf8ToString());

                        // Skip this path if a higher-order model already
                        // saw/predicted its last token:
                        BytesRef lastToken = token;
                        for (int i = token.Length - 1; i >= 0; i--)
                        {
                            if (token.Bytes[token.Offset + i] == separator)
                            {
                                Debug.Assert(token.Length - i - 1 > 0);
                                lastToken = new BytesRef(token.Bytes, token.Offset + i + 1, token.Length - i - 1);
                                break;
                            }
                        }
                        if (seen.Contains(lastToken))
                        {
                            //System.out.println("      skip dup " + lastToken.utf8ToString());
                            goto nextCompletionContinue;
                        }
                        seen.Add(BytesRef.DeepCopyOf(lastToken));
                        spare.Grow(token.Length);
                        UnicodeUtil.UTF8toUTF16(token, spare);
                        LookupResult result = new LookupResult(spare.ToString(),
                                                               // LUCENENET NOTE: We need to calculate this as decimal because when using double it can sometimes
                                                               // return numbers that are greater than long.MaxValue, which results in a negative long number.
                                                               (long)(long.MaxValue * (decimal)backoff * ((decimal)DecodeWeight(completion.Output)) / contextCount));
                        results.Add(result);
                        Debug.Assert(results.Count == seen.Count);
                        //System.out.println("  add result=" + result);
                        nextCompletionContinue :;
                    }
                    backoff *= ALPHA;
                }

                results.Sort(new ComparerAnonymousInnerClassHelper(this));

                if (results.Count > num)
                {
                    results.SubList(num, results.Count).Clear();
                }

                return(results);
            }
            finally
            {
                IOUtils.DisposeWhileHandlingException(ts);
            }
        }
예제 #52
0
 internal TestTokenStream()
 {
     termAtt = AddAttribute<ITermAttribute>();
     offsetAtt = AddAttribute<IOffsetAttribute>();
     typeAtt = AddAttribute<ITypeAttribute>();
     payloadAtt = AddAttribute<IPayloadAttribute>();
     posIncAtt = AddAttribute<IPositionIncrementAttribute>();
     flagsAtt = AddAttribute<IFlagsAttribute>();
 }
            protected internal RandomTokenStream(BaseTermVectorsFormatTestCase outerInstance, int len, string[] sampleTerms, BytesRef[] sampleTermBytes, bool offsetsGoBackwards)
            {
                this.OuterInstance = outerInstance;
                Terms               = new string[len];
                TermBytes           = new BytesRef[len];
                PositionsIncrements = new int[len];
                Positions           = new int[len];
                StartOffsets        = new int[len];
                EndOffsets          = new int[len];
                Payloads            = new BytesRef[len];
                for (int i = 0; i < len; ++i)
                {
                    int o = Random().Next(sampleTerms.Length);
                    Terms[i]               = sampleTerms[o];
                    TermBytes[i]           = sampleTermBytes[o];
                    PositionsIncrements[i] = TestUtil.NextInt(Random(), i == 0 ? 1 : 0, 10);
                    if (offsetsGoBackwards)
                    {
                        StartOffsets[i] = Random().Next();
                        EndOffsets[i]   = Random().Next();
                    }
                    else
                    {
                        if (i == 0)
                        {
                            StartOffsets[i] = TestUtil.NextInt(Random(), 0, 1 << 16);
                        }
                        else
                        {
                            StartOffsets[i] = StartOffsets[i - 1] + TestUtil.NextInt(Random(), 0, Rarely() ? 1 << 16 : 20);
                        }
                        EndOffsets[i] = StartOffsets[i] + TestUtil.NextInt(Random(), 0, Rarely() ? 1 << 10 : 20);
                    }
                }

                for (int i = 0; i < len; ++i)
                {
                    if (i == 0)
                    {
                        Positions[i] = PositionsIncrements[i] - 1;
                    }
                    else
                    {
                        Positions[i] = Positions[i - 1] + PositionsIncrements[i];
                    }
                }
                if (Rarely())
                {
                    Arrays.Fill(Payloads, outerInstance.RandomPayload());
                }
                else
                {
                    for (int i = 0; i < len; ++i)
                    {
                        Payloads[i] = outerInstance.RandomPayload();
                    }
                }

                PositionToTerms    = new Dictionary <int?, ISet <int?> >(len);
                StartOffsetToTerms = new Dictionary <int?, ISet <int?> >(len);
                for (int i = 0; i < len; ++i)
                {
                    if (!PositionToTerms.ContainsKey(Positions[i]))
                    {
                        PositionToTerms[Positions[i]] = new HashSet <int?>();//size1
                    }
                    PositionToTerms[Positions[i]].Add(i);
                    if (!StartOffsetToTerms.ContainsKey(StartOffsets[i]))
                    {
                        StartOffsetToTerms[StartOffsets[i]] = new HashSet <int?>();//size1
                    }
                    StartOffsetToTerms[StartOffsets[i]].Add(i);
                }

                Freqs = new Dictionary <string, int?>();
                foreach (string term in Terms)
                {
                    if (Freqs.ContainsKey(term))
                    {
                        Freqs[term] = Freqs[term] + 1;
                    }
                    else
                    {
                        Freqs[term] = 1;
                    }
                }

                AddAttributeImpl(new PermissiveOffsetAttributeImpl());

                TermAtt = AddAttribute <ICharTermAttribute>();
                PiAtt   = AddAttribute <IPositionIncrementAttribute>();
                OAtt    = AddAttribute <IOffsetAttribute>();
                PAtt    = AddAttribute <IPayloadAttribute>();
            }
예제 #54
0
        protected override IQueryNode PostProcessNode(IQueryNode node)
        {
            if (node is ITextableQueryNode &&
                !(node is WildcardQueryNode) &&
                !(node is FuzzyQueryNode) &&
                !(node is RegexpQueryNode) &&
                !(node.Parent is IRangeQueryNode))
            {
                FieldQueryNode fieldNode = ((FieldQueryNode)node);
                string         text      = fieldNode.GetTextAsString();
                string         field     = fieldNode.GetFieldAsString();

                CachingTokenFilter          buffer     = null;
                IPositionIncrementAttribute posIncrAtt = null;
                int  numTokens     = 0;
                int  positionCount = 0;
                bool severalTokensAtSamePosition = false;

                TokenStream source = null;
                try
                {
                    source = this.analyzer.GetTokenStream(field, text);
                    source.Reset();
                    buffer = new CachingTokenFilter(source);

                    if (buffer.HasAttribute <IPositionIncrementAttribute>())
                    {
                        posIncrAtt = buffer.GetAttribute <IPositionIncrementAttribute>();
                    }

                    try
                    {
                        while (buffer.IncrementToken())
                        {
                            numTokens++;
                            int positionIncrement = (posIncrAtt != null) ? posIncrAtt
                                                    .PositionIncrement : 1;
                            if (positionIncrement != 0)
                            {
                                positionCount += positionIncrement;
                            }
                            else
                            {
                                severalTokensAtSamePosition = true;
                            }
                        }
                    }
#pragma warning disable 168
                    catch (IOException e)
#pragma warning restore 168
                    {
                        // ignore
                    }
                }
                catch (IOException e)
                {
                    throw new Exception(e.ToString(), e);
                }
                finally
                {
                    IOUtils.DisposeWhileHandlingException(source);
                }

                // rewind the buffer stream
                buffer.Reset();

                if (!buffer.HasAttribute <ICharTermAttribute>())
                {
                    return(new NoTokenFoundQueryNode());
                }

                ICharTermAttribute termAtt = buffer.GetAttribute <ICharTermAttribute>();

                if (numTokens == 0)
                {
                    return(new NoTokenFoundQueryNode());
                }
                else if (numTokens == 1)
                {
                    string term = null;
                    try
                    {
                        bool hasNext;
                        hasNext = buffer.IncrementToken();
                        if (Debugging.AssertsEnabled)
                        {
                            Debugging.Assert(hasNext == true);
                        }
                        term = termAtt.ToString();
                    }
                    catch (IOException) // LUCENENET: IDE0059: Remove unnecessary value assignment
                    {
                        // safe to ignore, because we know the number of tokens
                    }

                    fieldNode.Text = term.AsCharSequence();

                    return(fieldNode);
                }
                else if (severalTokensAtSamePosition || !(node is QuotedFieldQueryNode))
                {
                    if (positionCount == 1 || !(node is QuotedFieldQueryNode))
                    {
                        // no phrase query:

                        if (positionCount == 1)
                        {
                            // simple case: only one position, with synonyms
                            List <IQueryNode> children = new List <IQueryNode>();

                            for (int i = 0; i < numTokens; i++)
                            {
                                string term = null;
                                try
                                {
                                    bool hasNext = buffer.IncrementToken();
                                    if (Debugging.AssertsEnabled)
                                    {
                                        Debugging.Assert(hasNext == true);
                                    }
                                    term = termAtt.ToString();
                                }
                                catch (IOException) // LUCENENET: IDE0059: Remove unnecessary value assignment
                                {
                                    // safe to ignore, because we know the number of tokens
                                }

                                children.Add(new FieldQueryNode(field, term, -1, -1));
                            }
                            return(new GroupQueryNode(
                                       new StandardBooleanQueryNode(children, positionCount == 1)));
                        }
                        else
                        {
                            // multiple positions
                            IQueryNode q            = new StandardBooleanQueryNode(Collections.EmptyList <IQueryNode>(), false);
                            IQueryNode currentQuery = null;
                            for (int i = 0; i < numTokens; i++)
                            {
                                string term = null;
                                try
                                {
                                    bool hasNext = buffer.IncrementToken();
                                    if (Debugging.AssertsEnabled)
                                    {
                                        Debugging.Assert(hasNext == true);
                                    }
                                    term = termAtt.ToString();
                                }
                                catch (IOException) // LUCENENET: IDE0059: Remove unnecessary value assignment
                                {
                                    // safe to ignore, because we know the number of tokens
                                }
                                if (posIncrAtt != null && posIncrAtt.PositionIncrement == 0)
                                {
                                    if (!(currentQuery is BooleanQueryNode))
                                    {
                                        IQueryNode t = currentQuery;
                                        currentQuery = new StandardBooleanQueryNode(Collections.EmptyList <IQueryNode>(), true);
                                        ((BooleanQueryNode)currentQuery).Add(t);
                                    }
                                    ((BooleanQueryNode)currentQuery).Add(new FieldQueryNode(field, term, -1, -1));
                                }
                                else
                                {
                                    if (currentQuery != null)
                                    {
                                        if (this.defaultOperator == Operator.OR)
                                        {
                                            q.Add(currentQuery);
                                        }
                                        else
                                        {
                                            q.Add(new ModifierQueryNode(currentQuery, Modifier.MOD_REQ));
                                        }
                                    }
                                    currentQuery = new FieldQueryNode(field, term, -1, -1);
                                }
                            }
                            if (this.defaultOperator == Operator.OR)
                            {
                                q.Add(currentQuery);
                            }
                            else
                            {
                                q.Add(new ModifierQueryNode(currentQuery, Modifier.MOD_REQ));
                            }

                            if (q is BooleanQueryNode)
                            {
                                q = new GroupQueryNode(q);
                            }
                            return(q);
                        }
                    }
                    else
                    {
                        // phrase query:
                        MultiPhraseQueryNode mpq = new MultiPhraseQueryNode();

                        List <FieldQueryNode> multiTerms = new List <FieldQueryNode>();
                        int position       = -1;
                        int i              = 0;
                        int termGroupCount = 0;
                        for (; i < numTokens; i++)
                        {
                            string term = null;
                            int    positionIncrement = 1;
                            try
                            {
                                bool hasNext = buffer.IncrementToken();
                                if (Debugging.AssertsEnabled)
                                {
                                    Debugging.Assert(hasNext == true);
                                }
                                term = termAtt.ToString();
                                if (posIncrAtt != null)
                                {
                                    positionIncrement = posIncrAtt.PositionIncrement;
                                }
                            }
                            catch (IOException) // LUCENENET: IDE0059: Remove unnecessary value assignment
                            {
                                // safe to ignore, because we know the number of tokens
                            }

                            if (positionIncrement > 0 && multiTerms.Count > 0)
                            {
                                foreach (FieldQueryNode termNode in multiTerms)
                                {
                                    if (this.positionIncrementsEnabled)
                                    {
                                        termNode.PositionIncrement = position;
                                    }
                                    else
                                    {
                                        termNode.PositionIncrement = termGroupCount;
                                    }

                                    mpq.Add(termNode);
                                }

                                // Only increment once for each "group" of
                                // terms that were in the same position:
                                termGroupCount++;

                                multiTerms.Clear();
                            }

                            position += positionIncrement;
                            multiTerms.Add(new FieldQueryNode(field, term, -1, -1));
                        }

                        foreach (FieldQueryNode termNode in multiTerms)
                        {
                            if (this.positionIncrementsEnabled)
                            {
                                termNode.PositionIncrement = position;
                            }
                            else
                            {
                                termNode.PositionIncrement = termGroupCount;
                            }

                            mpq.Add(termNode);
                        }

                        return(mpq);
                    }
                }
                else
                {
                    TokenizedPhraseQueryNode pq = new TokenizedPhraseQueryNode();

                    int position = -1;

                    for (int i = 0; i < numTokens; i++)
                    {
                        string term = null;
                        int    positionIncrement = 1;

                        try
                        {
                            bool hasNext = buffer.IncrementToken();
                            if (Debugging.AssertsEnabled)
                            {
                                Debugging.Assert(hasNext == true);
                            }
                            term = termAtt.ToString();

                            if (posIncrAtt != null)
                            {
                                positionIncrement = posIncrAtt.PositionIncrement;
                            }
                        }
                        catch (IOException) // LUCENENET: IDE0059: Remove unnecessary value assignment
                        {
                            // safe to ignore, because we know the number of tokens
                        }

                        FieldQueryNode newFieldNode = new FieldQueryNode(field, term, -1, -1);

                        if (this.positionIncrementsEnabled)
                        {
                            position += positionIncrement;
                            newFieldNode.PositionIncrement = position;
                        }
                        else
                        {
                            newFieldNode.PositionIncrement = i;
                        }

                        pq.Add(newFieldNode);
                    }

                    return(pq);
                }
            }

            return(node);
        }
예제 #55
0
 public MyTokenStream()
 {
     TermAtt   = AddAttribute <ICharTermAttribute>();
     PosIncAtt = AddAttribute <IPositionIncrementAttribute>();
 }
예제 #56
0
 public OverlappingTokenStream()
 {
     termAttribute              = AddAttribute <ICharTermAttribute>();
     offsetAttribute            = AddAttribute <IOffsetAttribute>();
     positionIncrementAttribute = AddAttribute <IPositionIncrementAttribute>();
 }
예제 #57
0
        /// <summary>
        /// Not an explicit test, just useful to print out some info on performance
        /// </summary>
        public virtual void Performance()
        {
            int[] tokCount  = new int[] { 100, 500, 1000, 2000, 5000, 10000 };
            int[] modCounts = new int[] { 1, 2, 5, 10, 20, 50, 100, 200, 500 };
            for (int k = 0; k < tokCount.Length; k++)
            {
                StringBuilder buffer = new StringBuilder();
                Console.WriteLine("-----Tokens: " + tokCount[k] + "-----");
                for (int i = 0; i < tokCount[k]; i++)
                {
                    //buffer.Append(English.intToEnglish(i).toUpperCase(Locale.ROOT)).Append(' ');
                    buffer.Append(i.ToString(CultureInfo.InvariantCulture)).Append(' ');
                }
                //make sure we produce the same tokens
                TeeSinkTokenFilter teeStream = new TeeSinkTokenFilter(new StandardFilter(TEST_VERSION_CURRENT, new StandardTokenizer(TEST_VERSION_CURRENT, new StringReader(buffer.ToString()))));
                TokenStream        sink      = teeStream.NewSinkTokenStream(new ModuloSinkFilter(this, 100));
                teeStream.ConsumeAllTokens();
                TokenStream        stream  = new ModuloTokenFilter(this, new StandardFilter(TEST_VERSION_CURRENT, new StandardTokenizer(TEST_VERSION_CURRENT, new StringReader(buffer.ToString()))), 100);
                ICharTermAttribute tfTok   = stream.AddAttribute <ICharTermAttribute>();
                ICharTermAttribute sinkTok = sink.AddAttribute <ICharTermAttribute>();
                for (int i = 0; stream.IncrementToken(); i++)
                {
                    assertTrue(sink.IncrementToken());
                    assertTrue(tfTok + " is not equal to " + sinkTok + " at token: " + i, tfTok.Equals(sinkTok) == true);
                }

                //simulate two fields, each being analyzed once, for 20 documents
                for (int j = 0; j < modCounts.Length; j++)
                {
                    int tfPos = 0;
                    //long start = DateTimeHelperClass.CurrentUnixTimeMillis();
                    long start = Environment.TickCount;
                    for (int i = 0; i < 20; i++)
                    {
                        stream = new StandardFilter(TEST_VERSION_CURRENT, new StandardTokenizer(TEST_VERSION_CURRENT, new StringReader(buffer.ToString())));
                        IPositionIncrementAttribute posIncrAtt = stream.GetAttribute <IPositionIncrementAttribute>();
                        while (stream.IncrementToken())
                        {
                            tfPos += posIncrAtt.PositionIncrement;
                        }
                        stream     = new ModuloTokenFilter(this, new StandardFilter(TEST_VERSION_CURRENT, new StandardTokenizer(TEST_VERSION_CURRENT, new StringReader(buffer.ToString()))), modCounts[j]);
                        posIncrAtt = stream.GetAttribute <IPositionIncrementAttribute>();
                        while (stream.IncrementToken())
                        {
                            tfPos += posIncrAtt.PositionIncrement;
                        }
                    }
                    //long finish = DateTimeHelperClass.CurrentUnixTimeMillis();
                    long finish = Environment.TickCount;
                    Console.WriteLine("ModCount: " + modCounts[j] + " Two fields took " + (finish - start) + " ms");
                    int sinkPos = 0;
                    //simulate one field with one sink
                    //start = DateTimeHelperClass.CurrentUnixTimeMillis();
                    start = Environment.TickCount;
                    for (int i = 0; i < 20; i++)
                    {
                        teeStream = new TeeSinkTokenFilter(new StandardFilter(TEST_VERSION_CURRENT, new StandardTokenizer(TEST_VERSION_CURRENT, new StringReader(buffer.ToString()))));
                        sink      = teeStream.NewSinkTokenStream(new ModuloSinkFilter(this, modCounts[j]));
                        IPositionIncrementAttribute posIncrAtt = teeStream.GetAttribute <IPositionIncrementAttribute>();
                        while (teeStream.IncrementToken())
                        {
                            sinkPos += posIncrAtt.PositionIncrement;
                        }
                        //System.out.println("Modulo--------");
                        posIncrAtt = sink.GetAttribute <IPositionIncrementAttribute>();
                        while (sink.IncrementToken())
                        {
                            sinkPos += posIncrAtt.PositionIncrement;
                        }
                    }
                    //finish = DateTimeHelperClass.CurrentUnixTimeMillis();
                    finish = Environment.TickCount;
                    Console.WriteLine("ModCount: " + modCounts[j] + " Tee fields took " + (finish - start) + " ms");
                    assertTrue(sinkPos + " does not equal: " + tfPos, sinkPos == tfPos);
                }
                Console.WriteLine("- End Tokens: " + tokCount[k] + "-----");
            }
        }
예제 #58
0
        public void TestPayloads()
        {
            Directory         dir    = NewDirectory();
            RandomIndexWriter writer = new RandomIndexWriter(
#if FEATURE_INSTANCE_TESTDATA_INITIALIZATION
                this,
#endif
                Random, dir);
            FieldType myFieldType = new FieldType(TextField.TYPE_NOT_STORED);

            myFieldType.StoreTermVectors         = (true);
            myFieldType.StoreTermVectorOffsets   = (true);
            myFieldType.StoreTermVectorPositions = (true);
            myFieldType.StoreTermVectorPayloads  = (true);

            curOffset = 0;

            Token[] tokens = new Token[] {
                getToken("foxes"),
                getToken("can"),
                getToken("jump"),
                getToken("high")
            };

            Document doc = new Document();

            doc.Add(new Field("field", new CannedTokenStream(tokens), myFieldType));
            writer.AddDocument(doc);

            IndexReader reader = writer.GetReader();

            writer.Dispose();
            assertEquals(1, reader.NumDocs);

            for (int i = 0; i < 2; i++)
            {
                // Do this twice, once passing true and then passing
                // false: they are entirely different code paths
                // under-the-hood:
                TokenStream ts = TokenSources.GetTokenStream(reader.GetTermVectors(0).GetTerms("field"), i == 0);

                ICharTermAttribute          termAtt    = ts.GetAttribute <ICharTermAttribute>();
                IPositionIncrementAttribute posIncAtt  = ts.GetAttribute <IPositionIncrementAttribute>();
                IOffsetAttribute            offsetAtt  = ts.GetAttribute <IOffsetAttribute>();
                IPayloadAttribute           payloadAtt = ts.GetAttribute <IPayloadAttribute>();

                foreach (Token token in tokens)
                {
                    assertTrue(ts.IncrementToken());
                    assertEquals(token.toString(), termAtt.toString());
                    assertEquals(token.PositionIncrement, posIncAtt.PositionIncrement);
                    assertEquals(token.Payload, payloadAtt.Payload);
                    assertEquals(token.StartOffset, offsetAtt.StartOffset);
                    assertEquals(token.EndOffset, offsetAtt.EndOffset);
                }

                assertFalse(ts.IncrementToken());
            }

            reader.Dispose();
            dir.Dispose();
        }
            protected internal RandomTokenStream(BaseTermVectorsFormatTestCase outerInstance, int len, string[] sampleTerms, BytesRef[] sampleTermBytes, bool offsetsGoBackwards)
            {
                this.OuterInstance = outerInstance;
                Terms = new string[len];
                TermBytes = new BytesRef[len];
                PositionsIncrements = new int[len];
                Positions = new int[len];
                StartOffsets = new int[len];
                EndOffsets = new int[len];
                Payloads = new BytesRef[len];
                for (int i = 0; i < len; ++i)
                {
                    int o = Random().Next(sampleTerms.Length);
                    Terms[i] = sampleTerms[o];
                    TermBytes[i] = sampleTermBytes[o];
                    PositionsIncrements[i] = TestUtil.NextInt(Random(), i == 0 ? 1 : 0, 10);
                    if (offsetsGoBackwards)
                    {
                        StartOffsets[i] = Random().Next();
                        EndOffsets[i] = Random().Next();
                    }
                    else
                    {
                        if (i == 0)
                        {
                            StartOffsets[i] = TestUtil.NextInt(Random(), 0, 1 << 16);
                        }
                        else
                        {
                            StartOffsets[i] = StartOffsets[i - 1] + TestUtil.NextInt(Random(), 0, Rarely() ? 1 << 16 : 20);
                        }
                        EndOffsets[i] = StartOffsets[i] + TestUtil.NextInt(Random(), 0, Rarely() ? 1 << 10 : 20);
                    }
                }

                for (int i = 0; i < len; ++i)
                {
                    if (i == 0)
                    {
                        Positions[i] = PositionsIncrements[i] - 1;
                    }
                    else
                    {
                        Positions[i] = Positions[i - 1] + PositionsIncrements[i];
                    }
                }
                if (Rarely())
                {
                    Arrays.Fill(Payloads, outerInstance.RandomPayload());
                }
                else
                {
                    for (int i = 0; i < len; ++i)
                    {
                        Payloads[i] = outerInstance.RandomPayload();
                    }
                }

                PositionToTerms = new Dictionary<int?, ISet<int?>>(len);
                StartOffsetToTerms = new Dictionary<int?, ISet<int?>>(len);
                for (int i = 0; i < len; ++i)
                {
                    if (!PositionToTerms.ContainsKey(Positions[i]))
                    {
                        PositionToTerms[Positions[i]] = new HashSet<int?>();//size1
                    }
                    PositionToTerms[Positions[i]].Add(i);
                    if (!StartOffsetToTerms.ContainsKey(StartOffsets[i]))
                    {
                        StartOffsetToTerms[StartOffsets[i]] = new HashSet<int?>();//size1
                    }
                    StartOffsetToTerms[StartOffsets[i]].Add(i);
                }

                Freqs = new Dictionary<string, int?>();
                foreach (string term in Terms)
                {
                    if (Freqs.ContainsKey(term))
                    {
                        Freqs[term] = Freqs[term] + 1;
                    }
                    else
                    {
                        Freqs[term] = 1;
                    }
                }

                AddAttributeImpl(new PermissiveOffsetAttributeImpl());

                TermAtt = AddAttribute<ICharTermAttribute>();
                PiAtt = AddAttribute<IPositionIncrementAttribute>();
                OAtt = AddAttribute<IOffsetAttribute>();
                PAtt = AddAttribute<IPayloadAttribute>();
            }
 public SinglePositionTokenStream(string word)
 {
     termAtt = AddAttribute<ICharTermAttribute>();
     posIncrAtt = AddAttribute<IPositionIncrementAttribute>();
     this.word = word;
     returned = true;
 }