/// <summary>
        /// Create a new <seealso cref="PatternKeywordMarkerFilter"/>, that marks the current
        /// token as a keyword if the tokens term buffer matches the provided
        /// <seealso cref="Pattern"/> via the <seealso cref="KeywordAttribute"/>.
        /// </summary>
        /// <param name="in">
        ///          TokenStream to filter </param>
        /// <param name="pattern">
        ///          the pattern to apply to the incoming term buffer
        ///  </param>
        public PatternKeywordMarkerFilter(TokenStream @in, Pattern pattern)
            : base(@in)
        {
            termAtt = AddAttribute<ICharTermAttribute>();

            this.matcher = pattern.matcher("");
        }
Esempio n. 2
0
 public SimplePayloadFilter(TokenStream input)
     : base(input)
 {
     Pos = 0;
     PayloadAttr = input.AddAttribute<IPayloadAttribute>();
     TermAttr = input.AddAttribute<ICharTermAttribute>();
 }
Esempio n. 3
0
 /// <summary>
 /// Create a new MockTokenFilter.
 /// </summary>
 /// <param name="input"> TokenStream to filter </param>
 /// <param name="filter"> DFA representing the terms that should be removed. </param>
 public MockTokenFilter(TokenStream input, CharacterRunAutomaton filter)
     : base(input)
 {
     this.Filter = filter;
     TermAtt = AddAttribute<ICharTermAttribute>();
     PosIncrAtt = AddAttribute<IPositionIncrementAttribute>();
 }
        protected CompoundWordTokenFilterBase(LuceneVersion matchVersion, TokenStream input, CharArraySet dictionary, int minWordSize, int minSubwordSize, int maxSubwordSize, bool onlyLongestMatch)
            : base(input)
        {
            termAtt = AddAttribute<ICharTermAttribute>();
            offsetAtt = AddAttribute<IOffsetAttribute>();
            posIncAtt = AddAttribute<IPositionIncrementAttribute>();

            this.matchVersion = matchVersion;
            this.tokens = new LinkedList<CompoundToken>();
            if (minWordSize < 0)
            {
                throw new System.ArgumentException("minWordSize cannot be negative");
            }
            this.minWordSize = minWordSize;
            if (minSubwordSize < 0)
            {
                throw new System.ArgumentException("minSubwordSize cannot be negative");
            }
            this.minSubwordSize = minSubwordSize;
            if (maxSubwordSize < 0)
            {
                throw new System.ArgumentException("maxSubwordSize cannot be negative");
            }
            this.maxSubwordSize = maxSubwordSize;
            this.onlyLongestMatch = onlyLongestMatch;
            this.dictionary = dictionary;
        }
Esempio n. 5
0
 /// <summary>
 /// Create a new UpperCaseFilter, that normalizes token text to upper case.
 /// </summary>
 /// <param name="matchVersion"> See <a href="#version">above</a> </param>
 /// <param name="in"> TokenStream to filter </param>
 public UpperCaseFilter(LuceneVersion matchVersion, TokenStream @in)
     : base(@in)
 {
     termAtt = AddAttribute<ICharTermAttribute>();
     termAtt = AddAttribute<ICharTermAttribute>();
     charUtils = CharacterUtils.GetInstance(matchVersion);
 }
Esempio n. 6
0
        public ChineseFilter(TokenStream @in)
            : base(@in)
        {

            stopTable = new CharArraySet(LuceneVersion.LUCENE_CURRENT, Arrays.AsList(STOP_WORDS), false);
            termAtt = AddAttribute<ICharTermAttribute>();
        }
 /// <summary>
 /// Creates a new NorwegianLightStemFilter </summary>
 /// <param name="flags"> set to <seealso cref="NorwegianLightStemmer#BOKMAAL"/>, 
 ///                     <seealso cref="NorwegianLightStemmer#NYNORSK"/>, or both. </param>
 public NorwegianMinimalStemFilter(TokenStream input, int flags)
       : base(input)
 {
     this.stemmer = new NorwegianMinimalStemmer(flags);
     termAtt = AddAttribute<ICharTermAttribute>();
     keywordAttr = AddAttribute<IKeywordAttribute>();
 }
 /// <summary>
 /// Create a new <seealso cref="CodepointCountFilter"/>. This will filter out tokens whose
 /// <seealso cref="CharTermAttribute"/> is either too short (<seealso cref="Character#CodePointCount(char[], int, int)"/>
 /// &lt; min) or too long (<seealso cref="Character#codePointCount(char[], int, int)"/> &gt; max). </summary>
 /// <param name="version"> the Lucene match version </param>
 /// <param name="in">      the <seealso cref="TokenStream"/> to consume </param>
 /// <param name="min">     the minimum length </param>
 /// <param name="max">     the maximum length </param>
 public CodepointCountFilter(LuceneVersion version, TokenStream @in, int min, int max)
     : base(version, @in)
 {
     this.min = min;
     this.max = max;
     termAtt = AddAttribute<ICharTermAttribute>();
 }
 /// <summary>
 /// Create a new <seealso cref="ASCIIFoldingFilter"/>.
 /// </summary>
 /// <param name="input">
 ///          TokenStream to filter </param>
 /// <param name="preserveOriginal">
 ///          should the original tokens be kept on the input stream with a 0 position increment
 ///          from the folded tokens?
 ///  </param>
 public ASCIIFoldingFilter(TokenStream input, bool preserveOriginal)
     : base(input)
 {
     this.preserveOriginal = preserveOriginal;
     termAtt = AddAttribute<ICharTermAttribute>();
     posIncAttr = AddAttribute<IPositionIncrementAttribute>();
 }
 /// <summary>
 /// Create a new IndonesianStemFilter.
 /// <para>
 /// If <code>stemDerivational</code> is false, 
 /// only inflectional suffixes (particles and possessive pronouns) are stemmed.
 /// </para>
 /// </summary>
 public IndonesianStemFilter(TokenStream input, bool stemDerivational)
       : base(input)
 {
     this.stemDerivational = stemDerivational;
     termAtt = AddAttribute<ICharTermAttribute>();
     keywordAtt = AddAttribute<IKeywordAttribute>();
 }
Esempio n. 11
0
 public SnowballFilter(TokenStream input, SnowballProgram stemmer)
       : base(input)
 {
     this.stemmer = stemmer;
     this.termAtt = AddAttribute<ICharTermAttribute>();
     this.keywordAttr = AddAttribute<IKeywordAttribute>();
 }
 public CannedTokenizer(System.IO.TextReader reader, TokenAndPos[] tokens)
     : base(reader)
 {
     this.tokens = tokens;
     this.termAtt = AddAttribute<ICharTermAttribute>();
     this.posIncrAtt = AddAttribute<IPositionIncrementAttribute>();
 }
Esempio n. 13
0
 /// <summary>
 /// Create filter using the supplied stemming table.
 /// </summary>
 /// <param name="in">input token stream</param>
 /// <param name="stemmer">stemmer</param>
 /// <param name="minLength">For performance reasons words shorter than minLength 
 /// characters are not processed, but simply returned.</param>
 public StempelFilter(TokenStream @in, StempelStemmer stemmer, int minLength)
     : base(@in)
 {
     this.stemmer = stemmer;
     this.minLength = minLength;
     this.termAtt = AddAttribute<ICharTermAttribute>();
     this.keywordAtt = AddAttribute<IKeywordAttribute>();
 }
 /// <summary>
 /// Create a new StemmerOverrideFilter, performing dictionary-based stemming
 /// with the provided <code>dictionary</code>.
 /// <para>
 /// Any dictionary-stemmed terms will be marked with <seealso cref="KeywordAttribute"/>
 /// so that they will not be stemmed with stemmers down the chain.
 /// </para>
 /// </summary>
 public StemmerOverrideFilter(TokenStream input, StemmerOverrideMap stemmerOverrideMap)
       : base(input)
 {
     this.stemmerOverrideMap = stemmerOverrideMap;
     fstReader = stemmerOverrideMap.BytesReader;
     termAtt = AddAttribute<ICharTermAttribute>();
     keywordAtt = AddAttribute<IKeywordAttribute>();
 }
 /// <summary>
 /// Constructs an instance to replace either the first, or all occurances
 /// </summary>
 /// <param name="in"> the TokenStream to process </param>
 /// <param name="pattern"> the pattern (a <seealso cref="Regex"/> object) to apply to each Token </param>
 /// <param name="replacement"> the "replacement string" to substitute, if null a
 ///        blank string will be used. Note that this is not the literal
 ///        string that will be used, '$' and '\' have special meaning. </param>
 /// <param name="all"> if true, all matches will be replaced otherwise just the first match. </param>
 /// <seealso cref= Matcher#quoteReplacement </seealso>
 public PatternReplaceFilter(TokenStream @in, Regex pattern, string replacement, bool all)
       : base(@in)
 {
     this.replacement = (null == replacement) ? "" : replacement;
     this.all = all;
     this.pattern = pattern;
     termAtt = AddAttribute<ICharTermAttribute>();
 }
Esempio n. 16
0
 public GraphTokenizer(TextReader input)
     : base(input)
 {
     TermAtt = AddAttribute<ICharTermAttribute>();
     OffsetAtt = AddAttribute<IOffsetAttribute>();
     PosIncrAtt = AddAttribute<IPositionIncrementAttribute>();
     PosLengthAtt = AddAttribute<IPositionLengthAttribute>();
 }
 public DelimitedPayloadTokenFilter(TokenStream input, char delimiter, IPayloadEncoder encoder)
     : base(input)
 {
     this.delimiter = delimiter;
     this.encoder = encoder;
     termAtt = AddAttribute<ICharTermAttribute>();
     payAtt = AddAttribute<IPayloadAttribute>();
 }
Esempio n. 18
0
 public RepeatingTokenizer(TextReader reader, string val, Random random, float percentDocs, int maxTF)
     : base(reader)
 {
     this.Value = val;
     this.Random = random;
     this.PercentDocs = percentDocs;
     this.MaxTF = maxTF;
     this.TermAtt = AddAttribute<ICharTermAttribute>();
 }
 /// <summary>
 /// Sole constructor. </summary>
 public SuggestStopFilter(TokenStream input, CharArraySet stopWords)
     : base(input)
 {
     this.stopWords = stopWords;
     this.termAtt = AddAttribute<ICharTermAttribute>();
     this.posIncAtt = AddAttribute<IPositionIncrementAttribute>();
     this.keywordAtt = AddAttribute<IKeywordAttribute>();
     this.offsetAtt = AddAttribute<IOffsetAttribute>();
 }
 public TruncateTokenFilter(TokenStream input, int length) : base(input)
 {
     if (length < 1)
     {
         throw new System.ArgumentOutOfRangeException("length parameter must be a positive number: " + length);
     }
     this.length = length;
     this.termAttribute = AddAttribute<ICharTermAttribute>();
     this.keywordAttr = AddAttribute<IKeywordAttribute>();
 }
 /// <summary>
 /// Pop one input token's worth of tokens off the filter and verify that they are as expected.
 /// </summary>
 internal virtual void assertNextTerms(string expectedUnfolded, string expectedFolded, ASCIIFoldingFilter filter, ICharTermAttribute termAtt)
 {
     assertTrue(filter.IncrementToken());
     assertEquals(expectedFolded, termAtt.ToString());
     if (filter.PreserveOriginal && !expectedUnfolded.Equals(expectedFolded))
     {
         assertTrue(filter.IncrementToken());
         assertEquals(expectedUnfolded, termAtt.ToString());
     }
 }
Esempio n. 22
0
 /// <summary>
 /// Construct a token stream filtering the given input using a Set of common
 /// words to create bigrams. Outputs both unigrams with position increment and
 /// bigrams with position increment 0 type=gram where one or both of the words
 /// in a potential bigram are in the set of common words .
 /// </summary>
 /// <param name="input"> TokenStream input in filter chain </param>
 /// <param name="commonWords"> The set of common words. </param>
 public CommonGramsFilter(LuceneVersion matchVersion, TokenStream input, CharArraySet commonWords)
     : base(input)
 {
     termAttribute = AddAttribute<ICharTermAttribute>();
     offsetAttribute = AddAttribute<IOffsetAttribute>();
     typeAttribute = AddAttribute<ITypeAttribute>();
     posIncAttribute = AddAttribute<IPositionIncrementAttribute>();
     posLenAttribute = AddAttribute<IPositionLengthAttribute>();
     this.commonWords = commonWords;
 }
Esempio n. 23
0
 public MockPayloadFilter(TokenStream input, string fieldName)
     : base(input)
 {
     this.FieldName = fieldName;
     Pos = 0;
     i = 0;
     PosIncrAttr = input.AddAttribute<IPositionIncrementAttribute>();
     PayloadAttr = input.AddAttribute<IPayloadAttribute>();
     TermAttr = input.AddAttribute<ICharTermAttribute>();
 }
Esempio n. 24
0
 /// <summary>
 /// Creates a new ThaiTokenizer, supplying the AttributeFactory </summary>
 public ThaiTokenizer(AttributeFactory factory, TextReader reader)
       : base(factory, reader, BreakIterator.CreateSentenceInstance(Locale.GetUS()))
 {
     if (!DBBI_AVAILABLE)
     {
         throw new System.NotSupportedException("This JRE does not have support for Thai segmentation");
     }
     wordBreaker = new ThaiWordBreaker(BreakIterator.CreateWordInstance(Locale.GetUS()));
     termAtt = AddAttribute<ICharTermAttribute>();
     offsetAtt = AddAttribute<IOffsetAttribute>();
 }
Esempio n. 25
0
 public IterTokenStream(params Token[] tokens)
         : base()
 {
     this.tokens = tokens;
     this.termAtt = AddAttribute<ICharTermAttribute>();
     this.offsetAtt = AddAttribute<IOffsetAttribute>();
     this.posIncAtt = AddAttribute<IPositionIncrementAttribute>();
     this.flagsAtt = AddAttribute<IFlagsAttribute>();
     this.typeAtt = AddAttribute<ITypeAttribute>();
     this.payloadAtt = AddAttribute<IPayloadAttribute>();
 }
Esempio n. 26
0
 public TrimFilter(LuceneVersion version, TokenStream @in, bool updateOffsets)
     : base(@in)
 {
     if (updateOffsets && version.OnOrAfter(LuceneVersion.LUCENE_44))
     {
         throw new System.ArgumentException("updateOffsets=true is not supported anymore as of Lucene 4.4");
     }
     termAtt = AddAttribute<ICharTermAttribute>();
     offsetAtt = AddAttribute<IOffsetAttribute>();
     this.updateOffsets = updateOffsets;
 }
Esempio n. 27
0
        private bool hasIllegalOffsets = false; // only if the length changed before this filter

        /// <summary>
        /// Creates a new ThaiWordFilter with the specified match version. </summary>
        public ThaiWordFilter(LuceneVersion matchVersion, TokenStream input)
              : base(matchVersion.OnOrAfter(LuceneVersion.LUCENE_31) ? input : new LowerCaseFilter(matchVersion, input))
        {
            if (!DBBI_AVAILABLE)
            {
                throw new System.NotSupportedException("This JRE does not have support for Thai segmentation");
            }
            handlePosIncr = matchVersion.OnOrAfter(LuceneVersion.LUCENE_31);
            termAtt = AddAttribute<ICharTermAttribute>();
            offsetAtt = AddAttribute<IOffsetAttribute>();
            posAtt = AddAttribute<IPositionIncrementAttribute>();
        }
        public KeywordTokenizer(AttributeSource.AttributeFactory factory, Reader input, int bufferSize)
            : base(factory, input)
        {
            termAtt = AddAttribute<ICharTermAttribute>();
            offsetAtt = AddAttribute<IOffsetAttribute>();

            if (bufferSize <= 0)
            {
                throw new System.ArgumentException("bufferSize must be > 0");
            }
            termAtt.ResizeBuffer(bufferSize);
        }
Esempio n. 29
0
 /// <summary>
 /// Create a new <seealso cref="LengthFilter"/>. This will filter out tokens whose
 /// <seealso cref="CharTermAttribute"/> is either too short (<seealso cref="CharTermAttribute#length()"/>
 /// &lt; min) or too long (<seealso cref="CharTermAttribute#length()"/> &gt; max). </summary>
 /// <param name="version"> the Lucene match version </param>
 /// <param name="in">      the <seealso cref="TokenStream"/> to consume </param>
 /// <param name="min">     the minimum length </param>
 /// <param name="max">     the maximum length </param>
 public LengthFilter(LuceneVersion version, TokenStream @in, int min, int max)
     : base(version, @in)
 {
     if (min < 0)
     {
         throw new ArgumentOutOfRangeException("minimum length must be greater than or equal to zero");
     }
     if (min > max)
     {
         throw new ArgumentOutOfRangeException("maximum length must not be greater than minimum length");
     }
     this.min = min;
     this.max = max;
     this.termAtt = AddAttribute<ICharTermAttribute>();
 }
Esempio n. 30
0
 /// <summary>
 /// If inputText is non-null, and the TokenStream has
 ///  offsets, we include the surface form in each arc's
 ///  label.
 /// </summary>
 public TokenStreamToDot(string inputText, TokenStream @in, TextWriter @out)
 {
     this.@in = @in;
     this.@out = @out;
     this.InputText = inputText;
     TermAtt = @in.AddAttribute<ICharTermAttribute>();
     PosIncAtt = @in.AddAttribute<IPositionIncrementAttribute>();
     PosLengthAtt = @in.AddAttribute<IPositionLengthAttribute>();
     if (@in.HasAttribute<IOffsetAttribute>())
     {
         OffsetAtt = @in.AddAttribute<IOffsetAttribute>();
     }
     else
     {
         OffsetAtt = null;
     }
 }
 internal WordTokenFilter(TypeAsPayloadTokenFilterTest outerInstance, TokenStream input) : base(input)
 {
     this.outerInstance = outerInstance;
     termAtt            = AddAttribute <ICharTermAttribute>();
     typeAtt            = AddAttribute <ITypeAttribute>();
 }
Esempio n. 32
0
 /// <summary>
 /// Pop one input token's worth of tokens off the filter and verify that they are as expected.
 /// </summary>
 internal virtual void assertNextTerms(string expectedUnfolded, string expectedFolded, ASCIIFoldingFilter filter, ICharTermAttribute termAtt)
 {
     assertTrue(filter.IncrementToken());
     assertEquals(expectedFolded, termAtt.ToString());
     if (filter.PreserveOriginal && !expectedUnfolded.Equals(expectedFolded, StringComparison.Ordinal))
     {
         assertTrue(filter.IncrementToken());
         assertEquals(expectedUnfolded, termAtt.ToString());
     }
 }
            protected internal RandomTokenStream(BaseTermVectorsFormatTestCase baseTermVectorsFormatTestCase, int len, string[] sampleTerms, BytesRef[] sampleTermBytes, bool offsetsGoBackwards)
            {
                terms               = new string[len];
                termBytes           = new BytesRef[len];
                positionsIncrements = new int[len];
                positions           = new int[len];
                startOffsets        = new int[len];
                endOffsets          = new int[len];
                payloads            = new BytesRef[len];
                for (int i = 0; i < len; ++i)
                {
                    int o = Random.Next(sampleTerms.Length);
                    terms[i]               = sampleTerms[o];
                    termBytes[i]           = sampleTermBytes[o];
                    positionsIncrements[i] = TestUtil.NextInt32(Random, i == 0 ? 1 : 0, 10);
                    if (offsetsGoBackwards)
                    {
                        startOffsets[i] = Random.Next();
                        endOffsets[i]   = Random.Next();
                    }
                    else
                    {
                        if (i == 0)
                        {
                            startOffsets[i] = TestUtil.NextInt32(Random, 0, 1 << 16);
                        }
                        else
                        {
                            startOffsets[i] = startOffsets[i - 1] + TestUtil.NextInt32(Random, 0, Rarely() ? 1 << 16 : 20);
                        }
                        endOffsets[i] = startOffsets[i] + TestUtil.NextInt32(Random, 0, Rarely() ? 1 << 10 : 20);
                    }
                }

                for (int i = 0; i < len; ++i)
                {
                    if (i == 0)
                    {
                        positions[i] = positionsIncrements[i] - 1;
                    }
                    else
                    {
                        positions[i] = positions[i - 1] + positionsIncrements[i];
                    }
                }
                if (Rarely())
                {
                    Arrays.Fill(payloads, baseTermVectorsFormatTestCase.RandomPayload());
                }
                else
                {
                    for (int i = 0; i < len; ++i)
                    {
                        payloads[i] = baseTermVectorsFormatTestCase.RandomPayload();
                    }
                }

                positionToTerms    = new Dictionary <int?, ISet <int?> >(len);
                startOffsetToTerms = new Dictionary <int?, ISet <int?> >(len);
                for (int i = 0; i < len; ++i)
                {
                    if (!positionToTerms.TryGetValue(positions[i], out ISet <int?> positionTerms))
                    {
                        positionToTerms[positions[i]] = positionTerms = new JCG.HashSet <int?>(1);
                    }
                    positionTerms.Add(i);
                    if (!startOffsetToTerms.TryGetValue(startOffsets[i], out ISet <int?> startOffsetTerms))
                    {
                        startOffsetToTerms[startOffsets[i]] = startOffsetTerms = new JCG.HashSet <int?>(1);
                    }
                    startOffsetTerms.Add(i);
                }

                freqs = new Dictionary <string, int?>();
                foreach (string term in terms)
                {
                    if (freqs.TryGetValue(term, out int?freq))
                    {
                        freqs[term] = freq + 1;
                    }
                    else
                    {
                        freqs[term] = 1;
                    }
                }

                AddAttributeImpl(new PermissiveOffsetAttribute());

                termAtt = AddAttribute <ICharTermAttribute>();
                piAtt   = AddAttribute <IPositionIncrementAttribute>();
                oAtt    = AddAttribute <IOffsetAttribute>();
                pAtt    = AddAttribute <IPayloadAttribute>();
            }
Esempio n. 34
0
        public virtual void TestCaptureState()
        {
            // init a first instance
            AttributeSource    src     = new AttributeSource();
            ICharTermAttribute termAtt = src.AddAttribute <ICharTermAttribute>();
            ITypeAttribute     typeAtt = src.AddAttribute <ITypeAttribute>();

            termAtt.Append("TestTerm");
            typeAtt.Type = "TestType";
            int hashCode = src.GetHashCode();

            AttributeSource.State state = src.CaptureState();

            // modify the attributes
            termAtt.SetEmpty().Append("AnotherTestTerm");
            typeAtt.Type = "AnotherTestType";
            Assert.IsTrue(hashCode != src.GetHashCode(), "Hash code should be different");

            src.RestoreState(state);
            Assert.AreEqual(termAtt.ToString(), "TestTerm");
            Assert.AreEqual(typeAtt.Type, "TestType");
            Assert.AreEqual(hashCode, src.GetHashCode(), "Hash code should be equal after restore");

            // restore into an exact configured copy
            AttributeSource copy = new AttributeSource();

            copy.AddAttribute <ICharTermAttribute>();
            copy.AddAttribute <ITypeAttribute>();
            copy.RestoreState(state);
            Assert.AreEqual(src.GetHashCode(), copy.GetHashCode(), "Both AttributeSources should have same hashCode after restore");
            Assert.AreEqual(src, copy, "Both AttributeSources should be equal after restore");

            // init a second instance (with attributes in different order and one additional attribute)
            AttributeSource src2 = new AttributeSource();

            typeAtt = src2.AddAttribute <ITypeAttribute>();
            IFlagsAttribute flagsAtt = src2.AddAttribute <IFlagsAttribute>();

            termAtt        = src2.AddAttribute <ICharTermAttribute>();
            flagsAtt.Flags = 12345;

            src2.RestoreState(state);
            Assert.AreEqual(termAtt.ToString(), "TestTerm");
            Assert.AreEqual(typeAtt.Type, "TestType");
            Assert.AreEqual(12345, flagsAtt.Flags, "FlagsAttribute should not be touched");

            // init a third instance missing one Attribute
            AttributeSource src3 = new AttributeSource();

            termAtt = src3.AddAttribute <ICharTermAttribute>();
            try
            {
                src3.RestoreState(state);
                Assert.Fail("The third instance is missing the TypeAttribute, so restoreState() should throw IllegalArgumentException");
            }
#pragma warning disable 168
            catch (System.ArgumentException iae)
#pragma warning restore 168
            {
                // pass
            }
        }
Esempio n. 35
0
 public BugReproTokenStream()
 {
     termAtt   = AddAttribute <ICharTermAttribute>();
     offsetAtt = AddAttribute <IOffsetAttribute>();
     posIncAtt = AddAttribute <IPositionIncrementAttribute>();
 }
Esempio n. 36
0
 public WholeSentenceTokenizer(TextReader reader)
     : base(reader, BreakIterator.CreateSentenceInstance(Locale.GetUS()))
 {
     termAtt   = AddAttribute <ICharTermAttribute>();
     offsetAtt = AddAttribute <IOffsetAttribute>();
 }
Esempio n. 37
0
 public InputWindowToken(AttributeSource attSource)
 {
     this.attSource = attSource;
     this.termAtt   = attSource.GetAttribute <ICharTermAttribute>();
     this.offsetAtt = attSource.GetAttribute <IOffsetAttribute>();
 }
Esempio n. 38
0
 /// <summary>
 /// Create a new <see cref="KeepWordFilter"/>.
 /// <para><c>NOTE</c>: The words set passed to this constructor will be directly
 /// used by this filter and should not be modified.
 /// </para>
 /// </summary>
 /// <param name="version"> the Lucene match version </param>
 /// <param name="in">      the <see cref="TokenStream"/> to consume </param>
 /// <param name="words">   the words to keep </param>
 public KeepWordFilter(LuceneVersion version, TokenStream @in, CharArraySet words)
     : base(version, @in)
 {
     this.words = words;
     termAtt    = AddAttribute <ICharTermAttribute>();
 }
Esempio n. 39
0
 public KeepWordFilter(LuceneVersion version, bool enablePositionIncrements, TokenStream @in, CharArraySet words)
     : base(version, enablePositionIncrements, @in)
 {
     this.words = words;
     termAtt    = AddAttribute <ICharTermAttribute>();
 }
Esempio n. 40
0
 public GreekStemFilter(TokenStream input)
     : base(input)
 {
     termAtt     = AddAttribute <ICharTermAttribute>();
     keywordAttr = AddAttribute <IKeywordAttribute>();
 }
Esempio n. 41
0
 /// <summary>
 /// Creates a new <see cref="HyphenatedWordsFilter"/>
 /// </summary>
 /// <param name="in"> <see cref="TokenStream"/> that will be filtered </param>
 public HyphenatedWordsFilter(TokenStream @in)
     : base(@in)
 {
     termAttribute   = AddAttribute <ICharTermAttribute>();
     offsetAttribute = AddAttribute <IOffsetAttribute>();
 }
Esempio n. 42
0
 public ChineseFilter(TokenStream @in)
     : base(@in)
 {
     stopTable = new CharArraySet(LuceneVersion.LUCENE_CURRENT, Arrays.AsList(STOP_WORDS), false);
     termAtt   = AddAttribute <ICharTermAttribute>();
 }
Esempio n. 43
0
 public MyTokenStream()
 {
     TermAtt = AddAttribute <ICharTermAttribute>();
 }
Esempio n. 44
0
 public MockGraphTokenFilter(Random random, TokenStream input)
     : base(input)
 {
     seed    = random.NextInt64();
     termAtt = AddAttribute <ICharTermAttribute>();
 }
Esempio n. 45
0
        public virtual void TestRandom()
        {
            int alphabetSize = TestUtil.NextInt32(Random, 2, 7);

            int docLen = AtLeast(3000);
            //final int docLen = 50;

            string document = GetRandomString('a', alphabetSize, docLen);

            if (Verbose)
            {
                Console.WriteLine("TEST: doc=" + document);
            }

            int numSyn = AtLeast(5);
            //final int numSyn = 2;

            IDictionary <string, OneSyn> synMap = new Dictionary <string, OneSyn>();
            IList <OneSyn> syns  = new JCG.List <OneSyn>();
            bool           dedup = Random.nextBoolean();

            if (Verbose)
            {
                Console.WriteLine("  dedup=" + dedup);
            }
            b = new SynonymMap.Builder(dedup);
            for (int synIDX = 0; synIDX < numSyn; synIDX++)
            {
                string synIn = GetRandomString('a', alphabetSize, TestUtil.NextInt32(Random, 1, 5)).Trim();
                if (!synMap.TryGetValue(synIn, out OneSyn s) || s is null)
                {
                    s     = new OneSyn();
                    s.@in = synIn;
                    syns.Add(s);
                    s.@out        = new JCG.List <string>();
                    synMap[synIn] = s;
                    s.keepOrig    = Random.nextBoolean();
                }
                string synOut = GetRandomString('0', 10, TestUtil.NextInt32(Random, 1, 5)).Trim();
                [email protected](synOut);
                Add(synIn, synOut, s.keepOrig);
                if (Verbose)
                {
                    Console.WriteLine("  syns[" + synIDX + "] = " + s.@in + " -> " + s.@out + " keepOrig=" + s.keepOrig);
                }
            }

            tokensIn = new MockTokenizer(new StringReader("a"), MockTokenizer.WHITESPACE, true);
            tokensIn.Reset();
            assertTrue(tokensIn.IncrementToken());
            assertFalse(tokensIn.IncrementToken());
            tokensIn.End();
            tokensIn.Dispose();

            tokensOut  = new SynonymFilter(tokensIn, b.Build(), true);
            termAtt    = tokensOut.AddAttribute <ICharTermAttribute>();
            posIncrAtt = tokensOut.AddAttribute <IPositionIncrementAttribute>();
            posLenAtt  = tokensOut.AddAttribute <IPositionLengthAttribute>();
            offsetAtt  = tokensOut.AddAttribute <IOffsetAttribute>();

            if (dedup)
            {
                PruneDups(syns);
            }

            string expected = SlowSynMatcher(document, syns, 5);

            if (Verbose)
            {
                Console.WriteLine("TEST: expected=" + expected);
            }

            Verify(document, expected);
        }
 public PortugueseMinimalStemFilter(TokenStream input)
     : base(input)
 {
     termAtt     = AddAttribute <ICharTermAttribute>();
     keywordAttr = AddAttribute <IKeywordAttribute>();
 }
 internal LargePosIncTokenFilter(TestWordDelimiterFilter outerInstance, TokenStream input) : base(input)
 {
     this.outerInstance = outerInstance;
     this.termAtt       = AddAttribute <ICharTermAttribute>();
     this.posIncAtt     = AddAttribute <IPositionIncrementAttribute>();
 }
Esempio n. 48
0
 public CrazyTokenFilter(TokenStream input)
     : base(input)
 {
     termAtt = AddAttribute <ICharTermAttribute>();
 }
Esempio n. 49
0
        public virtual void TestLatin1Accents()
        {
            TokenStream        stream = new MockTokenizer(new StringReader("Des mot clés À LA CHAÎNE À Á Â Ã Ä Å Æ Ç È É Ê Ë Ì Í Î Ï IJ Ð Ñ" + " Ò Ó Ô Õ Ö Ø Œ Þ Ù Ú Û Ü Ý Ÿ à á â ã ä å æ ç è é ê ë ì í î ï ij" + " ð ñ ò ó ô õ ö ø œ ß þ ù ú û ü ý ÿ fi fl"), MockTokenizer.WHITESPACE, false);
            ASCIIFoldingFilter filter = new ASCIIFoldingFilter(stream, Random.nextBoolean());

            ICharTermAttribute termAtt = filter.GetAttribute <ICharTermAttribute>();

            filter.Reset();
            assertNextTerms("Des", "Des", filter, termAtt);
            assertNextTerms("mot", "mot", filter, termAtt);
            assertNextTerms("clés", "cles", filter, termAtt);
            assertNextTerms("À", "A", filter, termAtt);
            assertNextTerms("LA", "LA", filter, termAtt);
            assertNextTerms("CHAÎNE", "CHAINE", filter, termAtt);
            assertNextTerms("À", "A", filter, termAtt);
            assertNextTerms("Á", "A", filter, termAtt);
            assertNextTerms("Â", "A", filter, termAtt);
            assertNextTerms("Ã", "A", filter, termAtt);
            assertNextTerms("Ä", "A", filter, termAtt);
            assertNextTerms("Å", "A", filter, termAtt);
            assertNextTerms("Æ", "AE", filter, termAtt);
            assertNextTerms("Ç", "C", filter, termAtt);
            assertNextTerms("È", "E", filter, termAtt);
            assertNextTerms("É", "E", filter, termAtt);
            assertNextTerms("Ê", "E", filter, termAtt);
            assertNextTerms("Ë", "E", filter, termAtt);
            assertNextTerms("Ì", "I", filter, termAtt);
            assertNextTerms("Í", "I", filter, termAtt);
            assertNextTerms("Î", "I", filter, termAtt);
            assertNextTerms("Ï", "I", filter, termAtt);
            assertNextTerms("IJ", "IJ", filter, termAtt);
            assertNextTerms("Ð", "D", filter, termAtt);
            assertNextTerms("Ñ", "N", filter, termAtt);
            assertNextTerms("Ò", "O", filter, termAtt);
            assertNextTerms("Ó", "O", filter, termAtt);
            assertNextTerms("Ô", "O", filter, termAtt);
            assertNextTerms("Õ", "O", filter, termAtt);
            assertNextTerms("Ö", "O", filter, termAtt);
            assertNextTerms("Ø", "O", filter, termAtt);
            assertNextTerms("Œ", "OE", filter, termAtt);
            assertNextTerms("Þ", "TH", filter, termAtt);
            assertNextTerms("Ù", "U", filter, termAtt);
            assertNextTerms("Ú", "U", filter, termAtt);
            assertNextTerms("Û", "U", filter, termAtt);
            assertNextTerms("Ü", "U", filter, termAtt);
            assertNextTerms("Ý", "Y", filter, termAtt);
            assertNextTerms("Ÿ", "Y", filter, termAtt);
            assertNextTerms("à", "a", filter, termAtt);
            assertNextTerms("á", "a", filter, termAtt);
            assertNextTerms("â", "a", filter, termAtt);
            assertNextTerms("ã", "a", filter, termAtt);
            assertNextTerms("ä", "a", filter, termAtt);
            assertNextTerms("å", "a", filter, termAtt);
            assertNextTerms("æ", "ae", filter, termAtt);
            assertNextTerms("ç", "c", filter, termAtt);
            assertNextTerms("è", "e", filter, termAtt);
            assertNextTerms("é", "e", filter, termAtt);
            assertNextTerms("ê", "e", filter, termAtt);
            assertNextTerms("ë", "e", filter, termAtt);
            assertNextTerms("ì", "i", filter, termAtt);
            assertNextTerms("í", "i", filter, termAtt);
            assertNextTerms("î", "i", filter, termAtt);
            assertNextTerms("ï", "i", filter, termAtt);
            assertNextTerms("ij", "ij", filter, termAtt);
            assertNextTerms("ð", "d", filter, termAtt);
            assertNextTerms("ñ", "n", filter, termAtt);
            assertNextTerms("ò", "o", filter, termAtt);
            assertNextTerms("ó", "o", filter, termAtt);
            assertNextTerms("ô", "o", filter, termAtt);
            assertNextTerms("õ", "o", filter, termAtt);
            assertNextTerms("ö", "o", filter, termAtt);
            assertNextTerms("ø", "o", filter, termAtt);
            assertNextTerms("œ", "oe", filter, termAtt);
            assertNextTerms("ß", "ss", filter, termAtt);
            assertNextTerms("þ", "th", filter, termAtt);
            assertNextTerms("ù", "u", filter, termAtt);
            assertNextTerms("ú", "u", filter, termAtt);
            assertNextTerms("û", "u", filter, termAtt);
            assertNextTerms("ü", "u", filter, termAtt);
            assertNextTerms("ý", "y", filter, termAtt);
            assertNextTerms("ÿ", "y", filter, termAtt);
            assertNextTerms("fi", "fi", filter, termAtt);
            assertNextTerms("fl", "fl", filter, termAtt);
            assertFalse(filter.IncrementToken());
        }
 public HungarianLightStemFilter(TokenStream input)
     : base(input)
 {
     termAtt     = AddAttribute <ICharTermAttribute>();
     keywordAttr = AddAttribute <IKeywordAttribute>();
 }
Esempio n. 51
0
        protected override IQueryNode PostProcessNode(IQueryNode node)
        {
            if (node is ITextableQueryNode &&
                !(node is WildcardQueryNode) &&
                !(node is FuzzyQueryNode) &&
                !(node is RegexpQueryNode) &&
                !(node.Parent is IRangeQueryNode))
            {
                FieldQueryNode fieldNode = ((FieldQueryNode)node);
                string         text      = fieldNode.GetTextAsString();
                string         field     = fieldNode.GetFieldAsString();

                CachingTokenFilter          buffer     = null;
                IPositionIncrementAttribute posIncrAtt = null;
                int  numTokens     = 0;
                int  positionCount = 0;
                bool severalTokensAtSamePosition = false;

                TokenStream source = null;
                try
                {
                    source = this.analyzer.GetTokenStream(field, text);
                    source.Reset();
                    buffer = new CachingTokenFilter(source);

                    if (buffer.HasAttribute <IPositionIncrementAttribute>())
                    {
                        posIncrAtt = buffer.GetAttribute <IPositionIncrementAttribute>();
                    }

                    try
                    {
                        while (buffer.IncrementToken())
                        {
                            numTokens++;
                            int positionIncrement = (posIncrAtt != null) ? posIncrAtt
                                                    .PositionIncrement : 1;
                            if (positionIncrement != 0)
                            {
                                positionCount += positionIncrement;
                            }
                            else
                            {
                                severalTokensAtSamePosition = true;
                            }
                        }
                    }
#pragma warning disable 168
                    catch (IOException e)
#pragma warning restore 168
                    {
                        // ignore
                    }
                }
                catch (IOException e)
                {
                    throw new Exception(e.ToString(), e);
                }
                finally
                {
                    IOUtils.DisposeWhileHandlingException(source);
                }

                // rewind the buffer stream
                buffer.Reset();

                if (!buffer.HasAttribute <ICharTermAttribute>())
                {
                    return(new NoTokenFoundQueryNode());
                }

                ICharTermAttribute termAtt = buffer.GetAttribute <ICharTermAttribute>();

                if (numTokens == 0)
                {
                    return(new NoTokenFoundQueryNode());
                }
                else if (numTokens == 1)
                {
                    string term = null;
                    try
                    {
                        bool hasNext;
                        hasNext = buffer.IncrementToken();
                        if (Debugging.AssertsEnabled)
                        {
                            Debugging.Assert(hasNext == true);
                        }
                        term = termAtt.ToString();
                    }
                    catch (IOException) // LUCENENET: IDE0059: Remove unnecessary value assignment
                    {
                        // safe to ignore, because we know the number of tokens
                    }

                    fieldNode.Text = term.AsCharSequence();

                    return(fieldNode);
                }
                else if (severalTokensAtSamePosition || !(node is QuotedFieldQueryNode))
                {
                    if (positionCount == 1 || !(node is QuotedFieldQueryNode))
                    {
                        // no phrase query:

                        if (positionCount == 1)
                        {
                            // simple case: only one position, with synonyms
                            List <IQueryNode> children = new List <IQueryNode>();

                            for (int i = 0; i < numTokens; i++)
                            {
                                string term = null;
                                try
                                {
                                    bool hasNext = buffer.IncrementToken();
                                    if (Debugging.AssertsEnabled)
                                    {
                                        Debugging.Assert(hasNext == true);
                                    }
                                    term = termAtt.ToString();
                                }
                                catch (IOException) // LUCENENET: IDE0059: Remove unnecessary value assignment
                                {
                                    // safe to ignore, because we know the number of tokens
                                }

                                children.Add(new FieldQueryNode(field, term, -1, -1));
                            }
                            return(new GroupQueryNode(
                                       new StandardBooleanQueryNode(children, positionCount == 1)));
                        }
                        else
                        {
                            // multiple positions
                            IQueryNode q            = new StandardBooleanQueryNode(Collections.EmptyList <IQueryNode>(), false);
                            IQueryNode currentQuery = null;
                            for (int i = 0; i < numTokens; i++)
                            {
                                string term = null;
                                try
                                {
                                    bool hasNext = buffer.IncrementToken();
                                    if (Debugging.AssertsEnabled)
                                    {
                                        Debugging.Assert(hasNext == true);
                                    }
                                    term = termAtt.ToString();
                                }
                                catch (IOException) // LUCENENET: IDE0059: Remove unnecessary value assignment
                                {
                                    // safe to ignore, because we know the number of tokens
                                }
                                if (posIncrAtt != null && posIncrAtt.PositionIncrement == 0)
                                {
                                    if (!(currentQuery is BooleanQueryNode))
                                    {
                                        IQueryNode t = currentQuery;
                                        currentQuery = new StandardBooleanQueryNode(Collections.EmptyList <IQueryNode>(), true);
                                        ((BooleanQueryNode)currentQuery).Add(t);
                                    }
                                    ((BooleanQueryNode)currentQuery).Add(new FieldQueryNode(field, term, -1, -1));
                                }
                                else
                                {
                                    if (currentQuery != null)
                                    {
                                        if (this.defaultOperator == Operator.OR)
                                        {
                                            q.Add(currentQuery);
                                        }
                                        else
                                        {
                                            q.Add(new ModifierQueryNode(currentQuery, Modifier.MOD_REQ));
                                        }
                                    }
                                    currentQuery = new FieldQueryNode(field, term, -1, -1);
                                }
                            }
                            if (this.defaultOperator == Operator.OR)
                            {
                                q.Add(currentQuery);
                            }
                            else
                            {
                                q.Add(new ModifierQueryNode(currentQuery, Modifier.MOD_REQ));
                            }

                            if (q is BooleanQueryNode)
                            {
                                q = new GroupQueryNode(q);
                            }
                            return(q);
                        }
                    }
                    else
                    {
                        // phrase query:
                        MultiPhraseQueryNode mpq = new MultiPhraseQueryNode();

                        List <FieldQueryNode> multiTerms = new List <FieldQueryNode>();
                        int position       = -1;
                        int i              = 0;
                        int termGroupCount = 0;
                        for (; i < numTokens; i++)
                        {
                            string term = null;
                            int    positionIncrement = 1;
                            try
                            {
                                bool hasNext = buffer.IncrementToken();
                                if (Debugging.AssertsEnabled)
                                {
                                    Debugging.Assert(hasNext == true);
                                }
                                term = termAtt.ToString();
                                if (posIncrAtt != null)
                                {
                                    positionIncrement = posIncrAtt.PositionIncrement;
                                }
                            }
                            catch (IOException) // LUCENENET: IDE0059: Remove unnecessary value assignment
                            {
                                // safe to ignore, because we know the number of tokens
                            }

                            if (positionIncrement > 0 && multiTerms.Count > 0)
                            {
                                foreach (FieldQueryNode termNode in multiTerms)
                                {
                                    if (this.positionIncrementsEnabled)
                                    {
                                        termNode.PositionIncrement = position;
                                    }
                                    else
                                    {
                                        termNode.PositionIncrement = termGroupCount;
                                    }

                                    mpq.Add(termNode);
                                }

                                // Only increment once for each "group" of
                                // terms that were in the same position:
                                termGroupCount++;

                                multiTerms.Clear();
                            }

                            position += positionIncrement;
                            multiTerms.Add(new FieldQueryNode(field, term, -1, -1));
                        }

                        foreach (FieldQueryNode termNode in multiTerms)
                        {
                            if (this.positionIncrementsEnabled)
                            {
                                termNode.PositionIncrement = position;
                            }
                            else
                            {
                                termNode.PositionIncrement = termGroupCount;
                            }

                            mpq.Add(termNode);
                        }

                        return(mpq);
                    }
                }
                else
                {
                    TokenizedPhraseQueryNode pq = new TokenizedPhraseQueryNode();

                    int position = -1;

                    for (int i = 0; i < numTokens; i++)
                    {
                        string term = null;
                        int    positionIncrement = 1;

                        try
                        {
                            bool hasNext = buffer.IncrementToken();
                            if (Debugging.AssertsEnabled)
                            {
                                Debugging.Assert(hasNext == true);
                            }
                            term = termAtt.ToString();

                            if (posIncrAtt != null)
                            {
                                positionIncrement = posIncrAtt.PositionIncrement;
                            }
                        }
                        catch (IOException) // LUCENENET: IDE0059: Remove unnecessary value assignment
                        {
                            // safe to ignore, because we know the number of tokens
                        }

                        FieldQueryNode newFieldNode = new FieldQueryNode(field, term, -1, -1);

                        if (this.positionIncrementsEnabled)
                        {
                            position += positionIncrement;
                            newFieldNode.PositionIncrement = position;
                        }
                        else
                        {
                            newFieldNode.PositionIncrement = i;
                        }

                        pq.Add(newFieldNode);
                    }

                    return(pq);
                }
            }

            return(node);
        }
Esempio n. 52
0
        /*
         * Need to worry about multiple scenarios:
         *  - need to go for the longest match
         *    a b => foo      #shouldn't match if "a b" is followed by "c d"
         *    a b c d => bar
         *  - need to backtrack - retry matches for tokens already read
         *     a b c d => foo
         *       b c => bar
         *     If the input stream is "a b c x", one will consume "a b c d"
         *     trying to match the first rule... all but "a" should be
         *     pushed back so a match may be made on "b c".
         *  - don't try and match generated tokens (thus need separate queue)
         *    matching is not recursive.
         *  - handle optional generation of original tokens in all these cases,
         *    merging token streams to preserve token positions.
         *  - preserve original positionIncrement of first matched token
         */
        public override bool IncrementToken()
        {
            while (true)
            {
                // if there are any generated tokens, return them... don't try any
                // matches against them, as we specifically don't want recursion.
                if (replacement != null && replacement.MoveNext())
                {
                    Copy(this, replacement.Current);
                    return(true);
                }

                // common case fast-path of first token not matching anything
                AttributeSource firstTok = NextTok();
                if (firstTok == null)
                {
                    return(false);
                }
                var            termAtt = firstTok.AddAttribute <ICharTermAttribute>();
                SlowSynonymMap result  = map.Submap != null?map.Submap.Get(termAtt.Buffer, 0, termAtt.Length) : null;

                if (result == null)
                {
                    Copy(this, firstTok);
                    return(true);
                }

                // fast-path failed, clone ourselves if needed
                if (firstTok == this)
                {
                    firstTok = CloneAttributes();
                }
                // OK, we matched a token, so find the longest match.

                matched = new LinkedList <AttributeSource>();

                result = Match(result);

                if (result == null)
                {
                    // no match, simply return the first token read.
                    Copy(this, firstTok);
                    return(true);
                }

                // reuse, or create new one each time?
                List <AttributeSource> generated = new List <AttributeSource>(result.Synonyms.Length + matched.Count + 1);

                //
                // there was a match... let's generate the new tokens, merging
                // in the matched tokens (position increments need adjusting)
                //
                AttributeSource lastTok     = matched.Count == 0 ? firstTok : matched.Last.Value;
                bool            includeOrig = result.IncludeOrig;

                AttributeSource             origTok        = includeOrig ? firstTok : null;
                IPositionIncrementAttribute firstPosIncAtt = firstTok.AddAttribute <IPositionIncrementAttribute>();
                int origPos = firstPosIncAtt.PositionIncrement; // position of origTok in the original stream
                int repPos  = 0;                                // curr position in replacement token stream
                int pos     = 0;                                // current position in merged token stream

                for (int i = 0; i < result.Synonyms.Length; i++)
                {
                    Token                       repTok       = result.Synonyms[i];
                    AttributeSource             newTok       = firstTok.CloneAttributes();
                    ICharTermAttribute          newTermAtt   = newTok.AddAttribute <ICharTermAttribute>();
                    IOffsetAttribute            newOffsetAtt = newTok.AddAttribute <IOffsetAttribute>();
                    IPositionIncrementAttribute newPosIncAtt = newTok.AddAttribute <IPositionIncrementAttribute>();

                    IOffsetAttribute lastOffsetAtt = lastTok.AddAttribute <IOffsetAttribute>();

                    newOffsetAtt.SetOffset(newOffsetAtt.StartOffset, lastOffsetAtt.EndOffset);
                    newTermAtt.CopyBuffer(repTok.Buffer, 0, repTok.Length);
                    repPos += repTok.PositionIncrement;
                    if (i == 0) // make position of first token equal to original
                    {
                        repPos = origPos;
                    }

                    // if necessary, insert original tokens and adjust position increment
                    while (origTok != null && origPos <= repPos)
                    {
                        IPositionIncrementAttribute origPosInc = origTok.AddAttribute <IPositionIncrementAttribute>();
                        origPosInc.PositionIncrement = origPos - pos;
                        generated.Add(origTok);
                        pos += origPosInc.PositionIncrement;
                        //origTok = matched.Count == 0 ? null : matched.RemoveFirst();
                        if (matched.Count == 0)
                        {
                            origTok = null;
                        }
                        else
                        {
                            origTok = matched.First.Value;
                            matched.Remove(origTok);
                        }
                        if (origTok != null)
                        {
                            origPosInc = origTok.AddAttribute <IPositionIncrementAttribute>();
                            origPos   += origPosInc.PositionIncrement;
                        }
                    }

                    newPosIncAtt.PositionIncrement = repPos - pos;
                    generated.Add(newTok);
                    pos += newPosIncAtt.PositionIncrement;
                }

                // finish up any leftover original tokens
                while (origTok != null)
                {
                    IPositionIncrementAttribute origPosInc = origTok.AddAttribute <IPositionIncrementAttribute>();
                    origPosInc.PositionIncrement = origPos - pos;
                    generated.Add(origTok);
                    pos += origPosInc.PositionIncrement;
                    if (matched.Count == 0)
                    {
                        origTok = null;
                    }
                    else
                    {
                        origTok = matched.First.Value;
                        matched.Remove(origTok);
                    }
                    if (origTok != null)
                    {
                        origPosInc = origTok.AddAttribute <IPositionIncrementAttribute>();
                        origPos   += origPosInc.PositionIncrement;
                    }
                }

                // what if we replaced a longer sequence with a shorter one?
                // a/0 b/5 =>  foo/0
                // should I re-create the gap on the next buffered token?

                replacement = generated.GetEnumerator();
                // Now return to the top of the loop to read and return the first
                // generated token.. The reason this is done is that we may have generated
                // nothing at all, and may need to continue with more matching logic.
            }
        }
Esempio n. 53
0
 /// <summary>
 /// Create a new <see cref="SetKeywordMarkerFilter"/>, that marks the current token as a
 /// keyword if the tokens term buffer is contained in the given set via the
 /// <see cref="KeywordAttribute"/>.
 /// </summary>
 /// <param name="in">
 ///          <see cref="TokenStream"/> to filter </param>
 /// <param name="keywordSet">
 ///          the keywords set to lookup the current termbuffer </param>
 public SetKeywordMarkerFilter(TokenStream @in, CharArraySet keywordSet)
     : base(@in)
 {
     this.keywordSet = keywordSet;
     termAtt         = AddAttribute <ICharTermAttribute>();
 }
Esempio n. 54
0
        /// <summary>
        /// (non-Javadoc)
        /// @see org.apache.lucene.xmlparser.QueryObjectBuilder#process(org.w3c.dom.Element)
        /// </summary>
        public virtual Query GetQuery(XmlElement e)
        {
            string fieldsList = e.GetAttribute("fieldNames"); //a comma-delimited list of fields

            string[] fields = defaultFieldNames;
            if ((fieldsList != null) && (fieldsList.Trim().Length > 0))
            {
                fields = fieldsList.Trim().Split(',').TrimEnd();
                //trim the fieldnames
                for (int i = 0; i < fields.Length; i++)
                {
                    fields[i] = fields[i].Trim();
                }
            }

            //Parse any "stopWords" attribute
            //TODO MoreLikeThis needs to ideally have per-field stopWords lists - until then
            //I use all analyzers/fields to generate multi-field compatible stop list
            string        stopWords    = e.GetAttribute("stopWords");
            ISet <string> stopWordsSet = null;

            if ((stopWords != null) && (fields != null))
            {
                stopWordsSet = new JCG.HashSet <string>();
                foreach (string field in fields)
                {
                    TokenStream ts = null;
                    try
                    {
                        ts = analyzer.GetTokenStream(field, stopWords);
                        ICharTermAttribute termAtt = ts.AddAttribute <ICharTermAttribute>();
                        ts.Reset();
                        while (ts.IncrementToken())
                        {
                            stopWordsSet.Add(termAtt.ToString());
                        }
                        ts.End();
                    }
                    catch (IOException ioe)
                    {
                        throw new ParserException("IoException parsing stop words list in "
                                                  + GetType().Name + ":" + ioe.Message);
                    }
                    finally
                    {
                        IOUtils.DisposeWhileHandlingException(ts);
                    }
                }
            }

            MoreLikeThisQuery mlt = new MoreLikeThisQuery(DOMUtils.GetText(e), fields, analyzer, fields[0]);

            mlt.MaxQueryTerms       = DOMUtils.GetAttribute(e, "maxQueryTerms", DEFAULT_MAX_QUERY_TERMS);
            mlt.MinTermFrequency    = DOMUtils.GetAttribute(e, "minTermFrequency", DEFAULT_MIN_TERM_FREQUENCY);
            mlt.PercentTermsToMatch = DOMUtils.GetAttribute(e, "percentTermsToMatch", DEFAULT_PERCENT_TERMS_TO_MATCH) / 100;
            mlt.StopWords           = stopWordsSet;
            int minDocFreq = DOMUtils.GetAttribute(e, "minDocFreq", -1);

            if (minDocFreq >= 0)
            {
                mlt.MinDocFreq = minDocFreq;
            }

            mlt.Boost = DOMUtils.GetAttribute(e, "boost", 1.0f);

            return(mlt);
        }
Esempio n. 55
0
 public PorterStemFilter(TokenStream @in) : base(@in)
 {
     termAtt     = AddAttribute <ICharTermAttribute>();
     keywordAttr = AddAttribute <IKeywordAttribute>();
 }
Esempio n. 56
0
 /// <param name="input"> Source token stream </param>
 /// <param name="collator"> CollationKey generator </param>
 public CollationKeyFilter(TokenStream input, Collator collator)
     : base(input)
 {
     this.collator = collator;
     this.termAtt  = this.AddAttribute <ICharTermAttribute>();
 }
Esempio n. 57
0
 /// <summary>
 /// Fills ICharTermAttribute with the current token text.
 /// </summary>
 public void GetText(ICharTermAttribute t)
 {
     t.CopyBuffer(zzBuffer, zzStartRead, zzMarkedPos - zzStartRead);
 }
Esempio n. 58
0
 /// <summary>
 /// <seealso cref="IScorer.Init(TokenStream)"/>
 /// </summary>
 public virtual TokenStream Init(TokenStream tokenStream)
 {
     termAtt = tokenStream.AddAttribute <ICharTermAttribute>();
     return(null);
 }
Esempio n. 59
0
 private void Init()
 {
     termAtt   = AddAttribute <ICharTermAttribute>();
     offsetAtt = AddAttribute <IOffsetAttribute>();
     typeAtt   = AddAttribute <ITypeAttribute>();
 }
Esempio n. 60
0
 /// <summary>
 /// Constructs a filter which removes words from the input <see cref="TokenStream"/> that are
 /// named in the <see cref="CharArraySet"/>.
 /// </summary>
 /// <param name="matchVersion">
 ///          Lucene version to enable correct Unicode 4.0 behavior in the stop
 ///          set if Version > 3.0.  See <see cref="LuceneVersion"/>> for details. </param>
 /// <param name="in">
 ///          Input <see cref="TokenStream"/> </param>
 /// <param name="stopWords">
 ///          A <see cref="CharArraySet"/> representing the stopwords. </param>
 /// <seealso cref="MakeStopSet(LuceneVersion, string[])"/>
 public StopFilter(LuceneVersion matchVersion, TokenStream @in, CharArraySet stopWords)
     : base(matchVersion, @in)
 {
     termAtt        = AddAttribute <ICharTermAttribute>();
     this.stopWords = stopWords;
 }