Esempio n. 1
0
		private void  Init(int bufferSize)
		{
			this.done = false;
            termAtt = AddAttribute<ITermAttribute>();
            offsetAtt = AddAttribute<IOffsetAttribute>();
			termAtt.ResizeTermBuffer(bufferSize);
		}
Esempio n. 2
0
        ///<summary>
        ///Lucene Tokenizer适配器类构造函数
        /// </summary>
        /// <param name="isMaxWordLength">当为true时,分词器进行最大词长切分;当为false是,采用最细粒度切分</param>
        public IKTokenizer(TextReader inreader, bool isMaxWordLength)
            : base(inreader)
        {
			offsetAtt = AddAttribute<IOffsetAttribute>();
            termAtt = AddAttribute<ITermAttribute>();
            _IKImplement = new IKSegmentation(inreader, isMaxWordLength);
        }
Esempio n. 3
0
 public CamelCaseFilter(TokenStream stream)
     : base(stream)
 {
     _termAttribute = AddAttribute<ITermAttribute>();
     _offsetAttribute = AddAttribute<IOffsetAttribute>();
     _positionIncrementAttribute = AddAttribute<IPositionIncrementAttribute>();
 }
 void Init()
 {
     InitPanGuSegment();
     termAtt = AddAttribute<ITermAttribute>();
     offsetAtt = AddAttribute<IOffsetAttribute>();
     posIncrAtt = AddAttribute<IPositionIncrementAttribute>();
     typeAtt = AddAttribute<ITypeAttribute>();
 }
 /*
  * 此处忽略调用base(input);因调用后input的position会被移动
  * by zh
  */
 public MMSegTokenizer(Seg seg, TextReader input)
     : base(input)
 {
     mmSeg = new MMSeg(input, seg);
     termAtt = AddAttribute<ITermAttribute>();
     offsetAtt = AddAttribute<IOffsetAttribute>();
     typeAtt = AddAttribute<ITypeAttribute>();
 }
 public CutLeterDigitFilter(TokenStream input)
     : base(input)
 {
     reusableToken = new Token();
     termAtt = AddAttribute<ITermAttribute>();
     offsetAtt = AddAttribute<IOffsetAttribute>();
     typeAtt = AddAttribute<ITypeAttribute>();
 }
        void Init(string content)
        {
            _enumerationPositions = _aufzaehlungDetector.FindAufzaehlungsspans(content).ToArray();
            _offsetAttribute = AddAttribute<IOffsetAttribute>();
            _flagsAttribute = AddAttribute<IFlagsAttribute>();

            _isInitialized = true;
        }
Esempio n. 8
0
 public GraphTokenizer(TextReader input)
     : base(input)
 {
     TermAtt = AddAttribute<ICharTermAttribute>();
     OffsetAtt = AddAttribute<IOffsetAttribute>();
     PosIncrAtt = AddAttribute<IPositionIncrementAttribute>();
     PosLengthAtt = AddAttribute<IPositionLengthAttribute>();
 }
 /// <summary>
 /// Sole constructor. </summary>
 public SuggestStopFilter(TokenStream input, CharArraySet stopWords)
     : base(input)
 {
     this.stopWords = stopWords;
     this.termAtt = AddAttribute<ICharTermAttribute>();
     this.posIncAtt = AddAttribute<IPositionIncrementAttribute>();
     this.keywordAtt = AddAttribute<IKeywordAttribute>();
     this.offsetAtt = AddAttribute<IOffsetAttribute>();
 }
 public CannedBinaryTokenStream(params BinaryToken[] tokens)
     : base()
 {
     this.Tokens = tokens;
     TermAtt = AddAttribute<IBinaryTermAttribute>();
     PosIncrAtt = AddAttribute<IPositionIncrementAttribute>();
     PosLengthAtt = AddAttribute<IPositionLengthAttribute>();
     OffsetAtt = AddAttribute<IOffsetAttribute>();
 }
Esempio n. 11
0
        private void Init(System.IO.TextReader _input, HebMorph.DataStructures.DictRadix<int> _prefixesTree)
        {
			termAtt = AddAttribute <ITermAttribute>();
			offsetAtt = AddAttribute <IOffsetAttribute>();
            //posIncrAtt = (PositionIncrementAttribute)AddAttribute(typeof(PositionIncrementAttribute));
			typeAtt = AddAttribute <ITypeAttribute>();
        	input = _input;
            hebMorphTokenizer = new HebMorph.Tokenizer(_input);
            prefixesTree = _prefixesTree;
        }
Esempio n. 12
0
        public JiebaTokenizer(JiebaSegmenter seg, string input)
        {
            segmenter = seg;
            termAtt = AddAttribute<ITermAttribute>();
            offsetAtt = AddAttribute<IOffsetAttribute>();
            typeAtt = AddAttribute<ITypeAttribute>();

            var text = input;
            tokens = segmenter.Tokenize(text, TokenizerMode.Search).ToList();
        }
Esempio n. 13
0
 /// <summary>
 /// Construct a token stream filtering the given input using a Set of common
 /// words to create bigrams. Outputs both unigrams with position increment and
 /// bigrams with position increment 0 type=gram where one or both of the words
 /// in a potential bigram are in the set of common words .
 /// </summary>
 /// <param name="input"> TokenStream input in filter chain </param>
 /// <param name="commonWords"> The set of common words. </param>
 public CommonGramsFilter(LuceneVersion matchVersion, TokenStream input, CharArraySet commonWords)
     : base(input)
 {
     termAttribute = AddAttribute<ICharTermAttribute>();
     offsetAttribute = AddAttribute<IOffsetAttribute>();
     typeAttribute = AddAttribute<ITypeAttribute>();
     posIncAttribute = AddAttribute<IPositionIncrementAttribute>();
     posLenAttribute = AddAttribute<IPositionLengthAttribute>();
     this.commonWords = commonWords;
 }
Esempio n. 14
0
 public TrimFilter(LuceneVersion version, TokenStream @in, bool updateOffsets)
     : base(@in)
 {
     if (updateOffsets && version.OnOrAfter(LuceneVersion.LUCENE_44))
     {
         throw new System.ArgumentException("updateOffsets=true is not supported anymore as of Lucene 4.4");
     }
     termAtt = AddAttribute<ICharTermAttribute>();
     offsetAtt = AddAttribute<IOffsetAttribute>();
     this.updateOffsets = updateOffsets;
 }
Esempio n. 15
0
 public IterTokenStream(params Token[] tokens)
         : base()
 {
     this.tokens = tokens;
     this.termAtt = AddAttribute<ICharTermAttribute>();
     this.offsetAtt = AddAttribute<IOffsetAttribute>();
     this.posIncAtt = AddAttribute<IPositionIncrementAttribute>();
     this.flagsAtt = AddAttribute<IFlagsAttribute>();
     this.typeAtt = AddAttribute<ITypeAttribute>();
     this.payloadAtt = AddAttribute<IPayloadAttribute>();
 }
Esempio n. 16
0
 /// <summary>
 /// Creates a new ThaiTokenizer, supplying the AttributeFactory </summary>
 public ThaiTokenizer(AttributeFactory factory, TextReader reader)
       : base(factory, reader, BreakIterator.CreateSentenceInstance(Locale.GetUS()))
 {
     if (!DBBI_AVAILABLE)
     {
         throw new System.NotSupportedException("This JRE does not have support for Thai segmentation");
     }
     wordBreaker = new ThaiWordBreaker(BreakIterator.CreateWordInstance(Locale.GetUS()));
     termAtt = AddAttribute<ICharTermAttribute>();
     offsetAtt = AddAttribute<IOffsetAttribute>();
 }
        public KeywordTokenizer(AttributeSource.AttributeFactory factory, Reader input, int bufferSize)
            : base(factory, input)
        {
            termAtt = AddAttribute<ICharTermAttribute>();
            offsetAtt = AddAttribute<IOffsetAttribute>();

            if (bufferSize <= 0)
            {
                throw new System.ArgumentException("bufferSize must be > 0");
            }
            termAtt.ResizeBuffer(bufferSize);
        }
Esempio n. 18
0
        private bool hasIllegalOffsets = false; // only if the length changed before this filter

        /// <summary>
        /// Creates a new ThaiWordFilter with the specified match version. </summary>
        public ThaiWordFilter(LuceneVersion matchVersion, TokenStream input)
              : base(matchVersion.OnOrAfter(LuceneVersion.LUCENE_31) ? input : new LowerCaseFilter(matchVersion, input))
        {
            if (!DBBI_AVAILABLE)
            {
                throw new System.NotSupportedException("This JRE does not have support for Thai segmentation");
            }
            handlePosIncr = matchVersion.OnOrAfter(LuceneVersion.LUCENE_31);
            termAtt = AddAttribute<ICharTermAttribute>();
            offsetAtt = AddAttribute<IOffsetAttribute>();
            posAtt = AddAttribute<IPositionIncrementAttribute>();
        }
Esempio n. 19
0
        private void Init(System.IO.TextReader input, HebMorph.StreamLemmatizer _lemmatizer,
            HebMorph.LemmaFilters.LemmaFilterBase _lemmaFilter, bool AlwaysSaveMarkedOriginal)
        {
			termAtt = AddAttribute <ITermAttribute>();
	        offsetAtt = AddAttribute<IOffsetAttribute>();
	        posIncrAtt = AddAttribute<IPositionIncrementAttribute>();
			typeAtt = AddAttribute <ITypeAttribute>();
            //payAtt = (PayloadAttribute)AddAttribute(typeof(PayloadAttribute));

        	this.input = input;
            this._streamLemmatizer = _lemmatizer;
            this._streamLemmatizer.SetStream(input);
            this.alwaysSaveMarkedOriginal = AlwaysSaveMarkedOriginal;
            this.lemmaFilter = _lemmaFilter;
        }
Esempio n. 20
0
        /*
         * Creates NGramTokenFilter with given min and max n-grams.
         * <param name="input"><see cref="TokenStream"/> holding the input to be tokenized</param>
         * <param name="minGram">the smallest n-gram to generate</param>
         * <param name="maxGram">the largest n-gram to generate</param>
         */
        public NGramTokenFilter(TokenStream input, int minGram, int maxGram)
            : base(input)
        {
            if (minGram < 1)
            {
                throw new System.ArgumentException("minGram must be greater than zero");
            }
            if (minGram > maxGram)
            {
                throw new System.ArgumentException("minGram must not be greater than maxGram");
            }
            this.minGram = minGram;
            this.maxGram = maxGram;

            this.termAtt = AddAttribute<ITermAttribute>();
            this.offsetAtt = AddAttribute<IOffsetAttribute>();
        }
Esempio n. 21
0
 /// <summary>
 /// If inputText is non-null, and the TokenStream has
 ///  offsets, we include the surface form in each arc's
 ///  label.
 /// </summary>
 public TokenStreamToDot(string inputText, TokenStream @in, TextWriter @out)
 {
     this.@in = @in;
     this.@out = @out;
     this.InputText = inputText;
     TermAtt = @in.AddAttribute<ICharTermAttribute>();
     PosIncAtt = @in.AddAttribute<IPositionIncrementAttribute>();
     PosLengthAtt = @in.AddAttribute<IPositionLengthAttribute>();
     if (@in.HasAttribute<IOffsetAttribute>())
     {
         OffsetAtt = @in.AddAttribute<IOffsetAttribute>();
     }
     else
     {
         OffsetAtt = null;
     }
 }
        /// <summary>
        /// creates a new PatternTokenizer returning tokens from group (-1 for split functionality) </summary>
        public PatternTokenizer(AttributeFactory factory, TextReader input, Regex pattern, int group)
              : base(factory, input)
        {
            this.termAtt = AddAttribute<ICharTermAttribute>();
            this.offsetAtt = AddAttribute<IOffsetAttribute>();
            this.group = group;

            // Use "" instead of str so don't consume chars
            // (fillBuffer) from the input on throwing IAE below:
            this.matcher = pattern.Match("");
            this.pattern = pattern;

            // confusingly group count depends ENTIRELY on the pattern but is only accessible via matcher
            var groupCount = pattern.GetGroupNumbers().Length;
            if (group >= 0 && group > groupCount)
            {
                throw new System.ArgumentException("invalid group specified: pattern only has: " + groupCount + " capturing groups");
            }

        }
        public PrefixAwareTokenFilter(TokenStream prefix, TokenStream suffix)
            : base(suffix)
        {
            this.suffix = suffix;
            this.prefix = prefix;
            prefixExhausted = false;

            termAtt = AddAttribute<ICharTermAttribute>();
            posIncrAtt = AddAttribute<IPositionIncrementAttribute>();
            payloadAtt = AddAttribute<IPayloadAttribute>();
            offsetAtt = AddAttribute<IOffsetAttribute>();
            typeAtt = AddAttribute<ITypeAttribute>();
            flagsAtt = AddAttribute<IFlagsAttribute>();

            p_termAtt = prefix.AddAttribute<ICharTermAttribute>();
            p_posIncrAtt = prefix.AddAttribute<IPositionIncrementAttribute>();
            p_payloadAtt = prefix.AddAttribute<IPayloadAttribute>();
            p_offsetAtt = prefix.AddAttribute<IOffsetAttribute>();
            p_typeAtt = prefix.AddAttribute<ITypeAttribute>();
            p_flagsAtt = prefix.AddAttribute<IFlagsAttribute>();
        }
        public PrefixAwareTokenFilter(TokenStream prefix, TokenStream suffix) : base(suffix)
        {
            Suffix = suffix;
            Prefix = prefix;
            _prefixExhausted = false;

            // ReSharper disable DoNotCallOverridableMethodsInConstructor
            _termAtt = AddAttribute<ITermAttribute>();
            _posIncrAtt = AddAttribute<IPositionIncrementAttribute>();
            _payloadAtt = AddAttribute<IPayloadAttribute>();
            _offsetAtt = AddAttribute<IOffsetAttribute>();
            _typeAtt = AddAttribute<ITypeAttribute>();
            _flagsAtt = AddAttribute<IFlagsAttribute>();
            // ReSharper restore DoNotCallOverridableMethodsInConstructor

            _pTermAtt = prefix.AddAttribute<ITermAttribute>();
            _pPosIncrAtt = prefix.AddAttribute<IPositionIncrementAttribute>();
            _pPayloadAtt = prefix.AddAttribute<IPayloadAttribute>();
            _pOffsetAtt = prefix.AddAttribute<IOffsetAttribute>();
            _pTypeAtt = prefix.AddAttribute<ITypeAttribute>();
            _pFlagsAtt = prefix.AddAttribute<IFlagsAttribute>();
        }
Esempio n. 25
0
        public PanGuTokenizer(TextReader input)
            : base(input)
        {
            termAttribute = AddAttribute<ITermAttribute>();
            offsetAttribute = AddAttribute<IOffsetAttribute>();

            inputText = base.input.ReadToEnd();

            if (string.IsNullOrEmpty(inputText)) {
                char[] readBuf = new char[1024];

                int relCount = base.input.Read(readBuf, 0, readBuf.Length);

                StringBuilder inputStr = new StringBuilder(readBuf.Length);

                while (relCount > 0) {
                    inputStr.Append(readBuf, 0, relCount);

                    relCount = input.Read(readBuf, 0, readBuf.Length);
                }

                if (inputStr.Length > 0) {
                    inputText = inputStr.ToString();
                }
            }

            if (string.IsNullOrEmpty(inputText)) {
                words = new WordInfo[0];
            }
            else {
                global::PanGu.Segment segment = new Segment();
                ICollection<WordInfo> wordInfos = segment.DoSegment(inputText);
                words = new WordInfo[wordInfos.Count];
                wordInfos.CopyTo(words, 0);
            }
        }
Esempio n. 26
0
        public static void VerifyEquals(Fields d1, Fields d2)
        {
            if (d1 == null)
            {
                Assert.IsTrue(d2 == null || d2.Count == 0);
                return;
            }
            Assert.IsTrue(d2 != null);

            IEnumerator <string> fieldsEnum2 = d2.GetEnumerator();

            foreach (string field1 in d1)
            {
                fieldsEnum2.MoveNext();
                string field2 = fieldsEnum2.Current;
                Assert.AreEqual(field1, field2);

                Terms terms1 = d1.GetTerms(field1);
                Assert.IsNotNull(terms1);
                TermsEnum termsEnum1 = terms1.GetIterator(null);

                Terms terms2 = d2.GetTerms(field2);
                Assert.IsNotNull(terms2);
                TermsEnum termsEnum2 = terms2.GetIterator(null);

                DocsAndPositionsEnum dpEnum1 = null;
                DocsAndPositionsEnum dpEnum2 = null;
                DocsEnum             dEnum1  = null;
                DocsEnum             dEnum2  = null;

                BytesRef term1;
                while ((term1 = termsEnum1.Next()) != null)
                {
                    BytesRef term2 = termsEnum2.Next();
                    Assert.AreEqual(term1, term2);
                    Assert.AreEqual(termsEnum1.TotalTermFreq, termsEnum2.TotalTermFreq);

                    dpEnum1 = termsEnum1.DocsAndPositions(null, dpEnum1);
                    dpEnum2 = termsEnum2.DocsAndPositions(null, dpEnum2);
                    if (dpEnum1 != null)
                    {
                        Assert.IsNotNull(dpEnum2);
                        int docID1 = dpEnum1.NextDoc();
                        dpEnum2.NextDoc();
                        // docIDs are not supposed to be equal
                        //int docID2 = dpEnum2.NextDoc();
                        //Assert.AreEqual(docID1, docID2);
                        Assert.IsTrue(docID1 != DocIdSetIterator.NO_MORE_DOCS);

                        int freq1 = dpEnum1.Freq;
                        int freq2 = dpEnum2.Freq;
                        Assert.AreEqual(freq1, freq2);
                        IOffsetAttribute offsetAtt1 = dpEnum1.Attributes.HasAttribute <IOffsetAttribute>() ? dpEnum1.Attributes.GetAttribute <IOffsetAttribute>() : null;
                        IOffsetAttribute offsetAtt2 = dpEnum2.Attributes.HasAttribute <IOffsetAttribute>() ? dpEnum2.Attributes.GetAttribute <IOffsetAttribute>() : null;

                        if (offsetAtt1 != null)
                        {
                            Assert.IsNotNull(offsetAtt2);
                        }
                        else
                        {
                            Assert.IsNull(offsetAtt2);
                        }

                        for (int posUpto = 0; posUpto < freq1; posUpto++)
                        {
                            int pos1 = dpEnum1.NextPosition();
                            int pos2 = dpEnum2.NextPosition();
                            Assert.AreEqual(pos1, pos2);
                            if (offsetAtt1 != null)
                            {
                                Assert.AreEqual(offsetAtt1.StartOffset, offsetAtt2.StartOffset);
                                Assert.AreEqual(offsetAtt1.EndOffset, offsetAtt2.EndOffset);
                            }
                        }
                        Assert.AreEqual(DocIdSetIterator.NO_MORE_DOCS, dpEnum1.NextDoc());
                        Assert.AreEqual(DocIdSetIterator.NO_MORE_DOCS, dpEnum2.NextDoc());
                    }
                    else
                    {
                        dEnum1 = TestUtil.Docs(Random(), termsEnum1, null, dEnum1, DocsFlags.FREQS);
                        dEnum2 = TestUtil.Docs(Random(), termsEnum2, null, dEnum2, DocsFlags.FREQS);
                        Assert.IsNotNull(dEnum1);
                        Assert.IsNotNull(dEnum2);
                        int docID1 = dEnum1.NextDoc();
                        dEnum2.NextDoc();
                        // docIDs are not supposed to be equal
                        //int docID2 = dEnum2.NextDoc();
                        //Assert.AreEqual(docID1, docID2);
                        Assert.IsTrue(docID1 != DocIdSetIterator.NO_MORE_DOCS);
                        int freq1 = dEnum1.Freq;
                        int freq2 = dEnum2.Freq;
                        Assert.AreEqual(freq1, freq2);
                        Assert.AreEqual(DocIdSetIterator.NO_MORE_DOCS, dEnum1.NextDoc());
                        Assert.AreEqual(DocIdSetIterator.NO_MORE_DOCS, dEnum2.NextDoc());
                    }
                }

                Assert.IsNull(termsEnum2.Next());
            }
            Assert.IsFalse(fieldsEnum2.MoveNext());
        }
Esempio n. 27
0
 public InputWindowToken(AttributeSource attSource)
 {
     this.attSource = attSource;
     this.termAtt   = attSource.GetAttribute <ICharTermAttribute>();
     this.offsetAtt = attSource.GetAttribute <IOffsetAttribute>();
 }
 /// <summary>
 /// Construct a new SegmenterBase, also supplying the <see cref="Lucene.Net.Util.AttributeSource.AttributeFactory"/>
 /// </summary>
 public SegmentingTokenizerBase(AttributeFactory factory, TextReader reader, BreakIterator iterator)
     : base(factory, reader)
 {
     offsetAtt     = AddAttribute <IOffsetAttribute>();
     this.iterator = iterator;
 }
Esempio n. 29
0
            protected internal RandomTokenStream(BaseTermVectorsFormatTestCase baseTermVectorsFormatTestCase, int len, string[] sampleTerms, BytesRef[] sampleTermBytes, bool offsetsGoBackwards)
            {
                terms               = new string[len];
                termBytes           = new BytesRef[len];
                positionsIncrements = new int[len];
                positions           = new int[len];
                startOffsets        = new int[len];
                endOffsets          = new int[len];
                payloads            = new BytesRef[len];
                for (int i = 0; i < len; ++i)
                {
                    int o = Random.Next(sampleTerms.Length);
                    terms[i]               = sampleTerms[o];
                    termBytes[i]           = sampleTermBytes[o];
                    positionsIncrements[i] = TestUtil.NextInt32(Random, i == 0 ? 1 : 0, 10);
                    if (offsetsGoBackwards)
                    {
                        startOffsets[i] = Random.Next();
                        endOffsets[i]   = Random.Next();
                    }
                    else
                    {
                        if (i == 0)
                        {
                            startOffsets[i] = TestUtil.NextInt32(Random, 0, 1 << 16);
                        }
                        else
                        {
                            startOffsets[i] = startOffsets[i - 1] + TestUtil.NextInt32(Random, 0, Rarely() ? 1 << 16 : 20);
                        }
                        endOffsets[i] = startOffsets[i] + TestUtil.NextInt32(Random, 0, Rarely() ? 1 << 10 : 20);
                    }
                }

                for (int i = 0; i < len; ++i)
                {
                    if (i == 0)
                    {
                        positions[i] = positionsIncrements[i] - 1;
                    }
                    else
                    {
                        positions[i] = positions[i - 1] + positionsIncrements[i];
                    }
                }
                if (Rarely())
                {
                    Arrays.Fill(payloads, baseTermVectorsFormatTestCase.RandomPayload());
                }
                else
                {
                    for (int i = 0; i < len; ++i)
                    {
                        payloads[i] = baseTermVectorsFormatTestCase.RandomPayload();
                    }
                }

                positionToTerms    = new Dictionary <int, ISet <int> >(len);
                startOffsetToTerms = new Dictionary <int, ISet <int> >(len);
                for (int i = 0; i < len; ++i)
                {
                    if (!positionToTerms.TryGetValue(positions[i], out ISet <int> positionTerms))
                    {
                        positionToTerms[positions[i]] = positionTerms = new JCG.HashSet <int>(1);
                    }
                    positionTerms.Add(i);
                    if (!startOffsetToTerms.TryGetValue(startOffsets[i], out ISet <int> startOffsetTerms))
                    {
                        startOffsetToTerms[startOffsets[i]] = startOffsetTerms = new JCG.HashSet <int>(1);
                    }
                    startOffsetTerms.Add(i);
                }

                freqs = new Dictionary <string, int>();
                foreach (string term in terms)
                {
                    if (freqs.TryGetValue(term, out int freq))
                    {
                        freqs[term] = freq + 1;
                    }
                    else
                    {
                        freqs[term] = 1;
                    }
                }

                AddAttributeImpl(new PermissiveOffsetAttribute());

                termAtt = AddAttribute <ICharTermAttribute>();
                piAtt   = AddAttribute <IPositionIncrementAttribute>();
                oAtt    = AddAttribute <IOffsetAttribute>();
                pAtt    = AddAttribute <IPayloadAttribute>();
            }
 /// <summary>
 /// Creates a new HyphenatedWordsFilter
 /// </summary>
 /// <param name="in"> TokenStream that will be filtered </param>
 public HyphenatedWordsFilter(TokenStream @in)
     : base(@in)
 {
     termAttribute = AddAttribute<ICharTermAttribute>();
     offsetAttribute = AddAttribute<IOffsetAttribute>();
 }
Esempio n. 31
0
 public PositionsTokenStream()
 {
     term    = AddAttribute <ICharTermAttribute>();
     payload = AddAttribute <IPayloadAttribute>();
     offset  = AddAttribute <IOffsetAttribute>();
 }
Esempio n. 32
0
 public RegexTokenizer(String str, Regex regex, bool toLowerCase)
 {
     this.str = str;
     this.matcher = regex.Match(str);
     this.toLowerCase = toLowerCase;
     this.termAtt = AddAttribute<ITermAttribute>();
     this.offsetAtt = AddAttribute<IOffsetAttribute>();
 }
 public TestFilter(TokenStream @in)
     : base(@in)
 {
     termAtt = AddAttribute<ICharTermAttribute>();
     posIncrAtt = AddAttribute<IPositionIncrementAttribute>();
     offsetAtt = AddAttribute<IOffsetAttribute>();
     typeAtt = AddAttribute<ITypeAttribute>();
 }
Esempio n. 34
0
 public IdentifierTokenizer(TextReader input)
     : base(input)
 {
     _offsetAtt = AddAttribute <IOffsetAttribute>();
     _termAtt   = AddAttribute <ITermAttribute>();
 }
 public BugReproTokenStream()
 {
     TermAtt   = AddAttribute <ICharTermAttribute>();
     OffsetAtt = AddAttribute <IOffsetAttribute>();
     PosIncAtt = AddAttribute <IPositionIncrementAttribute>();
 }
Esempio n. 36
0
        public override bool IncrementToken()
        {
            if (hasMoreTokensInClone)
            {
                int start = breaker.Current;
                int end   = breaker.Next();
                if (end != BreakIterator.Done)
                {
                    clonedToken.CopyTo(this);
                    termAtt.CopyBuffer(clonedTermAtt.Buffer, start, end - start);
                    if (hasIllegalOffsets)
                    {
                        offsetAtt.SetOffset(clonedOffsetAtt.StartOffset, clonedOffsetAtt.EndOffset);
                    }
                    else
                    {
                        offsetAtt.SetOffset(clonedOffsetAtt.StartOffset + start, clonedOffsetAtt.StartOffset + end);
                    }
                    if (handlePosIncr)
                    {
                        posAtt.PositionIncrement = 1;
                    }
                    return(true);
                }
                hasMoreTokensInClone = false;
            }

            if (!m_input.IncrementToken())
            {
                return(false);
            }

            if (termAtt.Length == 0 || !thaiPattern.IsMatch(string.Empty + termAtt[0]))
            {
                return(true);
            }

            hasMoreTokensInClone = true;

            // if length by start + end offsets doesn't match the term text then assume
            // this is a synonym and don't adjust the offsets.
            hasIllegalOffsets = offsetAtt.EndOffset - offsetAtt.StartOffset != termAtt.Length;

            // we lazy init the cloned token, as in ctor not all attributes may be added
            if (clonedToken == null)
            {
                clonedToken     = CloneAttributes();
                clonedTermAtt   = clonedToken.GetAttribute <ICharTermAttribute>();
                clonedOffsetAtt = clonedToken.GetAttribute <IOffsetAttribute>();
            }
            else
            {
                this.CopyTo(clonedToken);
            }

            // reinit CharacterIterator
            charIterator.SetText(clonedTermAtt.Buffer, 0, clonedTermAtt.Length);
            breaker.SetText(new string(charIterator.Text, charIterator.Start, charIterator.Length));
            int end2 = breaker.Next();

            if (end2 != BreakIterator.Done)
            {
                termAtt.Length = end2;
                if (hasIllegalOffsets)
                {
                    offsetAtt.SetOffset(clonedOffsetAtt.StartOffset, clonedOffsetAtt.EndOffset);
                }
                else
                {
                    offsetAtt.SetOffset(clonedOffsetAtt.StartOffset, clonedOffsetAtt.StartOffset + end2);
                }
                // position increment keeps as it is for first token
                return(true);
            }
            return(false);
        }
Esempio n. 37
0
        /// <summary>
        /// Iterates over the given token stream and adds the resulting terms to the index;
        /// Equivalent to adding a tokenized, indexed, termVectorStored, unstored,
        /// Lucene <see cref="Documents.Field"/>.
        /// Finally closes the token stream. Note that untokenized keywords can be added with this method via
        /// <see cref="T:KeywordTokenStream{T}(ICollection{T}"/>)"/>, the Lucene <c>KeywordTokenizer</c> or similar utilities.
        ///
        /// </summary>
        /// <param name="fieldName"> a name to be associated with the text </param>
        /// <param name="stream"> the token stream to retrieve tokens from. </param>
        /// <param name="boost"> the boost factor for hits for this field </param>
        /// <param name="positionIncrementGap"> the position increment gap if fields with the same name are added more than once </param>
        /// <param name="offsetGap"> the offset gap if fields with the same name are added more than once </param>
        /// <seealso cref="Documents.Field.Boost"/>
        public virtual void AddField(string fieldName, TokenStream stream, float boost, int positionIncrementGap, int offsetGap)
        {
            try
            {
                if (fieldName == null)
                {
                    throw new ArgumentException("fieldName must not be null");
                }
                if (stream == null)
                {
                    throw new ArgumentException("token stream must not be null");
                }
                if (boost <= 0.0f)
                {
                    throw new ArgumentException("boost factor must be greater than 0.0");
                }
                int                 numTokens        = 0;
                int                 numOverlapTokens = 0;
                int                 pos = -1;
                BytesRefHash        terms;
                SliceByteStartArray sliceArray;
                Info                info             = null;
                long                sumTotalTermFreq = 0;
                int                 offset           = 0;
                if (fields.TryGetValue(fieldName, out info))
                {
                    numTokens        = info.numTokens;
                    numOverlapTokens = info.numOverlapTokens;
                    pos              = info.lastPosition + positionIncrementGap;
                    offset           = info.lastOffset + offsetGap;
                    terms            = info.terms;
                    boost           *= info.boost;
                    sliceArray       = info.sliceArray;
                    sumTotalTermFreq = info.sumTotalTermFreq;
                }
                else
                {
                    sliceArray = new SliceByteStartArray(BytesRefHash.DEFAULT_CAPACITY);
                    terms      = new BytesRefHash(byteBlockPool, BytesRefHash.DEFAULT_CAPACITY, sliceArray);
                }

                if (!fieldInfos.ContainsKey(fieldName))
                {
                    fieldInfos[fieldName] = new FieldInfo(fieldName,
                                                          true,
                                                          fieldInfos.Count,
                                                          false,
                                                          false,
                                                          false,
                                                          this.storeOffsets ? IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS : IndexOptions.DOCS_AND_FREQS_AND_POSITIONS,
                                                          DocValuesType.NONE,
                                                          DocValuesType.NONE,
                                                          null);
                }
                ITermToBytesRefAttribute    termAtt          = stream.GetAttribute <ITermToBytesRefAttribute>();
                IPositionIncrementAttribute posIncrAttribute = stream.AddAttribute <IPositionIncrementAttribute>();
                IOffsetAttribute            offsetAtt        = stream.AddAttribute <IOffsetAttribute>();
                BytesRef @ref = termAtt.BytesRef;
                stream.Reset();

                while (stream.IncrementToken())
                {
                    termAtt.FillBytesRef();
                    //        if (DEBUG) System.err.println("token='" + term + "'");
                    numTokens++;
                    int posIncr = posIncrAttribute.PositionIncrement;
                    if (posIncr == 0)
                    {
                        numOverlapTokens++;
                    }
                    pos += posIncr;
                    int ord = terms.Add(@ref);
                    if (ord < 0)
                    {
                        ord = (-ord) - 1;
                        postingsWriter.Reset(sliceArray.end[ord]);
                    }
                    else
                    {
                        sliceArray.start[ord] = postingsWriter.StartNewSlice();
                    }
                    sliceArray.freq[ord]++;
                    sumTotalTermFreq++;
                    if (!storeOffsets)
                    {
                        postingsWriter.WriteInt32(pos);
                    }
                    else
                    {
                        postingsWriter.WriteInt32(pos);
                        postingsWriter.WriteInt32(offsetAtt.StartOffset + offset);
                        postingsWriter.WriteInt32(offsetAtt.EndOffset + offset);
                    }
                    sliceArray.end[ord] = postingsWriter.CurrentOffset;
                }
                stream.End();

                // ensure infos.numTokens > 0 invariant; needed for correct operation of terms()
                if (numTokens > 0)
                {
                    fields[fieldName] = new Info(terms, sliceArray, numTokens, numOverlapTokens, boost, pos, offsetAtt.EndOffset + offset, sumTotalTermFreq);
                    sortedFields      = null; // invalidate sorted view, if any
                }
            } // can never happen
            catch (Exception e)
            {
                throw new Exception(e.ToString(), e);
            }
            finally
            {
                try
                {
                    if (stream != null)
                    {
                        stream.Dispose();
                    }
                }
                catch (IOException e2)
                {
                    throw new Exception(e2.ToString(), e2);
                }
            }
        }
            protected internal RandomTokenStream(BaseTermVectorsFormatTestCase outerInstance, int len, string[] sampleTerms, BytesRef[] sampleTermBytes, bool offsetsGoBackwards)
            {
                this.OuterInstance = outerInstance;
                Terms               = new string[len];
                TermBytes           = new BytesRef[len];
                PositionsIncrements = new int[len];
                Positions           = new int[len];
                StartOffsets        = new int[len];
                EndOffsets          = new int[len];
                Payloads            = new BytesRef[len];
                for (int i = 0; i < len; ++i)
                {
                    int o = Random().Next(sampleTerms.Length);
                    Terms[i]               = sampleTerms[o];
                    TermBytes[i]           = sampleTermBytes[o];
                    PositionsIncrements[i] = TestUtil.NextInt(Random(), i == 0 ? 1 : 0, 10);
                    if (offsetsGoBackwards)
                    {
                        StartOffsets[i] = Random().Next();
                        EndOffsets[i]   = Random().Next();
                    }
                    else
                    {
                        if (i == 0)
                        {
                            StartOffsets[i] = TestUtil.NextInt(Random(), 0, 1 << 16);
                        }
                        else
                        {
                            StartOffsets[i] = StartOffsets[i - 1] + TestUtil.NextInt(Random(), 0, Rarely() ? 1 << 16 : 20);
                        }
                        EndOffsets[i] = StartOffsets[i] + TestUtil.NextInt(Random(), 0, Rarely() ? 1 << 10 : 20);
                    }
                }

                for (int i = 0; i < len; ++i)
                {
                    if (i == 0)
                    {
                        Positions[i] = PositionsIncrements[i] - 1;
                    }
                    else
                    {
                        Positions[i] = Positions[i - 1] + PositionsIncrements[i];
                    }
                }
                if (Rarely())
                {
                    Arrays.Fill(Payloads, outerInstance.RandomPayload());
                }
                else
                {
                    for (int i = 0; i < len; ++i)
                    {
                        Payloads[i] = outerInstance.RandomPayload();
                    }
                }

                PositionToTerms    = new Dictionary <int?, ISet <int?> >(len);
                StartOffsetToTerms = new Dictionary <int?, ISet <int?> >(len);
                for (int i = 0; i < len; ++i)
                {
                    if (!PositionToTerms.ContainsKey(Positions[i]))
                    {
                        PositionToTerms[Positions[i]] = new HashSet <int?>();//size1
                    }
                    PositionToTerms[Positions[i]].Add(i);
                    if (!StartOffsetToTerms.ContainsKey(StartOffsets[i]))
                    {
                        StartOffsetToTerms[StartOffsets[i]] = new HashSet <int?>();//size1
                    }
                    StartOffsetToTerms[StartOffsets[i]].Add(i);
                }

                Freqs = new Dictionary <string, int?>();
                foreach (string term in Terms)
                {
                    if (Freqs.ContainsKey(term))
                    {
                        Freqs[term] = Freqs[term] + 1;
                    }
                    else
                    {
                        Freqs[term] = 1;
                    }
                }

                AddAttributeImpl(new PermissiveOffsetAttributeImpl());

                TermAtt = AddAttribute <ICharTermAttribute>();
                PiAtt   = AddAttribute <IPositionIncrementAttribute>();
                OAtt    = AddAttribute <IOffsetAttribute>();
                PAtt    = AddAttribute <IPayloadAttribute>();
            }
Esempio n. 39
0
 public IdentifierTokenizer(AttributeFactory factory, TextReader input)
     : base(factory, input)
 {
     _offsetAtt = AddAttribute <IOffsetAttribute>();
     _termAtt   = AddAttribute <ITermAttribute>();
 }
Esempio n. 40
0
        /// <summary>
        /// <para>Get the next token from the input stream.
        /// </para>
        /// <para>If the next token has <c>positionIncrement > 1</c>,
        /// <c>positionIncrement - 1</c> <see cref="fillerToken"/>s are
        /// inserted first.
        /// </para>
        /// </summary>
        /// <param name="target"> Where to put the new token; if null, a new instance is created. </param>
        /// <returns> On success, the populated token; null otherwise </returns>
        /// <exception cref="IOException"> if the input stream has a problem </exception>
        private InputWindowToken GetNextToken(InputWindowToken target)
        {
            InputWindowToken newTarget = target;

            if (numFillerTokensToInsert > 0)
            {
                if (null == target)
                {
                    newTarget = new InputWindowToken(nextInputStreamToken.CloneAttributes());
                }
                else
                {
                    nextInputStreamToken.CopyTo(target.attSource);
                }
                // A filler token occupies no space
                newTarget.offsetAtt.SetOffset(newTarget.offsetAtt.StartOffset, newTarget.offsetAtt.StartOffset);
                newTarget.termAtt.CopyBuffer(fillerToken, 0, fillerToken.Length);
                newTarget.isFiller = true;
                --numFillerTokensToInsert;
            }
            else if (isNextInputStreamToken)
            {
                if (null == target)
                {
                    newTarget = new InputWindowToken(nextInputStreamToken.CloneAttributes());
                }
                else
                {
                    nextInputStreamToken.CopyTo(target.attSource);
                }
                isNextInputStreamToken = false;
                newTarget.isFiller     = false;
            }
            else if (!exhausted)
            {
                if (m_input.IncrementToken())
                {
                    if (null == target)
                    {
                        newTarget = new InputWindowToken(CloneAttributes());
                    }
                    else
                    {
                        this.CopyTo(target.attSource);
                    }
                    if (posIncrAtt.PositionIncrement > 1)
                    {
                        // Each output shingle must contain at least one input token,
                        // so no more than (maxShingleSize - 1) filler tokens will be inserted.
                        numFillerTokensToInsert = Math.Min(posIncrAtt.PositionIncrement - 1, maxShingleSize - 1);
                        // Save the current token as the next input stream token
                        if (null == nextInputStreamToken)
                        {
                            nextInputStreamToken = CloneAttributes();
                        }
                        else
                        {
                            this.CopyTo(nextInputStreamToken);
                        }
                        isNextInputStreamToken = true;
                        // A filler token occupies no space
                        newTarget.offsetAtt.SetOffset(offsetAtt.StartOffset, offsetAtt.StartOffset);
                        newTarget.termAtt.CopyBuffer(fillerToken, 0, fillerToken.Length);
                        newTarget.isFiller = true;
                        --numFillerTokensToInsert;
                    }
                    else
                    {
                        newTarget.isFiller = false;
                    }
                }
                else
                {
                    exhausted = true;
                    m_input.End();
                    endState = CaptureState();
                    numFillerTokensToInsert = Math.Min(posIncrAtt.PositionIncrement, maxShingleSize - 1);
                    if (numFillerTokensToInsert > 0)
                    {
                        nextInputStreamToken = new AttributeSource(this.GetAttributeFactory());
                        nextInputStreamToken.AddAttribute <ICharTermAttribute>();
                        IOffsetAttribute newOffsetAtt = nextInputStreamToken.AddAttribute <IOffsetAttribute>();
                        newOffsetAtt.SetOffset(offsetAtt.EndOffset, offsetAtt.EndOffset);
                        // Recurse/loop just once:
                        return(GetNextToken(target));
                    }
                    else
                    {
                        newTarget = null;
                    }
                }
            }
            else
            {
                newTarget = null;
            }
            return(newTarget);
        }
Esempio n. 41
0
 public IdentifierTokenizer(AttributeSource source, TextReader input)
     : base(source, input)
 {
     _offsetAtt = AddAttribute <IOffsetAttribute>();
     _termAtt   = AddAttribute <ITermAttribute>();
 }
Esempio n. 42
0
        public override void ProcessFields(IIndexableField[] fields, int count)
        {
            fieldState.Reset();

            bool doInvert = consumer.Start(fields, count);

            for (int i = 0; i < count; i++)
            {
                IIndexableField     field     = fields[i];
                IIndexableFieldType fieldType = field.IndexableFieldType;

                // TODO FI: this should be "genericized" to querying
                // consumer if it wants to see this particular field
                // tokenized.
                if (fieldType.IsIndexed && doInvert)
                {
                    bool analyzed = fieldType.IsTokenized && docState.analyzer != null;

                    // if the field omits norms, the boost cannot be indexed.
                    if (fieldType.OmitNorms && field.Boost != 1.0f)
                    {
                        throw new NotSupportedException("You cannot set an index-time boost: norms are omitted for field '" + field.Name + "'");
                    }

                    // only bother checking offsets if something will consume them.
                    // TODO: after we fix analyzers, also check if termVectorOffsets will be indexed.
                    bool checkOffsets    = fieldType.IndexOptions == IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS;
                    int  lastStartOffset = 0;

                    if (i > 0)
                    {
                        fieldState.Position += analyzed ? docState.analyzer.GetPositionIncrementGap(fieldInfo.Name) : 0;
                    }

                    /*
                     * To assist people in tracking down problems in analysis components, we wish to write the field name to the infostream
                     * when we fail. We expect some caller to eventually deal with the real exception, so we don't want any 'catch' clauses,
                     * but rather a finally that takes note of the problem.
                     */

                    bool succeededInProcessingField = false;

                    TokenStream stream = field.GetTokenStream(docState.analyzer);
                    // reset the TokenStream to the first token
                    stream.Reset();

                    try
                    {
                        bool hasMoreTokens = stream.IncrementToken();

                        fieldState.AttributeSource = stream;

                        IOffsetAttribute            offsetAttribute  = fieldState.AttributeSource.AddAttribute <IOffsetAttribute>();
                        IPositionIncrementAttribute posIncrAttribute = fieldState.AttributeSource.AddAttribute <IPositionIncrementAttribute>();

                        if (hasMoreTokens)
                        {
                            consumer.Start(field);

                            do
                            {
                                // If we hit an exception in stream.next below
                                // (which is fairly common, eg if analyzer
                                // chokes on a given document), then it's
                                // non-aborting and (above) this one document
                                // will be marked as deleted, but still
                                // consume a docID

                                int posIncr = posIncrAttribute.PositionIncrement;
                                if (posIncr < 0)
                                {
                                    throw new ArgumentException("position increment must be >=0 (got " + posIncr + ") for field '" + field.Name + "'");
                                }
                                if (fieldState.Position == 0 && posIncr == 0)
                                {
                                    throw new ArgumentException("first position increment must be > 0 (got 0) for field '" + field.Name + "'");
                                }
                                int position = fieldState.Position + posIncr;
                                if (position > 0)
                                {
                                    // NOTE: confusing: this "mirrors" the
                                    // position++ we do below
                                    position--;
                                }
                                else if (position < 0)
                                {
                                    throw new ArgumentException("position overflow for field '" + field.Name + "'");
                                }

                                // position is legal, we can safely place it in fieldState now.
                                // not sure if anything will use fieldState after non-aborting exc...
                                fieldState.Position = position;

                                if (posIncr == 0)
                                {
                                    fieldState.NumOverlap++;
                                }

                                if (checkOffsets)
                                {
                                    int startOffset = fieldState.Offset + offsetAttribute.StartOffset;
                                    int endOffset   = fieldState.Offset + offsetAttribute.EndOffset;
                                    if (startOffset < 0 || endOffset < startOffset)
                                    {
                                        throw new ArgumentException("startOffset must be non-negative, and endOffset must be >= startOffset, " + "startOffset=" + startOffset + ",endOffset=" + endOffset + " for field '" + field.Name + "'");
                                    }
                                    if (startOffset < lastStartOffset)
                                    {
                                        throw new ArgumentException("offsets must not go backwards startOffset=" + startOffset + " is < lastStartOffset=" + lastStartOffset + " for field '" + field.Name + "'");
                                    }
                                    lastStartOffset = startOffset;
                                }

                                bool success = false;
                                try
                                {
                                    // If we hit an exception in here, we abort
                                    // all buffered documents since the last
                                    // flush, on the likelihood that the
                                    // internal state of the consumer is now
                                    // corrupt and should not be flushed to a
                                    // new segment:
                                    consumer.Add();
                                    success = true;
                                }
                                finally
                                {
                                    if (!success)
                                    {
                                        docState.docWriter.SetAborting();
                                    }
                                }
                                fieldState.Length++;
                                fieldState.Position++;
                            } while (stream.IncrementToken());
                        }
                        // trigger streams to perform end-of-stream operations
                        stream.End();
                        // TODO: maybe add some safety? then again, its already checked
                        // when we come back around to the field...
                        fieldState.Position += posIncrAttribute.PositionIncrement;
                        fieldState.Offset   += offsetAttribute.EndOffset;

                        if (docState.maxTermPrefix != null)
                        {
                            string msg = "Document contains at least one immense term in field=\"" + fieldInfo.Name + "\" (whose UTF8 encoding is longer than the max length " + DocumentsWriterPerThread.MAX_TERM_LENGTH_UTF8 + "), all of which were skipped.  Please correct the analyzer to not produce such terms.  The prefix of the first immense term is: '" + docState.maxTermPrefix + "...'";
                            if (docState.infoStream.IsEnabled("IW"))
                            {
                                docState.infoStream.Message("IW", "ERROR: " + msg);
                            }
                            docState.maxTermPrefix = null;
                            throw new ArgumentException(msg);
                        }

                        /* if success was false above there is an exception coming through and we won't get here.*/
                        succeededInProcessingField = true;
                    }
                    finally
                    {
                        if (!succeededInProcessingField)
                        {
                            IOUtils.DisposeWhileHandlingException(stream);
                        }
                        else
                        {
                            stream.Dispose();
                        }
                        if (!succeededInProcessingField && docState.infoStream.IsEnabled("DW"))
                        {
                            docState.infoStream.Message("DW", "An exception was thrown while processing field " + fieldInfo.Name);
                        }
                    }

                    fieldState.Offset += analyzed ? docState.analyzer.GetOffsetGap(fieldInfo.Name) : 0;
                    fieldState.Boost  *= field.Boost;
                }

                // LUCENE-2387: don't hang onto the field, so GC can
                // reclaim
                fields[i] = null;
            }

            consumer.Finish();
            endConsumer.Finish();
        }
Esempio n. 43
0
 public FastStringTokenizer(String str, bool isLetter, bool toLowerCase, ISet<string> stopWords)
 {
     this.str = str;
     this.isLetter = isLetter;
     this.toLowerCase = toLowerCase;
     this.stopWords = stopWords;
     this.termAtt = AddAttribute<ITermAttribute>();
     this.offsetAtt = AddAttribute<IOffsetAttribute>();
 }
Esempio n. 44
0
        public override void  CopyTo(Attribute target)
        {
            IOffsetAttribute t = (IOffsetAttribute)target;

            t.SetOffset(startOffset, endOffset);
        }
            internal SingleTokenAttributeSource()
			{
                termAttribute = AddAttribute<ITermAttribute>();
				offsetAttribute = AddAttribute<IOffsetAttribute>();
			}
Esempio n. 46
0
 private void Init()
 {
     termAtt   = AddAttribute <ICharTermAttribute>();
     offsetAtt = AddAttribute <IOffsetAttribute>();
     typeAtt   = AddAttribute <ITypeAttribute>();
 }
Esempio n. 47
0
 /// <summary>
 /// Creates a new <see cref="HyphenatedWordsFilter"/>
 /// </summary>
 /// <param name="in"> <see cref="TokenStream"/> that will be filtered </param>
 public HyphenatedWordsFilter(TokenStream @in)
     : base(@in)
 {
     termAttribute   = AddAttribute <ICharTermAttribute>();
     offsetAttribute = AddAttribute <IOffsetAttribute>();
 }
Esempio n. 48
0
            public DocsAndPositionsEnumAnonymousClass(
                TokenStream ts, CharacterRunAutomaton[] matchers, ICharTermAttribute charTermAtt, IOffsetAttribute offsetAtt)
            {
                this.matchers    = matchers;
                this.charTermAtt = charTermAtt;
                this.offsetAtt   = offsetAtt;

                stream            = ts;
                matchDescriptions = new BytesRef[matchers.Length];
            }