private void  Init(int bufferSize)
			this.done = false;
            termAtt = AddAttribute<ITermAttribute>();
            offsetAtt = AddAttribute<IOffsetAttribute>();
Example #2
        ///Lucene Tokenizer适配器类构造函数
        /// </summary>
        /// <param name="isMaxWordLength">当为true时,分词器进行最大词长切分;当为false是,采用最细粒度切分</param>
        public IKTokenizer(TextReader inreader, bool isMaxWordLength)
            : base(inreader)
			offsetAtt = AddAttribute<IOffsetAttribute>();
            termAtt = AddAttribute<ITermAttribute>();
            _IKImplement = new IKSegmentation(inreader, isMaxWordLength);
Example #3
 public CamelCaseFilter(TokenStream stream)
     : base(stream)
     _termAttribute = AddAttribute<ITermAttribute>();
     _offsetAttribute = AddAttribute<IOffsetAttribute>();
     _positionIncrementAttribute = AddAttribute<IPositionIncrementAttribute>();
 void Init()
     termAtt = AddAttribute<ITermAttribute>();
     offsetAtt = AddAttribute<IOffsetAttribute>();
     posIncrAtt = AddAttribute<IPositionIncrementAttribute>();
     typeAtt = AddAttribute<ITypeAttribute>();
  * 此处忽略调用base(input);因调用后input的position会被移动
  * by zh
 public MMSegTokenizer(Seg seg, TextReader input)
     : base(input)
     mmSeg = new MMSeg(input, seg);
     termAtt = AddAttribute<ITermAttribute>();
     offsetAtt = AddAttribute<IOffsetAttribute>();
     typeAtt = AddAttribute<ITypeAttribute>();
 public CutLeterDigitFilter(TokenStream input)
     : base(input)
     reusableToken = new Token();
     termAtt = AddAttribute<ITermAttribute>();
     offsetAtt = AddAttribute<IOffsetAttribute>();
     typeAtt = AddAttribute<ITypeAttribute>();
        void Init(string content)
            _enumerationPositions = _aufzaehlungDetector.FindAufzaehlungsspans(content).ToArray();
            _offsetAttribute = AddAttribute<IOffsetAttribute>();
            _flagsAttribute = AddAttribute<IFlagsAttribute>();

            _isInitialized = true;
 public GraphTokenizer(TextReader input)
     : base(input)
     TermAtt = AddAttribute<ICharTermAttribute>();
     OffsetAtt = AddAttribute<IOffsetAttribute>();
     PosIncrAtt = AddAttribute<IPositionIncrementAttribute>();
     PosLengthAtt = AddAttribute<IPositionLengthAttribute>();
 /// <summary>
 /// Sole constructor. </summary>
 public SuggestStopFilter(TokenStream input, CharArraySet stopWords)
     : base(input)
     this.stopWords = stopWords;
     this.termAtt = AddAttribute<ICharTermAttribute>();
     this.posIncAtt = AddAttribute<IPositionIncrementAttribute>();
     this.keywordAtt = AddAttribute<IKeywordAttribute>();
     this.offsetAtt = AddAttribute<IOffsetAttribute>();
 public CannedBinaryTokenStream(params BinaryToken[] tokens)
     : base()
     this.Tokens = tokens;
     TermAtt = AddAttribute<IBinaryTermAttribute>();
     PosIncrAtt = AddAttribute<IPositionIncrementAttribute>();
     PosLengthAtt = AddAttribute<IPositionLengthAttribute>();
     OffsetAtt = AddAttribute<IOffsetAttribute>();
Example #11
        private void Init(System.IO.TextReader _input, HebMorph.DataStructures.DictRadix<int> _prefixesTree)
			termAtt = AddAttribute <ITermAttribute>();
			offsetAtt = AddAttribute <IOffsetAttribute>();
            //posIncrAtt = (PositionIncrementAttribute)AddAttribute(typeof(PositionIncrementAttribute));
			typeAtt = AddAttribute <ITypeAttribute>();
        	input = _input;
            hebMorphTokenizer = new HebMorph.Tokenizer(_input);
            prefixesTree = _prefixesTree;
Example #12
        public JiebaTokenizer(JiebaSegmenter seg, string input)
            segmenter = seg;
            termAtt = AddAttribute<ITermAttribute>();
            offsetAtt = AddAttribute<IOffsetAttribute>();
            typeAtt = AddAttribute<ITypeAttribute>();

            var text = input;
            tokens = segmenter.Tokenize(text, TokenizerMode.Search).ToList();
Example #13
 /// <summary>
 /// Construct a token stream filtering the given input using a Set of common
 /// words to create bigrams. Outputs both unigrams with position increment and
 /// bigrams with position increment 0 type=gram where one or both of the words
 /// in a potential bigram are in the set of common words .
 /// </summary>
 /// <param name="input"> TokenStream input in filter chain </param>
 /// <param name="commonWords"> The set of common words. </param>
 public CommonGramsFilter(LuceneVersion matchVersion, TokenStream input, CharArraySet commonWords)
     : base(input)
     termAttribute = AddAttribute<ICharTermAttribute>();
     offsetAttribute = AddAttribute<IOffsetAttribute>();
     typeAttribute = AddAttribute<ITypeAttribute>();
     posIncAttribute = AddAttribute<IPositionIncrementAttribute>();
     posLenAttribute = AddAttribute<IPositionLengthAttribute>();
     this.commonWords = commonWords;
Example #14
 public TrimFilter(LuceneVersion version, TokenStream @in, bool updateOffsets)
     : base(@in)
     if (updateOffsets && version.OnOrAfter(LuceneVersion.LUCENE_44))
         throw new System.ArgumentException("updateOffsets=true is not supported anymore as of Lucene 4.4");
     termAtt = AddAttribute<ICharTermAttribute>();
     offsetAtt = AddAttribute<IOffsetAttribute>();
     this.updateOffsets = updateOffsets;
 public IterTokenStream(params Token[] tokens)
         : base()
     this.tokens = tokens;
     this.termAtt = AddAttribute<ICharTermAttribute>();
     this.offsetAtt = AddAttribute<IOffsetAttribute>();
     this.posIncAtt = AddAttribute<IPositionIncrementAttribute>();
     this.flagsAtt = AddAttribute<IFlagsAttribute>();
     this.typeAtt = AddAttribute<ITypeAttribute>();
     this.payloadAtt = AddAttribute<IPayloadAttribute>();
 /// <summary>
 /// Creates a new ThaiTokenizer, supplying the AttributeFactory </summary>
 public ThaiTokenizer(AttributeFactory factory, TextReader reader)
       : base(factory, reader, BreakIterator.CreateSentenceInstance(Locale.GetUS()))
         throw new System.NotSupportedException("This JRE does not have support for Thai segmentation");
     wordBreaker = new ThaiWordBreaker(BreakIterator.CreateWordInstance(Locale.GetUS()));
     termAtt = AddAttribute<ICharTermAttribute>();
     offsetAtt = AddAttribute<IOffsetAttribute>();
        public KeywordTokenizer(AttributeSource.AttributeFactory factory, Reader input, int bufferSize)
            : base(factory, input)
            termAtt = AddAttribute<ICharTermAttribute>();
            offsetAtt = AddAttribute<IOffsetAttribute>();

            if (bufferSize <= 0)
                throw new System.ArgumentException("bufferSize must be > 0");
        private bool hasIllegalOffsets = false; // only if the length changed before this filter

        /// <summary>
        /// Creates a new ThaiWordFilter with the specified match version. </summary>
        public ThaiWordFilter(LuceneVersion matchVersion, TokenStream input)
              : base(matchVersion.OnOrAfter(LuceneVersion.LUCENE_31) ? input : new LowerCaseFilter(matchVersion, input))
            if (!DBBI_AVAILABLE)
                throw new System.NotSupportedException("This JRE does not have support for Thai segmentation");
            handlePosIncr = matchVersion.OnOrAfter(LuceneVersion.LUCENE_31);
            termAtt = AddAttribute<ICharTermAttribute>();
            offsetAtt = AddAttribute<IOffsetAttribute>();
            posAtt = AddAttribute<IPositionIncrementAttribute>();
        private void Init(System.IO.TextReader input, HebMorph.StreamLemmatizer _lemmatizer,
            HebMorph.LemmaFilters.LemmaFilterBase _lemmaFilter, bool AlwaysSaveMarkedOriginal)
			termAtt = AddAttribute <ITermAttribute>();
	        offsetAtt = AddAttribute<IOffsetAttribute>();
	        posIncrAtt = AddAttribute<IPositionIncrementAttribute>();
			typeAtt = AddAttribute <ITypeAttribute>();
            //payAtt = (PayloadAttribute)AddAttribute(typeof(PayloadAttribute));

        	this.input = input;
            this._streamLemmatizer = _lemmatizer;
            this.alwaysSaveMarkedOriginal = AlwaysSaveMarkedOriginal;
            this.lemmaFilter = _lemmaFilter;
Example #20
         * Creates NGramTokenFilter with given min and max n-grams.
         * <param name="input"><see cref="TokenStream"/> holding the input to be tokenized</param>
         * <param name="minGram">the smallest n-gram to generate</param>
         * <param name="maxGram">the largest n-gram to generate</param>
        public NGramTokenFilter(TokenStream input, int minGram, int maxGram)
            : base(input)
            if (minGram < 1)
                throw new System.ArgumentException("minGram must be greater than zero");
            if (minGram > maxGram)
                throw new System.ArgumentException("minGram must not be greater than maxGram");
            this.minGram = minGram;
            this.maxGram = maxGram;

            this.termAtt = AddAttribute<ITermAttribute>();
            this.offsetAtt = AddAttribute<IOffsetAttribute>();
 /// <summary>
 /// If inputText is non-null, and the TokenStream has
 ///  offsets, we include the surface form in each arc's
 ///  label.
 /// </summary>
 public TokenStreamToDot(string inputText, TokenStream @in, TextWriter @out)
     this.@in = @in;
     this.@out = @out;
     this.InputText = inputText;
     TermAtt = @in.AddAttribute<ICharTermAttribute>();
     PosIncAtt = @in.AddAttribute<IPositionIncrementAttribute>();
     PosLengthAtt = @in.AddAttribute<IPositionLengthAttribute>();
     if (@in.HasAttribute<IOffsetAttribute>())
         OffsetAtt = @in.AddAttribute<IOffsetAttribute>();
         OffsetAtt = null;
        /// <summary>
        /// creates a new PatternTokenizer returning tokens from group (-1 for split functionality) </summary>
        public PatternTokenizer(AttributeFactory factory, TextReader input, Regex pattern, int group)
              : base(factory, input)
            this.termAtt = AddAttribute<ICharTermAttribute>();
            this.offsetAtt = AddAttribute<IOffsetAttribute>();
   = group;

            // Use "" instead of str so don't consume chars
            // (fillBuffer) from the input on throwing IAE below:
            this.matcher = pattern.Match("");
            this.pattern = pattern;

            // confusingly group count depends ENTIRELY on the pattern but is only accessible via matcher
            var groupCount = pattern.GetGroupNumbers().Length;
            if (group >= 0 && group > groupCount)
                throw new System.ArgumentException("invalid group specified: pattern only has: " + groupCount + " capturing groups");

        public PrefixAwareTokenFilter(TokenStream prefix, TokenStream suffix)
            : base(suffix)
            this.suffix = suffix;
            this.prefix = prefix;
            prefixExhausted = false;

            termAtt = AddAttribute<ICharTermAttribute>();
            posIncrAtt = AddAttribute<IPositionIncrementAttribute>();
            payloadAtt = AddAttribute<IPayloadAttribute>();
            offsetAtt = AddAttribute<IOffsetAttribute>();
            typeAtt = AddAttribute<ITypeAttribute>();
            flagsAtt = AddAttribute<IFlagsAttribute>();

            p_termAtt = prefix.AddAttribute<ICharTermAttribute>();
            p_posIncrAtt = prefix.AddAttribute<IPositionIncrementAttribute>();
            p_payloadAtt = prefix.AddAttribute<IPayloadAttribute>();
            p_offsetAtt = prefix.AddAttribute<IOffsetAttribute>();
            p_typeAtt = prefix.AddAttribute<ITypeAttribute>();
            p_flagsAtt = prefix.AddAttribute<IFlagsAttribute>();
        public PrefixAwareTokenFilter(TokenStream prefix, TokenStream suffix) : base(suffix)
            Suffix = suffix;
            Prefix = prefix;
            _prefixExhausted = false;

            // ReSharper disable DoNotCallOverridableMethodsInConstructor
            _termAtt = AddAttribute<ITermAttribute>();
            _posIncrAtt = AddAttribute<IPositionIncrementAttribute>();
            _payloadAtt = AddAttribute<IPayloadAttribute>();
            _offsetAtt = AddAttribute<IOffsetAttribute>();
            _typeAtt = AddAttribute<ITypeAttribute>();
            _flagsAtt = AddAttribute<IFlagsAttribute>();
            // ReSharper restore DoNotCallOverridableMethodsInConstructor

            _pTermAtt = prefix.AddAttribute<ITermAttribute>();
            _pPosIncrAtt = prefix.AddAttribute<IPositionIncrementAttribute>();
            _pPayloadAtt = prefix.AddAttribute<IPayloadAttribute>();
            _pOffsetAtt = prefix.AddAttribute<IOffsetAttribute>();
            _pTypeAtt = prefix.AddAttribute<ITypeAttribute>();
            _pFlagsAtt = prefix.AddAttribute<IFlagsAttribute>();
Example #25
        public PanGuTokenizer(TextReader input)
            : base(input)
            termAttribute = AddAttribute<ITermAttribute>();
            offsetAttribute = AddAttribute<IOffsetAttribute>();

            inputText = base.input.ReadToEnd();

            if (string.IsNullOrEmpty(inputText)) {
                char[] readBuf = new char[1024];

                int relCount = base.input.Read(readBuf, 0, readBuf.Length);

                StringBuilder inputStr = new StringBuilder(readBuf.Length);

                while (relCount > 0) {
                    inputStr.Append(readBuf, 0, relCount);

                    relCount = input.Read(readBuf, 0, readBuf.Length);

                if (inputStr.Length > 0) {
                    inputText = inputStr.ToString();

            if (string.IsNullOrEmpty(inputText)) {
                words = new WordInfo[0];
            else {
                global::PanGu.Segment segment = new Segment();
                ICollection<WordInfo> wordInfos = segment.DoSegment(inputText);
                words = new WordInfo[wordInfos.Count];
                wordInfos.CopyTo(words, 0);
Example #26
        public static void VerifyEquals(Fields d1, Fields d2)
            if (d1 == null)
                Assert.IsTrue(d2 == null || d2.Count == 0);
            Assert.IsTrue(d2 != null);

            IEnumerator <string> fieldsEnum2 = d2.GetEnumerator();

            foreach (string field1 in d1)
                string field2 = fieldsEnum2.Current;
                Assert.AreEqual(field1, field2);

                Terms terms1 = d1.GetTerms(field1);
                TermsEnum termsEnum1 = terms1.GetIterator(null);

                Terms terms2 = d2.GetTerms(field2);
                TermsEnum termsEnum2 = terms2.GetIterator(null);

                DocsAndPositionsEnum dpEnum1 = null;
                DocsAndPositionsEnum dpEnum2 = null;
                DocsEnum             dEnum1  = null;
                DocsEnum             dEnum2  = null;

                BytesRef term1;
                while ((term1 = termsEnum1.Next()) != null)
                    BytesRef term2 = termsEnum2.Next();
                    Assert.AreEqual(term1, term2);
                    Assert.AreEqual(termsEnum1.TotalTermFreq, termsEnum2.TotalTermFreq);

                    dpEnum1 = termsEnum1.DocsAndPositions(null, dpEnum1);
                    dpEnum2 = termsEnum2.DocsAndPositions(null, dpEnum2);
                    if (dpEnum1 != null)
                        int docID1 = dpEnum1.NextDoc();
                        // docIDs are not supposed to be equal
                        //int docID2 = dpEnum2.NextDoc();
                        //Assert.AreEqual(docID1, docID2);
                        Assert.IsTrue(docID1 != DocIdSetIterator.NO_MORE_DOCS);

                        int freq1 = dpEnum1.Freq;
                        int freq2 = dpEnum2.Freq;
                        Assert.AreEqual(freq1, freq2);
                        IOffsetAttribute offsetAtt1 = dpEnum1.Attributes.HasAttribute <IOffsetAttribute>() ? dpEnum1.Attributes.GetAttribute <IOffsetAttribute>() : null;
                        IOffsetAttribute offsetAtt2 = dpEnum2.Attributes.HasAttribute <IOffsetAttribute>() ? dpEnum2.Attributes.GetAttribute <IOffsetAttribute>() : null;

                        if (offsetAtt1 != null)

                        for (int posUpto = 0; posUpto < freq1; posUpto++)
                            int pos1 = dpEnum1.NextPosition();
                            int pos2 = dpEnum2.NextPosition();
                            Assert.AreEqual(pos1, pos2);
                            if (offsetAtt1 != null)
                                Assert.AreEqual(offsetAtt1.StartOffset, offsetAtt2.StartOffset);
                                Assert.AreEqual(offsetAtt1.EndOffset, offsetAtt2.EndOffset);
                        Assert.AreEqual(DocIdSetIterator.NO_MORE_DOCS, dpEnum1.NextDoc());
                        Assert.AreEqual(DocIdSetIterator.NO_MORE_DOCS, dpEnum2.NextDoc());
                        dEnum1 = TestUtil.Docs(Random(), termsEnum1, null, dEnum1, DocsFlags.FREQS);
                        dEnum2 = TestUtil.Docs(Random(), termsEnum2, null, dEnum2, DocsFlags.FREQS);
                        int docID1 = dEnum1.NextDoc();
                        // docIDs are not supposed to be equal
                        //int docID2 = dEnum2.NextDoc();
                        //Assert.AreEqual(docID1, docID2);
                        Assert.IsTrue(docID1 != DocIdSetIterator.NO_MORE_DOCS);
                        int freq1 = dEnum1.Freq;
                        int freq2 = dEnum2.Freq;
                        Assert.AreEqual(freq1, freq2);
                        Assert.AreEqual(DocIdSetIterator.NO_MORE_DOCS, dEnum1.NextDoc());
                        Assert.AreEqual(DocIdSetIterator.NO_MORE_DOCS, dEnum2.NextDoc());

Example #27
 public InputWindowToken(AttributeSource attSource)
     this.attSource = attSource;
     this.termAtt   = attSource.GetAttribute <ICharTermAttribute>();
     this.offsetAtt = attSource.GetAttribute <IOffsetAttribute>();
 /// <summary>
 /// Construct a new SegmenterBase, also supplying the <see cref="Lucene.Net.Util.AttributeSource.AttributeFactory"/>
 /// </summary>
 public SegmentingTokenizerBase(AttributeFactory factory, TextReader reader, BreakIterator iterator)
     : base(factory, reader)
     offsetAtt     = AddAttribute <IOffsetAttribute>();
     this.iterator = iterator;
Example #29
            protected internal RandomTokenStream(BaseTermVectorsFormatTestCase baseTermVectorsFormatTestCase, int len, string[] sampleTerms, BytesRef[] sampleTermBytes, bool offsetsGoBackwards)
                terms               = new string[len];
                termBytes           = new BytesRef[len];
                positionsIncrements = new int[len];
                positions           = new int[len];
                startOffsets        = new int[len];
                endOffsets          = new int[len];
                payloads            = new BytesRef[len];
                for (int i = 0; i < len; ++i)
                    int o = Random.Next(sampleTerms.Length);
                    terms[i]               = sampleTerms[o];
                    termBytes[i]           = sampleTermBytes[o];
                    positionsIncrements[i] = TestUtil.NextInt32(Random, i == 0 ? 1 : 0, 10);
                    if (offsetsGoBackwards)
                        startOffsets[i] = Random.Next();
                        endOffsets[i]   = Random.Next();
                        if (i == 0)
                            startOffsets[i] = TestUtil.NextInt32(Random, 0, 1 << 16);
                            startOffsets[i] = startOffsets[i - 1] + TestUtil.NextInt32(Random, 0, Rarely() ? 1 << 16 : 20);
                        endOffsets[i] = startOffsets[i] + TestUtil.NextInt32(Random, 0, Rarely() ? 1 << 10 : 20);

                for (int i = 0; i < len; ++i)
                    if (i == 0)
                        positions[i] = positionsIncrements[i] - 1;
                        positions[i] = positions[i - 1] + positionsIncrements[i];
                if (Rarely())
                    Arrays.Fill(payloads, baseTermVectorsFormatTestCase.RandomPayload());
                    for (int i = 0; i < len; ++i)
                        payloads[i] = baseTermVectorsFormatTestCase.RandomPayload();

                positionToTerms    = new Dictionary <int, ISet <int> >(len);
                startOffsetToTerms = new Dictionary <int, ISet <int> >(len);
                for (int i = 0; i < len; ++i)
                    if (!positionToTerms.TryGetValue(positions[i], out ISet <int> positionTerms))
                        positionToTerms[positions[i]] = positionTerms = new JCG.HashSet <int>(1);
                    if (!startOffsetToTerms.TryGetValue(startOffsets[i], out ISet <int> startOffsetTerms))
                        startOffsetToTerms[startOffsets[i]] = startOffsetTerms = new JCG.HashSet <int>(1);

                freqs = new Dictionary <string, int>();
                foreach (string term in terms)
                    if (freqs.TryGetValue(term, out int freq))
                        freqs[term] = freq + 1;
                        freqs[term] = 1;

                AddAttributeImpl(new PermissiveOffsetAttribute());

                termAtt = AddAttribute <ICharTermAttribute>();
                piAtt   = AddAttribute <IPositionIncrementAttribute>();
                oAtt    = AddAttribute <IOffsetAttribute>();
                pAtt    = AddAttribute <IPayloadAttribute>();
 /// <summary>
 /// Creates a new HyphenatedWordsFilter
 /// </summary>
 /// <param name="in"> TokenStream that will be filtered </param>
 public HyphenatedWordsFilter(TokenStream @in)
     : base(@in)
     termAttribute = AddAttribute<ICharTermAttribute>();
     offsetAttribute = AddAttribute<IOffsetAttribute>();
Example #31
 public PositionsTokenStream()
     term    = AddAttribute <ICharTermAttribute>();
     payload = AddAttribute <IPayloadAttribute>();
     offset  = AddAttribute <IOffsetAttribute>();
Example #32
 public RegexTokenizer(String str, Regex regex, bool toLowerCase)
     this.str = str;
     this.matcher = regex.Match(str);
     this.toLowerCase = toLowerCase;
     this.termAtt = AddAttribute<ITermAttribute>();
     this.offsetAtt = AddAttribute<IOffsetAttribute>();
 public TestFilter(TokenStream @in)
     : base(@in)
     termAtt = AddAttribute<ICharTermAttribute>();
     posIncrAtt = AddAttribute<IPositionIncrementAttribute>();
     offsetAtt = AddAttribute<IOffsetAttribute>();
     typeAtt = AddAttribute<ITypeAttribute>();
Example #34
 public IdentifierTokenizer(TextReader input)
     : base(input)
     _offsetAtt = AddAttribute <IOffsetAttribute>();
     _termAtt   = AddAttribute <ITermAttribute>();
 public BugReproTokenStream()
     TermAtt   = AddAttribute <ICharTermAttribute>();
     OffsetAtt = AddAttribute <IOffsetAttribute>();
     PosIncAtt = AddAttribute <IPositionIncrementAttribute>();
Example #36
        public override bool IncrementToken()
            if (hasMoreTokensInClone)
                int start = breaker.Current;
                int end   = breaker.Next();
                if (end != BreakIterator.Done)
                    termAtt.CopyBuffer(clonedTermAtt.Buffer, start, end - start);
                    if (hasIllegalOffsets)
                        offsetAtt.SetOffset(clonedOffsetAtt.StartOffset, clonedOffsetAtt.EndOffset);
                        offsetAtt.SetOffset(clonedOffsetAtt.StartOffset + start, clonedOffsetAtt.StartOffset + end);
                    if (handlePosIncr)
                        posAtt.PositionIncrement = 1;
                hasMoreTokensInClone = false;

            if (!m_input.IncrementToken())

            if (termAtt.Length == 0 || !thaiPattern.IsMatch(string.Empty + termAtt[0]))

            hasMoreTokensInClone = true;

            // if length by start + end offsets doesn't match the term text then assume
            // this is a synonym and don't adjust the offsets.
            hasIllegalOffsets = offsetAtt.EndOffset - offsetAtt.StartOffset != termAtt.Length;

            // we lazy init the cloned token, as in ctor not all attributes may be added
            if (clonedToken == null)
                clonedToken     = CloneAttributes();
                clonedTermAtt   = clonedToken.GetAttribute <ICharTermAttribute>();
                clonedOffsetAtt = clonedToken.GetAttribute <IOffsetAttribute>();

            // reinit CharacterIterator
            charIterator.SetText(clonedTermAtt.Buffer, 0, clonedTermAtt.Length);
            breaker.SetText(new string(charIterator.Text, charIterator.Start, charIterator.Length));
            int end2 = breaker.Next();

            if (end2 != BreakIterator.Done)
                termAtt.Length = end2;
                if (hasIllegalOffsets)
                    offsetAtt.SetOffset(clonedOffsetAtt.StartOffset, clonedOffsetAtt.EndOffset);
                    offsetAtt.SetOffset(clonedOffsetAtt.StartOffset, clonedOffsetAtt.StartOffset + end2);
                // position increment keeps as it is for first token
Example #37
        /// <summary>
        /// Iterates over the given token stream and adds the resulting terms to the index;
        /// Equivalent to adding a tokenized, indexed, termVectorStored, unstored,
        /// Lucene <see cref="Documents.Field"/>.
        /// Finally closes the token stream. Note that untokenized keywords can be added with this method via
        /// <see cref="T:KeywordTokenStream{T}(ICollection{T}"/>)"/>, the Lucene <c>KeywordTokenizer</c> or similar utilities.
        /// </summary>
        /// <param name="fieldName"> a name to be associated with the text </param>
        /// <param name="stream"> the token stream to retrieve tokens from. </param>
        /// <param name="boost"> the boost factor for hits for this field </param>
        /// <param name="positionIncrementGap"> the position increment gap if fields with the same name are added more than once </param>
        /// <param name="offsetGap"> the offset gap if fields with the same name are added more than once </param>
        /// <seealso cref="Documents.Field.Boost"/>
        public virtual void AddField(string fieldName, TokenStream stream, float boost, int positionIncrementGap, int offsetGap)
                if (fieldName == null)
                    throw new ArgumentException("fieldName must not be null");
                if (stream == null)
                    throw new ArgumentException("token stream must not be null");
                if (boost <= 0.0f)
                    throw new ArgumentException("boost factor must be greater than 0.0");
                int                 numTokens        = 0;
                int                 numOverlapTokens = 0;
                int                 pos = -1;
                BytesRefHash        terms;
                SliceByteStartArray sliceArray;
                Info                info             = null;
                long                sumTotalTermFreq = 0;
                int                 offset           = 0;
                if (fields.TryGetValue(fieldName, out info))
                    numTokens        = info.numTokens;
                    numOverlapTokens = info.numOverlapTokens;
                    pos              = info.lastPosition + positionIncrementGap;
                    offset           = info.lastOffset + offsetGap;
                    terms            = info.terms;
                    boost           *= info.boost;
                    sliceArray       = info.sliceArray;
                    sumTotalTermFreq = info.sumTotalTermFreq;
                    sliceArray = new SliceByteStartArray(BytesRefHash.DEFAULT_CAPACITY);
                    terms      = new BytesRefHash(byteBlockPool, BytesRefHash.DEFAULT_CAPACITY, sliceArray);

                if (!fieldInfos.ContainsKey(fieldName))
                    fieldInfos[fieldName] = new FieldInfo(fieldName,
                                                          this.storeOffsets ? IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS : IndexOptions.DOCS_AND_FREQS_AND_POSITIONS,
                ITermToBytesRefAttribute    termAtt          = stream.GetAttribute <ITermToBytesRefAttribute>();
                IPositionIncrementAttribute posIncrAttribute = stream.AddAttribute <IPositionIncrementAttribute>();
                IOffsetAttribute            offsetAtt        = stream.AddAttribute <IOffsetAttribute>();
                BytesRef @ref = termAtt.BytesRef;

                while (stream.IncrementToken())
                    //        if (DEBUG) System.err.println("token='" + term + "'");
                    int posIncr = posIncrAttribute.PositionIncrement;
                    if (posIncr == 0)
                    pos += posIncr;
                    int ord = terms.Add(@ref);
                    if (ord < 0)
                        ord = (-ord) - 1;
                        sliceArray.start[ord] = postingsWriter.StartNewSlice();
                    if (!storeOffsets)
                        postingsWriter.WriteInt32(offsetAtt.StartOffset + offset);
                        postingsWriter.WriteInt32(offsetAtt.EndOffset + offset);
                    sliceArray.end[ord] = postingsWriter.CurrentOffset;

                // ensure infos.numTokens > 0 invariant; needed for correct operation of terms()
                if (numTokens > 0)
                    fields[fieldName] = new Info(terms, sliceArray, numTokens, numOverlapTokens, boost, pos, offsetAtt.EndOffset + offset, sumTotalTermFreq);
                    sortedFields      = null; // invalidate sorted view, if any
            } // can never happen
            catch (Exception e)
                throw new Exception(e.ToString(), e);
                    if (stream != null)
                catch (IOException e2)
                    throw new Exception(e2.ToString(), e2);
            protected internal RandomTokenStream(BaseTermVectorsFormatTestCase outerInstance, int len, string[] sampleTerms, BytesRef[] sampleTermBytes, bool offsetsGoBackwards)
                this.OuterInstance = outerInstance;
                Terms               = new string[len];
                TermBytes           = new BytesRef[len];
                PositionsIncrements = new int[len];
                Positions           = new int[len];
                StartOffsets        = new int[len];
                EndOffsets          = new int[len];
                Payloads            = new BytesRef[len];
                for (int i = 0; i < len; ++i)
                    int o = Random().Next(sampleTerms.Length);
                    Terms[i]               = sampleTerms[o];
                    TermBytes[i]           = sampleTermBytes[o];
                    PositionsIncrements[i] = TestUtil.NextInt(Random(), i == 0 ? 1 : 0, 10);
                    if (offsetsGoBackwards)
                        StartOffsets[i] = Random().Next();
                        EndOffsets[i]   = Random().Next();
                        if (i == 0)
                            StartOffsets[i] = TestUtil.NextInt(Random(), 0, 1 << 16);
                            StartOffsets[i] = StartOffsets[i - 1] + TestUtil.NextInt(Random(), 0, Rarely() ? 1 << 16 : 20);
                        EndOffsets[i] = StartOffsets[i] + TestUtil.NextInt(Random(), 0, Rarely() ? 1 << 10 : 20);

                for (int i = 0; i < len; ++i)
                    if (i == 0)
                        Positions[i] = PositionsIncrements[i] - 1;
                        Positions[i] = Positions[i - 1] + PositionsIncrements[i];
                if (Rarely())
                    Arrays.Fill(Payloads, outerInstance.RandomPayload());
                    for (int i = 0; i < len; ++i)
                        Payloads[i] = outerInstance.RandomPayload();

                PositionToTerms    = new Dictionary <int?, ISet <int?> >(len);
                StartOffsetToTerms = new Dictionary <int?, ISet <int?> >(len);
                for (int i = 0; i < len; ++i)
                    if (!PositionToTerms.ContainsKey(Positions[i]))
                        PositionToTerms[Positions[i]] = new HashSet <int?>();//size1
                    if (!StartOffsetToTerms.ContainsKey(StartOffsets[i]))
                        StartOffsetToTerms[StartOffsets[i]] = new HashSet <int?>();//size1

                Freqs = new Dictionary <string, int?>();
                foreach (string term in Terms)
                    if (Freqs.ContainsKey(term))
                        Freqs[term] = Freqs[term] + 1;
                        Freqs[term] = 1;

                AddAttributeImpl(new PermissiveOffsetAttributeImpl());

                TermAtt = AddAttribute <ICharTermAttribute>();
                PiAtt   = AddAttribute <IPositionIncrementAttribute>();
                OAtt    = AddAttribute <IOffsetAttribute>();
                PAtt    = AddAttribute <IPayloadAttribute>();
Example #39
 public IdentifierTokenizer(AttributeFactory factory, TextReader input)
     : base(factory, input)
     _offsetAtt = AddAttribute <IOffsetAttribute>();
     _termAtt   = AddAttribute <ITermAttribute>();
Example #40
        /// <summary>
        /// <para>Get the next token from the input stream.
        /// </para>
        /// <para>If the next token has <c>positionIncrement > 1</c>,
        /// <c>positionIncrement - 1</c> <see cref="fillerToken"/>s are
        /// inserted first.
        /// </para>
        /// </summary>
        /// <param name="target"> Where to put the new token; if null, a new instance is created. </param>
        /// <returns> On success, the populated token; null otherwise </returns>
        /// <exception cref="IOException"> if the input stream has a problem </exception>
        private InputWindowToken GetNextToken(InputWindowToken target)
            InputWindowToken newTarget = target;

            if (numFillerTokensToInsert > 0)
                if (null == target)
                    newTarget = new InputWindowToken(nextInputStreamToken.CloneAttributes());
                // A filler token occupies no space
                newTarget.offsetAtt.SetOffset(newTarget.offsetAtt.StartOffset, newTarget.offsetAtt.StartOffset);
                newTarget.termAtt.CopyBuffer(fillerToken, 0, fillerToken.Length);
                newTarget.isFiller = true;
            else if (isNextInputStreamToken)
                if (null == target)
                    newTarget = new InputWindowToken(nextInputStreamToken.CloneAttributes());
                isNextInputStreamToken = false;
                newTarget.isFiller     = false;
            else if (!exhausted)
                if (m_input.IncrementToken())
                    if (null == target)
                        newTarget = new InputWindowToken(CloneAttributes());
                    if (posIncrAtt.PositionIncrement > 1)
                        // Each output shingle must contain at least one input token,
                        // so no more than (maxShingleSize - 1) filler tokens will be inserted.
                        numFillerTokensToInsert = Math.Min(posIncrAtt.PositionIncrement - 1, maxShingleSize - 1);
                        // Save the current token as the next input stream token
                        if (null == nextInputStreamToken)
                            nextInputStreamToken = CloneAttributes();
                        isNextInputStreamToken = true;
                        // A filler token occupies no space
                        newTarget.offsetAtt.SetOffset(offsetAtt.StartOffset, offsetAtt.StartOffset);
                        newTarget.termAtt.CopyBuffer(fillerToken, 0, fillerToken.Length);
                        newTarget.isFiller = true;
                        newTarget.isFiller = false;
                    exhausted = true;
                    endState = CaptureState();
                    numFillerTokensToInsert = Math.Min(posIncrAtt.PositionIncrement, maxShingleSize - 1);
                    if (numFillerTokensToInsert > 0)
                        nextInputStreamToken = new AttributeSource(this.GetAttributeFactory());
                        nextInputStreamToken.AddAttribute <ICharTermAttribute>();
                        IOffsetAttribute newOffsetAtt = nextInputStreamToken.AddAttribute <IOffsetAttribute>();
                        newOffsetAtt.SetOffset(offsetAtt.EndOffset, offsetAtt.EndOffset);
                        // Recurse/loop just once:
                        newTarget = null;
                newTarget = null;
Example #41
 public IdentifierTokenizer(AttributeSource source, TextReader input)
     : base(source, input)
     _offsetAtt = AddAttribute <IOffsetAttribute>();
     _termAtt   = AddAttribute <ITermAttribute>();
Example #42
        public override void ProcessFields(IIndexableField[] fields, int count)

            bool doInvert = consumer.Start(fields, count);

            for (int i = 0; i < count; i++)
                IIndexableField     field     = fields[i];
                IIndexableFieldType fieldType = field.IndexableFieldType;

                // TODO FI: this should be "genericized" to querying
                // consumer if it wants to see this particular field
                // tokenized.
                if (fieldType.IsIndexed && doInvert)
                    bool analyzed = fieldType.IsTokenized && docState.analyzer != null;

                    // if the field omits norms, the boost cannot be indexed.
                    if (fieldType.OmitNorms && field.Boost != 1.0f)
                        throw new NotSupportedException("You cannot set an index-time boost: norms are omitted for field '" + field.Name + "'");

                    // only bother checking offsets if something will consume them.
                    // TODO: after we fix analyzers, also check if termVectorOffsets will be indexed.
                    bool checkOffsets    = fieldType.IndexOptions == IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS;
                    int  lastStartOffset = 0;

                    if (i > 0)
                        fieldState.Position += analyzed ? docState.analyzer.GetPositionIncrementGap(fieldInfo.Name) : 0;

                     * To assist people in tracking down problems in analysis components, we wish to write the field name to the infostream
                     * when we fail. We expect some caller to eventually deal with the real exception, so we don't want any 'catch' clauses,
                     * but rather a finally that takes note of the problem.

                    bool succeededInProcessingField = false;

                    TokenStream stream = field.GetTokenStream(docState.analyzer);
                    // reset the TokenStream to the first token

                        bool hasMoreTokens = stream.IncrementToken();

                        fieldState.AttributeSource = stream;

                        IOffsetAttribute            offsetAttribute  = fieldState.AttributeSource.AddAttribute <IOffsetAttribute>();
                        IPositionIncrementAttribute posIncrAttribute = fieldState.AttributeSource.AddAttribute <IPositionIncrementAttribute>();

                        if (hasMoreTokens)

                                // If we hit an exception in below
                                // (which is fairly common, eg if analyzer
                                // chokes on a given document), then it's
                                // non-aborting and (above) this one document
                                // will be marked as deleted, but still
                                // consume a docID

                                int posIncr = posIncrAttribute.PositionIncrement;
                                if (posIncr < 0)
                                    throw new ArgumentException("position increment must be >=0 (got " + posIncr + ") for field '" + field.Name + "'");
                                if (fieldState.Position == 0 && posIncr == 0)
                                    throw new ArgumentException("first position increment must be > 0 (got 0) for field '" + field.Name + "'");
                                int position = fieldState.Position + posIncr;
                                if (position > 0)
                                    // NOTE: confusing: this "mirrors" the
                                    // position++ we do below
                                else if (position < 0)
                                    throw new ArgumentException("position overflow for field '" + field.Name + "'");

                                // position is legal, we can safely place it in fieldState now.
                                // not sure if anything will use fieldState after non-aborting exc...
                                fieldState.Position = position;

                                if (posIncr == 0)

                                if (checkOffsets)
                                    int startOffset = fieldState.Offset + offsetAttribute.StartOffset;
                                    int endOffset   = fieldState.Offset + offsetAttribute.EndOffset;
                                    if (startOffset < 0 || endOffset < startOffset)
                                        throw new ArgumentException("startOffset must be non-negative, and endOffset must be >= startOffset, " + "startOffset=" + startOffset + ",endOffset=" + endOffset + " for field '" + field.Name + "'");
                                    if (startOffset < lastStartOffset)
                                        throw new ArgumentException("offsets must not go backwards startOffset=" + startOffset + " is < lastStartOffset=" + lastStartOffset + " for field '" + field.Name + "'");
                                    lastStartOffset = startOffset;

                                bool success = false;
                                    // If we hit an exception in here, we abort
                                    // all buffered documents since the last
                                    // flush, on the likelihood that the
                                    // internal state of the consumer is now
                                    // corrupt and should not be flushed to a
                                    // new segment:
                                    success = true;
                                    if (!success)
                            } while (stream.IncrementToken());
                        // trigger streams to perform end-of-stream operations
                        // TODO: maybe add some safety? then again, its already checked
                        // when we come back around to the field...
                        fieldState.Position += posIncrAttribute.PositionIncrement;
                        fieldState.Offset   += offsetAttribute.EndOffset;

                        if (docState.maxTermPrefix != null)
                            string msg = "Document contains at least one immense term in field=\"" + fieldInfo.Name + "\" (whose UTF8 encoding is longer than the max length " + DocumentsWriterPerThread.MAX_TERM_LENGTH_UTF8 + "), all of which were skipped.  Please correct the analyzer to not produce such terms.  The prefix of the first immense term is: '" + docState.maxTermPrefix + "...'";
                            if (docState.infoStream.IsEnabled("IW"))
                                docState.infoStream.Message("IW", "ERROR: " + msg);
                            docState.maxTermPrefix = null;
                            throw new ArgumentException(msg);

                        /* if success was false above there is an exception coming through and we won't get here.*/
                        succeededInProcessingField = true;
                        if (!succeededInProcessingField)
                        if (!succeededInProcessingField && docState.infoStream.IsEnabled("DW"))
                            docState.infoStream.Message("DW", "An exception was thrown while processing field " + fieldInfo.Name);

                    fieldState.Offset += analyzed ? docState.analyzer.GetOffsetGap(fieldInfo.Name) : 0;
                    fieldState.Boost  *= field.Boost;

                // LUCENE-2387: don't hang onto the field, so GC can
                // reclaim
                fields[i] = null;

Example #43
 public FastStringTokenizer(String str, bool isLetter, bool toLowerCase, ISet<string> stopWords)
     this.str = str;
     this.isLetter = isLetter;
     this.toLowerCase = toLowerCase;
     this.stopWords = stopWords;
     this.termAtt = AddAttribute<ITermAttribute>();
     this.offsetAtt = AddAttribute<IOffsetAttribute>();
Example #44
        public override void  CopyTo(Attribute target)
            IOffsetAttribute t = (IOffsetAttribute)target;

            t.SetOffset(startOffset, endOffset);
            internal SingleTokenAttributeSource()
                termAttribute = AddAttribute<ITermAttribute>();
				offsetAttribute = AddAttribute<IOffsetAttribute>();
Example #46
 private void Init()
     termAtt   = AddAttribute <ICharTermAttribute>();
     offsetAtt = AddAttribute <IOffsetAttribute>();
     typeAtt   = AddAttribute <ITypeAttribute>();
Example #47
 /// <summary>
 /// Creates a new <see cref="HyphenatedWordsFilter"/>
 /// </summary>
 /// <param name="in"> <see cref="TokenStream"/> that will be filtered </param>
 public HyphenatedWordsFilter(TokenStream @in)
     : base(@in)
     termAttribute   = AddAttribute <ICharTermAttribute>();
     offsetAttribute = AddAttribute <IOffsetAttribute>();
Example #48
            public DocsAndPositionsEnumAnonymousClass(
                TokenStream ts, CharacterRunAutomaton[] matchers, ICharTermAttribute charTermAtt, IOffsetAttribute offsetAtt)
                this.matchers    = matchers;
                this.charTermAtt = charTermAtt;
                this.offsetAtt   = offsetAtt;

                stream            = ts;
                matchDescriptions = new BytesRef[matchers.Length];