Example #1
0
 public SingleCharTokenizer(TokenStream input): base(input)
 {
     _input = input;
     _termAttribute = (TermAttribute)AddAttribute(typeof(TermAttribute));
     _offsetAttribute = (OffsetAttribute)AddAttribute(typeof(OffsetAttribute));
     _positionIncrementAttribute = (PositionIncrementAttribute)AddAttribute(typeof(PositionIncrementAttribute));
 }
        override public object Clone()
        {
            OffsetAttribute impl = new OffsetAttribute();

            impl.endOffset   = endOffset;
            impl.startOffset = startOffset;
            return(impl);
        }
 public HyphenationTokenFilter(TokenStream input, Hyphenator hyphenator)
     : base(input)
 {
     _hyphenator = hyphenator;
     _termAtt = (TermAttribute)AddAttribute(typeof(TermAttribute));
     _typeAtt = (TypeAttribute)AddAttribute(typeof(TypeAttribute));
     _ofsAtt = (OffsetAttribute)AddAttribute(typeof(OffsetAttribute));
 }
Example #4
0
 public AnsjTokenizer(AbstractAnalysis analysis, TextReader input, HashSet<string> filter, bool pstemming)
     : base(input)
 {
     _analysis = analysis;
     _termAtt = AddAttribute<TermAttribute>();
     _offsetAtt = AddAttribute<OffsetAttribute>();
     _positionAttr = AddAttribute<PositionIncrementAttribute>();
     _filter = filter;
     _pstemming = pstemming;
 }
        public IntMetaDataTokenStream(string tokenText)
        {
            _tokenText = tokenText;

            // NOTE: Calling the AddAttribute<T> method failed, so 
            // switched to using AddAttributeImpl.
            _termAttribute = new TermAttribute();
            _offsetAttribute = new OffsetAttribute();
            _payloadAtt = new PayloadAttribute();
            base.AddAttributeImpl(_termAttribute);
            base.AddAttributeImpl(_offsetAttribute);
            base.AddAttributeImpl(_payloadAtt);
        }
        public override bool Equals(object other)
        {
            if (other == this)
            {
                return(true);
            }

            if (other is OffsetAttribute)
            {
                OffsetAttribute o = (OffsetAttribute)other;
                return(o.startOffset == startOffset && o.endOffset == endOffset);
            }

            return(false);
        }
Example #7
0
			/**
			 * Creates NGramTokenFilter with given min and max n-grams.
			 * <param name="input"><see cref="TokenStream"/> holding the input to be tokenized</param>
			 * <param name="minGram">the smallest n-gram to generate</param>
			 * <param name="maxGram">the largest n-gram to generate</param>
			 */
			public NGramTokenFilter(TokenStream input, int minGram, int maxGram)
				: base(input)
			{

				if (minGram < 1)
				{
					throw new System.ArgumentException("minGram must be greater than zero");
				}
				if (minGram > maxGram)
				{
					throw new System.ArgumentException("minGram must not be greater than maxGram");
				}
				this.minGram = minGram;
				this.maxGram = maxGram;

				this.termAtt = (TermAttribute)AddAttribute(typeof(TermAttribute));
				this.offsetAtt = (OffsetAttribute)AddAttribute(typeof(OffsetAttribute));
			}
        public PrefixAwareTokenFilter(TokenStream prefix, TokenStream suffix) : base(suffix)
        {
            Suffix = suffix;
            Prefix = prefix;
            _prefixExhausted = false;

            // ReSharper disable DoNotCallOverridableMethodsInConstructor
            _termAtt = (TermAttribute) AddAttribute(typeof (TermAttribute));
            _posIncrAtt = (PositionIncrementAttribute) AddAttribute(typeof (PositionIncrementAttribute));
            _payloadAtt = (PayloadAttribute) AddAttribute(typeof (PayloadAttribute));
            _offsetAtt = (OffsetAttribute) AddAttribute(typeof (OffsetAttribute));
            _typeAtt = (TypeAttribute) AddAttribute(typeof (TypeAttribute));
            _flagsAtt = (FlagsAttribute) AddAttribute(typeof (FlagsAttribute));
            // ReSharper restore DoNotCallOverridableMethodsInConstructor

            _pTermAtt = (TermAttribute) prefix.AddAttribute(typeof (TermAttribute));
            _pPosIncrAtt = (PositionIncrementAttribute) prefix.AddAttribute(typeof (PositionIncrementAttribute));
            _pPayloadAtt = (PayloadAttribute) prefix.AddAttribute(typeof (PayloadAttribute));
            _pOffsetAtt = (OffsetAttribute) prefix.AddAttribute(typeof (OffsetAttribute));
            _pTypeAtt = (TypeAttribute) prefix.AddAttribute(typeof (TypeAttribute));
            _pFlagsAtt = (FlagsAttribute) prefix.AddAttribute(typeof (FlagsAttribute));
        }
        public virtual void  TestOffsetAttribute()
        {
            OffsetAttribute att = new OffsetAttribute();

            Assert.AreEqual(0, att.StartOffset);
            Assert.AreEqual(0, att.EndOffset);

            att.SetOffset(12, 34);
            // no string test here, because order unknown

            OffsetAttribute att2 = (OffsetAttribute)AssertCloneIsEqual(att);

            Assert.AreEqual(12, att2.StartOffset);
            Assert.AreEqual(34, att2.EndOffset);

            att2 = (OffsetAttribute)AssertCopyIsEqual(att);
            Assert.AreEqual(12, att2.StartOffset);
            Assert.AreEqual(34, att2.EndOffset);

            att.Clear();
            Assert.AreEqual(0, att.StartOffset);
            Assert.AreEqual(0, att.EndOffset);
        }
        /**
         * Creates EdgeNGramTokenFilter that can generate n-grams in the sizes of the given range
         *
         * @param input {@link TokenStream} holding the input to be tokenized
         * @param side the {@link Side} from which to chop off an n-gram
         * @param minGram the smallest n-gram to generate
         * @param maxGram the largest n-gram to generate
         */
        public EdgeNGramTokenFilter(TokenStream input, Side side, int minGram, int maxGram)
            : base(input)
        {
            if (side == null)
            {
                throw new System.ArgumentException("sideLabel must be either front or back");
            }

            if (minGram < 1)
            {
                throw new System.ArgumentException("minGram must be greater than zero");
            }

            if (minGram > maxGram)
            {
                throw new System.ArgumentException("minGram must not be greater than maxGram");
            }

            this.minGram = minGram;
            this.maxGram = maxGram;
            this.side = side;
            this.termAtt = (TermAttribute)AddAttribute(typeof(TermAttribute));
            this.offsetAtt = (OffsetAttribute)AddAttribute(typeof(OffsetAttribute));
        }
 public TokenListStream(ICollection<Token> tokens)
 {
     _tokens = tokens;
     _termAtt = (TermAttribute) AddAttribute(typeof (TermAttribute));
     _posIncrAtt = (PositionIncrementAttribute) AddAttribute(typeof (PositionIncrementAttribute));
     _payloadAtt = (PayloadAttribute) AddAttribute(typeof (PayloadAttribute));
     _offsetAtt = (OffsetAttribute) AddAttribute(typeof (OffsetAttribute));
     _typeAtt = (TypeAttribute) AddAttribute(typeof (TypeAttribute));
     _flagsAtt = (FlagsAttribute) AddAttribute(typeof (FlagsAttribute));
 }
		public virtual void  TestOffsetAttribute()
		{
			OffsetAttribute att = new OffsetAttribute();
			Assert.AreEqual(0, att.StartOffset);
			Assert.AreEqual(0, att.EndOffset);
			
			att.SetOffset(12, 34);
			// no string test here, because order unknown
			
			OffsetAttribute att2 = (OffsetAttribute) AssertCloneIsEqual(att);
			Assert.AreEqual(12, att2.StartOffset);
			Assert.AreEqual(34, att2.EndOffset);
			
			att2 = (OffsetAttribute) AssertCopyIsEqual(att);
			Assert.AreEqual(12, att2.StartOffset);
			Assert.AreEqual(34, att2.EndOffset);
			
			att.Clear();
			Assert.AreEqual(0, att.StartOffset);
			Assert.AreEqual(0, att.EndOffset);
		}
Example #13
0
        public override void  CopyTo(AttributeImpl target)
        {
            OffsetAttribute t = (OffsetAttribute)target;

            t.SetOffset(startOffset, endOffset);
        }
        private void init(Side side, int minGram, int maxGram)
        {
            if (side == null)
            {
                throw new System.ArgumentException("sideLabel must be either front or back");
            }

            if (minGram < 1)
            {
                throw new System.ArgumentException("minGram must be greater than zero");
            }

            if (minGram > maxGram)
            {
                throw new System.ArgumentException("minGram must not be greater than maxGram");
            }

            this.minGram = minGram;
            this.maxGram = maxGram;
            this.side = side;

            this.termAtt = (TermAttribute)AddAttribute(typeof(TermAttribute));
            this.offsetAtt = (OffsetAttribute)AddAttribute(typeof(OffsetAttribute));

        }
 protected EdgeNGramTokenFilter(TokenStream input)
     : base(input)
 {
     this.termAtt = (TermAttribute)AddAttribute(typeof(TermAttribute));
     this.offsetAtt = (OffsetAttribute)AddAttribute(typeof(OffsetAttribute));
 }
Example #16
0
        private void Init(System.IO.TextReader input, HebMorph.StreamLemmatizer _lemmatizer,
            HebMorph.LemmaFilters.LemmaFilterBase _lemmaFilter, bool AlwaysSaveMarkedOriginal)
        {
            termAtt = (TermAttribute)AddAttribute(typeof(TermAttribute));
            offsetAtt = (OffsetAttribute)AddAttribute(typeof(OffsetAttribute));
            posIncrAtt = (PositionIncrementAttribute)AddAttribute(typeof(PositionIncrementAttribute));
            typeAtt = (TypeAttribute)AddAttribute(typeof(TypeAttribute));
            //payAtt = (PayloadAttribute)AddAttribute(typeof(PayloadAttribute));

            this._streamLemmatizer = _lemmatizer;
            this._streamLemmatizer.SetStream(input);
            this.alwaysSaveMarkedOriginal = AlwaysSaveMarkedOriginal;
            this.lemmaFilter = _lemmaFilter;
        }
        /// <summary>
        /// Constructs a ShingleFilter with the specified single size from the TokenStream
        /// </summary>
        /// <param name="input">input token stream</param>
        /// <param name="maxShingleSize">maximum shingle size produced by the filter.</param>
        public ShingleFilter(TokenStream input, int maxShingleSize) : base(input)
        {
            SetMaxShingleSize(maxShingleSize);

            // ReSharper disable DoNotCallOverridableMethodsInConstructor
            _termAtt = (TermAttribute) AddAttribute(typeof (TermAttribute));
            _offsetAtt = (OffsetAttribute) AddAttribute(typeof (OffsetAttribute));
            _posIncrAtt = (PositionIncrementAttribute) AddAttribute(typeof (PositionIncrementAttribute));
            _typeAtt = (TypeAttribute) AddAttribute(typeof (TypeAttribute));
            // ReSharper restore DoNotCallOverridableMethodsInConstructor
        }
        private void init(int minGram, int maxGram)
        {
            if (minGram < 1)
            {
                throw new System.ArgumentException("minGram must be greater than zero");
            }
            if (minGram > maxGram)
            {
                throw new System.ArgumentException("minGram must not be greater than maxGram");
            }
            this.minGram = minGram;
            this.maxGram = maxGram;

            this.termAtt = (TermAttribute)AddAttribute(typeof(TermAttribute));
            this.offsetAtt = (OffsetAttribute)AddAttribute(typeof(OffsetAttribute));
        }
        /// <summary>
        /// Creates a shingle filter based on a user defined matrix.
        /// 
        /// The filter /will/ delete columns from the input matrix! You will not be able to reset the filter if you used this constructor.
        /// todo: don't touch the matrix! use a bool, set the input stream to null or something, and keep track of where in the matrix we are at.
        /// 
        /// </summary>
        /// <param name="matrix">the input based for creating shingles. Does not need to contain any information until ShingleMatrixFilter.Next(Token) is called the first time.</param>
        /// <param name="minimumShingleSize">minimum number of tokens in any shingle.</param>
        /// <param name="maximumShingleSize">maximum number of tokens in any shingle.</param>
        /// <param name="spacerCharacter">character to use between texts of the token parts in a shingle. null for none.</param>
        /// <param name="ignoringSinglePrefixOrSuffixShingle">if true, shingles that only contains permutation of the first of the last column will not be produced as shingles. Useful when adding boundary marker tokens such as '^' and '$'.</param>
        /// <param name="settingsCodec">codec used to read input token weight and matrix positioning.</param>
        public ShingleMatrixFilter(Matrix.Matrix matrix, int minimumShingleSize, int maximumShingleSize, Char spacerCharacter, bool ignoringSinglePrefixOrSuffixShingle, TokenSettingsCodec settingsCodec)
        {
            Matrix = matrix;
            MinimumShingleSize = minimumShingleSize;
            MaximumShingleSize = maximumShingleSize;
            SpacerCharacter = spacerCharacter;
            IsIgnoringSinglePrefixOrSuffixShingle = ignoringSinglePrefixOrSuffixShingle;
            _settingsCodec = settingsCodec;

            // ReSharper disable DoNotCallOverridableMethodsInConstructor
            _termAtt = (TermAttribute) AddAttribute(typeof (TermAttribute));
            _posIncrAtt = (PositionIncrementAttribute) AddAttribute(typeof (PositionIncrementAttribute));
            _payloadAtt = (PayloadAttribute) AddAttribute(typeof (PayloadAttribute));
            _offsetAtt = (OffsetAttribute) AddAttribute(typeof (OffsetAttribute));
            _typeAtt = (TypeAttribute) AddAttribute(typeof (TypeAttribute));
            _flagsAtt = (FlagsAttribute) AddAttribute(typeof (FlagsAttribute));
            // ReSharper restore DoNotCallOverridableMethodsInConstructor

            // set the input to be an empty token stream, we already have the data.
            _input = new EmptyTokenStream();

            _inTermAtt = (TermAttribute) _input.AddAttribute(typeof (TermAttribute));
            _inPosIncrAtt = (PositionIncrementAttribute) _input.AddAttribute(typeof (PositionIncrementAttribute));
            _inPayloadAtt = (PayloadAttribute) _input.AddAttribute(typeof (PayloadAttribute));
            _inOffsetAtt = (OffsetAttribute) _input.AddAttribute(typeof (OffsetAttribute));
            _inTypeAtt = (TypeAttribute) _input.AddAttribute(typeof (TypeAttribute));
            _inFlagsAtt = (FlagsAttribute) _input.AddAttribute(typeof (FlagsAttribute));
        }
        /// <summary>
        /// Creates a shingle filter with ad hoc parameter settings.
        /// </summary>
        /// <param name="input">stream from which to construct the matrix</param>
        /// <param name="minimumShingleSize">minimum number of tokens in any shingle.</param>
        /// <param name="maximumShingleSize">maximum number of tokens in any shingle.</param>
        /// <param name="spacerCharacter">character to use between texts of the token parts in a shingle. null for none.</param>
        /// <param name="ignoringSinglePrefixOrSuffixShingle">if true, shingles that only contains permutation of the first of the last column will not be produced as shingles. Useful when adding boundary marker tokens such as '^' and '$'.</param>
        /// <param name="settingsCodec">codec used to read input token weight and matrix positioning.</param>
        public ShingleMatrixFilter(TokenStream input, int minimumShingleSize, int maximumShingleSize, Char? spacerCharacter, bool ignoringSinglePrefixOrSuffixShingle, TokenSettingsCodec settingsCodec)
        {
            _input = input;
            MinimumShingleSize = minimumShingleSize;
            MaximumShingleSize = maximumShingleSize;
            SpacerCharacter = spacerCharacter;
            IsIgnoringSinglePrefixOrSuffixShingle = ignoringSinglePrefixOrSuffixShingle;
            _settingsCodec = settingsCodec;

            // ReSharper disable DoNotCallOverridableMethodsInConstructor
            _termAtt = (TermAttribute) AddAttribute(typeof (TermAttribute));
            _posIncrAtt = (PositionIncrementAttribute) AddAttribute(typeof (PositionIncrementAttribute));
            _payloadAtt = (PayloadAttribute) AddAttribute(typeof (PayloadAttribute));
            _offsetAtt = (OffsetAttribute) AddAttribute(typeof (OffsetAttribute));
            _typeAtt = (TypeAttribute) AddAttribute(typeof (TypeAttribute));
            _flagsAtt = (FlagsAttribute) AddAttribute(typeof (FlagsAttribute));
            // ReSharper restore DoNotCallOverridableMethodsInConstructor

            _inTermAtt = (TermAttribute) input.AddAttribute(typeof (TermAttribute));
            _inPosIncrAtt = (PositionIncrementAttribute) input.AddAttribute(typeof (PositionIncrementAttribute));
            _inPayloadAtt = (PayloadAttribute) input.AddAttribute(typeof (PayloadAttribute));
            _inOffsetAtt = (OffsetAttribute) input.AddAttribute(typeof (OffsetAttribute));
            _inTypeAtt = (TypeAttribute) input.AddAttribute(typeof (TypeAttribute));
            _inFlagsAtt = (FlagsAttribute) input.AddAttribute(typeof (FlagsAttribute));
        }
		override public System.Object Clone()
		{
            OffsetAttribute impl = new OffsetAttribute();
            impl.endOffset = endOffset;
            impl.startOffset = startOffset;
            return impl;
		}