public SingleCharTokenizer(TokenStream input): base(input) { _input = input; _termAttribute = (TermAttribute)AddAttribute(typeof(TermAttribute)); _offsetAttribute = (OffsetAttribute)AddAttribute(typeof(OffsetAttribute)); _positionIncrementAttribute = (PositionIncrementAttribute)AddAttribute(typeof(PositionIncrementAttribute)); }
override public object Clone() { OffsetAttribute impl = new OffsetAttribute(); impl.endOffset = endOffset; impl.startOffset = startOffset; return(impl); }
public HyphenationTokenFilter(TokenStream input, Hyphenator hyphenator) : base(input) { _hyphenator = hyphenator; _termAtt = (TermAttribute)AddAttribute(typeof(TermAttribute)); _typeAtt = (TypeAttribute)AddAttribute(typeof(TypeAttribute)); _ofsAtt = (OffsetAttribute)AddAttribute(typeof(OffsetAttribute)); }
public AnsjTokenizer(AbstractAnalysis analysis, TextReader input, HashSet<string> filter, bool pstemming) : base(input) { _analysis = analysis; _termAtt = AddAttribute<TermAttribute>(); _offsetAtt = AddAttribute<OffsetAttribute>(); _positionAttr = AddAttribute<PositionIncrementAttribute>(); _filter = filter; _pstemming = pstemming; }
public IntMetaDataTokenStream(string tokenText) { _tokenText = tokenText; // NOTE: Calling the AddAttribute<T> method failed, so // switched to using AddAttributeImpl. _termAttribute = new TermAttribute(); _offsetAttribute = new OffsetAttribute(); _payloadAtt = new PayloadAttribute(); base.AddAttributeImpl(_termAttribute); base.AddAttributeImpl(_offsetAttribute); base.AddAttributeImpl(_payloadAtt); }
public override bool Equals(object other) { if (other == this) { return(true); } if (other is OffsetAttribute) { OffsetAttribute o = (OffsetAttribute)other; return(o.startOffset == startOffset && o.endOffset == endOffset); } return(false); }
/** * Creates NGramTokenFilter with given min and max n-grams. * <param name="input"><see cref="TokenStream"/> holding the input to be tokenized</param> * <param name="minGram">the smallest n-gram to generate</param> * <param name="maxGram">the largest n-gram to generate</param> */ public NGramTokenFilter(TokenStream input, int minGram, int maxGram) : base(input) { if (minGram < 1) { throw new System.ArgumentException("minGram must be greater than zero"); } if (minGram > maxGram) { throw new System.ArgumentException("minGram must not be greater than maxGram"); } this.minGram = minGram; this.maxGram = maxGram; this.termAtt = (TermAttribute)AddAttribute(typeof(TermAttribute)); this.offsetAtt = (OffsetAttribute)AddAttribute(typeof(OffsetAttribute)); }
public PrefixAwareTokenFilter(TokenStream prefix, TokenStream suffix) : base(suffix) { Suffix = suffix; Prefix = prefix; _prefixExhausted = false; // ReSharper disable DoNotCallOverridableMethodsInConstructor _termAtt = (TermAttribute) AddAttribute(typeof (TermAttribute)); _posIncrAtt = (PositionIncrementAttribute) AddAttribute(typeof (PositionIncrementAttribute)); _payloadAtt = (PayloadAttribute) AddAttribute(typeof (PayloadAttribute)); _offsetAtt = (OffsetAttribute) AddAttribute(typeof (OffsetAttribute)); _typeAtt = (TypeAttribute) AddAttribute(typeof (TypeAttribute)); _flagsAtt = (FlagsAttribute) AddAttribute(typeof (FlagsAttribute)); // ReSharper restore DoNotCallOverridableMethodsInConstructor _pTermAtt = (TermAttribute) prefix.AddAttribute(typeof (TermAttribute)); _pPosIncrAtt = (PositionIncrementAttribute) prefix.AddAttribute(typeof (PositionIncrementAttribute)); _pPayloadAtt = (PayloadAttribute) prefix.AddAttribute(typeof (PayloadAttribute)); _pOffsetAtt = (OffsetAttribute) prefix.AddAttribute(typeof (OffsetAttribute)); _pTypeAtt = (TypeAttribute) prefix.AddAttribute(typeof (TypeAttribute)); _pFlagsAtt = (FlagsAttribute) prefix.AddAttribute(typeof (FlagsAttribute)); }
public virtual void TestOffsetAttribute() { OffsetAttribute att = new OffsetAttribute(); Assert.AreEqual(0, att.StartOffset); Assert.AreEqual(0, att.EndOffset); att.SetOffset(12, 34); // no string test here, because order unknown OffsetAttribute att2 = (OffsetAttribute)AssertCloneIsEqual(att); Assert.AreEqual(12, att2.StartOffset); Assert.AreEqual(34, att2.EndOffset); att2 = (OffsetAttribute)AssertCopyIsEqual(att); Assert.AreEqual(12, att2.StartOffset); Assert.AreEqual(34, att2.EndOffset); att.Clear(); Assert.AreEqual(0, att.StartOffset); Assert.AreEqual(0, att.EndOffset); }
/** * Creates EdgeNGramTokenFilter that can generate n-grams in the sizes of the given range * * @param input {@link TokenStream} holding the input to be tokenized * @param side the {@link Side} from which to chop off an n-gram * @param minGram the smallest n-gram to generate * @param maxGram the largest n-gram to generate */ public EdgeNGramTokenFilter(TokenStream input, Side side, int minGram, int maxGram) : base(input) { if (side == null) { throw new System.ArgumentException("sideLabel must be either front or back"); } if (minGram < 1) { throw new System.ArgumentException("minGram must be greater than zero"); } if (minGram > maxGram) { throw new System.ArgumentException("minGram must not be greater than maxGram"); } this.minGram = minGram; this.maxGram = maxGram; this.side = side; this.termAtt = (TermAttribute)AddAttribute(typeof(TermAttribute)); this.offsetAtt = (OffsetAttribute)AddAttribute(typeof(OffsetAttribute)); }
public TokenListStream(ICollection<Token> tokens) { _tokens = tokens; _termAtt = (TermAttribute) AddAttribute(typeof (TermAttribute)); _posIncrAtt = (PositionIncrementAttribute) AddAttribute(typeof (PositionIncrementAttribute)); _payloadAtt = (PayloadAttribute) AddAttribute(typeof (PayloadAttribute)); _offsetAtt = (OffsetAttribute) AddAttribute(typeof (OffsetAttribute)); _typeAtt = (TypeAttribute) AddAttribute(typeof (TypeAttribute)); _flagsAtt = (FlagsAttribute) AddAttribute(typeof (FlagsAttribute)); }
public virtual void TestOffsetAttribute() { OffsetAttribute att = new OffsetAttribute(); Assert.AreEqual(0, att.StartOffset); Assert.AreEqual(0, att.EndOffset); att.SetOffset(12, 34); // no string test here, because order unknown OffsetAttribute att2 = (OffsetAttribute) AssertCloneIsEqual(att); Assert.AreEqual(12, att2.StartOffset); Assert.AreEqual(34, att2.EndOffset); att2 = (OffsetAttribute) AssertCopyIsEqual(att); Assert.AreEqual(12, att2.StartOffset); Assert.AreEqual(34, att2.EndOffset); att.Clear(); Assert.AreEqual(0, att.StartOffset); Assert.AreEqual(0, att.EndOffset); }
public override void CopyTo(AttributeImpl target) { OffsetAttribute t = (OffsetAttribute)target; t.SetOffset(startOffset, endOffset); }
private void init(Side side, int minGram, int maxGram) { if (side == null) { throw new System.ArgumentException("sideLabel must be either front or back"); } if (minGram < 1) { throw new System.ArgumentException("minGram must be greater than zero"); } if (minGram > maxGram) { throw new System.ArgumentException("minGram must not be greater than maxGram"); } this.minGram = minGram; this.maxGram = maxGram; this.side = side; this.termAtt = (TermAttribute)AddAttribute(typeof(TermAttribute)); this.offsetAtt = (OffsetAttribute)AddAttribute(typeof(OffsetAttribute)); }
protected EdgeNGramTokenFilter(TokenStream input) : base(input) { this.termAtt = (TermAttribute)AddAttribute(typeof(TermAttribute)); this.offsetAtt = (OffsetAttribute)AddAttribute(typeof(OffsetAttribute)); }
private void Init(System.IO.TextReader input, HebMorph.StreamLemmatizer _lemmatizer, HebMorph.LemmaFilters.LemmaFilterBase _lemmaFilter, bool AlwaysSaveMarkedOriginal) { termAtt = (TermAttribute)AddAttribute(typeof(TermAttribute)); offsetAtt = (OffsetAttribute)AddAttribute(typeof(OffsetAttribute)); posIncrAtt = (PositionIncrementAttribute)AddAttribute(typeof(PositionIncrementAttribute)); typeAtt = (TypeAttribute)AddAttribute(typeof(TypeAttribute)); //payAtt = (PayloadAttribute)AddAttribute(typeof(PayloadAttribute)); this._streamLemmatizer = _lemmatizer; this._streamLemmatizer.SetStream(input); this.alwaysSaveMarkedOriginal = AlwaysSaveMarkedOriginal; this.lemmaFilter = _lemmaFilter; }
/// <summary> /// Constructs a ShingleFilter with the specified single size from the TokenStream /// </summary> /// <param name="input">input token stream</param> /// <param name="maxShingleSize">maximum shingle size produced by the filter.</param> public ShingleFilter(TokenStream input, int maxShingleSize) : base(input) { SetMaxShingleSize(maxShingleSize); // ReSharper disable DoNotCallOverridableMethodsInConstructor _termAtt = (TermAttribute) AddAttribute(typeof (TermAttribute)); _offsetAtt = (OffsetAttribute) AddAttribute(typeof (OffsetAttribute)); _posIncrAtt = (PositionIncrementAttribute) AddAttribute(typeof (PositionIncrementAttribute)); _typeAtt = (TypeAttribute) AddAttribute(typeof (TypeAttribute)); // ReSharper restore DoNotCallOverridableMethodsInConstructor }
private void init(int minGram, int maxGram) { if (minGram < 1) { throw new System.ArgumentException("minGram must be greater than zero"); } if (minGram > maxGram) { throw new System.ArgumentException("minGram must not be greater than maxGram"); } this.minGram = minGram; this.maxGram = maxGram; this.termAtt = (TermAttribute)AddAttribute(typeof(TermAttribute)); this.offsetAtt = (OffsetAttribute)AddAttribute(typeof(OffsetAttribute)); }
/// <summary> /// Creates a shingle filter based on a user defined matrix. /// /// The filter /will/ delete columns from the input matrix! You will not be able to reset the filter if you used this constructor. /// todo: don't touch the matrix! use a bool, set the input stream to null or something, and keep track of where in the matrix we are at. /// /// </summary> /// <param name="matrix">the input based for creating shingles. Does not need to contain any information until ShingleMatrixFilter.Next(Token) is called the first time.</param> /// <param name="minimumShingleSize">minimum number of tokens in any shingle.</param> /// <param name="maximumShingleSize">maximum number of tokens in any shingle.</param> /// <param name="spacerCharacter">character to use between texts of the token parts in a shingle. null for none.</param> /// <param name="ignoringSinglePrefixOrSuffixShingle">if true, shingles that only contains permutation of the first of the last column will not be produced as shingles. Useful when adding boundary marker tokens such as '^' and '$'.</param> /// <param name="settingsCodec">codec used to read input token weight and matrix positioning.</param> public ShingleMatrixFilter(Matrix.Matrix matrix, int minimumShingleSize, int maximumShingleSize, Char spacerCharacter, bool ignoringSinglePrefixOrSuffixShingle, TokenSettingsCodec settingsCodec) { Matrix = matrix; MinimumShingleSize = minimumShingleSize; MaximumShingleSize = maximumShingleSize; SpacerCharacter = spacerCharacter; IsIgnoringSinglePrefixOrSuffixShingle = ignoringSinglePrefixOrSuffixShingle; _settingsCodec = settingsCodec; // ReSharper disable DoNotCallOverridableMethodsInConstructor _termAtt = (TermAttribute) AddAttribute(typeof (TermAttribute)); _posIncrAtt = (PositionIncrementAttribute) AddAttribute(typeof (PositionIncrementAttribute)); _payloadAtt = (PayloadAttribute) AddAttribute(typeof (PayloadAttribute)); _offsetAtt = (OffsetAttribute) AddAttribute(typeof (OffsetAttribute)); _typeAtt = (TypeAttribute) AddAttribute(typeof (TypeAttribute)); _flagsAtt = (FlagsAttribute) AddAttribute(typeof (FlagsAttribute)); // ReSharper restore DoNotCallOverridableMethodsInConstructor // set the input to be an empty token stream, we already have the data. _input = new EmptyTokenStream(); _inTermAtt = (TermAttribute) _input.AddAttribute(typeof (TermAttribute)); _inPosIncrAtt = (PositionIncrementAttribute) _input.AddAttribute(typeof (PositionIncrementAttribute)); _inPayloadAtt = (PayloadAttribute) _input.AddAttribute(typeof (PayloadAttribute)); _inOffsetAtt = (OffsetAttribute) _input.AddAttribute(typeof (OffsetAttribute)); _inTypeAtt = (TypeAttribute) _input.AddAttribute(typeof (TypeAttribute)); _inFlagsAtt = (FlagsAttribute) _input.AddAttribute(typeof (FlagsAttribute)); }
/// <summary> /// Creates a shingle filter with ad hoc parameter settings. /// </summary> /// <param name="input">stream from which to construct the matrix</param> /// <param name="minimumShingleSize">minimum number of tokens in any shingle.</param> /// <param name="maximumShingleSize">maximum number of tokens in any shingle.</param> /// <param name="spacerCharacter">character to use between texts of the token parts in a shingle. null for none.</param> /// <param name="ignoringSinglePrefixOrSuffixShingle">if true, shingles that only contains permutation of the first of the last column will not be produced as shingles. Useful when adding boundary marker tokens such as '^' and '$'.</param> /// <param name="settingsCodec">codec used to read input token weight and matrix positioning.</param> public ShingleMatrixFilter(TokenStream input, int minimumShingleSize, int maximumShingleSize, Char? spacerCharacter, bool ignoringSinglePrefixOrSuffixShingle, TokenSettingsCodec settingsCodec) { _input = input; MinimumShingleSize = minimumShingleSize; MaximumShingleSize = maximumShingleSize; SpacerCharacter = spacerCharacter; IsIgnoringSinglePrefixOrSuffixShingle = ignoringSinglePrefixOrSuffixShingle; _settingsCodec = settingsCodec; // ReSharper disable DoNotCallOverridableMethodsInConstructor _termAtt = (TermAttribute) AddAttribute(typeof (TermAttribute)); _posIncrAtt = (PositionIncrementAttribute) AddAttribute(typeof (PositionIncrementAttribute)); _payloadAtt = (PayloadAttribute) AddAttribute(typeof (PayloadAttribute)); _offsetAtt = (OffsetAttribute) AddAttribute(typeof (OffsetAttribute)); _typeAtt = (TypeAttribute) AddAttribute(typeof (TypeAttribute)); _flagsAtt = (FlagsAttribute) AddAttribute(typeof (FlagsAttribute)); // ReSharper restore DoNotCallOverridableMethodsInConstructor _inTermAtt = (TermAttribute) input.AddAttribute(typeof (TermAttribute)); _inPosIncrAtt = (PositionIncrementAttribute) input.AddAttribute(typeof (PositionIncrementAttribute)); _inPayloadAtt = (PayloadAttribute) input.AddAttribute(typeof (PayloadAttribute)); _inOffsetAtt = (OffsetAttribute) input.AddAttribute(typeof (OffsetAttribute)); _inTypeAtt = (TypeAttribute) input.AddAttribute(typeof (TypeAttribute)); _inFlagsAtt = (FlagsAttribute) input.AddAttribute(typeof (FlagsAttribute)); }
override public System.Object Clone() { OffsetAttribute impl = new OffsetAttribute(); impl.endOffset = endOffset; impl.startOffset = startOffset; return impl; }