/// <summary> /// Creates a shingle filter based on a user defined matrix. /// /// The filter /will/ delete columns from the input matrix! You will not be able to reset the filter if you used this constructor. /// todo: don't touch the matrix! use a bool, set the input stream to null or something, and keep track of where in the matrix we are at. /// /// </summary> /// <param name="matrix">the input based for creating shingles. Does not need to contain any information until ShingleMatrixFilter.IncrementToken() is called the first time.</param> /// <param name="minimumShingleSize">minimum number of tokens in any shingle.</param> /// <param name="maximumShingleSize">maximum number of tokens in any shingle.</param> /// <param name="spacerCharacter">character to use between texts of the token parts in a shingle. null for none.</param> /// <param name="ignoringSinglePrefixOrSuffixShingle">if true, shingles that only contains permutation of the first of the last column will not be produced as shingles. Useful when adding boundary marker tokens such as '^' and '$'.</param> /// <param name="settingsCodec">codec used to read input token weight and matrix positioning.</param> public ShingleMatrixFilter(Matrix.Matrix matrix, int minimumShingleSize, int maximumShingleSize, Char spacerCharacter, bool ignoringSinglePrefixOrSuffixShingle, TokenSettingsCodec settingsCodec) { Matrix = matrix; MinimumShingleSize = minimumShingleSize; MaximumShingleSize = maximumShingleSize; SpacerCharacter = spacerCharacter; IsIgnoringSinglePrefixOrSuffixShingle = ignoringSinglePrefixOrSuffixShingle; _settingsCodec = settingsCodec; // ReSharper disable DoNotCallOverridableMethodsInConstructor _termAtt = AddAttribute <ITermAttribute>(); _posIncrAtt = AddAttribute <IPositionIncrementAttribute>(); _payloadAtt = AddAttribute <IPayloadAttribute>(); _offsetAtt = AddAttribute <IOffsetAttribute>(); _typeAtt = AddAttribute <ITypeAttribute>(); _flagsAtt = AddAttribute <IFlagsAttribute>(); // ReSharper restore DoNotCallOverridableMethodsInConstructor // set the input to be an empty token stream, we already have the data. _input = new EmptyTokenStream(); _inTermAtt = _input.AddAttribute <ITermAttribute>(); _inPosIncrAtt = _input.AddAttribute <IPositionIncrementAttribute>(); _inPayloadAtt = _input.AddAttribute <IPayloadAttribute>(); _inOffsetAtt = _input.AddAttribute <IOffsetAttribute>(); _inTypeAtt = _input.AddAttribute <ITypeAttribute>(); _inFlagsAtt = _input.AddAttribute <IFlagsAttribute>(); }
protected CompoundWordTokenFilterBase(TokenStream input, ISet <string> dictionary, int minWordSize, int minSubwordSize, int maxSubwordSize, bool onlyLongestMatch) : base(input) { this.tokens = new LinkedList <Token>(); this.minWordSize = minWordSize; this.minSubwordSize = minSubwordSize; this.maxSubwordSize = maxSubwordSize; this.onlyLongestMatch = onlyLongestMatch; if (dictionary is CharArraySet) { this.dictionary = (CharArraySet)dictionary; } else { this.dictionary = new CharArraySet(dictionary.Count, false); AddAllLowerCase(this.dictionary, dictionary); } termAtt = AddAttribute <ITermAttribute>(); offsetAtt = AddAttribute <IOffsetAttribute>(); flagsAtt = AddAttribute <IFlagsAttribute>(); posIncAtt = AddAttribute <IPositionIncrementAttribute>(); typeAtt = AddAttribute <ITypeAttribute>(); payloadAtt = AddAttribute <IPayloadAttribute>(); }
/// <summary> /// Creates a shingle filter with ad hoc parameter settings. /// </summary> /// <param name="input">stream from which to construct the matrix</param> /// <param name="minimumShingleSize">minimum number of tokens in any shingle.</param> /// <param name="maximumShingleSize">maximum number of tokens in any shingle.</param> /// <param name="spacerCharacter">character to use between texts of the token parts in a shingle. null for none.</param> /// <param name="ignoringSinglePrefixOrSuffixShingle">if true, shingles that only contains permutation of the first of the last column will not be produced as shingles. Useful when adding boundary marker tokens such as '^' and '$'.</param> /// <param name="settingsCodec">codec used to read input token weight and matrix positioning.</param> public ShingleMatrixFilter(TokenStream input, int minimumShingleSize, int maximumShingleSize, Char?spacerCharacter, bool ignoringSinglePrefixOrSuffixShingle, TokenSettingsCodec settingsCodec) { _input = input; MinimumShingleSize = minimumShingleSize; MaximumShingleSize = maximumShingleSize; SpacerCharacter = spacerCharacter; IsIgnoringSinglePrefixOrSuffixShingle = ignoringSinglePrefixOrSuffixShingle; _settingsCodec = settingsCodec; // ReSharper disable DoNotCallOverridableMethodsInConstructor _termAtt = AddAttribute <ITermAttribute>(); _posIncrAtt = AddAttribute <IPositionIncrementAttribute>(); _payloadAtt = AddAttribute <IPayloadAttribute>(); _offsetAtt = AddAttribute <IOffsetAttribute>(); _typeAtt = AddAttribute <ITypeAttribute>(); _flagsAtt = AddAttribute <IFlagsAttribute>(); // ReSharper restore DoNotCallOverridableMethodsInConstructor _inTermAtt = input.AddAttribute <ITermAttribute>(); _inPosIncrAtt = input.AddAttribute <IPositionIncrementAttribute>(); _inPayloadAtt = input.AddAttribute <IPayloadAttribute>(); _inOffsetAtt = input.AddAttribute <IOffsetAttribute>(); _inTypeAtt = input.AddAttribute <ITypeAttribute>(); _inFlagsAtt = input.AddAttribute <IFlagsAttribute>(); }
public virtual void TestBoth() { ISet <string> untoks = new HashSet <string>(); untoks.Add(WikipediaTokenizer.CATEGORY); untoks.Add(WikipediaTokenizer.ITALICS); string test = "[[Category:a b c d]] [[Category:e f g]] [[link here]] [[link there]] ''italics here'' something ''more italics'' [[Category:h i j]]"; //should output all the indivual tokens plus the untokenized tokens as well. Untokenized tokens WikipediaTokenizer tf = new WikipediaTokenizer(new StringReader(test), WikipediaTokenizer.BOTH, untoks); AssertTokenStreamContents(tf, new string[] { "a b c d", "a", "b", "c", "d", "e f g", "e", "f", "g", "link", "here", "link", "there", "italics here", "italics", "here", "something", "more italics", "more", "italics", "h i j", "h", "i", "j" }, new int[] { 11, 11, 13, 15, 17, 32, 32, 34, 36, 42, 47, 56, 61, 71, 71, 79, 86, 98, 98, 103, 124, 124, 128, 132 }, new int[] { 18, 12, 14, 16, 18, 37, 33, 35, 37, 46, 51, 60, 66, 83, 78, 83, 95, 110, 102, 110, 133, 125, 129, 133 }, new int[] { 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 0, 1, 1 }); // now check the flags, TODO: add way to check flags from BaseTokenStreamTestCase? tf = new WikipediaTokenizer(new StringReader(test), WikipediaTokenizer.BOTH, untoks); int[] expectedFlags = new int[] { WikipediaTokenizer.UNTOKENIZED_TOKEN_FLAG, 0, 0, 0, 0, WikipediaTokenizer.UNTOKENIZED_TOKEN_FLAG, 0, 0, 0, 0, 0, 0, 0, WikipediaTokenizer.UNTOKENIZED_TOKEN_FLAG, 0, 0, 0, WikipediaTokenizer.UNTOKENIZED_TOKEN_FLAG, 0, 0, WikipediaTokenizer.UNTOKENIZED_TOKEN_FLAG, 0, 0, 0 }; IFlagsAttribute flagsAtt = tf.AddAttribute <IFlagsAttribute>(); tf.Reset(); for (int i = 0; i < expectedFlags.Length; i++) { assertTrue(tf.IncrementToken()); assertEquals("flags " + i, expectedFlags[i], flagsAtt.Flags); } assertFalse(tf.IncrementToken()); tf.Dispose(); }
public virtual void TestCaptureState() { // init a first instance AttributeSource src = new AttributeSource(); ICharTermAttribute termAtt = src.AddAttribute <ICharTermAttribute>(); ITypeAttribute typeAtt = src.AddAttribute <ITypeAttribute>(); termAtt.Append("TestTerm"); typeAtt.Type = "TestType"; int hashCode = src.GetHashCode(); AttributeSource.State state = src.CaptureState(); // modify the attributes termAtt.SetEmpty().Append("AnotherTestTerm"); typeAtt.Type = "AnotherTestType"; Assert.IsTrue(hashCode != src.GetHashCode(), "Hash code should be different"); src.RestoreState(state); Assert.AreEqual(termAtt.ToString(), "TestTerm"); Assert.AreEqual(typeAtt.Type, "TestType"); Assert.AreEqual(hashCode, src.GetHashCode(), "Hash code should be equal after restore"); // restore into an exact configured copy AttributeSource copy = new AttributeSource(); copy.AddAttribute <ICharTermAttribute>(); copy.AddAttribute <ITypeAttribute>(); copy.RestoreState(state); Assert.AreEqual(src.GetHashCode(), copy.GetHashCode(), "Both AttributeSources should have same hashCode after restore"); Assert.AreEqual(src, copy, "Both AttributeSources should be equal after restore"); // init a second instance (with attributes in different order and one additional attribute) AttributeSource src2 = new AttributeSource(); typeAtt = src2.AddAttribute <ITypeAttribute>(); IFlagsAttribute flagsAtt = src2.AddAttribute <IFlagsAttribute>(); termAtt = src2.AddAttribute <ICharTermAttribute>(); flagsAtt.Flags = 12345; src2.RestoreState(state); Assert.AreEqual(termAtt.ToString(), "TestTerm"); Assert.AreEqual(typeAtt.Type, "TestType"); Assert.AreEqual(12345, flagsAtt.Flags, "FlagsAttribute should not be touched"); // init a third instance missing one Attribute AttributeSource src3 = new AttributeSource(); termAtt = src3.AddAttribute <ICharTermAttribute>(); try { src3.RestoreState(state); Assert.Fail("The third instance is missing the TypeAttribute, so restoreState() should throw IllegalArgumentException"); } catch (Exception iae) when(iae.IsIllegalArgumentException()) { // pass } }
public OpenNLPChunkerFilter(TokenStream input, NLPChunkerOp chunkerOp) : base(input) { this.chunkerOp = chunkerOp; this.typeAtt = AddAttribute <ITypeAttribute>(); this.flagsAtt = AddAttribute <IFlagsAttribute>(); this.termAtt = AddAttribute <ICharTermAttribute>(); }
void Init(string content) { _enumerationPositions = _aufzaehlungDetector.FindAufzaehlungsspans(content).ToArray(); _offsetAttribute = AddAttribute<IOffsetAttribute>(); _flagsAttribute = AddAttribute<IFlagsAttribute>(); _isInitialized = true; }
public OpenNLPPOSFilter(TokenStream input, NLPPOSTaggerOp posTaggerOp) : base(input) { this.posTaggerOp = posTaggerOp; this.typeAtt = AddAttribute <ITypeAttribute>(); this.flagsAtt = AddAttribute <IFlagsAttribute>(); this.termAtt = AddAttribute <ICharTermAttribute>(); }
internal TestTokenStream() : base() { this.termAtt = AddAttribute <ICharTermAttribute>(); this.offsetAtt = AddAttribute <IOffsetAttribute>(); this.typeAtt = AddAttribute <ITypeAttribute>(); this.payloadAtt = AddAttribute <IPayloadAttribute>(); this.posIncAtt = AddAttribute <IPositionIncrementAttribute>(); this.flagsAtt = AddAttribute <IFlagsAttribute>(); }
private int lemmaNum = 0; // lemma counter public OpenNLPLemmatizerFilter(TokenStream input, NLPLemmatizerOp lemmatizerOp) : base(input) { this.lemmatizerOp = lemmatizerOp; this.termAtt = AddAttribute <ICharTermAttribute>(); this.typeAtt = AddAttribute <ITypeAttribute>(); this.keywordAtt = AddAttribute <IKeywordAttribute>(); this.flagsAtt = AddAttribute <IFlagsAttribute>(); }
internal TestTokenStream() { termAtt = AddAttribute <ITermAttribute>(); offsetAtt = AddAttribute <IOffsetAttribute>(); typeAtt = AddAttribute <ITypeAttribute>(); payloadAtt = AddAttribute <IPayloadAttribute>(); posIncAtt = AddAttribute <IPositionIncrementAttribute>(); flagsAtt = AddAttribute <IFlagsAttribute>(); }
public IterTokenStream(params Token[] tokens) : base() { this.tokens = tokens; termAtt = AddAttribute <ICharTermAttribute>(); offsetAtt = AddAttribute <IOffsetAttribute>(); posIncAtt = AddAttribute <IPositionIncrementAttribute>(); flagsAtt = AddAttribute <IFlagsAttribute>(); typeAtt = AddAttribute <ITypeAttribute>(); payloadAtt = AddAttribute <IPayloadAttribute>(); }
public TokenListStream(ICollection <Token> tokens) { _tokens = tokens; _termAtt = AddAttribute <ITermAttribute>(); _posIncrAtt = AddAttribute <IPositionIncrementAttribute>(); _payloadAtt = AddAttribute <IPayloadAttribute>(); _offsetAtt = AddAttribute <IOffsetAttribute>(); _typeAtt = AddAttribute <ITypeAttribute>(); _flagsAtt = AddAttribute <IFlagsAttribute>(); }
public IterTokenStream(params Token[] tokens) : base() { this.tokens = tokens; this.termAtt = AddAttribute<ICharTermAttribute>(); this.offsetAtt = AddAttribute<IOffsetAttribute>(); this.posIncAtt = AddAttribute<IPositionIncrementAttribute>(); this.flagsAtt = AddAttribute<IFlagsAttribute>(); this.typeAtt = AddAttribute<ITypeAttribute>(); this.payloadAtt = AddAttribute<IPayloadAttribute>(); }
public OpenNLPTokenizer(AttributeFactory factory, TextReader reader, NLPSentenceDetectorOp sentenceOp, NLPTokenizerOp tokenizerOp) // LUCENENET: Added reader param for compatibility with 4.8 - remove when upgrading : base(factory, reader, new OpenNLPSentenceBreakIterator(sentenceOp)) { if (sentenceOp == null || tokenizerOp == null) { throw new ArgumentException("OpenNLPTokenizer: both a Sentence Detector and a Tokenizer are required"); } this.sentenceOp = sentenceOp; this.tokenizerOp = tokenizerOp; this.termAtt = AddAttribute <ICharTermAttribute>(); this.flagsAtt = AddAttribute <IFlagsAttribute>(); this.offsetAtt = AddAttribute <IOffsetAttribute>(); }
private void Init(int tokenOutput, IEnumerable <string> untokenizedTypes) { // TODO: cutover to enum if (tokenOutput != TOKENS_ONLY && tokenOutput != UNTOKENIZED_ONLY && tokenOutput != BOTH) { throw new System.ArgumentException("tokenOutput must be TOKENS_ONLY, UNTOKENIZED_ONLY or BOTH"); } this.tokenOutput = tokenOutput; this.untokenizedTypes = untokenizedTypes; offsetAtt = AddAttribute <IOffsetAttribute>(); typeAtt = AddAttribute <ITypeAttribute>(); posIncrAtt = AddAttribute <IPositionIncrementAttribute>(); termAtt = AddAttribute <ICharTermAttribute>(); flagsAtt = AddAttribute <IFlagsAttribute>(); }
private void Init(int tokenOutput, ICollection <string> untokenizedTypes) { // TODO: cutover to enum if (tokenOutput != TOKENS_ONLY && tokenOutput != UNTOKENIZED_ONLY && tokenOutput != BOTH) { throw new ArgumentOutOfRangeException(nameof(tokenOutput), "tokenOutput must be TOKENS_ONLY, UNTOKENIZED_ONLY or BOTH"); // LUCENENET specific - changed from IllegalArgumentException to ArgumentOutOfRangeException (.NET convention) } this.tokenOutput = tokenOutput; this.untokenizedTypes = untokenizedTypes; offsetAtt = AddAttribute <IOffsetAttribute>(); typeAtt = AddAttribute <ITypeAttribute>(); posIncrAtt = AddAttribute <IPositionIncrementAttribute>(); termAtt = AddAttribute <ICharTermAttribute>(); flagsAtt = AddAttribute <IFlagsAttribute>(); }
public OpenNLPTokenizer(AttributeFactory factory, TextReader reader, NLPSentenceDetectorOp sentenceOp, NLPTokenizerOp tokenizerOp) // LUCENENET: Added reader param for compatibility with 4.8 - remove when upgrading : base(factory, reader, new OpenNLPSentenceBreakIterator(sentenceOp)) { // LUCENENET specific - changed from IllegalArgumentException to ArgumentNullException (.NET convention) and refactored to throw on each one separately if (sentenceOp is null) { throw new ArgumentNullException(nameof(sentenceOp), "OpenNLPTokenizer: both a Sentence Detector and a Tokenizer are required"); } if (tokenizerOp is null) { throw new ArgumentNullException(nameof(tokenizerOp), "OpenNLPTokenizer: both a Sentence Detector and a Tokenizer are required"); } //this.sentenceOp = sentenceOp; // LUCENENET: Never read this.tokenizerOp = tokenizerOp; this.termAtt = AddAttribute <ICharTermAttribute>(); this.flagsAtt = AddAttribute <IFlagsAttribute>(); this.offsetAtt = AddAttribute <IOffsetAttribute>(); }
public void TestFilterTokens() { SnowballFilter filter = new SnowballFilter(new TestTokenStream(), "English"); ITermAttribute termAtt = filter.GetAttribute <ITermAttribute>(); IOffsetAttribute offsetAtt = filter.GetAttribute <IOffsetAttribute>(); ITypeAttribute typeAtt = filter.GetAttribute <ITypeAttribute>(); IPayloadAttribute payloadAtt = filter.GetAttribute <IPayloadAttribute>(); IPositionIncrementAttribute posIncAtt = filter.GetAttribute <IPositionIncrementAttribute>(); IFlagsAttribute flagsAtt = filter.GetAttribute <IFlagsAttribute>(); filter.IncrementToken(); Assert.AreEqual("accent", termAtt.Term); Assert.AreEqual(2, offsetAtt.StartOffset); Assert.AreEqual(7, offsetAtt.EndOffset); Assert.AreEqual("wrd", typeAtt.Type); Assert.AreEqual(3, posIncAtt.PositionIncrement); Assert.AreEqual(77, flagsAtt.Flags); Assert.AreEqual(new Payload(new byte[] { 0, 1, 2, 3 }), payloadAtt.Payload); }
public virtual void TestFilterTokens() { SnowballFilter filter = new SnowballFilter(new TestTokenStream(this), "English"); ICharTermAttribute termAtt = filter.GetAttribute <ICharTermAttribute>(); IOffsetAttribute offsetAtt = filter.GetAttribute <IOffsetAttribute>(); ITypeAttribute typeAtt = filter.GetAttribute <ITypeAttribute>(); IPayloadAttribute payloadAtt = filter.GetAttribute <IPayloadAttribute>(); IPositionIncrementAttribute posIncAtt = filter.GetAttribute <IPositionIncrementAttribute>(); IFlagsAttribute flagsAtt = filter.GetAttribute <IFlagsAttribute>(); filter.IncrementToken(); assertEquals("accent", termAtt.ToString()); assertEquals(2, offsetAtt.StartOffset()); assertEquals(7, offsetAtt.EndOffset()); assertEquals("wrd", typeAtt.Type); assertEquals(3, posIncAtt.PositionIncrement); assertEquals(77, flagsAtt.Flags); assertEquals(new BytesRef(new byte[] { 0, 1, 2, 3 }), payloadAtt.Payload); }
public PrefixAwareTokenFilter(TokenStream prefix, TokenStream suffix) : base(suffix) { this.suffix = suffix; this.prefix = prefix; prefixExhausted = false; termAtt = AddAttribute<ICharTermAttribute>(); posIncrAtt = AddAttribute<IPositionIncrementAttribute>(); payloadAtt = AddAttribute<IPayloadAttribute>(); offsetAtt = AddAttribute<IOffsetAttribute>(); typeAtt = AddAttribute<ITypeAttribute>(); flagsAtt = AddAttribute<IFlagsAttribute>(); p_termAtt = prefix.AddAttribute<ICharTermAttribute>(); p_posIncrAtt = prefix.AddAttribute<IPositionIncrementAttribute>(); p_payloadAtt = prefix.AddAttribute<IPayloadAttribute>(); p_offsetAtt = prefix.AddAttribute<IOffsetAttribute>(); p_typeAtt = prefix.AddAttribute<ITypeAttribute>(); p_flagsAtt = prefix.AddAttribute<IFlagsAttribute>(); }
public PrefixAwareTokenFilter(TokenStream prefix, TokenStream suffix) : base(suffix) { this.suffix = suffix; this.prefix = prefix; prefixExhausted = false; termAtt = AddAttribute <ICharTermAttribute>(); posIncrAtt = AddAttribute <IPositionIncrementAttribute>(); payloadAtt = AddAttribute <IPayloadAttribute>(); offsetAtt = AddAttribute <IOffsetAttribute>(); typeAtt = AddAttribute <ITypeAttribute>(); flagsAtt = AddAttribute <IFlagsAttribute>(); p_termAtt = prefix.AddAttribute <ICharTermAttribute>(); p_posIncrAtt = prefix.AddAttribute <IPositionIncrementAttribute>(); p_payloadAtt = prefix.AddAttribute <IPayloadAttribute>(); p_offsetAtt = prefix.AddAttribute <IOffsetAttribute>(); p_typeAtt = prefix.AddAttribute <ITypeAttribute>(); p_flagsAtt = prefix.AddAttribute <IFlagsAttribute>(); }
public PrefixAwareTokenFilter(TokenStream prefix, TokenStream suffix) : base(suffix) { Suffix = suffix; Prefix = prefix; _prefixExhausted = false; // ReSharper disable DoNotCallOverridableMethodsInConstructor _termAtt = AddAttribute<ITermAttribute>(); _posIncrAtt = AddAttribute<IPositionIncrementAttribute>(); _payloadAtt = AddAttribute<IPayloadAttribute>(); _offsetAtt = AddAttribute<IOffsetAttribute>(); _typeAtt = AddAttribute<ITypeAttribute>(); _flagsAtt = AddAttribute<IFlagsAttribute>(); // ReSharper restore DoNotCallOverridableMethodsInConstructor _pTermAtt = prefix.AddAttribute<ITermAttribute>(); _pPosIncrAtt = prefix.AddAttribute<IPositionIncrementAttribute>(); _pPayloadAtt = prefix.AddAttribute<IPayloadAttribute>(); _pOffsetAtt = prefix.AddAttribute<IOffsetAttribute>(); _pTypeAtt = prefix.AddAttribute<ITypeAttribute>(); _pFlagsAtt = prefix.AddAttribute<IFlagsAttribute>(); }
public PrefixAwareTokenFilter(TokenStream prefix, TokenStream suffix) : base(suffix) { Suffix = suffix; Prefix = prefix; _prefixExhausted = false; // ReSharper disable DoNotCallOverridableMethodsInConstructor _termAtt = AddAttribute <ITermAttribute>(); _posIncrAtt = AddAttribute <IPositionIncrementAttribute>(); _payloadAtt = AddAttribute <IPayloadAttribute>(); _offsetAtt = AddAttribute <IOffsetAttribute>(); _typeAtt = AddAttribute <ITypeAttribute>(); _flagsAtt = AddAttribute <IFlagsAttribute>(); // ReSharper restore DoNotCallOverridableMethodsInConstructor _pTermAtt = prefix.AddAttribute <ITermAttribute>(); _pPosIncrAtt = prefix.AddAttribute <IPositionIncrementAttribute>(); _pPayloadAtt = prefix.AddAttribute <IPayloadAttribute>(); _pOffsetAtt = prefix.AddAttribute <IOffsetAttribute>(); _pTypeAtt = prefix.AddAttribute <ITypeAttribute>(); _pFlagsAtt = prefix.AddAttribute <IFlagsAttribute>(); }
public virtual void TestCloneAttributes() { AttributeSource src = new AttributeSource(); IFlagsAttribute flagsAtt = src.AddAttribute <IFlagsAttribute>(); ITypeAttribute typeAtt = src.AddAttribute <ITypeAttribute>(); flagsAtt.Flags = 1234; typeAtt.Type = "TestType"; AttributeSource clone = src.CloneAttributes(); IEnumerator <Type> it = clone.GetAttributeClassesEnumerator(); it.MoveNext(); Assert.AreEqual(typeof(IFlagsAttribute), it.Current, "FlagsAttribute must be the first attribute"); it.MoveNext(); Assert.AreEqual(typeof(ITypeAttribute), it.Current, "TypeAttribute must be the second attribute"); Assert.IsFalse(it.MoveNext(), "No more attributes"); IFlagsAttribute flagsAtt2 = clone.GetAttribute <IFlagsAttribute>(); ITypeAttribute typeAtt2 = clone.GetAttribute <ITypeAttribute>(); Assert.AreNotSame(flagsAtt2, flagsAtt, "FlagsAttribute of original and clone must be different instances"); Assert.AreNotSame(typeAtt2, typeAtt, "TypeAttribute of original and clone must be different instances"); Assert.AreEqual(flagsAtt2, flagsAtt, "FlagsAttribute of original and clone must be equal"); Assert.AreEqual(typeAtt2, typeAtt, "TypeAttribute of original and clone must be equal"); // test copy back flagsAtt2.Flags = 4711; typeAtt2.Type = "OtherType"; clone.CopyTo(src); Assert.AreEqual(4711, flagsAtt.Flags, "FlagsAttribute of original must now contain updated term"); Assert.AreEqual(typeAtt.Type, "OtherType", "TypeAttribute of original must now contain updated type"); // verify again: Assert.AreNotSame(flagsAtt2, flagsAtt, "FlagsAttribute of original and clone must be different instances"); Assert.AreNotSame(typeAtt2, typeAtt, "TypeAttribute of original and clone must be different instances"); Assert.AreEqual(flagsAtt2, flagsAtt, "FlagsAttribute of original and clone must be equal"); Assert.AreEqual(typeAtt2, typeAtt, "TypeAttribute of original and clone must be equal"); }
/// <summary> /// Creates a shingle filter with ad hoc parameter settings. /// </summary> /// <param name="input">stream from which to construct the matrix</param> /// <param name="minimumShingleSize">minimum number of tokens in any shingle.</param> /// <param name="maximumShingleSize">maximum number of tokens in any shingle.</param> /// <param name="spacerCharacter">character to use between texts of the token parts in a shingle. null for none.</param> /// <param name="ignoringSinglePrefixOrSuffixShingle">if true, shingles that only contains permutation of the first of the last column will not be produced as shingles. Useful when adding boundary marker tokens such as '^' and '$'.</param> /// <param name="settingsCodec">codec used to read input token weight and matrix positioning.</param> public ShingleMatrixFilter(TokenStream input, int minimumShingleSize, int maximumShingleSize, Char? spacerCharacter, bool ignoringSinglePrefixOrSuffixShingle, TokenSettingsCodec settingsCodec) { _input = input; MinimumShingleSize = minimumShingleSize; MaximumShingleSize = maximumShingleSize; SpacerCharacter = spacerCharacter; IsIgnoringSinglePrefixOrSuffixShingle = ignoringSinglePrefixOrSuffixShingle; _settingsCodec = settingsCodec; // ReSharper disable DoNotCallOverridableMethodsInConstructor _termAtt = AddAttribute<ITermAttribute>(); _posIncrAtt = AddAttribute<IPositionIncrementAttribute>(); _payloadAtt = AddAttribute<IPayloadAttribute>(); _offsetAtt = AddAttribute<IOffsetAttribute>(); _typeAtt = AddAttribute<ITypeAttribute>(); _flagsAtt = AddAttribute<IFlagsAttribute>(); // ReSharper restore DoNotCallOverridableMethodsInConstructor _inTermAtt = input.AddAttribute<ITermAttribute>(); _inPosIncrAtt = input.AddAttribute<IPositionIncrementAttribute>(); _inPayloadAtt = input.AddAttribute<IPayloadAttribute>(); _inOffsetAtt = input.AddAttribute<IOffsetAttribute>(); _inTypeAtt = input.AddAttribute<ITypeAttribute>(); _inFlagsAtt = input.AddAttribute<IFlagsAttribute>(); }
public TokenListStream(ICollection<Token> tokens) { _tokens = tokens; _termAtt = AddAttribute<ITermAttribute>(); _posIncrAtt = AddAttribute<IPositionIncrementAttribute>(); _payloadAtt = AddAttribute<IPayloadAttribute>(); _offsetAtt = AddAttribute<IOffsetAttribute>(); _typeAtt = AddAttribute<ITypeAttribute>(); _flagsAtt = AddAttribute<IFlagsAttribute>(); }
internal TestTokenStream() { termAtt = AddAttribute<ITermAttribute>(); offsetAtt = AddAttribute<IOffsetAttribute>(); typeAtt = AddAttribute<ITypeAttribute>(); payloadAtt = AddAttribute<IPayloadAttribute>(); posIncAtt = AddAttribute<IPositionIncrementAttribute>(); flagsAtt = AddAttribute<IFlagsAttribute>(); }
protected CompoundWordTokenFilterBase(TokenStream input, ISet<string> dictionary, int minWordSize, int minSubwordSize, int maxSubwordSize, bool onlyLongestMatch) : base(input) { this.tokens = new LinkedList<Token>(); this.minWordSize = minWordSize; this.minSubwordSize = minSubwordSize; this.maxSubwordSize = maxSubwordSize; this.onlyLongestMatch = onlyLongestMatch; if (dictionary is CharArraySet) { this.dictionary = (CharArraySet)dictionary; } else { this.dictionary = new CharArraySet(dictionary.Count, false); AddAllLowerCase(this.dictionary, dictionary); } termAtt = AddAttribute<ITermAttribute>(); offsetAtt = AddAttribute<IOffsetAttribute>(); flagsAtt = AddAttribute<IFlagsAttribute>(); posIncAtt = AddAttribute<IPositionIncrementAttribute>(); typeAtt = AddAttribute<ITypeAttribute>(); payloadAtt = AddAttribute<IPayloadAttribute>(); }
private void Init(int tokenOutput, IEnumerable<string> untokenizedTypes) { // TODO: cutover to enum if (tokenOutput != TOKENS_ONLY && tokenOutput != UNTOKENIZED_ONLY && tokenOutput != BOTH) { throw new System.ArgumentException("tokenOutput must be TOKENS_ONLY, UNTOKENIZED_ONLY or BOTH"); } this.tokenOutput = tokenOutput; this.untokenizedTypes = untokenizedTypes; offsetAtt = AddAttribute<IOffsetAttribute>(); typeAtt = AddAttribute<ITypeAttribute>(); posIncrAtt = AddAttribute<IPositionIncrementAttribute>(); termAtt = AddAttribute<ICharTermAttribute>(); flagsAtt = AddAttribute<IFlagsAttribute>(); }
/// <summary> /// Creates a shingle filter based on a user defined matrix. /// /// The filter /will/ delete columns from the input matrix! You will not be able to reset the filter if you used this constructor. /// todo: don't touch the matrix! use a bool, set the input stream to null or something, and keep track of where in the matrix we are at. /// /// </summary> /// <param name="matrix">the input based for creating shingles. Does not need to contain any information until ShingleMatrixFilter.IncrementToken() is called the first time.</param> /// <param name="minimumShingleSize">minimum number of tokens in any shingle.</param> /// <param name="maximumShingleSize">maximum number of tokens in any shingle.</param> /// <param name="spacerCharacter">character to use between texts of the token parts in a shingle. null for none.</param> /// <param name="ignoringSinglePrefixOrSuffixShingle">if true, shingles that only contains permutation of the first of the last column will not be produced as shingles. Useful when adding boundary marker tokens such as '^' and '$'.</param> /// <param name="settingsCodec">codec used to read input token weight and matrix positioning.</param> public ShingleMatrixFilter(Matrix.Matrix matrix, int minimumShingleSize, int maximumShingleSize, Char spacerCharacter, bool ignoringSinglePrefixOrSuffixShingle, TokenSettingsCodec settingsCodec) { Matrix = matrix; MinimumShingleSize = minimumShingleSize; MaximumShingleSize = maximumShingleSize; SpacerCharacter = spacerCharacter; IsIgnoringSinglePrefixOrSuffixShingle = ignoringSinglePrefixOrSuffixShingle; _settingsCodec = settingsCodec; // ReSharper disable DoNotCallOverridableMethodsInConstructor _termAtt = AddAttribute<ITermAttribute>(); _posIncrAtt = AddAttribute<IPositionIncrementAttribute>(); _payloadAtt = AddAttribute<IPayloadAttribute>(); _offsetAtt = AddAttribute<IOffsetAttribute>(); _typeAtt = AddAttribute<ITypeAttribute>(); _flagsAtt = AddAttribute<IFlagsAttribute>(); // ReSharper restore DoNotCallOverridableMethodsInConstructor // set the input to be an empty token stream, we already have the data. _input = new EmptyTokenStream(); _inTermAtt = _input.AddAttribute<ITermAttribute>(); _inPosIncrAtt = _input.AddAttribute<IPositionIncrementAttribute>(); _inPayloadAtt = _input.AddAttribute<IPayloadAttribute>(); _inOffsetAtt = _input.AddAttribute<IOffsetAttribute>(); _inTypeAtt = _input.AddAttribute<ITypeAttribute>(); _inFlagsAtt = _input.AddAttribute<IFlagsAttribute>(); }
public override void CopyTo(Attribute target) { IFlagsAttribute t = (IFlagsAttribute)target; t.Flags = flags; }
public FlagFilter(int flags, TokenStream input) : base(input) { _flagsAttribute = AddAttribute<IFlagsAttribute>(); _flags = flags; }
internal TestTokenStream(TestSnowball outerInstance) : base() { this.outerInstance = outerInstance; this.termAtt = AddAttribute<ICharTermAttribute>(); this.offsetAtt = AddAttribute<IOffsetAttribute>(); this.typeAtt = AddAttribute<ITypeAttribute>(); this.payloadAtt = AddAttribute<IPayloadAttribute>(); this.posIncAtt = AddAttribute<IPositionIncrementAttribute>(); this.flagsAtt = AddAttribute<IFlagsAttribute>(); }