Exemplo n.º 1
0
        /// <summary>
        /// Creates a shingle filter based on a user defined matrix.
        ///
        /// The filter /will/ delete columns from the input matrix! You will not be able to reset the filter if you used this constructor.
        /// todo: don't touch the matrix! use a bool, set the input stream to null or something, and keep track of where in the matrix we are at.
        ///
        /// </summary>
        /// <param name="matrix">the input based for creating shingles. Does not need to contain any information until ShingleMatrixFilter.IncrementToken() is called the first time.</param>
        /// <param name="minimumShingleSize">minimum number of tokens in any shingle.</param>
        /// <param name="maximumShingleSize">maximum number of tokens in any shingle.</param>
        /// <param name="spacerCharacter">character to use between texts of the token parts in a shingle. null for none.</param>
        /// <param name="ignoringSinglePrefixOrSuffixShingle">if true, shingles that only contains permutation of the first of the last column will not be produced as shingles. Useful when adding boundary marker tokens such as '^' and '$'.</param>
        /// <param name="settingsCodec">codec used to read input token weight and matrix positioning.</param>
        public ShingleMatrixFilter(Matrix.Matrix matrix, int minimumShingleSize, int maximumShingleSize, Char spacerCharacter, bool ignoringSinglePrefixOrSuffixShingle, TokenSettingsCodec settingsCodec)
        {
            Matrix             = matrix;
            MinimumShingleSize = minimumShingleSize;
            MaximumShingleSize = maximumShingleSize;
            SpacerCharacter    = spacerCharacter;
            IsIgnoringSinglePrefixOrSuffixShingle = ignoringSinglePrefixOrSuffixShingle;
            _settingsCodec = settingsCodec;

            // ReSharper disable DoNotCallOverridableMethodsInConstructor
            _termAtt    = AddAttribute <ITermAttribute>();
            _posIncrAtt = AddAttribute <IPositionIncrementAttribute>();
            _payloadAtt = AddAttribute <IPayloadAttribute>();
            _offsetAtt  = AddAttribute <IOffsetAttribute>();
            _typeAtt    = AddAttribute <ITypeAttribute>();
            _flagsAtt   = AddAttribute <IFlagsAttribute>();
            // ReSharper restore DoNotCallOverridableMethodsInConstructor

            // set the input to be an empty token stream, we already have the data.
            _input = new EmptyTokenStream();

            _inTermAtt    = _input.AddAttribute <ITermAttribute>();
            _inPosIncrAtt = _input.AddAttribute <IPositionIncrementAttribute>();
            _inPayloadAtt = _input.AddAttribute <IPayloadAttribute>();
            _inOffsetAtt  = _input.AddAttribute <IOffsetAttribute>();
            _inTypeAtt    = _input.AddAttribute <ITypeAttribute>();
            _inFlagsAtt   = _input.AddAttribute <IFlagsAttribute>();
        }
        protected CompoundWordTokenFilterBase(TokenStream input, ISet <string> dictionary, int minWordSize, int minSubwordSize, int maxSubwordSize, bool onlyLongestMatch)
            : base(input)
        {
            this.tokens           = new LinkedList <Token>();
            this.minWordSize      = minWordSize;
            this.minSubwordSize   = minSubwordSize;
            this.maxSubwordSize   = maxSubwordSize;
            this.onlyLongestMatch = onlyLongestMatch;

            if (dictionary is CharArraySet)
            {
                this.dictionary = (CharArraySet)dictionary;
            }
            else
            {
                this.dictionary = new CharArraySet(dictionary.Count, false);
                AddAllLowerCase(this.dictionary, dictionary);
            }

            termAtt    = AddAttribute <ITermAttribute>();
            offsetAtt  = AddAttribute <IOffsetAttribute>();
            flagsAtt   = AddAttribute <IFlagsAttribute>();
            posIncAtt  = AddAttribute <IPositionIncrementAttribute>();
            typeAtt    = AddAttribute <ITypeAttribute>();
            payloadAtt = AddAttribute <IPayloadAttribute>();
        }
Exemplo n.º 3
0
        /// <summary>
        /// Creates a shingle filter with ad hoc parameter settings.
        /// </summary>
        /// <param name="input">stream from which to construct the matrix</param>
        /// <param name="minimumShingleSize">minimum number of tokens in any shingle.</param>
        /// <param name="maximumShingleSize">maximum number of tokens in any shingle.</param>
        /// <param name="spacerCharacter">character to use between texts of the token parts in a shingle. null for none.</param>
        /// <param name="ignoringSinglePrefixOrSuffixShingle">if true, shingles that only contains permutation of the first of the last column will not be produced as shingles. Useful when adding boundary marker tokens such as '^' and '$'.</param>
        /// <param name="settingsCodec">codec used to read input token weight and matrix positioning.</param>
        public ShingleMatrixFilter(TokenStream input, int minimumShingleSize, int maximumShingleSize, Char?spacerCharacter, bool ignoringSinglePrefixOrSuffixShingle, TokenSettingsCodec settingsCodec)
        {
            _input             = input;
            MinimumShingleSize = minimumShingleSize;
            MaximumShingleSize = maximumShingleSize;
            SpacerCharacter    = spacerCharacter;
            IsIgnoringSinglePrefixOrSuffixShingle = ignoringSinglePrefixOrSuffixShingle;
            _settingsCodec = settingsCodec;

            // ReSharper disable DoNotCallOverridableMethodsInConstructor
            _termAtt    = AddAttribute <ITermAttribute>();
            _posIncrAtt = AddAttribute <IPositionIncrementAttribute>();
            _payloadAtt = AddAttribute <IPayloadAttribute>();
            _offsetAtt  = AddAttribute <IOffsetAttribute>();
            _typeAtt    = AddAttribute <ITypeAttribute>();
            _flagsAtt   = AddAttribute <IFlagsAttribute>();
            // ReSharper restore DoNotCallOverridableMethodsInConstructor

            _inTermAtt    = input.AddAttribute <ITermAttribute>();
            _inPosIncrAtt = input.AddAttribute <IPositionIncrementAttribute>();
            _inPayloadAtt = input.AddAttribute <IPayloadAttribute>();
            _inOffsetAtt  = input.AddAttribute <IOffsetAttribute>();
            _inTypeAtt    = input.AddAttribute <ITypeAttribute>();
            _inFlagsAtt   = input.AddAttribute <IFlagsAttribute>();
        }
Exemplo n.º 4
0
        public virtual void TestBoth()
        {
            ISet <string> untoks = new HashSet <string>();

            untoks.Add(WikipediaTokenizer.CATEGORY);
            untoks.Add(WikipediaTokenizer.ITALICS);
            string test = "[[Category:a b c d]] [[Category:e f g]] [[link here]] [[link there]] ''italics here'' something ''more italics'' [[Category:h   i   j]]";
            //should output all the indivual tokens plus the untokenized tokens as well.  Untokenized tokens
            WikipediaTokenizer tf = new WikipediaTokenizer(new StringReader(test), WikipediaTokenizer.BOTH, untoks);

            AssertTokenStreamContents(tf, new string[] { "a b c d", "a", "b", "c", "d", "e f g", "e", "f", "g", "link", "here", "link", "there", "italics here", "italics", "here", "something", "more italics", "more", "italics", "h   i   j", "h", "i", "j" }, new int[] { 11, 11, 13, 15, 17, 32, 32, 34, 36, 42, 47, 56, 61, 71, 71, 79, 86, 98, 98, 103, 124, 124, 128, 132 }, new int[] { 18, 12, 14, 16, 18, 37, 33, 35, 37, 46, 51, 60, 66, 83, 78, 83, 95, 110, 102, 110, 133, 125, 129, 133 }, new int[] { 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 0, 1, 1 });

            // now check the flags, TODO: add way to check flags from BaseTokenStreamTestCase?
            tf = new WikipediaTokenizer(new StringReader(test), WikipediaTokenizer.BOTH, untoks);
            int[]           expectedFlags = new int[] { WikipediaTokenizer.UNTOKENIZED_TOKEN_FLAG, 0, 0, 0, 0, WikipediaTokenizer.UNTOKENIZED_TOKEN_FLAG, 0, 0, 0, 0, 0, 0, 0, WikipediaTokenizer.UNTOKENIZED_TOKEN_FLAG, 0, 0, 0, WikipediaTokenizer.UNTOKENIZED_TOKEN_FLAG, 0, 0, WikipediaTokenizer.UNTOKENIZED_TOKEN_FLAG, 0, 0, 0 };
            IFlagsAttribute flagsAtt      = tf.AddAttribute <IFlagsAttribute>();

            tf.Reset();
            for (int i = 0; i < expectedFlags.Length; i++)
            {
                assertTrue(tf.IncrementToken());
                assertEquals("flags " + i, expectedFlags[i], flagsAtt.Flags);
            }
            assertFalse(tf.IncrementToken());
            tf.Dispose();
        }
Exemplo n.º 5
0
        public virtual void TestCaptureState()
        {
            // init a first instance
            AttributeSource    src     = new AttributeSource();
            ICharTermAttribute termAtt = src.AddAttribute <ICharTermAttribute>();
            ITypeAttribute     typeAtt = src.AddAttribute <ITypeAttribute>();

            termAtt.Append("TestTerm");
            typeAtt.Type = "TestType";
            int hashCode = src.GetHashCode();

            AttributeSource.State state = src.CaptureState();

            // modify the attributes
            termAtt.SetEmpty().Append("AnotherTestTerm");
            typeAtt.Type = "AnotherTestType";
            Assert.IsTrue(hashCode != src.GetHashCode(), "Hash code should be different");

            src.RestoreState(state);
            Assert.AreEqual(termAtt.ToString(), "TestTerm");
            Assert.AreEqual(typeAtt.Type, "TestType");
            Assert.AreEqual(hashCode, src.GetHashCode(), "Hash code should be equal after restore");

            // restore into an exact configured copy
            AttributeSource copy = new AttributeSource();

            copy.AddAttribute <ICharTermAttribute>();
            copy.AddAttribute <ITypeAttribute>();
            copy.RestoreState(state);
            Assert.AreEqual(src.GetHashCode(), copy.GetHashCode(), "Both AttributeSources should have same hashCode after restore");
            Assert.AreEqual(src, copy, "Both AttributeSources should be equal after restore");

            // init a second instance (with attributes in different order and one additional attribute)
            AttributeSource src2 = new AttributeSource();

            typeAtt = src2.AddAttribute <ITypeAttribute>();
            IFlagsAttribute flagsAtt = src2.AddAttribute <IFlagsAttribute>();

            termAtt        = src2.AddAttribute <ICharTermAttribute>();
            flagsAtt.Flags = 12345;

            src2.RestoreState(state);
            Assert.AreEqual(termAtt.ToString(), "TestTerm");
            Assert.AreEqual(typeAtt.Type, "TestType");
            Assert.AreEqual(12345, flagsAtt.Flags, "FlagsAttribute should not be touched");

            // init a third instance missing one Attribute
            AttributeSource src3 = new AttributeSource();

            termAtt = src3.AddAttribute <ICharTermAttribute>();
            try
            {
                src3.RestoreState(state);
                Assert.Fail("The third instance is missing the TypeAttribute, so restoreState() should throw IllegalArgumentException");
            }
            catch (Exception iae) when(iae.IsIllegalArgumentException())
            {
                // pass
            }
        }
Exemplo n.º 6
0
 public OpenNLPChunkerFilter(TokenStream input, NLPChunkerOp chunkerOp)
     : base(input)
 {
     this.chunkerOp = chunkerOp;
     this.typeAtt   = AddAttribute <ITypeAttribute>();
     this.flagsAtt  = AddAttribute <IFlagsAttribute>();
     this.termAtt   = AddAttribute <ICharTermAttribute>();
 }
        void Init(string content)
        {
            _enumerationPositions = _aufzaehlungDetector.FindAufzaehlungsspans(content).ToArray();
            _offsetAttribute = AddAttribute<IOffsetAttribute>();
            _flagsAttribute = AddAttribute<IFlagsAttribute>();

            _isInitialized = true;
        }
Exemplo n.º 8
0
 public OpenNLPPOSFilter(TokenStream input, NLPPOSTaggerOp posTaggerOp)
     : base(input)
 {
     this.posTaggerOp = posTaggerOp;
     this.typeAtt     = AddAttribute <ITypeAttribute>();
     this.flagsAtt    = AddAttribute <IFlagsAttribute>();
     this.termAtt     = AddAttribute <ICharTermAttribute>();
 }
Exemplo n.º 9
0
 internal TestTokenStream() : base()
 {
     this.termAtt    = AddAttribute <ICharTermAttribute>();
     this.offsetAtt  = AddAttribute <IOffsetAttribute>();
     this.typeAtt    = AddAttribute <ITypeAttribute>();
     this.payloadAtt = AddAttribute <IPayloadAttribute>();
     this.posIncAtt  = AddAttribute <IPositionIncrementAttribute>();
     this.flagsAtt   = AddAttribute <IFlagsAttribute>();
 }
Exemplo n.º 10
0
        private int lemmaNum = 0;                   // lemma counter

        public OpenNLPLemmatizerFilter(TokenStream input, NLPLemmatizerOp lemmatizerOp)
            : base(input)
        {
            this.lemmatizerOp = lemmatizerOp;
            this.termAtt      = AddAttribute <ICharTermAttribute>();
            this.typeAtt      = AddAttribute <ITypeAttribute>();
            this.keywordAtt   = AddAttribute <IKeywordAttribute>();
            this.flagsAtt     = AddAttribute <IFlagsAttribute>();
        }
Exemplo n.º 11
0
 internal TestTokenStream()
 {
     termAtt    = AddAttribute <ITermAttribute>();
     offsetAtt  = AddAttribute <IOffsetAttribute>();
     typeAtt    = AddAttribute <ITypeAttribute>();
     payloadAtt = AddAttribute <IPayloadAttribute>();
     posIncAtt  = AddAttribute <IPositionIncrementAttribute>();
     flagsAtt   = AddAttribute <IFlagsAttribute>();
 }
Exemplo n.º 12
0
 public IterTokenStream(params Token[] tokens) : base()
 {
     this.tokens = tokens;
     termAtt     = AddAttribute <ICharTermAttribute>();
     offsetAtt   = AddAttribute <IOffsetAttribute>();
     posIncAtt   = AddAttribute <IPositionIncrementAttribute>();
     flagsAtt    = AddAttribute <IFlagsAttribute>();
     typeAtt     = AddAttribute <ITypeAttribute>();
     payloadAtt  = AddAttribute <IPayloadAttribute>();
 }
Exemplo n.º 13
0
 public TokenListStream(ICollection <Token> tokens)
 {
     _tokens     = tokens;
     _termAtt    = AddAttribute <ITermAttribute>();
     _posIncrAtt = AddAttribute <IPositionIncrementAttribute>();
     _payloadAtt = AddAttribute <IPayloadAttribute>();
     _offsetAtt  = AddAttribute <IOffsetAttribute>();
     _typeAtt    = AddAttribute <ITypeAttribute>();
     _flagsAtt   = AddAttribute <IFlagsAttribute>();
 }
Exemplo n.º 14
0
 public IterTokenStream(params Token[] tokens)
         : base()
 {
     this.tokens = tokens;
     this.termAtt = AddAttribute<ICharTermAttribute>();
     this.offsetAtt = AddAttribute<IOffsetAttribute>();
     this.posIncAtt = AddAttribute<IPositionIncrementAttribute>();
     this.flagsAtt = AddAttribute<IFlagsAttribute>();
     this.typeAtt = AddAttribute<ITypeAttribute>();
     this.payloadAtt = AddAttribute<IPayloadAttribute>();
 }
Exemplo n.º 15
0
 public OpenNLPTokenizer(AttributeFactory factory, TextReader reader, NLPSentenceDetectorOp sentenceOp, NLPTokenizerOp tokenizerOp) // LUCENENET: Added reader param for compatibility with 4.8 - remove when upgrading
     : base(factory, reader, new OpenNLPSentenceBreakIterator(sentenceOp))
 {
     if (sentenceOp == null || tokenizerOp == null)
     {
         throw new ArgumentException("OpenNLPTokenizer: both a Sentence Detector and a Tokenizer are required");
     }
     this.sentenceOp  = sentenceOp;
     this.tokenizerOp = tokenizerOp;
     this.termAtt     = AddAttribute <ICharTermAttribute>();
     this.flagsAtt    = AddAttribute <IFlagsAttribute>();
     this.offsetAtt   = AddAttribute <IOffsetAttribute>();
 }
Exemplo n.º 16
0
 private void Init(int tokenOutput, IEnumerable <string> untokenizedTypes)
 {
     // TODO: cutover to enum
     if (tokenOutput != TOKENS_ONLY && tokenOutput != UNTOKENIZED_ONLY && tokenOutput != BOTH)
     {
         throw new System.ArgumentException("tokenOutput must be TOKENS_ONLY, UNTOKENIZED_ONLY or BOTH");
     }
     this.tokenOutput      = tokenOutput;
     this.untokenizedTypes = untokenizedTypes;
     offsetAtt             = AddAttribute <IOffsetAttribute>();
     typeAtt    = AddAttribute <ITypeAttribute>();
     posIncrAtt = AddAttribute <IPositionIncrementAttribute>();
     termAtt    = AddAttribute <ICharTermAttribute>();
     flagsAtt   = AddAttribute <IFlagsAttribute>();
 }
Exemplo n.º 17
0
 private void Init(int tokenOutput, ICollection <string> untokenizedTypes)
 {
     // TODO: cutover to enum
     if (tokenOutput != TOKENS_ONLY && tokenOutput != UNTOKENIZED_ONLY && tokenOutput != BOTH)
     {
         throw new ArgumentOutOfRangeException(nameof(tokenOutput), "tokenOutput must be TOKENS_ONLY, UNTOKENIZED_ONLY or BOTH"); // LUCENENET specific - changed from IllegalArgumentException to ArgumentOutOfRangeException (.NET convention)
     }
     this.tokenOutput      = tokenOutput;
     this.untokenizedTypes = untokenizedTypes;
     offsetAtt             = AddAttribute <IOffsetAttribute>();
     typeAtt    = AddAttribute <ITypeAttribute>();
     posIncrAtt = AddAttribute <IPositionIncrementAttribute>();
     termAtt    = AddAttribute <ICharTermAttribute>();
     flagsAtt   = AddAttribute <IFlagsAttribute>();
 }
Exemplo n.º 18
0
 public OpenNLPTokenizer(AttributeFactory factory, TextReader reader, NLPSentenceDetectorOp sentenceOp, NLPTokenizerOp tokenizerOp) // LUCENENET: Added reader param for compatibility with 4.8 - remove when upgrading
     : base(factory, reader, new OpenNLPSentenceBreakIterator(sentenceOp))
 {
     // LUCENENET specific - changed from IllegalArgumentException to ArgumentNullException (.NET convention) and refactored to throw on each one separately
     if (sentenceOp is null)
     {
         throw new ArgumentNullException(nameof(sentenceOp), "OpenNLPTokenizer: both a Sentence Detector and a Tokenizer are required");
     }
     if (tokenizerOp is null)
     {
         throw new ArgumentNullException(nameof(tokenizerOp), "OpenNLPTokenizer: both a Sentence Detector and a Tokenizer are required");
     }
     //this.sentenceOp = sentenceOp; // LUCENENET: Never read
     this.tokenizerOp = tokenizerOp;
     this.termAtt     = AddAttribute <ICharTermAttribute>();
     this.flagsAtt    = AddAttribute <IFlagsAttribute>();
     this.offsetAtt   = AddAttribute <IOffsetAttribute>();
 }
Exemplo n.º 19
0
        public void TestFilterTokens()
        {
            SnowballFilter              filter     = new SnowballFilter(new TestTokenStream(), "English");
            ITermAttribute              termAtt    = filter.GetAttribute <ITermAttribute>();
            IOffsetAttribute            offsetAtt  = filter.GetAttribute <IOffsetAttribute>();
            ITypeAttribute              typeAtt    = filter.GetAttribute <ITypeAttribute>();
            IPayloadAttribute           payloadAtt = filter.GetAttribute <IPayloadAttribute>();
            IPositionIncrementAttribute posIncAtt  = filter.GetAttribute <IPositionIncrementAttribute>();
            IFlagsAttribute             flagsAtt   = filter.GetAttribute <IFlagsAttribute>();

            filter.IncrementToken();

            Assert.AreEqual("accent", termAtt.Term);
            Assert.AreEqual(2, offsetAtt.StartOffset);
            Assert.AreEqual(7, offsetAtt.EndOffset);
            Assert.AreEqual("wrd", typeAtt.Type);
            Assert.AreEqual(3, posIncAtt.PositionIncrement);
            Assert.AreEqual(77, flagsAtt.Flags);
            Assert.AreEqual(new Payload(new byte[] { 0, 1, 2, 3 }), payloadAtt.Payload);
        }
Exemplo n.º 20
0
        public virtual void TestFilterTokens()
        {
            SnowballFilter              filter     = new SnowballFilter(new TestTokenStream(this), "English");
            ICharTermAttribute          termAtt    = filter.GetAttribute <ICharTermAttribute>();
            IOffsetAttribute            offsetAtt  = filter.GetAttribute <IOffsetAttribute>();
            ITypeAttribute              typeAtt    = filter.GetAttribute <ITypeAttribute>();
            IPayloadAttribute           payloadAtt = filter.GetAttribute <IPayloadAttribute>();
            IPositionIncrementAttribute posIncAtt  = filter.GetAttribute <IPositionIncrementAttribute>();
            IFlagsAttribute             flagsAtt   = filter.GetAttribute <IFlagsAttribute>();

            filter.IncrementToken();

            assertEquals("accent", termAtt.ToString());
            assertEquals(2, offsetAtt.StartOffset());
            assertEquals(7, offsetAtt.EndOffset());
            assertEquals("wrd", typeAtt.Type);
            assertEquals(3, posIncAtt.PositionIncrement);
            assertEquals(77, flagsAtt.Flags);
            assertEquals(new BytesRef(new byte[] { 0, 1, 2, 3 }), payloadAtt.Payload);
        }
Exemplo n.º 21
0
        public PrefixAwareTokenFilter(TokenStream prefix, TokenStream suffix)
            : base(suffix)
        {
            this.suffix = suffix;
            this.prefix = prefix;
            prefixExhausted = false;

            termAtt = AddAttribute<ICharTermAttribute>();
            posIncrAtt = AddAttribute<IPositionIncrementAttribute>();
            payloadAtt = AddAttribute<IPayloadAttribute>();
            offsetAtt = AddAttribute<IOffsetAttribute>();
            typeAtt = AddAttribute<ITypeAttribute>();
            flagsAtt = AddAttribute<IFlagsAttribute>();

            p_termAtt = prefix.AddAttribute<ICharTermAttribute>();
            p_posIncrAtt = prefix.AddAttribute<IPositionIncrementAttribute>();
            p_payloadAtt = prefix.AddAttribute<IPayloadAttribute>();
            p_offsetAtt = prefix.AddAttribute<IOffsetAttribute>();
            p_typeAtt = prefix.AddAttribute<ITypeAttribute>();
            p_flagsAtt = prefix.AddAttribute<IFlagsAttribute>();
        }
Exemplo n.º 22
0
        public PrefixAwareTokenFilter(TokenStream prefix, TokenStream suffix)
            : base(suffix)
        {
            this.suffix     = suffix;
            this.prefix     = prefix;
            prefixExhausted = false;

            termAtt    = AddAttribute <ICharTermAttribute>();
            posIncrAtt = AddAttribute <IPositionIncrementAttribute>();
            payloadAtt = AddAttribute <IPayloadAttribute>();
            offsetAtt  = AddAttribute <IOffsetAttribute>();
            typeAtt    = AddAttribute <ITypeAttribute>();
            flagsAtt   = AddAttribute <IFlagsAttribute>();

            p_termAtt    = prefix.AddAttribute <ICharTermAttribute>();
            p_posIncrAtt = prefix.AddAttribute <IPositionIncrementAttribute>();
            p_payloadAtt = prefix.AddAttribute <IPayloadAttribute>();
            p_offsetAtt  = prefix.AddAttribute <IOffsetAttribute>();
            p_typeAtt    = prefix.AddAttribute <ITypeAttribute>();
            p_flagsAtt   = prefix.AddAttribute <IFlagsAttribute>();
        }
Exemplo n.º 23
0
        public PrefixAwareTokenFilter(TokenStream prefix, TokenStream suffix) : base(suffix)
        {
            Suffix = suffix;
            Prefix = prefix;
            _prefixExhausted = false;

            // ReSharper disable DoNotCallOverridableMethodsInConstructor
            _termAtt = AddAttribute<ITermAttribute>();
            _posIncrAtt = AddAttribute<IPositionIncrementAttribute>();
            _payloadAtt = AddAttribute<IPayloadAttribute>();
            _offsetAtt = AddAttribute<IOffsetAttribute>();
            _typeAtt = AddAttribute<ITypeAttribute>();
            _flagsAtt = AddAttribute<IFlagsAttribute>();
            // ReSharper restore DoNotCallOverridableMethodsInConstructor

            _pTermAtt = prefix.AddAttribute<ITermAttribute>();
            _pPosIncrAtt = prefix.AddAttribute<IPositionIncrementAttribute>();
            _pPayloadAtt = prefix.AddAttribute<IPayloadAttribute>();
            _pOffsetAtt = prefix.AddAttribute<IOffsetAttribute>();
            _pTypeAtt = prefix.AddAttribute<ITypeAttribute>();
            _pFlagsAtt = prefix.AddAttribute<IFlagsAttribute>();
        }
Exemplo n.º 24
0
        public PrefixAwareTokenFilter(TokenStream prefix, TokenStream suffix) : base(suffix)
        {
            Suffix           = suffix;
            Prefix           = prefix;
            _prefixExhausted = false;

            // ReSharper disable DoNotCallOverridableMethodsInConstructor
            _termAtt    = AddAttribute <ITermAttribute>();
            _posIncrAtt = AddAttribute <IPositionIncrementAttribute>();
            _payloadAtt = AddAttribute <IPayloadAttribute>();
            _offsetAtt  = AddAttribute <IOffsetAttribute>();
            _typeAtt    = AddAttribute <ITypeAttribute>();
            _flagsAtt   = AddAttribute <IFlagsAttribute>();
            // ReSharper restore DoNotCallOverridableMethodsInConstructor

            _pTermAtt    = prefix.AddAttribute <ITermAttribute>();
            _pPosIncrAtt = prefix.AddAttribute <IPositionIncrementAttribute>();
            _pPayloadAtt = prefix.AddAttribute <IPayloadAttribute>();
            _pOffsetAtt  = prefix.AddAttribute <IOffsetAttribute>();
            _pTypeAtt    = prefix.AddAttribute <ITypeAttribute>();
            _pFlagsAtt   = prefix.AddAttribute <IFlagsAttribute>();
        }
Exemplo n.º 25
0
        public virtual void TestCloneAttributes()
        {
            AttributeSource src      = new AttributeSource();
            IFlagsAttribute flagsAtt = src.AddAttribute <IFlagsAttribute>();
            ITypeAttribute  typeAtt  = src.AddAttribute <ITypeAttribute>();

            flagsAtt.Flags = 1234;
            typeAtt.Type   = "TestType";

            AttributeSource    clone = src.CloneAttributes();
            IEnumerator <Type> it    = clone.GetAttributeClassesEnumerator();

            it.MoveNext();
            Assert.AreEqual(typeof(IFlagsAttribute), it.Current, "FlagsAttribute must be the first attribute");
            it.MoveNext();
            Assert.AreEqual(typeof(ITypeAttribute), it.Current, "TypeAttribute must be the second attribute");
            Assert.IsFalse(it.MoveNext(), "No more attributes");

            IFlagsAttribute flagsAtt2 = clone.GetAttribute <IFlagsAttribute>();
            ITypeAttribute  typeAtt2  = clone.GetAttribute <ITypeAttribute>();

            Assert.AreNotSame(flagsAtt2, flagsAtt, "FlagsAttribute of original and clone must be different instances");
            Assert.AreNotSame(typeAtt2, typeAtt, "TypeAttribute of original and clone must be different instances");
            Assert.AreEqual(flagsAtt2, flagsAtt, "FlagsAttribute of original and clone must be equal");
            Assert.AreEqual(typeAtt2, typeAtt, "TypeAttribute of original and clone must be equal");

            // test copy back
            flagsAtt2.Flags = 4711;
            typeAtt2.Type   = "OtherType";
            clone.CopyTo(src);
            Assert.AreEqual(4711, flagsAtt.Flags, "FlagsAttribute of original must now contain updated term");
            Assert.AreEqual(typeAtt.Type, "OtherType", "TypeAttribute of original must now contain updated type");
            // verify again:
            Assert.AreNotSame(flagsAtt2, flagsAtt, "FlagsAttribute of original and clone must be different instances");
            Assert.AreNotSame(typeAtt2, typeAtt, "TypeAttribute of original and clone must be different instances");
            Assert.AreEqual(flagsAtt2, flagsAtt, "FlagsAttribute of original and clone must be equal");
            Assert.AreEqual(typeAtt2, typeAtt, "TypeAttribute of original and clone must be equal");
        }
Exemplo n.º 26
0
        /// <summary>
        /// Creates a shingle filter with ad hoc parameter settings.
        /// </summary>
        /// <param name="input">stream from which to construct the matrix</param>
        /// <param name="minimumShingleSize">minimum number of tokens in any shingle.</param>
        /// <param name="maximumShingleSize">maximum number of tokens in any shingle.</param>
        /// <param name="spacerCharacter">character to use between texts of the token parts in a shingle. null for none.</param>
        /// <param name="ignoringSinglePrefixOrSuffixShingle">if true, shingles that only contains permutation of the first of the last column will not be produced as shingles. Useful when adding boundary marker tokens such as '^' and '$'.</param>
        /// <param name="settingsCodec">codec used to read input token weight and matrix positioning.</param>
        public ShingleMatrixFilter(TokenStream input, int minimumShingleSize, int maximumShingleSize, Char? spacerCharacter, bool ignoringSinglePrefixOrSuffixShingle, TokenSettingsCodec settingsCodec)
        {
            _input = input;
            MinimumShingleSize = minimumShingleSize;
            MaximumShingleSize = maximumShingleSize;
            SpacerCharacter = spacerCharacter;
            IsIgnoringSinglePrefixOrSuffixShingle = ignoringSinglePrefixOrSuffixShingle;
            _settingsCodec = settingsCodec;

            // ReSharper disable DoNotCallOverridableMethodsInConstructor
            _termAtt = AddAttribute<ITermAttribute>();
            _posIncrAtt = AddAttribute<IPositionIncrementAttribute>();
            _payloadAtt = AddAttribute<IPayloadAttribute>();
            _offsetAtt = AddAttribute<IOffsetAttribute>();
            _typeAtt = AddAttribute<ITypeAttribute>();
            _flagsAtt = AddAttribute<IFlagsAttribute>();
            // ReSharper restore DoNotCallOverridableMethodsInConstructor

            _inTermAtt = input.AddAttribute<ITermAttribute>();
            _inPosIncrAtt = input.AddAttribute<IPositionIncrementAttribute>();
            _inPayloadAtt = input.AddAttribute<IPayloadAttribute>();
            _inOffsetAtt = input.AddAttribute<IOffsetAttribute>();
            _inTypeAtt = input.AddAttribute<ITypeAttribute>();
            _inFlagsAtt = input.AddAttribute<IFlagsAttribute>();
        }
 public TokenListStream(ICollection<Token> tokens)
 {
     _tokens = tokens;
     _termAtt = AddAttribute<ITermAttribute>();
     _posIncrAtt = AddAttribute<IPositionIncrementAttribute>();
     _payloadAtt = AddAttribute<IPayloadAttribute>();
     _offsetAtt = AddAttribute<IOffsetAttribute>();
     _typeAtt = AddAttribute<ITypeAttribute>();
     _flagsAtt = AddAttribute<IFlagsAttribute>();
 }
Exemplo n.º 28
0
 internal TestTokenStream()
 {
     termAtt = AddAttribute<ITermAttribute>();
     offsetAtt = AddAttribute<IOffsetAttribute>();
     typeAtt = AddAttribute<ITypeAttribute>();
     payloadAtt = AddAttribute<IPayloadAttribute>();
     posIncAtt = AddAttribute<IPositionIncrementAttribute>();
     flagsAtt = AddAttribute<IFlagsAttribute>();
 }
        protected CompoundWordTokenFilterBase(TokenStream input, ISet<string> dictionary, int minWordSize, int minSubwordSize, int maxSubwordSize, bool onlyLongestMatch)
            : base(input)
        {
            this.tokens = new LinkedList<Token>();
            this.minWordSize = minWordSize;
            this.minSubwordSize = minSubwordSize;
            this.maxSubwordSize = maxSubwordSize;
            this.onlyLongestMatch = onlyLongestMatch;

            if (dictionary is CharArraySet)
            {
                this.dictionary = (CharArraySet)dictionary;
            }
            else
            {
                this.dictionary = new CharArraySet(dictionary.Count, false);
                AddAllLowerCase(this.dictionary, dictionary);
            }

            termAtt = AddAttribute<ITermAttribute>();
            offsetAtt = AddAttribute<IOffsetAttribute>();
            flagsAtt = AddAttribute<IFlagsAttribute>();
            posIncAtt = AddAttribute<IPositionIncrementAttribute>();
            typeAtt = AddAttribute<ITypeAttribute>();
            payloadAtt = AddAttribute<IPayloadAttribute>();
        }
Exemplo n.º 30
0
 private void Init(int tokenOutput, IEnumerable<string> untokenizedTypes)
 {
     // TODO: cutover to enum
     if (tokenOutput != TOKENS_ONLY && tokenOutput != UNTOKENIZED_ONLY && tokenOutput != BOTH)
     {
         throw new System.ArgumentException("tokenOutput must be TOKENS_ONLY, UNTOKENIZED_ONLY or BOTH");
     }
     this.tokenOutput = tokenOutput;
     this.untokenizedTypes = untokenizedTypes;
     offsetAtt = AddAttribute<IOffsetAttribute>();
     typeAtt = AddAttribute<ITypeAttribute>();
     posIncrAtt = AddAttribute<IPositionIncrementAttribute>();
     termAtt = AddAttribute<ICharTermAttribute>();
     flagsAtt = AddAttribute<IFlagsAttribute>();
 }
Exemplo n.º 31
0
        /// <summary>
        /// Creates a shingle filter based on a user defined matrix.
        /// 
        /// The filter /will/ delete columns from the input matrix! You will not be able to reset the filter if you used this constructor.
        /// todo: don't touch the matrix! use a bool, set the input stream to null or something, and keep track of where in the matrix we are at.
        /// 
        /// </summary>
        /// <param name="matrix">the input based for creating shingles. Does not need to contain any information until ShingleMatrixFilter.IncrementToken() is called the first time.</param>
        /// <param name="minimumShingleSize">minimum number of tokens in any shingle.</param>
        /// <param name="maximumShingleSize">maximum number of tokens in any shingle.</param>
        /// <param name="spacerCharacter">character to use between texts of the token parts in a shingle. null for none.</param>
        /// <param name="ignoringSinglePrefixOrSuffixShingle">if true, shingles that only contains permutation of the first of the last column will not be produced as shingles. Useful when adding boundary marker tokens such as '^' and '$'.</param>
        /// <param name="settingsCodec">codec used to read input token weight and matrix positioning.</param>
        public ShingleMatrixFilter(Matrix.Matrix matrix, int minimumShingleSize, int maximumShingleSize, Char spacerCharacter, bool ignoringSinglePrefixOrSuffixShingle, TokenSettingsCodec settingsCodec)
        {
            Matrix = matrix;
            MinimumShingleSize = minimumShingleSize;
            MaximumShingleSize = maximumShingleSize;
            SpacerCharacter = spacerCharacter;
            IsIgnoringSinglePrefixOrSuffixShingle = ignoringSinglePrefixOrSuffixShingle;
            _settingsCodec = settingsCodec;

            // ReSharper disable DoNotCallOverridableMethodsInConstructor
            _termAtt = AddAttribute<ITermAttribute>();
            _posIncrAtt = AddAttribute<IPositionIncrementAttribute>();
            _payloadAtt = AddAttribute<IPayloadAttribute>();
            _offsetAtt = AddAttribute<IOffsetAttribute>();
            _typeAtt = AddAttribute<ITypeAttribute>();
            _flagsAtt = AddAttribute<IFlagsAttribute>();
            // ReSharper restore DoNotCallOverridableMethodsInConstructor

            // set the input to be an empty token stream, we already have the data.
            _input = new EmptyTokenStream();

            _inTermAtt = _input.AddAttribute<ITermAttribute>();
            _inPosIncrAtt = _input.AddAttribute<IPositionIncrementAttribute>();
            _inPayloadAtt = _input.AddAttribute<IPayloadAttribute>();
            _inOffsetAtt = _input.AddAttribute<IOffsetAttribute>();
            _inTypeAtt = _input.AddAttribute<ITypeAttribute>();
            _inFlagsAtt = _input.AddAttribute<IFlagsAttribute>();
        }
Exemplo n.º 32
0
        public override void  CopyTo(Attribute target)
        {
            IFlagsAttribute t = (IFlagsAttribute)target;

            t.Flags = flags;
        }
Exemplo n.º 33
0
 public FlagFilter(int flags, TokenStream input)
     : base(input)
 {
     _flagsAttribute = AddAttribute<IFlagsAttribute>();
     _flags = flags;
 }
Exemplo n.º 34
0
 internal TestTokenStream(TestSnowball outerInstance) : base()
 {
     this.outerInstance = outerInstance;
     this.termAtt = AddAttribute<ICharTermAttribute>();
     this.offsetAtt = AddAttribute<IOffsetAttribute>();
     this.typeAtt = AddAttribute<ITypeAttribute>();
     this.payloadAtt = AddAttribute<IPayloadAttribute>();
     this.posIncAtt = AddAttribute<IPositionIncrementAttribute>();
     this.flagsAtt = AddAttribute<IFlagsAttribute>();
 }