Ejemplo n.º 1
0
 /// <summary> Build a filter that removes words that are too long or too
 /// short from the text.
 /// </summary>
 public LengthFilter(TokenStream in_Renamed, int min, int max)
     : base(in_Renamed)
 {
     this.min = min;
     this.max = max;
     termAtt  = AddAttribute <ITermAttribute>();
 }
Ejemplo n.º 2
0
 private void InitBlock(AnonymousClassAnalyzer1 enclosingInstance)
 {
     this.enclosingInstance = enclosingInstance;
     termAtt = AddAttribute<ITermAttribute>();
     payloadAtt = AddAttribute<IPayloadAttribute>();
     posIncrAtt = AddAttribute<IPositionIncrementAttribute>();
 }
Ejemplo n.º 3
0
 /// <summary>
 /// Builds a GermanStemFilter that uses an exclusiontable.
 /// </summary>
 /// <param name="_in"></param>
 /// <param name="exclusiontable"></param>
 /// <param name="normalizeDin2">Specifies if the DIN-2007-2 style stemmer should be used in addition to DIN1.  This
 /// will cause words with 'ae', 'ue', or 'oe' in them (expanded umlauts) to be first converted to 'a', 'u', and 'o'
 /// respectively, before the DIN1 stemmer is invoked.</param>
 public GermanStemFilter(TokenStream _in, ISet <string> exclusiontable, bool normalizeDin2)
     : base(_in)
 {
     exclusionSet = exclusiontable;
     stemmer      = normalizeDin2 ? new GermanDIN2Stemmer() : new GermanStemmer();
     termAtt      = AddAttribute <ITermAttribute>();
 }
Ejemplo n.º 4
0
        /// <summary>
        /// 将word取出词干,支持停用词
        /// </summary>
        /// <param name="word"></param>
        /// <param name="language"></param>
        /// <returns></returns>
        public static string SnowballWord(string word, string language)
        {
            string result  = null;
            string stemmer = SnowballDict.GetStemmer(language);

            if (stemmer == null)
            {
                result = word;
            }
            else
            {
                using (SnowballAnalyzer snowball = new SnowballAnalyzer(Lucene.Net.Util.Version.LUCENE_30, stemmer, StopWord.StopWordList))
                {
                    using (TokenStream ts = snowball.ReusableTokenStream("", new StringReader(word)))//只显示分词信息,不需要使用FieldName
                    {
                        while (ts.IncrementToken())
                        {
                            ITermAttribute attribute = ts.GetAttribute <ITermAttribute>();
                            result = attribute.Term;
                        }
                    }
                }
            }
            return(result);
        }
Ejemplo n.º 5
0
 private void  Init(int bufferSize)
 {
     this.done = false;
     termAtt   = AddAttribute <ITermAttribute>();
     offsetAtt = AddAttribute <IOffsetAttribute>();
     termAtt.ResizeTermBuffer(bufferSize);
 }
Ejemplo n.º 6
0
        /// <summary> Adds term frequencies found by tokenizing text from reader into the Map words</summary>
        /// <param name="r">a source of text to be tokenized
        /// </param>
        /// <param name="termFreqMap">a Map of terms and their frequencies
        /// </param>
        /// <param name="fieldName">Used by analyzer for any special per-field analysis
        /// </param>
        protected void AddTermFrequencies(System.IO.TextReader r, IDictionary <string, Int> termFreqMap, System.String fieldName)
        {
            TokenStream ts         = analyzer.TokenStream(fieldName, r);
            int         tokenCount = 0;
            // for every token
            ITermAttribute termAtt = ts.AddAttribute <ITermAttribute>();

            while (ts.IncrementToken())
            {
                string word = termAtt.Term;
                tokenCount++;
                if (tokenCount > maxNumTokensParsed)
                {
                    break;
                }
                if (IsNoiseWord(word))
                {
                    continue;
                }

                // increment frequency
                Int cnt = termFreqMap[word];
                if (cnt == null)
                {
                    termFreqMap[word] = new Int();
                }
                else
                {
                    cnt.x++;
                }
            }
        }
Ejemplo n.º 7
0
 public AddSuffixFilter(TokenStream input, Dictionary <string, char[]> _suffixByTokenType)
     : base(input)
 {
     termAtt = AddAttribute <ITermAttribute>();
     typeAtt = AddAttribute <ITypeAttribute>();
     this.suffixByTokenType = _suffixByTokenType;
 }
Ejemplo n.º 8
0
        public virtual void  TestStopListPositions()
        {
            var stopWordsSet = Support.Compatibility.SetFactory.CreateHashSet <string>();

            stopWordsSet.Add("good");
            stopWordsSet.Add("test");
            stopWordsSet.Add("analyzer");
            var newStop = new StopAnalyzer(Version.LUCENE_CURRENT, stopWordsSet);
            var reader  = new System.IO.StringReader("This is a good test of the english stop analyzer with positions");

            int[]       expectedIncr = { 1, 1, 1, 3, 1, 1, 1, 2, 1 };
            TokenStream stream       = newStop.TokenStream("test", reader);

            Assert.NotNull(stream);
            int            i       = 0;
            ITermAttribute termAtt = stream.GetAttribute <ITermAttribute>();
            IPositionIncrementAttribute posIncrAtt = stream.AddAttribute <IPositionIncrementAttribute>();

            while (stream.IncrementToken())
            {
                string text = termAtt.Term;
                Assert.IsFalse(stopWordsSet.Contains(text));
                Assert.AreEqual(expectedIncr[i++], posIncrAtt.PositionIncrement);
            }
        }
Ejemplo n.º 9
0
        public virtual void TestCloneAttributes()
        {
            AttributeSource src     = new AttributeSource();
            ITermAttribute  termAtt = src.AddAttribute <ITermAttribute>();
            ITypeAttribute  typeAtt = src.AddAttribute <ITypeAttribute>();

            termAtt.SetTermBuffer("TestTerm");
            typeAtt.Type = "TestType";

            AttributeSource clone = src.CloneAttributes();

            System.Collections.Generic.IEnumerator <Type> it = clone.GetAttributeTypesIterator().GetEnumerator();
            Assert.IsTrue(it.MoveNext());
            Assert.AreEqual(typeof(ITermAttribute), it.Current, "TermAttribute must be the first attribute");
            Assert.IsTrue(it.MoveNext());
            Assert.AreEqual(typeof(ITypeAttribute), it.Current, "TypeAttribute must be the second attribute");
            Assert.IsFalse(it.MoveNext(), "No more attributes");

            ITermAttribute termAtt2 = clone.GetAttribute <ITermAttribute>();
            ITypeAttribute typeAtt2 = clone.GetAttribute <ITypeAttribute>();

            Assert.IsFalse(ReferenceEquals(termAtt2, termAtt), "TermAttribute of original and clone must be different instances");
            Assert.IsFalse(ReferenceEquals(typeAtt2, typeAtt), "TypeAttribute of original and clone must be different instances");
            Assert.AreEqual(termAtt2, termAtt, "TermAttribute of original and clone must be equal");
            Assert.AreEqual(typeAtt2, typeAtt, "TypeAttribute of original and clone must be equal");
        }
Ejemplo n.º 10
0
        /// <summary>
        /// Creates a shingle filter with ad hoc parameter settings.
        /// </summary>
        /// <param name="input">stream from which to construct the matrix</param>
        /// <param name="minimumShingleSize">minimum number of tokens in any shingle.</param>
        /// <param name="maximumShingleSize">maximum number of tokens in any shingle.</param>
        /// <param name="spacerCharacter">character to use between texts of the token parts in a shingle. null for none.</param>
        /// <param name="ignoringSinglePrefixOrSuffixShingle">if true, shingles that only contains permutation of the first of the last column will not be produced as shingles. Useful when adding boundary marker tokens such as '^' and '$'.</param>
        /// <param name="settingsCodec">codec used to read input token weight and matrix positioning.</param>
        public ShingleMatrixFilter(TokenStream input, int minimumShingleSize, int maximumShingleSize, Char?spacerCharacter, bool ignoringSinglePrefixOrSuffixShingle, TokenSettingsCodec settingsCodec)
        {
            _input             = input;
            MinimumShingleSize = minimumShingleSize;
            MaximumShingleSize = maximumShingleSize;
            SpacerCharacter    = spacerCharacter;
            IsIgnoringSinglePrefixOrSuffixShingle = ignoringSinglePrefixOrSuffixShingle;
            _settingsCodec = settingsCodec;

            // ReSharper disable DoNotCallOverridableMethodsInConstructor
            _termAtt    = AddAttribute <ITermAttribute>();
            _posIncrAtt = AddAttribute <IPositionIncrementAttribute>();
            _payloadAtt = AddAttribute <IPayloadAttribute>();
            _offsetAtt  = AddAttribute <IOffsetAttribute>();
            _typeAtt    = AddAttribute <ITypeAttribute>();
            _flagsAtt   = AddAttribute <IFlagsAttribute>();
            // ReSharper restore DoNotCallOverridableMethodsInConstructor

            _inTermAtt    = input.AddAttribute <ITermAttribute>();
            _inPosIncrAtt = input.AddAttribute <IPositionIncrementAttribute>();
            _inPayloadAtt = input.AddAttribute <IPayloadAttribute>();
            _inOffsetAtt  = input.AddAttribute <IOffsetAttribute>();
            _inTypeAtt    = input.AddAttribute <ITypeAttribute>();
            _inFlagsAtt   = input.AddAttribute <IFlagsAttribute>();
        }
 /// <summary>
 /// Initialization constructor.
 /// </summary>
 /// <param name="tokenizer">CodeXCavator ITokenizer instance.</param>
 /// <param name="reader">Text reader, providing access to the text, from which tokens should be extracted.</param>
 /// <param name="caseMode">Defines, how the token stream should treat character case.</param>
 internal ITokenizerTokenStream(ITokenizer tokenizer, System.IO.TextReader reader, Case caseMode = Case.Sensitive) : base(reader)
 {
     mTermAttribute   = AddAttribute <ITermAttribute>();
     mOffsetAttribute = AddAttribute <IOffsetAttribute>();
     mTokenizer       = tokenizer;
     mCaseMode        = caseMode;
 }
Ejemplo n.º 12
0
        ///<summary>
        ///Lucene Tokenizer适配器类构造函数
        /// </summary>
        /// <param name="isMaxWordLength">当为true时,分词器进行最大词长切分;当为false是,采用最细粒度切分</param>
        public IKTokenizer(TextReader inreader, bool isMaxWordLength)
            : base(inreader)
        {
			offsetAtt = AddAttribute<IOffsetAttribute>();
            termAtt = AddAttribute<ITermAttribute>();
            _IKImplement = new IKSegmentation(inreader, isMaxWordLength);
        }
        public void TestUnicode()
        {
            RussianAnalyzer ra = new RussianAnalyzer(Version.LUCENE_CURRENT);

            using (inWords = new StreamReader(@"ru\testUTF8.txt", Encoding.UTF8))
                using (sampleUnicode = new StreamReader(@"ru\resUTF8.txt", Encoding.UTF8))
                {
                    TokenStream _in = ra.TokenStream("all", inWords);

                    RussianLetterTokenizer sample =
                        new RussianLetterTokenizer(
                            sampleUnicode);

                    ITermAttribute text       = _in.GetAttribute <ITermAttribute>();
                    ITermAttribute sampleText = sample.GetAttribute <ITermAttribute>();

                    for (; ;)
                    {
                        if (_in.IncrementToken() == false)
                        {
                            break;
                        }

                        bool nextSampleToken = sample.IncrementToken();
                        Assert.AreEqual(text.Term, nextSampleToken == false ? null : sampleText.Term, "Unicode");
                    }
                }
        }
Ejemplo n.º 14
0
        public override void  CopyTo(Attribute target)
        {
            InitTermBuffer();
            ITermAttribute t = (ITermAttribute)target;

            t.SetTermBuffer(termBuffer, 0, termLength);
        }
Ejemplo n.º 15
0
        /// <summary>
        /// 显示分词完整信息
        /// </summary>
        /// <param name="content"></param>
        /// <param name="analzyer"></param>
        /// <returns></returns>
        public static List <TermInfo> TestTermAll(string content, Analyzer analzyer)
        {
            List <TermInfo> list = new List <TermInfo>();

            using (TokenStream tokenStream = analzyer.ReusableTokenStream("", new StringReader(content)))
            {
                //tokenStream.AddAttribute<ITermAttribute>();
                while (tokenStream.IncrementToken())
                {
                    ITermAttribute termAttribute = tokenStream.GetAttribute <ITermAttribute>();
                    IPositionIncrementAttribute postionIncrementAttribute = tokenStream.GetAttribute <IPositionIncrementAttribute>();
                    ITypeAttribute   typeAttribute   = tokenStream.GetAttribute <ITypeAttribute>();
                    IOffsetAttribute offsetAttribute = tokenStream.GetAttribute <IOffsetAttribute>();
                    TermInfo         obj             = new TermInfo();
                    //obj.FlagsAttribute = tokenStream.GetAttribute<IFlagsAttribute>().Flags.ToString();
                    //obj.PayloadAttribute = tokenStream.GetAttribute<IPayloadAttribute>().Payload.Length.ToString();
                    obj.TermAttribute              = termAttribute.Term;
                    obj.OffsetAttribute            = offsetAttribute.StartOffset.ToString() + "---" + offsetAttribute.EndOffset.ToString();
                    obj.PositionIncrementAttribute = postionIncrementAttribute.PositionIncrement.ToString();
                    obj.TypeAttribute              = typeAttribute.Type;
                    obj.TokenStream = tokenStream;
                    list.Add(obj);
                }
            }
            return(list);
        }
        public override bool IncrementToken()
        {
            if (!input.IncrementToken())
            {
                return(false);
            }

            ITermAttribute   termAttribute   = this.GetAttribute <ITermAttribute>();
            ISourceAttribute sourceAttribute = GetAttribute <ISourceAttribute>();
            ISpellAttribute  spellAttribute  = GetAttribute <ISpellAttribute>();
            IStemAttribute   stemAttribute   = GetAttribute <IStemAttribute>();

            sourceAttribute.Term = termAttribute.Term;
            spellAttribute.Term  = termAttribute.Term;
            stemAttribute.Term   = termAttribute.Term;

            if (!SpellChecker.Exist(spellAttribute.Term))
            {
                var res = SpellChecker.SuggestSimilar(spellAttribute.Term, 100);
                if (res.Length != 0)
                {
                    spellAttribute.Term = res[0];
                }
            }

            termAttribute.SetTermBuffer(spellAttribute.Term);

            return(true);
        }
Ejemplo n.º 17
0
 public SingleCharTokenizer(TokenStream input) : base(input)
 {
     _input                      = input;
     _termAttribute              = AddAttribute <ITermAttribute>();
     _offsetAttribute            = AddAttribute <IOffsetAttribute>();
     _positionIncrementAttribute = AddAttribute <IPositionIncrementAttribute>();
 }
Ejemplo n.º 18
0
        public virtual void TestToStringAndMultiAttributeImplementations()
        {
            AttributeSource src     = new AttributeSource();
            ITermAttribute  termAtt = src.AddAttribute <ITermAttribute>();
            ITypeAttribute  typeAtt = src.AddAttribute <ITypeAttribute>();

            termAtt.SetTermBuffer("TestTerm");
            typeAtt.Type = "TestType";
            Assert.AreEqual("(" + typeAtt.ToString() + "," + termAtt.ToString() + ")", src.ToString(), "Attributes should appear in original order");
            System.Collections.Generic.IEnumerator <Attribute> it = src.GetAttributeImplsIterator().GetEnumerator();
            Assert.IsTrue(it.MoveNext(), "Iterator should have 2 attributes left");
            Assert.AreSame(typeAtt, it.Current, "First AttributeImpl from iterator should be typeAtt");
            Assert.IsTrue(it.MoveNext(), "Iterator should have 1 attributes left");
            Assert.AreSame(termAtt, it.Current, "Second AttributeImpl from iterator should be termAtt");
            Assert.IsFalse(it.MoveNext(), "Iterator should have 0 attributes left");

            src = new AttributeSource();
            src.AddAttributeImpl(new Token());
            // this should not add a new attribute as Token implements TermAttribute, too
            termAtt = src.AddAttribute <ITermAttribute>();
            Assert.IsTrue(termAtt is Token, "TermAttribute should be implemented by Token");
            // get the Token attribute and check, that it is the only one
            it = src.GetAttributeImplsIterator().GetEnumerator();
            Assert.IsTrue(it.MoveNext());
            Token tok = (Token)it.Current;

            Assert.IsFalse(it.MoveNext(), "There should be only one attribute implementation instance");

            termAtt.SetTermBuffer("TestTerm");
            Assert.AreEqual("(" + tok.ToString() + ")", src.ToString(), "Token should only printed once");
        }
Ejemplo n.º 19
0
        public void Test()
        {
            String test = "The quick red fox jumped over the lazy brown dogs";

            NumericPayloadTokenFilter nptf = new NumericPayloadTokenFilter(new WordTokenFilter(new WhitespaceTokenizer(new StringReader(test))), 3, "D");
            bool              seenDogs     = false;
            ITermAttribute    termAtt      = nptf.GetAttribute <ITermAttribute>();
            ITypeAttribute    typeAtt      = nptf.GetAttribute <ITypeAttribute>();
            IPayloadAttribute payloadAtt   = nptf.GetAttribute <IPayloadAttribute>();

            while (nptf.IncrementToken())
            {
                if (termAtt.Term.Equals("dogs"))
                {
                    seenDogs = true;
                    Assert.True(typeAtt.Type.Equals("D") == true, typeAtt.Type + " is not equal to " + "D");
                    Assert.True(payloadAtt.Payload != null, "payloadAtt.GetPayload() is null and it shouldn't be");
                    byte[] bytes = payloadAtt.Payload.GetData();//safe here to just use the bytes, otherwise we should use offset, length
                    Assert.True(bytes.Length == payloadAtt.Payload.Length, bytes.Length + " does not equal: " + payloadAtt.Payload.Length);
                    Assert.True(payloadAtt.Payload.Offset == 0, payloadAtt.Payload.Offset + " does not equal: " + 0);
                    float pay = PayloadHelper.DecodeFloat(bytes);
                    Assert.True(pay == 3, pay + " does not equal: " + 3);
                }
                else
                {
                    Assert.True(typeAtt.Type.Equals("word"), typeAtt.Type + " is not null and it should be");
                }
            }
            Assert.True(seenDogs == true, seenDogs + " does not equal: " + true);
        }
        public override bool IncrementToken()
        {
            bool wasGoodWord = false;

            while (!wasGoodWord)
            {
                if (!input.IncrementToken())
                {
                    return(false);
                }

                ITermAttribute termAttribute = GetAttribute <ITermAttribute>();
                var            termArr       = termAttribute.Term.ToCharArray();

                if (termArr.Length < 3)
                {
                    continue;
                }

                wasGoodWord = true;

                for (int i = 0; i < termArr.Length; ++i)
                {
                    if (!_goodChars.Contains(termArr[i]))
                    {
                        wasGoodWord = false;
                    }
                }
            }

            return(true);
        }
Ejemplo n.º 21
0
        //public virtual string GetView(TokenStream tokenStream, out int numberOfTokens)
        //{
        //    StringBuilder sb = new StringBuilder();

        //    Token token = tokenStream.Next();

        //    numberOfTokens = 0;

        //    while (token != null)
        //    {
        //        numberOfTokens++;
        //        sb.Append(GetTokenView(token));
        //        token = tokenStream.Next();
        //    }

        //    return sb.ToString();
        //}

        public virtual string GetView(TokenStream tokenStream, out int numberOfAttributes)
        {
            StringBuilder sb = new StringBuilder();
            //instead of CharTermAttribute, use
            //ITermAttribute termAttr = tokenStream.GetAttribute<ITermAttribute>(); //try addattribute
            //Attribute termAttr = tokenStream.GetAttribute<Attribute>();
            //Error here. Resolved ITermAttribute is a must!

            ITermAttribute termAttr = tokenStream.GetAttribute <ITermAttribute>(); //try addattribute here

            //Error here

            //http://stackoverflow.com/questions/16274779/get-termattribute-in-tokenstream-lucene-net
            numberOfAttributes = 0;
            tokenStream.Reset(); //try out without
            while (tokenStream.IncrementToken())
            {
                numberOfAttributes++;
                sb.Append(GetAttributeView(termAttr));
                termAttr = tokenStream.GetAttribute <ITermAttribute>();
                //string term = termAttr.Term;
            }
            tokenStream.End();
            tokenStream.Dispose();
            //http://stackoverflow.com/questions/2638200/how-to-get-a-token-from-a-lucene-tokenstream
            return(sb.ToString());
        }
Ejemplo n.º 22
0
        public QueryTermVector(string queryString, Analyzer analyzer)
        {
            if (analyzer != null)
            {
                TokenStream stream = analyzer.TokenStream("", new System.IO.StringReader(queryString));
                if (stream != null)
                {
                    IList <string> terms = new List <string>();
                    try
                    {
                        bool hasMoreTokens = false;

                        stream.Reset();
                        ITermAttribute termAtt = stream.AddAttribute <ITermAttribute>();

                        hasMoreTokens = stream.IncrementToken();
                        while (hasMoreTokens)
                        {
                            terms.Add(termAtt.Term);
                            hasMoreTokens = stream.IncrementToken();
                        }
                        ProcessTerms(terms.ToArray());
                    }
                    catch (System.IO.IOException)
                    {
                    }
                }
            }
        }
Ejemplo n.º 23
0
        /// <summary>
        /// 对keyword进行分词,将分词的结果返回
        /// </summary>
        public static IEnumerable <string> SplitWords(string keyword)
        {
            IList <string> list     = new List <string>();
            Analyzer       analyzer = new PanGuAnalyzer();
            TokenStream    stream   = analyzer.TokenStream(keyword, new StringReader(keyword));
            ITermAttribute ita      = null;
            bool           hasNext  = stream.IncrementToken();

            while (hasNext)
            {
                ita = stream.GetAttribute <ITermAttribute>();
                list.Add(ita.Term);
                hasNext = stream.IncrementToken();
            }
            return(list);

            //IList<string> list = new List<string>();
            //Analyzer analyzer = new PanGuAnalyzer();
            //TokenStream tokenStream = analyzer.TokenStream("", new StringReader(keyword));
            //Token token = null;
            //while ((token = tokenStream.IncrementToken()) != null)
            //{
            //    // token.TermText()为当前分的词
            //    string word = token.TermText();
            //    list.Add(word);
            //}

            //return list;
        }
Ejemplo n.º 24
0
 public CamelCaseFilter(TokenStream stream)
     : base(stream)
 {
     _termAttribute = AddAttribute<ITermAttribute>();
     _offsetAttribute = AddAttribute<IOffsetAttribute>();
     _positionIncrementAttribute = AddAttribute<IPositionIncrementAttribute>();
 }
        protected CompoundWordTokenFilterBase(TokenStream input, ISet <string> dictionary, int minWordSize, int minSubwordSize, int maxSubwordSize, bool onlyLongestMatch)
            : base(input)
        {
            this.tokens           = new LinkedList <Token>();
            this.minWordSize      = minWordSize;
            this.minSubwordSize   = minSubwordSize;
            this.maxSubwordSize   = maxSubwordSize;
            this.onlyLongestMatch = onlyLongestMatch;

            if (dictionary is CharArraySet)
            {
                this.dictionary = (CharArraySet)dictionary;
            }
            else
            {
                this.dictionary = new CharArraySet(dictionary.Count, false);
                AddAllLowerCase(this.dictionary, dictionary);
            }

            termAtt    = AddAttribute <ITermAttribute>();
            offsetAtt  = AddAttribute <IOffsetAttribute>();
            flagsAtt   = AddAttribute <IFlagsAttribute>();
            posIncAtt  = AddAttribute <IPositionIncrementAttribute>();
            typeAtt    = AddAttribute <ITypeAttribute>();
            payloadAtt = AddAttribute <IPayloadAttribute>();
        }
 public CamelCaseFilter(TokenStream stream)
     : base(stream)
 {
     _termAttribute              = AddAttribute <ITermAttribute>();
     _offsetAttribute            = AddAttribute <IOffsetAttribute>();
     _positionIncrementAttribute = AddAttribute <IPositionIncrementAttribute>();
 }
Ejemplo n.º 27
0
        public AddSuffixFilter(TokenStream input, Dictionary<string, char[]> _suffixByTokenType)
            : base(input)
        {
			termAtt = AddAttribute <ITermAttribute>();
			typeAtt = AddAttribute <ITypeAttribute>();
            this.suffixByTokenType = _suffixByTokenType;
        }
Ejemplo n.º 28
0
        public FrenchStemFilter(TokenStream _in)
            : base(_in)
        {

            stemmer = new FrenchStemmer();
            termAtt = AddAttribute<ITermAttribute>();
        }
Ejemplo n.º 29
0
 public MyTokenStream(TestTermVectorsReader enclosingInstance)
 {
     InitBlock(enclosingInstance);
     termAtt    = AddAttribute <ITermAttribute>();
     posIncrAtt = AddAttribute <IPositionIncrementAttribute>();
     offsetAtt  = AddAttribute <IOffsetAttribute>();
 }
Ejemplo n.º 30
0
 /// <summary> Build a filter that removes words that are too long or too
 /// short from the text.
 /// </summary>
 public LengthFilter(TokenStream in_Renamed, int min, int max)
     : base(in_Renamed)
 {
     this.min = min;
     this.max = max;
     termAtt = AddAttribute<ITermAttribute>();
 }
Ejemplo n.º 31
0
        /// <summary>
        /// Creates a shingle filter based on a user defined matrix.
        ///
        /// The filter /will/ delete columns from the input matrix! You will not be able to reset the filter if you used this constructor.
        /// todo: don't touch the matrix! use a bool, set the input stream to null or something, and keep track of where in the matrix we are at.
        ///
        /// </summary>
        /// <param name="matrix">the input based for creating shingles. Does not need to contain any information until ShingleMatrixFilter.IncrementToken() is called the first time.</param>
        /// <param name="minimumShingleSize">minimum number of tokens in any shingle.</param>
        /// <param name="maximumShingleSize">maximum number of tokens in any shingle.</param>
        /// <param name="spacerCharacter">character to use between texts of the token parts in a shingle. null for none.</param>
        /// <param name="ignoringSinglePrefixOrSuffixShingle">if true, shingles that only contains permutation of the first of the last column will not be produced as shingles. Useful when adding boundary marker tokens such as '^' and '$'.</param>
        /// <param name="settingsCodec">codec used to read input token weight and matrix positioning.</param>
        public ShingleMatrixFilter(Matrix.Matrix matrix, int minimumShingleSize, int maximumShingleSize, Char spacerCharacter, bool ignoringSinglePrefixOrSuffixShingle, TokenSettingsCodec settingsCodec)
        {
            Matrix             = matrix;
            MinimumShingleSize = minimumShingleSize;
            MaximumShingleSize = maximumShingleSize;
            SpacerCharacter    = spacerCharacter;
            IsIgnoringSinglePrefixOrSuffixShingle = ignoringSinglePrefixOrSuffixShingle;
            _settingsCodec = settingsCodec;

            // ReSharper disable DoNotCallOverridableMethodsInConstructor
            _termAtt    = AddAttribute <ITermAttribute>();
            _posIncrAtt = AddAttribute <IPositionIncrementAttribute>();
            _payloadAtt = AddAttribute <IPayloadAttribute>();
            _offsetAtt  = AddAttribute <IOffsetAttribute>();
            _typeAtt    = AddAttribute <ITypeAttribute>();
            _flagsAtt   = AddAttribute <IFlagsAttribute>();
            // ReSharper restore DoNotCallOverridableMethodsInConstructor

            // set the input to be an empty token stream, we already have the data.
            _input = new EmptyTokenStream();

            _inTermAtt    = _input.AddAttribute <ITermAttribute>();
            _inPosIncrAtt = _input.AddAttribute <IPositionIncrementAttribute>();
            _inPayloadAtt = _input.AddAttribute <IPayloadAttribute>();
            _inOffsetAtt  = _input.AddAttribute <IOffsetAttribute>();
            _inTypeAtt    = _input.AddAttribute <ITypeAttribute>();
            _inFlagsAtt   = _input.AddAttribute <IFlagsAttribute>();
        }
Ejemplo n.º 32
0
        public virtual void TestCloneAttributes()
        {
            AttributeSource src     = new AttributeSource();
            ITermAttribute  termAtt = src.AddAttribute <ITermAttribute>();
            ITypeAttribute  typeAtt = src.AddAttribute <ITypeAttribute>();

            termAtt.SetTermBuffer("TestTerm");
            typeAtt.Type = "TestType";

            var                types = new List <Type>();
            AttributeSource    clone = src.CloneAttributes();
            IEnumerator <Type> it    = clone.GetAttributeTypesIterator().GetEnumerator();

            Assert.IsTrue(it.MoveNext());
            types.Add(it.Current);
            Assert.IsTrue(it.MoveNext());
            types.Add(it.Current);
            Assert.IsFalse(it.MoveNext(), "No more attributes");

            Assert.Contains(typeof(ITypeAttribute), types, "TypeAttribute must be present in attributes");
            Assert.Contains(typeof(ITermAttribute), types, "TermAttribute must be present in attributes");

            ITermAttribute termAtt2 = clone.GetAttribute <ITermAttribute>();
            ITypeAttribute typeAtt2 = clone.GetAttribute <ITypeAttribute>();

            Assert.IsFalse(ReferenceEquals(termAtt2, termAtt), "TermAttribute of original and clone must be different instances");
            Assert.IsFalse(ReferenceEquals(typeAtt2, typeAtt), "TypeAttribute of original and clone must be different instances");
            Assert.AreEqual(termAtt2, termAtt, "TermAttribute of original and clone must be equal");
            Assert.AreEqual(typeAtt2, typeAtt, "TypeAttribute of original and clone must be equal");
        }
Ejemplo n.º 33
0
		private void  Init(int bufferSize)
		{
			this.done = false;
            termAtt = AddAttribute<ITermAttribute>();
            offsetAtt = AddAttribute<IOffsetAttribute>();
			termAtt.ResizeTermBuffer(bufferSize);
		}
 /*
  * 此处忽略调用base(input);因调用后input的position会被移动
  * by zh
  */
 public MMSegTokenizer(Seg seg, TextReader input)
     : base(input)
 {
     mmSeg = new MMSeg(input, seg);
     termAtt = AddAttribute<ITermAttribute>();
     offsetAtt = AddAttribute<IOffsetAttribute>();
     typeAtt = AddAttribute<ITypeAttribute>();
 }
Ejemplo n.º 35
0
 /// <summary>
 ///
 /// </summary>
 /// <remarks></remarks>
 /// <seealso cref=""/>
 /// <param name="input"></param>
 /// <param name="synonymEngine"></param>
 /// <return></return>
 public SynonymFilter(TokenStream input, ISynonymEngine synonymEngine)
     : base(input)
 {
     synonymStack    = new Stack <String>();
     this.engine     = synonymEngine;
     this.termAtt    = AddAttribute <ITermAttribute>();
     this.posIncrAtt = AddAttribute <IPositionIncrementAttribute>();
 }
 public CutLeterDigitFilter(TokenStream input)
     : base(input)
 {
     reusableToken = new Token();
     termAtt = AddAttribute<ITermAttribute>();
     offsetAtt = AddAttribute<IOffsetAttribute>();
     typeAtt = AddAttribute<ITypeAttribute>();
 }
 public DelimitedPayloadTokenFilter(TokenStream input, char delimiter, PayloadEncoder encoder)
     : base(input)
 {
     termAtt = AddAttribute<ITermAttribute>();
     payAtt = AddAttribute<IPayloadAttribute>();
     this.delimiter = delimiter;
     this.encoder = encoder;
 }
Ejemplo n.º 38
0
 /// <summary>
 /// 
 /// </summary>
 /// <remarks></remarks>
 /// <seealso cref=""/>
 /// <param name="input"></param>
 /// <param name="synonymEngine"></param>
 /// <return></return>
 public SynonymFilter(TokenStream input, ISynonymEngine synonymEngine)
     : base(input)
 {
     synonymStack = new Stack<String>();
     this.engine = synonymEngine;
     this.termAtt = AddAttribute<ITermAttribute>();
     this.posIncrAtt = AddAttribute<IPositionIncrementAttribute>();
 }
 void Init()
 {
     InitPanGuSegment();
     termAtt = AddAttribute<ITermAttribute>();
     offsetAtt = AddAttribute<IOffsetAttribute>();
     posIncrAtt = AddAttribute<IPositionIncrementAttribute>();
     typeAtt = AddAttribute<ITypeAttribute>();
 }
Ejemplo n.º 40
0
 public SynonymsFilter(TokenStream input, List <Dictionary <string, string[]> > synonymsDictList)
     : base(input)
 {
     this._SynonymsQueue             = new Queue <string>();
     this._TermAttribute             = base.AddAttribute <ITermAttribute>();
     this._PostionIncrementAttribute = base.AddAttribute <IPositionIncrementAttribute>();
     this._synonymsDictList          = synonymsDictList;
 }
Ejemplo n.º 41
0
 public SynonymsFilter(string language, TokenStream input)
     : base(input)
 {
     this._Language                  = language;
     this._SynonymsQueue             = new Queue <string>();
     this._TermAttribute             = base.AddAttribute <ITermAttribute>();
     this._PostionIncrementAttribute = base.AddAttribute <IPositionIncrementAttribute>();
 }
Ejemplo n.º 42
0
        /// <summary>
        ///   Creates a new HunspellStemFilter that will stem tokens from the given TokenStream using
        ///   affix rules in the provided HunspellDictionary.
        /// </summary>
        /// <param name="input">TokenStream whose tokens will be stemmed.</param>
        /// <param name="dictionary">HunspellDictionary containing the affix rules and words that will be used to stem the tokens.</param>
        /// <param name="dedup">true if only unique terms should be output.</param>
        public HunspellStemFilter(TokenStream input, HunspellDictionary dictionary, Boolean dedup = true)
            : base(input)
        {
            _posIncAtt = AddAttribute<IPositionIncrementAttribute>();
            _termAtt = AddAttribute<ITermAttribute>();

            _dedup = dedup;
            _stemmer = new HunspellStemmer(dictionary);
        }
Ejemplo n.º 43
0
        private void Init(System.IO.TextReader _input, HebMorph.DataStructures.DictRadix<int> _prefixesTree)
        {
			termAtt = AddAttribute <ITermAttribute>();
			offsetAtt = AddAttribute <IOffsetAttribute>();
            //posIncrAtt = (PositionIncrementAttribute)AddAttribute(typeof(PositionIncrementAttribute));
			typeAtt = AddAttribute <ITypeAttribute>();
        	input = _input;
            hebMorphTokenizer = new HebMorph.Tokenizer(_input);
            prefixesTree = _prefixesTree;
        }
        public ExpandAcronymsFilter(TokenStream input, IAcronymExpansionProvider acronymExpansionProvider)
            : base(input)
        {
            _acronymExpansionProvider = acronymExpansionProvider;

            _termAttribute = AddAttribute<ITermAttribute>();
            _positionIncrementAttribute = AddAttribute<IPositionIncrementAttribute>();
            _tokenSet = new Queue<string>();
            _recognizedTokens = new HashSet<string>();
        }
Ejemplo n.º 45
0
        public JiebaTokenizer(JiebaSegmenter seg, string input)
        {
            segmenter = seg;
            termAtt = AddAttribute<ITermAttribute>();
            offsetAtt = AddAttribute<IOffsetAttribute>();
            typeAtt = AddAttribute<ITypeAttribute>();

            var text = input;
            tokens = segmenter.Tokenize(text, TokenizerMode.Search).ToList();
        }
Ejemplo n.º 46
0
        public ExpanderFilter(TokenStream input, [NotNull] Func<String, IEnumerable<Expansion>> expander, Boolean emitSource = true)
            : base(input)
        {
            if (expander == null)
                throw new ArgumentNullException("expander");

            _expander = expander;
            _emitSource = emitSource;
            _termAttr = AddAttribute<ITermAttribute>();
            _posAttr = AddAttribute<IPositionIncrementAttribute>();
            _typeAttr = AddAttribute<ITypeAttribute>();
        }
Ejemplo n.º 47
0
		/// <summary>Construct the named stemming filter.
		/// 
		/// </summary>
        /// <param name="input">the input tokens to stem
		/// </param>
		/// <param name="name">the name of a stemmer
		/// </param>
		public SnowballFilter(TokenStream input, System.String name) : base(input)
		{
			try
			{
				System.Type stemClass = System.Type.GetType("SF.Snowball.Ext." + name + "Stemmer");
				stemmer = (SnowballProgram) System.Activator.CreateInstance(stemClass);
			}
			catch (System.Exception e)
			{
				throw new System.SystemException(e.ToString());
			}
		    termAtt = AddAttribute<ITermAttribute>();
		}
Ejemplo n.º 48
0
        private void Init(System.IO.TextReader input, HebMorph.StreamLemmatizer _lemmatizer,
            HebMorph.LemmaFilters.LemmaFilterBase _lemmaFilter, bool AlwaysSaveMarkedOriginal)
        {
			termAtt = AddAttribute <ITermAttribute>();
	        offsetAtt = AddAttribute<IOffsetAttribute>();
	        posIncrAtt = AddAttribute<IPositionIncrementAttribute>();
			typeAtt = AddAttribute <ITypeAttribute>();
            //payAtt = (PayloadAttribute)AddAttribute(typeof(PayloadAttribute));

        	this.input = input;
            this._streamLemmatizer = _lemmatizer;
            this._streamLemmatizer.SetStream(input);
            this.alwaysSaveMarkedOriginal = AlwaysSaveMarkedOriginal;
            this.lemmaFilter = _lemmaFilter;
        }
Ejemplo n.º 49
0
		/// <summary> Construct a token stream filtering the given input.
		/// If <c>stopWords</c> is an instance of <see cref="CharArraySet" /> (true if
		/// <c>makeStopSet()</c> was used to construct the set) it will be directly used
		/// and <c>ignoreCase</c> will be ignored since <c>CharArraySet</c>
		/// directly controls case sensitivity.
		/// <p/>
		/// If <c>stopWords</c> is not an instance of <see cref="CharArraySet" />,
		/// a new CharArraySet will be constructed and <c>ignoreCase</c> will be
		/// used to specify the case sensitivity of that set.
		/// </summary>
		/// <param name="enablePositionIncrements">true if token positions should record the removed stop words</param>
		/// <param name="input">Input TokenStream</param>
		/// <param name="stopWords">A Set of strings or strings or char[] or any other ToString()-able set representing the stopwords</param>
        /// <param name="ignoreCase">if true, all words are lower cased first</param>
        public StopFilter(bool enablePositionIncrements, TokenStream input, ISet<string> stopWords, bool ignoreCase)
            : base(input)
		{
		    if (stopWords is CharArraySet)
		    {
		        this.stopWords = (CharArraySet) stopWords;
		    }
		    else
		    {
		        this.stopWords = new CharArraySet(stopWords.Count, ignoreCase);
		        this.stopWords.AddAll(stopWords);
		    }
		    this.enablePositionIncrements = enablePositionIncrements;
		    termAtt = AddAttribute<ITermAttribute>();
            posIncrAtt = AddAttribute<IPositionIncrementAttribute>();
		}
Ejemplo n.º 50
0
        /*
         * Creates NGramTokenFilter with given min and max n-grams.
         * <param name="input"><see cref="TokenStream"/> holding the input to be tokenized</param>
         * <param name="minGram">the smallest n-gram to generate</param>
         * <param name="maxGram">the largest n-gram to generate</param>
         */
        public NGramTokenFilter(TokenStream input, int minGram, int maxGram)
            : base(input)
        {
            if (minGram < 1)
            {
                throw new System.ArgumentException("minGram must be greater than zero");
            }
            if (minGram > maxGram)
            {
                throw new System.ArgumentException("minGram must not be greater than maxGram");
            }
            this.minGram = minGram;
            this.maxGram = maxGram;

            this.termAtt = AddAttribute<ITermAttribute>();
            this.offsetAtt = AddAttribute<IOffsetAttribute>();
        }
        public override bool Accept(AttributeSource source)
        {
            if (termAtt == null)
            {
                termAtt = source.AddAttribute<ITermAttribute>();
            }
            try
            {
                DateTime date = DateTime.Parse(termAtt.Term, dateFormat);//We don't care about the date, just that we can parse it as a date
                if (date != null)
                {
                    return true;
                }
            }
            catch (FormatException)
            {

            }

            return false;
        }
Ejemplo n.º 52
0
        public PrefixAwareTokenFilter(TokenStream prefix, TokenStream suffix) : base(suffix)
        {
            Suffix = suffix;
            Prefix = prefix;
            _prefixExhausted = false;

            // ReSharper disable DoNotCallOverridableMethodsInConstructor
            _termAtt = AddAttribute<ITermAttribute>();
            _posIncrAtt = AddAttribute<IPositionIncrementAttribute>();
            _payloadAtt = AddAttribute<IPayloadAttribute>();
            _offsetAtt = AddAttribute<IOffsetAttribute>();
            _typeAtt = AddAttribute<ITypeAttribute>();
            _flagsAtt = AddAttribute<IFlagsAttribute>();
            // ReSharper restore DoNotCallOverridableMethodsInConstructor

            _pTermAtt = prefix.AddAttribute<ITermAttribute>();
            _pPosIncrAtt = prefix.AddAttribute<IPositionIncrementAttribute>();
            _pPayloadAtt = prefix.AddAttribute<IPayloadAttribute>();
            _pOffsetAtt = prefix.AddAttribute<IOffsetAttribute>();
            _pTypeAtt = prefix.AddAttribute<ITypeAttribute>();
            _pFlagsAtt = prefix.AddAttribute<IFlagsAttribute>();
        }
Ejemplo n.º 53
0
        public PanGuTokenizer(TextReader input)
            : base(input)
        {
            termAttribute = AddAttribute<ITermAttribute>();
            offsetAttribute = AddAttribute<IOffsetAttribute>();

            inputText = base.input.ReadToEnd();

            if (string.IsNullOrEmpty(inputText)) {
                char[] readBuf = new char[1024];

                int relCount = base.input.Read(readBuf, 0, readBuf.Length);

                StringBuilder inputStr = new StringBuilder(readBuf.Length);

                while (relCount > 0) {
                    inputStr.Append(readBuf, 0, relCount);

                    relCount = input.Read(readBuf, 0, readBuf.Length);
                }

                if (inputStr.Length > 0) {
                    inputText = inputStr.ToString();
                }
            }

            if (string.IsNullOrEmpty(inputText)) {
                words = new WordInfo[0];
            }
            else {
                global::PanGu.Segment segment = new Segment();
                ICollection<WordInfo> wordInfos = segment.DoSegment(inputText);
                words = new WordInfo[wordInfos.Count];
                wordInfos.CopyTo(words, 0);
            }
        }
Ejemplo n.º 54
0
 //private System.Reflection.MethodInfo stemMethod;
 public SnowballFilter(TokenStream input, SnowballProgram stemmer)
     : base(input)
 {
     this.stemmer = stemmer;
     termAtt = AddAttribute<ITermAttribute>();
 }
            internal SingleTokenAttributeSource()
			{
                termAttribute = AddAttribute<ITermAttribute>();
				offsetAttribute = AddAttribute<IOffsetAttribute>();
			}
Ejemplo n.º 56
0
 public FastStringTokenizer(String str, bool isLetter, bool toLowerCase, ISet<string> stopWords)
 {
     this.str = str;
     this.isLetter = isLetter;
     this.toLowerCase = toLowerCase;
     this.stopWords = stopWords;
     this.termAtt = AddAttribute<ITermAttribute>();
     this.offsetAtt = AddAttribute<IOffsetAttribute>();
 }
Ejemplo n.º 57
0
 public RegexTokenizer(String str, Regex regex, bool toLowerCase)
 {
     this.str = str;
     this.matcher = regex.Match(str);
     this.toLowerCase = toLowerCase;
     this.termAtt = AddAttribute<ITermAttribute>();
     this.offsetAtt = AddAttribute<IOffsetAttribute>();
 }
Ejemplo n.º 58
0
 public RepeatingTokenStream(System.String val)
 {
     this.value_Renamed = val;
     this.termAtt =  AddAttribute<ITermAttribute>();
 }
 internal WordTokenFilter(TokenStream input)
     : base(input)
 {
     termAtt = AddAttribute<ITermAttribute>();
     typeAtt = AddAttribute<ITypeAttribute>();
 }
Ejemplo n.º 60
0
 public PorterStemFilter(TokenStream in_Renamed)
     : base(in_Renamed)
 {
     stemmer = new PorterStemmer();
     termAtt = AddAttribute<ITermAttribute>();
 }