Beispiel #1
0
        protected override TokenStreamComponents CreateComponents(string fieldName, TextReader reader)
        {
            var tokenizer  = new AdrivaTokenizer(this.LuceneVersion, reader);
            var components = new TokenStreamComponents(tokenizer);

            return(components);
        }
Beispiel #2
0
        protected override TokenStreamComponents WrapComponents(string fieldName, TokenStreamComponents components)
        {
            if (!stopWordsPerField.TryGetValue(fieldName, out ISet <string> stopWords) || stopWords is null)
            {
                return(components);
            }
            var stopFilter = new StopFilter(matchVersion, components.TokenStream, new CharArraySet(matchVersion, stopWords, false));

            return(new TokenStreamComponents(components.Tokenizer, stopFilter));
        }
Beispiel #3
0
        public void Main()
        {
            const string s = "関西国際空港";

            Console.WriteLine($"対象の文字列:{s}");

            using var reader = new StringReader(s);

            Tokenizer tokenizer = new JapaneseTokenizer(reader, ReadDict(), false, JapaneseTokenizerMode.NORMAL);

            var tokenStreamComponents = new TokenStreamComponents(tokenizer, tokenizer);

            using var tokenStream = tokenStreamComponents.TokenStream;

            // note:処理の実行前にResetを実行する必要がある
            tokenStream.Reset();

            while (tokenStream.IncrementToken())
            {
                Console.WriteLine("---");
                Console.WriteLine(
                    $"ICharTermAttribute=>{tokenStream.GetAttribute<ICharTermAttribute>().ToString()}");

                Console.WriteLine(
                    $"ITermToBytesRefAttribute#BytesRef=>{tokenStream.GetAttribute<ITermToBytesRefAttribute>().BytesRef}");

                Console.WriteLine(
                    $"IOffsetAttribute#StartOffset=>{tokenStream.GetAttribute<IOffsetAttribute>().StartOffset}");
                Console.WriteLine(
                    $"IOffsetAttribute#EndOffset=>{tokenStream.GetAttribute<IOffsetAttribute>().EndOffset}");

                Console.WriteLine(
                    $"IPositionIncrementAttribute=>{tokenStream.GetAttribute<IPositionIncrementAttribute>().PositionIncrement}");
                Console.WriteLine(
                    $"IPositionLengthAttribute=>{tokenStream.GetAttribute<IPositionLengthAttribute>().PositionLength}");

                Console.WriteLine(
                    $"IBaseFormAttribute#GetBaseForm=>{tokenStream.GetAttribute<IBaseFormAttribute>().GetBaseForm()}");

                Console.WriteLine(
                    $"IPartOfSpeechAttribute#GetPartOfSpeech=>{tokenStream.GetAttribute<IPartOfSpeechAttribute>().GetPartOfSpeech()}");

                Console.WriteLine(
                    $"IReadingAttribute#GetReading=>{tokenStream.GetAttribute<IReadingAttribute>().GetReading()}");
                Console.WriteLine(
                    $"IReadingAttribute#GetPronunciation=>{tokenStream.GetAttribute<IReadingAttribute>().GetPronunciation()}");

                Console.WriteLine(
                    $"IInflectionAttribute#GetInflectionForm=>{tokenStream.GetAttribute<IInflectionAttribute>().GetInflectionForm()}");
                Console.WriteLine(
                    $"IInflectionAttribute#GetInflectionType=>{tokenStream.GetAttribute<IInflectionAttribute>().GetInflectionType()}");

                Console.WriteLine("---");
            }
        }
        protected override TokenStreamComponents WrapComponents(string fieldName, TokenStreamComponents components)
        {
            var stopWords = stopWordsPerField.ContainsKey(fieldName) ? stopWordsPerField[fieldName] : null;

            if (stopWords == null)
            {
                return(components);
            }
            var stopFilter = new StopFilter(matchVersion, components.TokenStream, new CharArraySet(matchVersion, stopWords, false));

            return(new TokenStreamComponents(components.Tokenizer, stopFilter));
        }
Beispiel #5
0
        protected override TokenStreamComponents WrapComponents(string fieldName, TokenStreamComponents components)
        {
            HashSet <string> stopWords = stopWordsPerField[fieldName];

            if (stopWords == null)
            {
                return(components);
            }
            StopFilter stopFilter = new StopFilter(matchVersion, components.TokenStream, new CharArraySet(matchVersion, stopWords, false));

            return(new TokenStreamComponents(components.Tokenizer, stopFilter));
        }
        protected override TokenStreamComponents WrapComponents(string fieldName, TokenStreamComponents components)
        {
            ShingleFilter filter = new ShingleFilter(components.TokenStream, minShingleSize, maxShingleSize);

            filter.SetMinShingleSize(minShingleSize);
            filter.SetMaxShingleSize(maxShingleSize);
            filter.SetTokenSeparator(tokenSeparator);
            filter.SetOutputUnigrams(outputUnigrams);
            filter.SetOutputUnigramsIfNoShingles(outputUnigramsIfNoShingles);
            filter.SetFillerToken(fillerToken);
            return(new TokenStreamComponents(components.Tokenizer, filter));
        }
        protected override TokenStreamComponents WrapComponents(string fieldName, TokenStreamComponents components)
        {
            ShingleFilter filter = new ShingleFilter(components.TokenStream, minShingleSize, maxShingleSize);

            filter.MinShingleSize             = minShingleSize;
            filter.MaxShingleSize             = maxShingleSize;
            filter.TokenSeparator             = tokenSeparator;
            filter.OutputUnigrams             = outputUnigrams;
            filter.OutputUnigramsIfNoShingles = outputUnigramsIfNoShingles;
            filter.FillerToken = fillerToken;
            return(new TokenStreamComponents(components.Tokenizer, filter));
        }
Beispiel #8
0
        protected override TokenStreamComponents CreateComponents(string fieldName, TextReader reader)
        {
            TokenStreamComponents tokenStreamComponents = null;
            Tokenizer             tokenizer             = new StandardTokenizer(matchVersion, reader);
            TokenStream           stream = new LowerCaseFilter(matchVersion, tokenizer);

            stream = new CyrllicToLatinFilter(stream);
            stream = new StopFilter(matchVersion, stream, StopFilter.MakeStopSet(matchVersion, STOP_WORDS));
            stream = new SnowballFilter(stream, new SimpleSerbianStemmer());
            stream = new ASCIIFoldingFilter(stream);

            tokenStreamComponents = new TokenStreamComponents(tokenizer, stream);

            return(tokenStreamComponents);
        }
Beispiel #9
0
        protected override TokenStreamComponents CreateComponents(string fieldName, TextReader reader)
        {
            TokenStreamComponents tokenStreamComponents = null;
            Tokenizer             tokenizer             = new StandardTokenizer(matchVersion, reader);
            TokenStream           stream = new StandardFilter(matchVersion, tokenizer);

            stream = new LowerCaseFilter(matchVersion, stream);
            stream = new StopFilter(matchVersion, stream, StopAnalyzer.ENGLISH_STOP_WORDS_SET);
            stream = new PorterStemFilter(stream);
            stream = new SnowballFilter(stream, new EnglishStemmer());

            tokenStreamComponents = new TokenStreamComponents(tokenizer, stream);

            return(tokenStreamComponents);
        }
Beispiel #10
0
 public override void SetReusableComponents(Analyzer analyzer, string fieldName, TokenStreamComponents components)
 {
     SetStoredValue(analyzer, components);
 }
 protected override TokenStreamComponents WrapComponents(string fieldName, TokenStreamComponents components)
 {
     if (fieldName.Equals("textgrams") && outerInstance.minPrefixChars > 0)
     {
         return new TokenStreamComponents(components.Tokenizer,
             new EdgeNGramTokenFilter(
                 outerInstance.matchVersion,
                 components.TokenStream,
                 1,
                 outerInstance.minPrefixChars));
     }
     else
     {
         return components;
     }
 }
Beispiel #12
0
 /// <summary>
 /// Wraps / alters the given TokenStreamComponents, taken from the wrapped
 /// Analyzer, to form new components. It is through this method that new
 /// TokenFilters can be added by AnalyzerWrappers. By default, the given
 /// components are returned.
 /// </summary>
 /// <param name="fieldName">
 ///          Name of the field which is to be analyzed </param>
 /// <param name="components">
 ///          TokenStreamComponents taken from the wrapped Analyzer </param>
 /// <returns> Wrapped / altered TokenStreamComponents. </returns>
 protected virtual TokenStreamComponents WrapComponents(string fieldName, TokenStreamComponents components)
 {
     return components;
 }
 protected override TokenStreamComponents WrapComponents(string fieldName, TokenStreamComponents components)
 {
     return new TokenStreamComponents(components.Tokenizer, new LimitTokenCountFilter(components.TokenStream, maxTokenCount, consumeAllTokens));
 }
Beispiel #14
0
 protected override TokenStreamComponents WrapComponents(string fieldName, TokenStreamComponents components)
 {
     return(new TokenStreamComponents(components.Tokenizer, new LimitTokenCountFilter(components.TokenStream, maxTokenCount, consumeAllTokens)));
 }
            protected override TokenStreamComponents WrapComponents(string fieldName, TokenStreamComponents components)
            {
                ShingleFilter shingles = new ShingleFilter(components.TokenStream, 2, outerInstance.grams);

                shingles.SetTokenSeparator(char.ToString((char)outerInstance.separator));
                return(new TokenStreamComponents(components.Tokenizer, shingles));
            }
 protected override TokenStreamComponents WrapComponents(string fieldName, TokenStreamComponents components)
 {
     if (fieldName.Equals("textgrams") && outerInstance.minPrefixChars > 0)
     {
         return(new TokenStreamComponents(components.Tokenizer,
                                          new EdgeNGramTokenFilter(
                                              outerInstance.matchVersion,
                                              components.TokenStream,
                                              1,
                                              outerInstance.minPrefixChars)));
     }
     else
     {
         return(components);
     }
 }
 protected override TokenStreamComponents WrapComponents(string fieldName, TokenStreamComponents components)
 {
     var stopWords = stopWordsPerField[fieldName];
     if (stopWords == null)
     {
         return components;
     }
     var stopFilter = new StopFilter(matchVersion, components.TokenStream, new CharArraySet(matchVersion, stopWords, false));
     return new TokenStreamComponents(components.Tokenizer, stopFilter);
 }
 protected override TokenStreamComponents WrapComponents(string fieldName, TokenStreamComponents components)
 {
     ShingleFilter filter = new ShingleFilter(components.TokenStream, minShingleSize, maxShingleSize);
     filter.MinShingleSize = minShingleSize;
     filter.MaxShingleSize = maxShingleSize;
     filter.TokenSeparator = tokenSeparator;
     filter.OutputUnigrams = outputUnigrams;
     filter.OutputUnigramsIfNoShingles = outputUnigramsIfNoShingles;
     filter.FillerToken = fillerToken;
     return new TokenStreamComponents(components.Tokenizer, filter);
 }
Beispiel #19
0
 public override void SetReusableComponents(Analyzer analyzer, string fieldName, TokenStreamComponents components)
 {
     var componentsPerField = (IDictionary<string, TokenStreamComponents>)GetStoredValue(analyzer);
     if (componentsPerField == null)
     {
         componentsPerField = new Dictionary<string, TokenStreamComponents>();
         SetStoredValue(analyzer, componentsPerField);
     }
     componentsPerField[fieldName] = components;
 }
 protected override TokenStreamComponents WrapComponents(string fieldName, TokenStreamComponents components)
 {
     return components;
 }
Beispiel #21
0
 /// <summary>
 /// Stores the given TokenStreamComponents as the reusable components for the
 /// field with the give name.
 /// </summary>
 /// <param name="fieldName"> Name of the field whose TokenStreamComponents are being set </param>
 /// <param name="components"> TokenStreamComponents which are to be reused for the field </param>
 public abstract void SetReusableComponents(Analyzer analyzer, string fieldName, TokenStreamComponents components);