//JAVA TO C# CONVERTER WARNING: Method 'throws' clauses are not available in .NET: //ORIGINAL LINE: public void testHugeDoc() throws java.io.IOException public virtual void testHugeDoc() { StringBuilder sb = new StringBuilder(); char[] whitespace = new char[4094]; Arrays.fill(whitespace, ' '); sb.Append(whitespace); sb.Append("testing 1234"); string input = sb.ToString(); StandardTokenizer tokenizer = new StandardTokenizer(TEST_VERSION_CURRENT, new StringReader(input)); BaseTokenStreamTestCase.assertTokenStreamContents(tokenizer, new string[] {"testing", "1234"}); }
static void Main(string[] args) { var dawgFile = args.Length > 0 ? args[0] : "https://raw.githubusercontent.com/yamool/CWSharp/master/data/cwsharp.dawg"; //BuildDawgFile(dawgFile); var tokenizer = new StandardTokenizer(dawgFile); foreach (var token in tokenizer.Traverse("研究生命起源")) { Console.Write(token.Text + "/" + token.Type); Console.Write(" "); } Console.ReadLine(); }
static void Main() { var dawgFile = @"d:\dict.dawg"; //BuildDawgFile(dawgFile); var tokenizer = new StandardTokenizer(dawgFile); foreach (var token in tokenizer.Traverse("研究生命起源")) { Console.Write(token.Text + "/" + token.Type); Console.Write(" "); } Console.ReadLine(); }
static void Main(string[] args) { var dawgFile = args[0]; Console.WriteLine("reading draw file: " + dawgFile); using (var stream = new FileStream(dawgFile, FileMode.Open, FileAccess.Read)) { var tokenizer = new StandardTokenizer(stream); foreach (var token in tokenizer.Traverse("研究生命起源")) { Console.Write(token.Text + "/" + token.Type); Console.Write(" "); } } }
public GitHubIndex(Directory indexDirectory, Credentials credentials) { github = new GitHubClient(new ProductHeaderValue("LuceneNetDemo")) { Credentials = credentials }; analyzer = new PerFieldAnalyzerWrapper( // Example of a pre-built custom analyzer defaultAnalyzer: new HtmlStripAnalyzer(GitHubIndex.MatchVersion), // Example of inline anonymous analyzers fieldAnalyzers: new Dictionary <string, Analyzer> { // Field analyzer for owner { "owner", Analyzer.NewAnonymous(createComponents: (fieldName, reader) => { var source = new KeywordTokenizer(reader); TokenStream result = new ASCIIFoldingFilter(source); result = new LowerCaseFilter(GitHubIndex.MatchVersion, result); return(new TokenStreamComponents(source, result)); }) }, // Field analyzer for name { "name", Analyzer.NewAnonymous(createComponents: (fieldName, reader) => { var source = new StandardTokenizer(GitHubIndex.MatchVersion, reader); TokenStream result = new WordDelimiterFilter(GitHubIndex.MatchVersion, source, ~WordDelimiterFlags.STEM_ENGLISH_POSSESSIVE, CharArraySet.EMPTY_SET); result = new ASCIIFoldingFilter(result); result = new LowerCaseFilter(GitHubIndex.MatchVersion, result); return(new TokenStreamComponents(source, result)); }) } }); queryParser = new MultiFieldQueryParser(GitHubIndex.MatchVersion, new[] { "name", "description", "readme" }, analyzer); indexWriter = new IndexWriter(indexDirectory, new IndexWriterConfig(GitHubIndex.MatchVersion, analyzer)); searcherManager = new SearcherManager(indexWriter, true, null); }
/// <summary> /// Creates a /// <seealso cref="org.apache.lucene.analysis.Analyzer.TokenStreamComponents"/> /// which tokenizes all the text in the provided <seealso cref="Reader"/>. /// </summary> /// <returns> A /// <seealso cref="org.apache.lucene.analysis.Analyzer.TokenStreamComponents"/> /// built from an <seealso cref="StandardTokenizer"/> filtered with /// <seealso cref="StandardFilter"/>, <seealso cref="SoraniNormalizationFilter"/>, /// <seealso cref="LowerCaseFilter"/>, <seealso cref="StopFilter"/> /// , <seealso cref="SetKeywordMarkerFilter"/> if a stem exclusion set is /// provided and <seealso cref="SoraniStemFilter"/>. </returns> protected internal override TokenStreamComponents createComponents(string fieldName, Reader reader) { //JAVA TO C# CONVERTER WARNING: The original Java variable was marked 'final': //ORIGINAL LINE: final org.apache.lucene.analysis.Tokenizer source = new org.apache.lucene.analysis.standard.StandardTokenizer(matchVersion, reader); Tokenizer source = new StandardTokenizer(matchVersion, reader); TokenStream result = new StandardFilter(matchVersion, source); result = new SoraniNormalizationFilter(result); result = new LowerCaseFilter(matchVersion, result); result = new StopFilter(matchVersion, result, stopwords); if (!stemExclusionSet.Empty) { result = new SetKeywordMarkerFilter(result, stemExclusionSet); } result = new SoraniStemFilter(result); return(new TokenStreamComponents(source, result)); }
public override TokenStream TokenStream(string fieldName, TextReader reader) { // Splits words at punctuation characters, removing punctuation. // Splits words at hyphens, unless there's a number in the token... // Recognizes email addresses and internet hostnames as one token. var tokenizer = new StandardTokenizer(_version, reader); TokenStream filter = new StandardFilter(tokenizer); // Normalizes token text to lower case. filter = new LowerCaseFilter(filter); // Removes stop words from a token stream. filter = new StopFilter(true, filter, StopAnalyzer.ENGLISH_STOP_WORDS_SET); return(new NGramTokenFilter(filter, _minGram, _maxGram)); }
/// <summary> /// Creates /// <see cref="TokenStreamComponents"/> /// used to tokenize all the text in the provided <see cref="TextReader"/>. /// </summary> /// <returns> <see cref="TokenStreamComponents"/> /// built from a <see cref="StandardTokenizer"/> filtered with /// <see cref="GreekLowerCaseFilter"/>, <see cref="StandardFilter"/>, /// <see cref="StopFilter"/>, and <see cref="GreekStemFilter"/> </returns> protected override TokenStreamComponents CreateComponents(string fieldName, TextReader reader) { Tokenizer source = new StandardTokenizer(m_matchVersion, reader); TokenStream result = new GreekLowerCaseFilter(m_matchVersion, source); #pragma warning disable 612, 618 if (m_matchVersion.OnOrAfter(LuceneVersion.LUCENE_31)) { result = new StandardFilter(m_matchVersion, result); } result = new StopFilter(m_matchVersion, result, m_stopwords); if (m_matchVersion.OnOrAfter(LuceneVersion.LUCENE_31)) #pragma warning restore 612, 618 { result = new GreekStemFilter(result); } return(new TokenStreamComponents(source, result)); }
/// <summary> /// Creates a /// <seealso cref="org.apache.lucene.analysis.Analyzer.TokenStreamComponents"/> /// which tokenizes all the text in the provided <seealso cref="Reader"/>. /// </summary> /// <returns> A /// <seealso cref="org.apache.lucene.analysis.Analyzer.TokenStreamComponents"/> /// built from an <seealso cref="StandardTokenizer"/> filtered with /// <seealso cref="StandardFilter"/>, <seealso cref="ElisionFilter"/>, <seealso cref="LowerCaseFilter"/>, /// <seealso cref="StopFilter"/>, <seealso cref="SetKeywordMarkerFilter"/> if a stem exclusion set is /// provided and <seealso cref="SnowballFilter"/>. </returns> public override TokenStreamComponents CreateComponents(string fieldName, TextReader reader) { Tokenizer source = new StandardTokenizer(matchVersion, reader); TokenStream result = new StandardFilter(matchVersion, source); if (matchVersion.OnOrAfter(LuceneVersion.LUCENE_36)) { result = new ElisionFilter(result, DEFAULT_ARTICLES); } result = new LowerCaseFilter(matchVersion, result); result = new StopFilter(matchVersion, result, stopwords); if (stemExclusionSet.Count > 0) { result = new SetKeywordMarkerFilter(result, stemExclusionSet); } result = new SnowballFilter(result, new CatalanStemmer()); return(new TokenStreamComponents(source, result)); }
/// <summary> /// Creates /// <see cref="TokenStreamComponents"/> /// used to tokenize all the text in the provided <see cref="TextReader"/>. /// </summary> /// <returns> <see cref="TokenStreamComponents"/> /// built from a <see cref="StandardTokenizer"/> filtered with /// <see cref="GreekLowerCaseFilter"/>, <see cref="StandardFilter"/>, /// <see cref="StopFilter"/>, and <see cref="GreekStemFilter"/> </returns> protected override TokenStreamComponents CreateComponents(string fieldName, TextReader reader) { Tokenizer source = new StandardTokenizer(m_matchVersion, reader); TokenStream result = new GreekLowerCaseFilter(m_matchVersion, source); if (m_matchVersion.OnOrAfter(LuceneVersion.LUCENE_48)) { result = new StandardFilter(m_matchVersion, result); } result = new StopFilter(m_matchVersion, result, m_stopwords); if (m_matchVersion.OnOrAfter(LuceneVersion.LUCENE_48)) { result = new GreekStemFilter(result); } result = new GreekPhoneticFilter(result); result = new GreekAccentFilter(result); return(new TokenStreamComponents(source, result)); }
/// <summary> /// Creates /// <seealso cref="org.apache.lucene.analysis.Analyzer.TokenStreamComponents"/> /// used to tokenize all the text in the provided <seealso cref="Reader"/>. /// </summary> /// <returns> <seealso cref="org.apache.lucene.analysis.Analyzer.TokenStreamComponents"/> /// built from a <seealso cref="StandardTokenizer"/> filtered with /// <seealso cref="GreekLowerCaseFilter"/>, <seealso cref="StandardFilter"/>, /// <seealso cref="StopFilter"/>, and <seealso cref="GreekStemFilter"/> </returns> protected internal override TokenStreamComponents createComponents(string fieldName, Reader reader) { //JAVA TO C# CONVERTER WARNING: The original Java variable was marked 'final': //ORIGINAL LINE: final org.apache.lucene.analysis.Tokenizer source = new org.apache.lucene.analysis.standard.StandardTokenizer(matchVersion, reader); Tokenizer source = new StandardTokenizer(matchVersion, reader); TokenStream result = new GreekLowerCaseFilter(matchVersion, source); if (matchVersion.onOrAfter(Version.LUCENE_31)) { result = new StandardFilter(matchVersion, result); } result = new StopFilter(matchVersion, result, stopwords); if (matchVersion.onOrAfter(Version.LUCENE_31)) { result = new GreekStemFilter(result); } return(new TokenStreamComponents(source, result)); }
/// <summary> /// Creates a /// <see cref="TokenStreamComponents"/> /// which tokenizes all the text in the provided <see cref="TextReader"/>. /// </summary> /// <returns> A /// <see cref="TokenStreamComponents"/> /// built from an <see cref="StandardTokenizer"/> filtered with /// <see cref="StandardFilter"/>, <see cref="TurkishLowerCaseFilter"/>, /// <see cref="StopFilter"/>, <see cref="SetKeywordMarkerFilter"/> if a stem /// exclusion set is provided and <see cref="SnowballFilter"/>. </returns> protected internal override TokenStreamComponents CreateComponents(string fieldName, TextReader reader) { Tokenizer source = new StandardTokenizer(m_matchVersion, reader); TokenStream result = new StandardFilter(m_matchVersion, source); if (m_matchVersion.OnOrAfter(LuceneVersion.LUCENE_48)) { result = new ApostropheFilter(result); } result = new TurkishLowerCaseFilter(result); result = new StopFilter(m_matchVersion, result, m_stopwords); if (stemExclusionSet.Any()) { result = new SetKeywordMarkerFilter(result, stemExclusionSet); } result = new SnowballFilter(result, new TurkishStemmer()); return(new TokenStreamComponents(source, result)); }
public override TokenStream TokenStream (string fieldName, System.IO.TextReader reader) { //create the tokenizer TokenStream result = new StandardTokenizer(reader); //add in filters // first normalize the StandardTokenizer result = new StandardFilter(result); // makes sure everything is lower case result = new LowerCaseFilter(result); // use the default list of Stop Words, provided by the StopAnalyzer class. result = new StopFilter(result, StopAnalyzer.ENGLISH_STOP_WORDS); // injects the synonyms. result = new SynonymFilter(result, SynonymEngine); //return the built token stream. return(result); }
public DynamicListRecognizer([CallerFilePath] string callerPath = "", [CallerLineNumber] int callerLine = 0) : base(callerPath, callerLine) { this.exactAnalyzer = new StandardAnalyzer(Utils.LuceneVersion.LUCENE_48); this.fuzzyAnalyzer = Analyzer.NewAnonymous((field, textReader) => { Tokenizer tokenizer = new StandardTokenizer(Utils.LuceneVersion.LUCENE_48, textReader); // TokenStream stream = new DoubleMetaphoneFilter(tokenizer, 6, false); var factory = new BeiderMorseFilterFactory(new Dictionary <string, string>() { { "nameType", NameType.GENERIC.ToString() }, { "ruleType", RuleType.APPROX.ToString() }, { "languageSet", "auto" } }); TokenStream stream = factory.Create(tokenizer); return(new TokenStreamComponents(tokenizer, stream)); }); }
protected override TokenStreamComponents CreateComponents(string fieldName, TextReader reader) { var tokenizer = new StandardTokenizer(version, reader); var shingler = new ShingleFilter(tokenizer, minGramSize, maxGramSize); if (!this.ShowUnigrams) { shingler.SetOutputUnigrams(false); } else { shingler.SetOutputUnigrams(true); } var filter = new StopFilter(version, new LowerCaseFilter(version, shingler), StopAnalyzer.ENGLISH_STOP_WORDS_SET); return(new TokenStreamComponents(tokenizer, filter)); }
/** * 将句子列表转化为文档 * * @param sentenceList * @return */ private static List <List <String> > convertSentenceListToDocument(List <String> sentenceList) { List <List <String> > docs = new List <List <String> >(sentenceList.Count); foreach (String sentence in sentenceList) { List <Term> termList = StandardTokenizer.segment(sentence.ToCharArray()); List <String> wordList = new List <String>(); foreach (Term term in termList) { if (CoreStopWordDictionary.shouldInclude(term)) { wordList.Add(term.word); } } docs.Add(wordList); } return(docs); }
/// <summary>Constructs a <see cref="StandardTokenizer" /> filtered by a <see cref="StandardFilter" /> ///, a <see cref="LowerCaseFilter" /> and a <see cref="StopFilter" />. /// </summary> public override TokenStream TokenStream(System.String fieldName, System.IO.TextReader reader) { StandardTokenizer tokenStream = new StandardTokenizer(matchVersion, reader); TokenStream result = new StandardFilter(tokenStream); result = new LowerCaseFilter(result); if (stopSet != null) { result = new StopFilter(StopFilter.GetEnablePositionIncrementsVersionDefault(matchVersion), result, stopSet); } //Now, our Stemming filter goes here result = new BestBetsWordFormsFilter(result); //This will remove duplicate keywords - bad for best bets/term count matching result = new RemoveDuplicatesTokenFilter(result); return(result); }
/// <summary> /// Creates /// <see cref="TokenStreamComponents"/> /// used to tokenize all the text in the provided <see cref="TextReader"/>. /// </summary> /// <returns> <see cref="TokenStreamComponents"/> /// built from a <see cref="StandardTokenizer"/> filtered with /// <see cref="StandardFilter"/>, <see cref="ElisionFilter"/>, /// <see cref="LowerCaseFilter"/>, <see cref="StopFilter"/>, /// <see cref="SetKeywordMarkerFilter"/> if a stem exclusion set is /// provided, and <see cref="FrenchLightStemFilter"/> </returns> /// protected internal override TokenStreamComponents CreateComponents(string fieldName, TextReader reader) { #pragma warning disable 612, 618 if (m_matchVersion.OnOrAfter(LuceneVersion.LUCENE_31)) #pragma warning restore 612, 618 { Tokenizer source = new StandardTokenizer(m_matchVersion, reader); TokenStream result = new StandardFilter(m_matchVersion, source); result = new ElisionFilter(result, DEFAULT_ARTICLES); result = new LowerCaseFilter(m_matchVersion, result); result = new StopFilter(m_matchVersion, result, m_stopwords); if (excltable.Count > 0) { result = new SetKeywordMarkerFilter(result, excltable); } #pragma warning disable 612, 618 if (m_matchVersion.OnOrAfter(LuceneVersion.LUCENE_36)) #pragma warning restore 612, 618 { result = new FrenchLightStemFilter(result); } else { result = new SnowballFilter(result, new Tartarus.Snowball.Ext.FrenchStemmer()); } return(new TokenStreamComponents(source, result)); } else { Tokenizer source = new StandardTokenizer(m_matchVersion, reader); TokenStream result = new StandardFilter(m_matchVersion, source); result = new StopFilter(m_matchVersion, result, m_stopwords); if (excltable.Count > 0) { result = new SetKeywordMarkerFilter(result, excltable); } #pragma warning disable 612, 618 result = new FrenchStemFilter(result); #pragma warning restore 612, 618 // Convert to lowercase after stemming! return(new TokenStreamComponents(source, new LowerCaseFilter(m_matchVersion, result))); } }
/// <summary> /// Creates /// <see cref="TokenStreamComponents"/> /// used to tokenize all the text in the provided <see cref="TextReader"/>. /// </summary> /// <returns> <see cref="TokenStreamComponents"/> /// built from a <see cref="StandardTokenizer"/> filtered with /// <see cref="StandardFilter"/>, <see cref="LowerCaseFilter"/>, <see cref="StopFilter"/>, /// and <see cref="CzechStemFilter"/> (only if version is >= LUCENE_31). If /// a version is >= LUCENE_31 and a stem exclusion set is provided via /// <see cref="CzechAnalyzer(LuceneVersion, CharArraySet, CharArraySet)"/> a /// <see cref="SetKeywordMarkerFilter"/> is added before /// <see cref="CzechStemFilter"/>. </returns> protected internal override TokenStreamComponents CreateComponents(string fieldName, TextReader reader) { Tokenizer source = new StandardTokenizer(m_matchVersion, reader); TokenStream result = new StandardFilter(m_matchVersion, source); result = new LowerCaseFilter(m_matchVersion, result); result = new StopFilter(m_matchVersion, result, m_stopwords); #pragma warning disable 612, 618 if (m_matchVersion.OnOrAfter(LuceneVersion.LUCENE_31)) #pragma warning restore 612, 618 { if (this.stemExclusionTable.Count > 0) { result = new SetKeywordMarkerFilter(result, stemExclusionTable); } result = new CzechStemFilter(result); } return(new TokenStreamComponents(source, result)); }
public virtual void TestChangedOffsets() { NormalizeCharMap.Builder builder = new NormalizeCharMap.Builder(); builder.Add("a", "一二"); builder.Add("b", "二三"); NormalizeCharMap norm = builder.Build(); Analyzer analyzer = Analyzer.NewAnonymous(createComponents: (fieldName, reader) => { Tokenizer tokenizer = new StandardTokenizer(TEST_VERSION_CURRENT, reader); return(new TokenStreamComponents(tokenizer, new CJKBigramFilter(tokenizer))); }, initReader: (fieldName, reader) => new MappingCharFilter(norm, reader)); AssertAnalyzesTo(analyzer, "ab", new string[] { "一二", "二二", "二三" }, new int[] { 0, 0, 1 }, new int[] { 1, 1, 2 }); // note: offsets are strange since this is how the charfilter maps them... // before bigramming, the 4 tokens look like: // { 0, 0, 1, 1 }, // { 0, 1, 1, 2 } }
/// <summary> /// Creates a /// <seealso cref="org.apache.lucene.analysis.Analyzer.TokenStreamComponents"/> /// which tokenizes all the text in the provided <seealso cref="Reader"/>. /// </summary> /// <returns> A /// <seealso cref="org.apache.lucene.analysis.Analyzer.TokenStreamComponents"/> /// built from an <seealso cref="StandardTokenizer"/> filtered with /// <seealso cref="StandardFilter"/>, <seealso cref="EnglishPossessiveFilter"/>, /// <seealso cref="LowerCaseFilter"/>, <seealso cref="StopFilter"/> /// , <seealso cref="SetKeywordMarkerFilter"/> if a stem exclusion set is /// provided and <seealso cref="PorterStemFilter"/>. </returns> public override TokenStreamComponents CreateComponents(string fieldName, TextReader reader) { Tokenizer source = new StandardTokenizer(matchVersion, reader); TokenStream result = new StandardFilter(matchVersion, source); // prior to this we get the classic behavior, standardfilter does it for us. if (matchVersion.OnOrAfter(LuceneVersion.LUCENE_31)) { result = new EnglishPossessiveFilter(matchVersion, result); } result = new LowerCaseFilter(matchVersion, result); result = new StopFilter(matchVersion, result, stopwords); if (stemExclusionSet.Any()) { result = new SetKeywordMarkerFilter(result, stemExclusionSet); } result = new PorterStemFilter(result); return(new TokenStreamComponents(source, result)); }
/// <summary> /// Creates a /// <seealso cref="org.apache.lucene.analysis.Analyzer.TokenStreamComponents"/> /// which tokenizes all the text in the provided <seealso cref="Reader"/>. /// </summary> /// <returns> A /// <seealso cref="org.apache.lucene.analysis.Analyzer.TokenStreamComponents"/> /// built from an <seealso cref="StandardTokenizer"/> filtered with /// <seealso cref="StandardFilter"/>, <seealso cref="TurkishLowerCaseFilter"/>, /// <seealso cref="StopFilter"/>, <seealso cref="SetKeywordMarkerFilter"/> if a stem /// exclusion set is provided and <seealso cref="SnowballFilter"/>. </returns> protected internal override TokenStreamComponents createComponents(string fieldName, Reader reader) { //JAVA TO C# CONVERTER WARNING: The original Java variable was marked 'final': //ORIGINAL LINE: final org.apache.lucene.analysis.Tokenizer source = new org.apache.lucene.analysis.standard.StandardTokenizer(matchVersion, reader); Tokenizer source = new StandardTokenizer(matchVersion, reader); TokenStream result = new StandardFilter(matchVersion, source); if (matchVersion.onOrAfter(Version.LUCENE_48)) { result = new ApostropheFilter(result); } result = new TurkishLowerCaseFilter(result); result = new StopFilter(matchVersion, result, stopwords); if (!stemExclusionSet.Empty) { result = new SetKeywordMarkerFilter(result, stemExclusionSet); } result = new SnowballFilter(result, new TurkishStemmer()); return(new TokenStreamComponents(source, result)); }
/// <summary> /// Creates a TokenStream which tokenizes all the text in the provided Reader. /// </summary> /// <returns> /// A TokenStream build from a StandardTokenizer filtered with /// StandardFilter, StopFilter, FrenchStemFilter and LowerCaseFilter /// </returns> public override TokenStream TokenStream(String fieldName, TextReader reader) { if (fieldName == null) { throw new ArgumentException("fieldName must not be null"); } if (reader == null) { throw new ArgumentException("readermust not be null"); } TokenStream result = new StandardTokenizer(reader); result = new StandardFilter(result); result = new StopFilter(result, stoptable); result = new FrenchStemFilter(result, excltable); // Convert to lowercase after stemming! result = new LowerCaseFilter(result); return(result); }
public override TokenStream TokenStream(string fieldName, System.IO.TextReader reader) { TokenStream result = new StandardTokenizer(Lucene.Net.Util.Version.LUCENE_29, reader); //result = new StandardFilter(result); result = new LowerCaseFilter(result); if (STOP_WORDS != null) { result = new StopFilter(false, result, STOP_WORDS); } result = new ASCIIFoldingFilter(result); // we are using a distinct version of the Spanish stemmer, called Spanish2 // Please check if this class can be found in the Snowball library, the relative path // should be: Snowball\SF\Snowball\Ext\ // just in case, I would leave a copy of this class in this project result = new SnowballFilter(result, "Spanish"); return(result); }
/// <summary> /// Creates /// <seealso cref="org.apache.lucene.analysis.Analyzer.TokenStreamComponents"/> /// used to tokenize all the text in the provided <seealso cref="Reader"/>. /// </summary> /// <returns> <seealso cref="org.apache.lucene.analysis.Analyzer.TokenStreamComponents"/> /// built from a <seealso cref="StandardTokenizer"/> filtered with /// <seealso cref="StandardFilter"/>, <seealso cref="ElisionFilter"/>, /// <seealso cref="LowerCaseFilter"/>, <seealso cref="StopFilter"/>, /// <seealso cref="SetKeywordMarkerFilter"/> if a stem exclusion set is /// provided, and <seealso cref="FrenchLightStemFilter"/> </returns> protected internal override TokenStreamComponents createComponents(string fieldName, Reader reader) { if (matchVersion.onOrAfter(Version.LUCENE_31)) { //JAVA TO C# CONVERTER WARNING: The original Java variable was marked 'final': //ORIGINAL LINE: final org.apache.lucene.analysis.Tokenizer source = new org.apache.lucene.analysis.standard.StandardTokenizer(matchVersion, reader); Tokenizer source = new StandardTokenizer(matchVersion, reader); TokenStream result = new StandardFilter(matchVersion, source); result = new ElisionFilter(result, DEFAULT_ARTICLES); result = new LowerCaseFilter(matchVersion, result); result = new StopFilter(matchVersion, result, stopwords); if (!excltable.Empty) { result = new SetKeywordMarkerFilter(result, excltable); } if (matchVersion.onOrAfter(Version.LUCENE_36)) { result = new FrenchLightStemFilter(result); } else { result = new SnowballFilter(result, new org.tartarus.snowball.ext.FrenchStemmer()); } return(new TokenStreamComponents(source, result)); } else { //JAVA TO C# CONVERTER WARNING: The original Java variable was marked 'final': //ORIGINAL LINE: final org.apache.lucene.analysis.Tokenizer source = new org.apache.lucene.analysis.standard.StandardTokenizer(matchVersion, reader); Tokenizer source = new StandardTokenizer(matchVersion, reader); TokenStream result = new StandardFilter(matchVersion, source); result = new StopFilter(matchVersion, result, stopwords); if (!excltable.Empty) { result = new SetKeywordMarkerFilter(result, excltable); } result = new FrenchStemFilter(result); // Convert to lowercase after stemming! return(new TokenStreamComponents(source, new LowerCaseFilter(matchVersion, result))); } }
override public TokenStream TokenStream(string fieldName, TextReader reader) { StandardTokenizer tokenStream = new StandardTokenizer(VERSION, reader); tokenStream.MaxTokenLength = DEFAULT_MAX_TOKEN_LENGTH; TokenStream result = new StandardFilter(tokenStream); result = new LowerCaseFilter(result); if (SettingsViewModel.Instance.StopWords == true) { result = new StopFilter(enableSPI, result, StopSet); } if (SettingsViewModel.Instance.Stemming == true) { result = new PorterStemFilter(result); } return(result); }
/// <summary> /// Creates a /// <seealso cref="org.apache.lucene.analysis.Analyzer.TokenStreamComponents"/> /// which tokenizes all the text in the provided <seealso cref="Reader"/>. /// </summary> /// <returns> A /// <seealso cref="org.apache.lucene.analysis.Analyzer.TokenStreamComponents"/> /// built from an <seealso cref="StandardTokenizer"/> filtered with /// <seealso cref="StandardFilter"/>, <seealso cref="LowerCaseFilter"/>, <seealso cref="StopFilter"/> /// , <seealso cref="SetKeywordMarkerFilter"/> if a stem exclusion set is /// provided and <seealso cref="PortugueseLightStemFilter"/>. </returns> public override TokenStreamComponents CreateComponents(string fieldName, TextReader reader) { Tokenizer source = new StandardTokenizer(matchVersion, reader); TokenStream result = new StandardFilter(matchVersion, source); result = new LowerCaseFilter(matchVersion, result); result = new StopFilter(matchVersion, result, stopwords); if (stemExclusionSet.Count > 0) { result = new SetKeywordMarkerFilter(result, stemExclusionSet); } if (matchVersion.OnOrAfter(LuceneVersion.LUCENE_36)) { result = new PortugueseLightStemFilter(result); } else { result = new SnowballFilter(result, new Tartarus.Snowball.Ext.PortugueseStemmer()); } return(new TokenStreamComponents(source, result)); }
/// <summary> /// Creates /// <seealso cref="org.apache.lucene.analysis.Analyzer.TokenStreamComponents"/> /// used to tokenize all the text in the provided <seealso cref="Reader"/>. /// </summary> /// <returns> <seealso cref="org.apache.lucene.analysis.Analyzer.TokenStreamComponents"/> /// built from a <seealso cref="StandardTokenizer"/> filtered with /// <seealso cref="StandardFilter"/>, <seealso cref="LowerCaseFilter"/>, <seealso cref="ThaiWordFilter"/>, and /// <seealso cref="StopFilter"/> </returns> public override TokenStreamComponents CreateComponents(string fieldName, TextReader reader) { if (matchVersion.OnOrAfter(LuceneVersion.LUCENE_48)) { Tokenizer source = new ThaiTokenizer(reader); TokenStream result = new LowerCaseFilter(matchVersion, source); result = new StopFilter(matchVersion, result, stopwords); return(new TokenStreamComponents(source, result)); } else { Tokenizer source = new StandardTokenizer(matchVersion, reader); TokenStream result = new StandardFilter(matchVersion, source); if (matchVersion.OnOrAfter(LuceneVersion.LUCENE_31)) { result = new LowerCaseFilter(matchVersion, result); } result = new ThaiWordFilter(matchVersion, result); return(new TokenStreamComponents(source, new StopFilter(matchVersion, result, stopwords))); } }
protected internal override TokenStreamComponents createComponents(string fieldName, Reader reader) { if (matchVersion.onOrAfter(Version.LUCENE_36)) { //JAVA TO C# CONVERTER WARNING: The original Java variable was marked 'final': //ORIGINAL LINE: final org.apache.lucene.analysis.Tokenizer source = new org.apache.lucene.analysis.standard.StandardTokenizer(matchVersion, reader); Tokenizer source = new StandardTokenizer(matchVersion, reader); // run the widthfilter first before bigramming, it sometimes combines characters. TokenStream result = new CJKWidthFilter(source); result = new LowerCaseFilter(matchVersion, result); result = new CJKBigramFilter(result); return(new TokenStreamComponents(source, new StopFilter(matchVersion, result, stopwords))); } else { //JAVA TO C# CONVERTER WARNING: The original Java variable was marked 'final': //ORIGINAL LINE: final org.apache.lucene.analysis.Tokenizer source = new CJKTokenizer(reader); Tokenizer source = new CJKTokenizer(reader); return(new TokenStreamComponents(source, new StopFilter(matchVersion, source, stopwords))); } }
/// <summary> /// Creates a /// <seealso cref="org.apache.lucene.analysis.Analyzer.TokenStreamComponents"/> /// which tokenizes all the text in the provided <seealso cref="Reader"/>. /// </summary> /// <returns> A /// <seealso cref="org.apache.lucene.analysis.Analyzer.TokenStreamComponents"/> /// built from an <seealso cref="StandardTokenizer"/> filtered with /// <seealso cref="StandardFilter"/>, <seealso cref="IrishLowerCaseFilter"/>, <seealso cref="StopFilter"/> /// , <seealso cref="SetKeywordMarkerFilter"/> if a stem exclusion set is /// provided and <seealso cref="SnowballFilter"/>. </returns> public override TokenStreamComponents CreateComponents(string fieldName, TextReader reader) { Tokenizer source = new StandardTokenizer(matchVersion, reader); TokenStream result = new StandardFilter(matchVersion, source); StopFilter s = new StopFilter(matchVersion, result, HYPHENATIONS); if (!matchVersion.OnOrAfter(LuceneVersion.LUCENE_44)) { s.EnablePositionIncrements = false; } result = s; result = new ElisionFilter(result, DEFAULT_ARTICLES); result = new IrishLowerCaseFilter(result); result = new StopFilter(matchVersion, result, stopwords); if (stemExclusionSet.Count > 0) { result = new SetKeywordMarkerFilter(result, stemExclusionSet); } result = new SnowballFilter(result, new IrishStemmer()); return(new TokenStreamComponents(source, result)); }
/// <summary> /// Creates a /// <see cref="TokenStreamComponents"/> /// which tokenizes all the text in the provided <see cref="TextReader"/>. /// </summary> /// <returns> A /// <see cref="TokenStreamComponents"/> /// built from an <see cref="StandardTokenizer"/> filtered with /// <see cref="StandardFilter"/>, <see cref="EnglishPossessiveFilter"/>, /// <see cref="LowerCaseFilter"/>, <see cref="StopFilter"/>, /// <see cref="SetKeywordMarkerFilter"/> if a stem exclusion set is /// provided and <see cref="PorterStemFilter"/>. </returns> protected internal override TokenStreamComponents CreateComponents(string fieldName, TextReader reader) { Tokenizer source = new StandardTokenizer(m_matchVersion, reader); TokenStream result = new StandardFilter(m_matchVersion, source); // prior to this we get the classic behavior, standardfilter does it for us. #pragma warning disable 612, 618 if (m_matchVersion.OnOrAfter(LuceneVersion.LUCENE_31)) #pragma warning restore 612, 618 { result = new EnglishPossessiveFilter(m_matchVersion, result); } result = new LowerCaseFilter(m_matchVersion, result); result = new StopFilter(m_matchVersion, result, m_stopwords); if (stemExclusionSet.Count > 0) { result = new SetKeywordMarkerFilter(result, stemExclusionSet); } result = new PorterStemFilter(result); return(new TokenStreamComponents(source, result)); }
protected internal override TokenStreamComponents CreateComponents(string fieldName, TextReader reader) { #pragma warning disable 612, 618 if (m_matchVersion.OnOrAfter(LuceneVersion.LUCENE_36)) #pragma warning restore 612, 618 { Tokenizer source = new StandardTokenizer(m_matchVersion, reader); // run the widthfilter first before bigramming, it sometimes combines characters. TokenStream result = new CJKWidthFilter(source); result = new LowerCaseFilter(m_matchVersion, result); result = new CJKBigramFilter(result); return(new TokenStreamComponents(source, new StopFilter(m_matchVersion, result, m_stopwords))); } else { #pragma warning disable 612, 618 Tokenizer source = new CJKTokenizer(reader); #pragma warning restore 612, 618 return(new TokenStreamComponents(source, new StopFilter(m_matchVersion, source, m_stopwords))); } }
static List <string> TokenizeStandard(string content, TokenizeConfig config) { StringReader reader = new StringReader(content); TokenStream result = new StandardTokenizer(Lucene.Net.Util.Version.LUCENE_24, reader); var stophash = StopFilter.MakeStopSet(config.StopWords); result = new StandardFilter(result); result = new LowerCaseFilter(result); result = new StopFilter(true, result, stophash, true); /// Set up lexicon/invertlexicon, featurevectors, wordappearancecount /// result.Reset(); TermAttribute termattr = (TermAttribute)result.GetAttribute(typeof(TermAttribute)); List <string> words = new List <string>(); while (result.IncrementToken()) { words.Add(termattr.Term()); } return(words); }
public TokenStream SpanishSteammer(TokenStream tokenStream) { //Obtener en una cadena cada token y aplicar el lematizador a cada término string term = string.Empty; IStemmer stemmer = new SpanishStemmer(); TokenStream tokenStreamtemp; //var termAttr = tokenStream.GetAttribute<ITermAttribute>(); string cadena = ""; string[] token; Lucene.Net.Analysis.Token current; while ((current = tokenStream.Next()) != null) { cadena = current.ToString(); token = cadena.Split(','); cadena = cadena.Substring(1, token[0].Length - 1); term = term + " " + stemmer.Stem(cadena); } tokenStreamtemp = new StandardTokenizer(new System.IO.StringReader(term)); return(tokenStreamtemp); // }
/// <summary> /// Creates /// <seealso cref="org.apache.lucene.analysis.Analyzer.TokenStreamComponents"/> /// used to tokenize all the text in the provided <seealso cref="Reader"/>. /// </summary> /// <returns> <seealso cref="org.apache.lucene.analysis.Analyzer.TokenStreamComponents"/> /// built from a <seealso cref="StandardTokenizer"/> filtered with /// <seealso cref="LowerCaseFilter"/>, <seealso cref="StandardFilter"/>, <seealso cref="StopFilter"/> /// , and <seealso cref="BrazilianStemFilter"/>. </returns> protected internal override TokenStreamComponents createComponents(string fieldName, Reader reader) { Tokenizer source = new StandardTokenizer(matchVersion, reader); TokenStream result = new LowerCaseFilter(matchVersion, source); result = new StandardFilter(matchVersion, result); result = new StopFilter(matchVersion, result, stopwords); if (excltable != null && !excltable.Empty) { result = new SetKeywordMarkerFilter(result, excltable); } return new TokenStreamComponents(source, new BrazilianStemFilter(result)); }
protected internal override TokenStreamComponents createComponents(string fieldName, Reader reader) { Tokenizer tokenizer = new StandardTokenizer(TEST_VERSION_CURRENT, reader); return new TokenStreamComponents(tokenizer); }
protected internal override TokenStreamComponents createComponents(string fieldName, Reader reader) { if (matchVersion.onOrAfter(Version.LUCENE_36)) { //JAVA TO C# CONVERTER WARNING: The original Java variable was marked 'final': //ORIGINAL LINE: final org.apache.lucene.analysis.Tokenizer source = new org.apache.lucene.analysis.standard.StandardTokenizer(matchVersion, reader); Tokenizer source = new StandardTokenizer(matchVersion, reader); // run the widthfilter first before bigramming, it sometimes combines characters. TokenStream result = new CJKWidthFilter(source); result = new LowerCaseFilter(matchVersion, result); result = new CJKBigramFilter(result); return new TokenStreamComponents(source, new StopFilter(matchVersion, result, stopwords)); } else { //JAVA TO C# CONVERTER WARNING: The original Java variable was marked 'final': //ORIGINAL LINE: final org.apache.lucene.analysis.Tokenizer source = new CJKTokenizer(reader); Tokenizer source = new CJKTokenizer(reader); return new TokenStreamComponents(source, new StopFilter(matchVersion, source, stopwords)); } }
/// <summary> /// Creates a /// <seealso cref="org.apache.lucene.analysis.Analyzer.TokenStreamComponents"/> /// which tokenizes all the text in the provided <seealso cref="Reader"/>. /// </summary> /// <returns> A /// <seealso cref="org.apache.lucene.analysis.Analyzer.TokenStreamComponents"/> /// built from an <seealso cref="StandardTokenizer"/> filtered with /// <seealso cref="StandardFilter"/>, <seealso cref="SoraniNormalizationFilter"/>, /// <seealso cref="LowerCaseFilter"/>, <seealso cref="StopFilter"/> /// , <seealso cref="SetKeywordMarkerFilter"/> if a stem exclusion set is /// provided and <seealso cref="SoraniStemFilter"/>. </returns> protected internal override TokenStreamComponents createComponents(string fieldName, Reader reader) { //JAVA TO C# CONVERTER WARNING: The original Java variable was marked 'final': //ORIGINAL LINE: final org.apache.lucene.analysis.Tokenizer source = new org.apache.lucene.analysis.standard.StandardTokenizer(matchVersion, reader); Tokenizer source = new StandardTokenizer(matchVersion, reader); TokenStream result = new StandardFilter(matchVersion, source); result = new SoraniNormalizationFilter(result); result = new LowerCaseFilter(matchVersion, result); result = new StopFilter(matchVersion, result, stopwords); if (!stemExclusionSet.Empty) { result = new SetKeywordMarkerFilter(result, stemExclusionSet); } result = new SoraniStemFilter(result); return new TokenStreamComponents(source, result); }
/// <summary> /// Constructs a <seealso cref="StandardTokenizer"/> filtered by a {@link /// StandardFilter}, a <seealso cref="LowerCaseFilter"/>, a <seealso cref="StopFilter"/>, /// and a <seealso cref="SnowballFilter"/> /// </summary> public override TokenStreamComponents createComponents(string fieldName, Reader reader) { Tokenizer tokenizer = new StandardTokenizer(matchVersion, reader); TokenStream result = new StandardFilter(matchVersion, tokenizer); // remove the possessive 's for english stemmers if (matchVersion.onOrAfter(Version.LUCENE_31) && (name.Equals("English") || name.Equals("Porter") || name.Equals("Lovins"))) { result = new EnglishPossessiveFilter(result); } // Use a special lowercase filter for turkish, the stemmer expects it. if (matchVersion.onOrAfter(Version.LUCENE_31) && name.Equals("Turkish")) { result = new TurkishLowerCaseFilter(result); } else { result = new LowerCaseFilter(matchVersion, result); } if (stopSet != null) { result = new StopFilter(matchVersion, result, stopSet); } result = new SnowballFilter(result, name); return new TokenStreamComponents(tokenizer, result); }
protected internal override TokenStreamComponents createComponents(string fieldName, Reader reader) { Tokenizer tokenizer = new StandardTokenizer(TEST_VERSION_CURRENT, reader); TokenStream tokenStream = new MockGraphTokenFilter(random(), tokenizer); return new TokenStreamComponents(tokenizer, tokenStream); }
/// <summary> /// Creates /// <seealso cref="org.apache.lucene.analysis.Analyzer.TokenStreamComponents"/> /// used to tokenize all the text in the provided <seealso cref="Reader"/>. /// </summary> /// <returns> <seealso cref="org.apache.lucene.analysis.Analyzer.TokenStreamComponents"/> /// built from a <seealso cref="StandardTokenizer"/> filtered with /// <seealso cref="GreekLowerCaseFilter"/>, <seealso cref="StandardFilter"/>, /// <seealso cref="StopFilter"/>, and <seealso cref="GreekStemFilter"/> </returns> protected internal override TokenStreamComponents createComponents(string fieldName, Reader reader) { //JAVA TO C# CONVERTER WARNING: The original Java variable was marked 'final': //ORIGINAL LINE: final org.apache.lucene.analysis.Tokenizer source = new org.apache.lucene.analysis.standard.StandardTokenizer(matchVersion, reader); Tokenizer source = new StandardTokenizer(matchVersion, reader); TokenStream result = new GreekLowerCaseFilter(matchVersion, source); if (matchVersion.onOrAfter(Version.LUCENE_31)) { result = new StandardFilter(matchVersion, result); } result = new StopFilter(matchVersion, result, stopwords); if (matchVersion.onOrAfter(Version.LUCENE_31)) { result = new GreekStemFilter(result); } return new TokenStreamComponents(source, result); }
/// <summary> /// Creates a /// <seealso cref="org.apache.lucene.analysis.Analyzer.TokenStreamComponents"/> /// which tokenizes all the text in the provided <seealso cref="Reader"/>. /// </summary> /// <returns> A /// <seealso cref="org.apache.lucene.analysis.Analyzer.TokenStreamComponents"/> /// built from an <seealso cref="StandardTokenizer"/> filtered with /// <seealso cref="StandardFilter"/>, <seealso cref="ElisionFilter"/>, <seealso cref="LowerCaseFilter"/>, /// <seealso cref="StopFilter"/>, <seealso cref="SetKeywordMarkerFilter"/> if a stem exclusion set is /// provided and <seealso cref="SnowballFilter"/>. </returns> protected internal override TokenStreamComponents createComponents(string fieldName, Reader reader) { //JAVA TO C# CONVERTER WARNING: The original Java variable was marked 'final': //ORIGINAL LINE: final org.apache.lucene.analysis.Tokenizer source = new org.apache.lucene.analysis.standard.StandardTokenizer(matchVersion, reader); Tokenizer source = new StandardTokenizer(matchVersion, reader); TokenStream result = new StandardFilter(matchVersion, source); if (matchVersion.onOrAfter(Version.LUCENE_36)) { result = new ElisionFilter(result, DEFAULT_ARTICLES); } result = new LowerCaseFilter(matchVersion, result); result = new StopFilter(matchVersion, result, stopwords); if (!stemExclusionSet.Empty) { result = new SetKeywordMarkerFilter(result, stemExclusionSet); } result = new SnowballFilter(result, new CatalanStemmer()); return new TokenStreamComponents(source, result); }
protected internal override TokenStreamComponents createComponents(string fieldName, Reader reader) { Tokenizer tokenizer = new StandardTokenizer(Version.LUCENE_40, reader); return new TokenStreamComponents(tokenizer); }
/// <summary> /// Creates /// <seealso cref="org.apache.lucene.analysis.Analyzer.TokenStreamComponents"/> /// used to tokenize all the text in the provided <seealso cref="Reader"/>. /// </summary> /// <returns> <seealso cref="org.apache.lucene.analysis.Analyzer.TokenStreamComponents"/> /// built from a <seealso cref="StandardTokenizer"/> filtered with /// <seealso cref="StandardFilter"/>, <seealso cref="ElisionFilter"/>, /// <seealso cref="LowerCaseFilter"/>, <seealso cref="StopFilter"/>, /// <seealso cref="SetKeywordMarkerFilter"/> if a stem exclusion set is /// provided, and <seealso cref="FrenchLightStemFilter"/> </returns> protected internal override TokenStreamComponents createComponents(string fieldName, Reader reader) { if (matchVersion.onOrAfter(Version.LUCENE_31)) { //JAVA TO C# CONVERTER WARNING: The original Java variable was marked 'final': //ORIGINAL LINE: final org.apache.lucene.analysis.Tokenizer source = new org.apache.lucene.analysis.standard.StandardTokenizer(matchVersion, reader); Tokenizer source = new StandardTokenizer(matchVersion, reader); TokenStream result = new StandardFilter(matchVersion, source); result = new ElisionFilter(result, DEFAULT_ARTICLES); result = new LowerCaseFilter(matchVersion, result); result = new StopFilter(matchVersion, result, stopwords); if (!excltable.Empty) { result = new SetKeywordMarkerFilter(result, excltable); } if (matchVersion.onOrAfter(Version.LUCENE_36)) { result = new FrenchLightStemFilter(result); } else { result = new SnowballFilter(result, new org.tartarus.snowball.ext.FrenchStemmer()); } return new TokenStreamComponents(source, result); } else { //JAVA TO C# CONVERTER WARNING: The original Java variable was marked 'final': //ORIGINAL LINE: final org.apache.lucene.analysis.Tokenizer source = new org.apache.lucene.analysis.standard.StandardTokenizer(matchVersion, reader); Tokenizer source = new StandardTokenizer(matchVersion, reader); TokenStream result = new StandardFilter(matchVersion, source); result = new StopFilter(matchVersion, result, stopwords); if (!excltable.Empty) { result = new SetKeywordMarkerFilter(result, excltable); } result = new FrenchStemFilter(result); // Convert to lowercase after stemming! return new TokenStreamComponents(source, new LowerCaseFilter(matchVersion, result)); } }