public override TokenStream TokenStream(String fieldName, TextReader reader) { TokenStream filter = new StandardFilter(new LowerCaseTokenizer(reader)); filter = new PorterStemFilter(filter); return(filter); }
public virtual void TestNoOverrides() { StemmerOverrideFilter.Builder builder = new StemmerOverrideFilter.Builder(true); Tokenizer tokenizer = new KeywordTokenizer(new StringReader("book")); TokenStream stream = new PorterStemFilter(new StemmerOverrideFilter(tokenizer, builder.Build())); AssertTokenStreamContents(stream, new string[] { "book" }); }
/// <summary> /// Override of the token stream method, uses these filters in order: /// /// Whitespace splitter /// ASCII common folder (ie é goes to e) /// Lowercase /// Stopwords removed /// Porter stemming (reduces words to common stem) /// </summary> protected override TokenStreamComponents CreateComponents(string fieldName, TextReader reader) { var tokenizer = new WhitespaceTokenizer(LeoLuceneVersion.Version, reader); TokenStream filter = new ASCIIFoldingFilter(tokenizer); filter = new LowerCaseFilter(LeoLuceneVersion.Version, filter); filter = new StopFilter(LeoLuceneVersion.Version, filter, _words); filter = new PorterStemFilter(filter); return(new TokenStreamComponents(tokenizer, filter)); }
protected override TokenStreamComponents CreateComponents(string fieldName, TextReader reader) { Tokenizer lowerCaseTokenizer = new LowerCaseTokenizer(LuceneVersion.LUCENE_48, reader); PorterStemFilter porterStemFilter = new PorterStemFilter(lowerCaseTokenizer); StopFilter stopFilter = new StopFilter(LuceneVersion.LUCENE_48, porterStemFilter, EnglishAnalyzer.DefaultStopSet); return(new TokenStreamComponents(lowerCaseTokenizer, stopFilter)); }
public virtual void TestIgnoreCase() { // lets make booked stem to books // the override filter will convert "booked" to "books", // but also mark it with KeywordAttribute so Porter will not change it. StemmerOverrideFilter.Builder builder = new StemmerOverrideFilter.Builder(true); builder.Add("boOkEd", "books"); Tokenizer tokenizer = new KeywordTokenizer(new StringReader("BooKeD")); TokenStream stream = new PorterStemFilter(new StemmerOverrideFilter(tokenizer, builder.Build())); AssertTokenStreamContents(stream, new string[] { "books" }); }
protected override TokenStreamComponents CreateComponents(string fieldName, TextReader reader) { Tokenizer source = new StandardTokenizer(m_matchVersion, reader); TokenStream result = new StandardFilter(m_matchVersion, source); result = new EnglishPossessiveFilter(m_matchVersion, result); result = new ASCIIFoldingFilter(result); result = new LowerCaseFilter(m_matchVersion, result); result = new StopFilter(m_matchVersion, result, m_stopwords); result = new PorterStemFilter(result); return(new TokenStreamComponents(source, result)); }
//Para el procesamiento de textos public string AnalizarConsulta(string consulta, string tipoAnalizador) { ArrayList ListStemsList = new ArrayList(); TokenStream tokenStream = new StandardTokenizer(Version.LUCENE_30, new System.IO.StringReader(consulta)); tokenStream = new StandardFilter(tokenStream); //elimina los signos de puntuación tokenStream = new LowerCaseFilter(tokenStream); //convierte el contenido a minúsculas if (tipoAnalizador == "Español") { //filtrará el contenido con el listado de stopWords tokenStream = new StopFilter(true, tokenStream, StopFilter.MakeStopSet(SpanishAnalyzer.SPANISH_STOP_WORDS)); //Convierte caracteres que estan por encima del 127 en la tabla ASCII tokenStream = new ASCIIFoldingFilter(tokenStream); //Operacion de lematización de la palabras tokenStream = SpanishSteammer(tokenStream); } else if (tipoAnalizador == "Ingles") { //filtrará el contenido con el listado de stopWords tokenStream = new StopFilter(true, tokenStream, StopAnalyzer.ENGLISH_STOP_WORDS_SET); //Operacion de lematización de la palabras tokenStream = new PorterStemFilter(tokenStream); } else //Sino establece idioma solo elimina palabras vacias en ambos idiomas { //filtrará el contenido con el listado de stopWords tokenStream = new StopFilter(true, tokenStream, StopFilter.MakeStopSet(SpanishAnalyzer.SPANISH_STOP_WORDS)); //filtrará el contenido con el listado de stopWords tokenStream = new StopFilter(true, tokenStream, StopAnalyzer.ENGLISH_STOP_WORDS_SET); } string term = string.Empty; var termAttr = tokenStream.GetAttribute <ITermAttribute>(); int i = 0; while (tokenStream.IncrementToken()) { if (i == 0) { term = termAttr.Term; i++; } else { term = term + "," + termAttr.Term; } } return((string.IsNullOrEmpty(term)) ? string.Empty : term.Trim()); }
public void tokenize() { TokenStream tokenStream = new StandardTokenizer(Lucene.Net.Util.Version.LUCENE_30, new StringReader(this.input)); tokenStream = new StopFilter(false, tokenStream, StandardAnalyzer.STOP_WORDS_SET); tokenStream = new PorterStemFilter(tokenStream); var termAttr = tokenStream.GetAttribute <Lucene.Net.Analysis.Tokenattributes.ITermAttribute>(); while (tokenStream.IncrementToken()) { tokens.Add(termAttr.Term); } }
public override TokenStream TokenStream(String fieldName, TextReader reader) { TokenStream result = new SentenceTokenizer(reader); result = new WordTokenizer(result, wordSegment); // result = new LowerCaseFilter(result); // 不再需要LowerCaseFilter,因为SegTokenFilter已经将所有英文字符转换成小写 // stem太严格了, This is not bug, this feature:) result = new PorterStemFilter(result); if (stopWords != null) { result = new StopFilter(true, result, StopFilter.MakeStopSet(stopWords), false); } return(result); }
protected override TokenStreamComponents CreateComponents(string fieldName, TextReader reader) { TokenStreamComponents tokenStreamComponents = null; Tokenizer tokenizer = new StandardTokenizer(matchVersion, reader); TokenStream stream = new StandardFilter(matchVersion, tokenizer); stream = new LowerCaseFilter(matchVersion, stream); stream = new StopFilter(matchVersion, stream, StopAnalyzer.ENGLISH_STOP_WORDS_SET); stream = new PorterStemFilter(stream); stream = new SnowballFilter(stream, new EnglishStemmer()); tokenStreamComponents = new TokenStreamComponents(tokenizer, stream); return(tokenStreamComponents); }
public virtual void TestRandomRealisticWhiteSpace() { IDictionary <string, string> map = new Dictionary <string, string>(); int numTerms = AtLeast(50); for (int i = 0; i < numTerms; i++) { string randomRealisticUnicodeString = TestUtil.RandomRealisticUnicodeString(Random); char[] charArray = randomRealisticUnicodeString.ToCharArray(); StringBuilder sb = new StringBuilder(); for (int j = 0; j < charArray.Length;) { int cp = Character.CodePointAt(charArray, j, charArray.Length); if (!char.IsWhiteSpace((char)cp)) { sb.AppendCodePoint(cp); } j += Character.CharCount(cp); } if (sb.Length > 0) { string value = TestUtil.RandomSimpleString(Random); map[sb.ToString()] = value.Length == 0 ? "a" : value; } } if (map.Count == 0) { map["booked"] = "books"; } StemmerOverrideFilter.Builder builder = new StemmerOverrideFilter.Builder(Random.nextBoolean()); IDictionary <string, string> entrySet = map; StringBuilder input = new StringBuilder(); IList <string> output = new List <string>(); foreach (KeyValuePair <string, string> entry in entrySet) { builder.Add(entry.Key, entry.Value); if (Random.nextBoolean() || output.Count == 0) { input.Append(entry.Key).Append(" "); output.Add(entry.Value); } } Tokenizer tokenizer = new WhitespaceTokenizer(TEST_VERSION_CURRENT, new StringReader(input.ToString())); TokenStream stream = new PorterStemFilter(new StemmerOverrideFilter(tokenizer, builder.Build())); AssertTokenStreamContents(stream, output.ToArray()); }
public override TokenStream TokenStream(string fieldName, TextReader reader) { TokenStream result = null; if (useStopWords) { ISet <string> stopWords = StopFilter.MakeStopSet(new string[] { "a", "an", "and", "are", "as", "at", "be", "but", "by", "for", "if", "in", "into", "is", "it", "no", "not", "of", "on", "or", "such", "that", "the", "their", "then", "there", "these", "they", "this", "to", "was", "will", "with", // EN "van", "aan", "dat", "de", "den", "der", "des", "deze", "die", "dit", "door", "een", "het", "ik", "is", "je", "na", // NL "au", "aux", "la", "le", "les" }); // FR result = new PorterStemFilter(new StopFilter(false, new ASCIIFoldingFilter(new LowerCaseFilter(new CDRWhitespaceTokenizer(reader))), stopWords)); } else { result = new PorterStemFilter(new ASCIIFoldingFilter(new LowerCaseFilter(new CDRWhitespaceTokenizer(reader)))); } return(result); }
override public TokenStream TokenStream(string fieldName, TextReader reader) { StandardTokenizer tokenStream = new StandardTokenizer(VERSION, reader); tokenStream.MaxTokenLength = DEFAULT_MAX_TOKEN_LENGTH; TokenStream result = new StandardFilter(tokenStream); result = new LowerCaseFilter(result); if (SettingsViewModel.Instance.StopWords == true) { result = new StopFilter(enableSPI, result, StopSet); } if (SettingsViewModel.Instance.Stemming == true) { result = new PorterStemFilter(result); } return(result); }
protected override TokenStreamComponents CreateComponents(string fieldName, TextReader reader) { Tokenizer source = new StandardTokenizer(m_matchVersion, reader); TokenStream result = new StandardFilter(m_matchVersion, source); // for stripping 's from words result = new EnglishPossessiveFilter(m_matchVersion, result); // converts é to e (and © to (c), etc. result = new ASCIIFoldingFilter(result); result = new LowerCaseFilter(m_matchVersion, result); result = new StopFilter(m_matchVersion, result, EnglishAnalyzer.DefaultStopSet); // for chopping off common word suffixes, like removing ming from stemming, etc. result = new PorterStemFilter(result); // The ngram tokenizer first breaks text down into words whenever it encounters one of a list of specified characters, // then it emits N-grams of each word of the specified length. if (_userNGram) { result = new EdgeNGramTokenFilter(m_matchVersion, result, _ngramMin, _ngramMax); } return(new TokenStreamComponents(source, result)); }
//JAVA TO C# CONVERTER WARNING: Method 'throws' clauses are not available in .NET: //ORIGINAL LINE: public void testRandomRealisticKeyword() throws java.io.IOException public virtual void testRandomRealisticKeyword() { IDictionary <string, string> map = new Dictionary <string, string>(); int numTerms = atLeast(50); for (int i = 0; i < numTerms; i++) { string randomRealisticUnicodeString = TestUtil.randomRealisticUnicodeString(random()); if (randomRealisticUnicodeString.Length > 0) { string value = TestUtil.randomSimpleString(random()); map[randomRealisticUnicodeString] = value.Length == 0 ? "a" : value; } } if (map.Count == 0) { map["booked"] = "books"; } StemmerOverrideFilter.Builder builder = new StemmerOverrideFilter.Builder(random().nextBoolean()); ISet <KeyValuePair <string, string> > entrySet = map.SetOfKeyValuePairs(); foreach (KeyValuePair <string, string> entry in entrySet) { builder.add(entry.Key, entry.Value); } StemmerOverrideMap build = builder.build(); foreach (KeyValuePair <string, string> entry in entrySet) { if (random().nextBoolean()) { Tokenizer tokenizer = new KeywordTokenizer(new StringReader(entry.Key)); TokenStream stream = new PorterStemFilter(new StemmerOverrideFilter(tokenizer, build)); assertTokenStreamContents(stream, new string[] { entry.Value }); } } }
public override TokenStream TokenStream(System.String fieldName, System.IO.TextReader reader) { TokenStream tok = new PorterStemFilter(new LowerCaseTokenizer(reader)); return(tok); }
/// <summary> /// Creates the components. /// </summary> /// <param name="fieldName">Name of the field.</param> /// <returns></returns> protected override AnalyzerTokenStreamComponents CreateComponents(string fieldName) { if (String.IsNullOrWhiteSpace(fieldName)) throw new ArgumentException($"{nameof(fieldName)} cannot be null or blank"); var pattern = Pattern.compile(_separatorChars); var tokenizer = new PatternTokenizer(pattern, -1); var stream = _ignoreCase ? new LowerCaseFilter(tokenizer) as TokenStream : tokenizer as TokenStream; if (_enableStemming) stream = new PorterStemFilter(stream); return new AnalyzerTokenStreamComponents(tokenizer, stream); }