protected override TokenStreamComponents CreateComponents(string fieldName, TextReader reader) { Tokenizer tokenizer = new MockTokenizer(reader); CharArraySet stopSet = StopFilter.MakeStopSet(TEST_VERSION_CURRENT, "of"); return(new TokenStreamComponents(tokenizer, new StopFilter(TEST_VERSION_CURRENT, tokenizer, stopSet))); }
//Para el procesamiento de textos internal string DeleteInvalidData(string result, string tipoAnalizador) { TokenStream tokenStream = new StandardTokenizer(Version.LUCENE_30, new System.IO.StringReader(result)); tokenStream = new StandardFilter(tokenStream); //elimina los signos de puntuación tokenStream = new LowerCaseFilter(tokenStream); //convierte el contenido a minúsculas if (tipoAnalizador == "Español") { //filtrará el contenido con el listado de stopWords tokenStream = new StopFilter(true, tokenStream, StopFilter.MakeStopSet(SpanishAnalyzer.SPANISH_STOP_WORDS)); //Convierte caracteres que estan por encima del 127 en la tabla ASCII tokenStream = new ASCIIFoldingFilter(tokenStream); //Operacion de lematización de la palabras tokenStream = SpanishSteammer(tokenStream); } else { //filtrará el contenido con el listado de stopWords tokenStream = new StopFilter(true, tokenStream, StopAnalyzer.ENGLISH_STOP_WORDS_SET); //Operacion de lematización de la palabras tokenStream = new PorterStemFilter(tokenStream); } return(GetDataTokens(tokenStream)); }
protected internal override TokenStreamComponents CreateComponents(string fieldName, TextReader reader) { CharArraySet stopSet = StopFilter.MakeStopSet(TEST_VERSION_CURRENT, "into"); Tokenizer tokenizer = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false); TokenFilter filter = new StopFilter(TEST_VERSION_CURRENT, tokenizer, stopSet); return(new TokenStreamComponents(tokenizer, filter)); }
/// <summary> /// Builds an analyzer. /// </summary> public DutchAnalyzer() { stoptable = StopFilter.MakeStopSet(DUTCH_STOP_WORDS); _stemdict.Add("fiets", "fiets"); //otherwise fiet _stemdict.Add("bromfiets", "bromfiets"); //otherwise bromfiet _stemdict.Add("ei", "eier"); _stemdict.Add("kind", "kinder"); }
protected override TokenStreamComponents CreateComponents(string fieldName, TextReader reader) { Tokenizer source = new StandardTokenizer(Version.LUCENE_48, reader); TokenStream result = new CyrilicToLatinFilter(source); result = new LowerCaseFilter(Version.LUCENE_48, result); result = new StopFilter(Version.LUCENE_48, result, StopFilter.MakeStopSet(Version.LUCENE_48, STOP_WORDS)); return(new TokenStreamComponents(source, result)); }
public override TokenStream TokenStream(String FieldName, TextReader reader) { TokenStream result = new StandardTokenizer(reader); result = new StandardFilter(result); result = new LowerCaseFilter(result); result = new StopFilter(result, StopFilter.MakeStopSet(SPANISH_STOP_WORDS), true); //result = new PorterStemFilter(result); result = SpanishSteammer(result); return(result); }
static List <string> TokenizeICTCLAS(string content, TokenizeConfig config) { if (!IsICTCLASInitialized) { if (!NLPIR_Init(datapath, 0, ""))//给出Data文件所在的路径,注意根据实际情况修改。 { throw new Exception("Init ICTCLAS failed!"); } //System.Console.WriteLine("Init ICTCLAS success!"); IsICTCLASInitialized = true; } //Add user dictionary if (config.UserDict != null && config.UserDict.Count != 0) { foreach (var kvp in config.UserDict) { NLPIR_AddUserWord(kvp.Key + " " + kvp.Value);//词 词性 example:点击下载 vyou } } //Tokenize var intPtr = NLPIR_ParagraphProcess(content.ToLower(), 1); var str = Marshal.PtrToStringAnsi(intPtr); var tokens = str.Split(new char[] { ' ' }, StringSplitOptions.RemoveEmptyEntries); List <string> words = new List <string>(); foreach (var token in tokens) { var index = token.IndexOf('/'); if (index > 0) { words.Add(token.Substring(0, index)); } } //Filter Stopwords var words2 = new List <string>(); var stophash = StopFilter.MakeStopSet(config.StopWords); foreach (var word in words) { if (!stophash.Contains(word) && Regex.Match(word).Success) { words2.Add(word); } } return(words2); }
//Para el procesamiento de textos public string AnalizarConsulta(string consulta, string tipoAnalizador) { ArrayList ListStemsList = new ArrayList(); TokenStream tokenStream = new StandardTokenizer(Version.LUCENE_30, new System.IO.StringReader(consulta)); tokenStream = new StandardFilter(tokenStream); //elimina los signos de puntuación tokenStream = new LowerCaseFilter(tokenStream); //convierte el contenido a minúsculas if (tipoAnalizador == "Español") { //filtrará el contenido con el listado de stopWords tokenStream = new StopFilter(true, tokenStream, StopFilter.MakeStopSet(SpanishAnalyzer.SPANISH_STOP_WORDS)); //Convierte caracteres que estan por encima del 127 en la tabla ASCII tokenStream = new ASCIIFoldingFilter(tokenStream); //Operacion de lematización de la palabras tokenStream = SpanishSteammer(tokenStream); } else if (tipoAnalizador == "Ingles") { //filtrará el contenido con el listado de stopWords tokenStream = new StopFilter(true, tokenStream, StopAnalyzer.ENGLISH_STOP_WORDS_SET); //Operacion de lematización de la palabras tokenStream = new PorterStemFilter(tokenStream); } else //Sino establece idioma solo elimina palabras vacias en ambos idiomas { //filtrará el contenido con el listado de stopWords tokenStream = new StopFilter(true, tokenStream, StopFilter.MakeStopSet(SpanishAnalyzer.SPANISH_STOP_WORDS)); //filtrará el contenido con el listado de stopWords tokenStream = new StopFilter(true, tokenStream, StopAnalyzer.ENGLISH_STOP_WORDS_SET); } string term = string.Empty; var termAttr = tokenStream.GetAttribute <ITermAttribute>(); int i = 0; while (tokenStream.IncrementToken()) { if (i == 0) { term = termAttr.Term; i++; } else { term = term + "," + termAttr.Term; } } return((string.IsNullOrEmpty(term)) ? string.Empty : term.Trim()); }
public override TokenStream TokenStream(string fieldName, System.IO.TextReader reader) { TokenStream result = new StandardTokenizer(kLuceneVersion, reader); result = new StandardFilter(result); result = new LowerCaseFilter(result); result = new ASCIIFoldingFilter(result); result = new StopFilter(false, result, StopFilter.MakeStopSet(kEnglishStopWords)); //DEFAULT_SIDE = Side.FRONT result = new EdgeNGramTokenFilter( result, Lucene.Net.Analysis.NGram.EdgeNGramTokenFilter.DEFAULT_SIDE, 1, 20); return(result); }
public override TokenStream TokenStream(String fieldName, TextReader reader) { TokenStream result = new SentenceTokenizer(reader); result = new WordTokenizer(result, wordSegment); // result = new LowerCaseFilter(result); // 不再需要LowerCaseFilter,因为SegTokenFilter已经将所有英文字符转换成小写 // stem太严格了, This is not bug, this feature:) result = new PorterStemFilter(result); if (stopWords != null) { result = new StopFilter(true, result, StopFilter.MakeStopSet(stopWords), false); } return(result); }
protected override TokenStreamComponents CreateComponents(string fieldName, TextReader reader) { TokenStreamComponents tokenStreamComponents = null; Tokenizer tokenizer = new StandardTokenizer(matchVersion, reader); TokenStream stream = new LowerCaseFilter(matchVersion, tokenizer); stream = new CyrllicToLatinFilter(stream); stream = new StopFilter(matchVersion, stream, StopFilter.MakeStopSet(matchVersion, STOP_WORDS)); stream = new SnowballFilter(stream, new SimpleSerbianStemmer()); stream = new ASCIIFoldingFilter(stream); tokenStreamComponents = new TokenStreamComponents(tokenizer, stream); return(tokenStreamComponents); }
public void TestEndNotStopWord() { CharArraySet stopWords = StopFilter.MakeStopSet(TEST_VERSION_CURRENT, "to"); TokenStream stream = new MockTokenizer(new StringReader("go to")); TokenStream filter = new SuggestStopFilter(stream, stopWords); AssertTokenStreamContents(filter, new string[] { "go", "to" }, new int[] { 0, 3 }, new int[] { 2, 5 }, null, new int[] { 1, 1 }, null, 5, new bool[] { false, true }, true); }
public void TestMultipleStopWords() { CharArraySet stopWords = StopFilter.MakeStopSet(TEST_VERSION_CURRENT, "to", "the", "a"); TokenStream stream = new MockTokenizer(new StringReader("go to a the school")); TokenStream filter = new SuggestStopFilter(stream, stopWords); filter = new SuggestStopFilter(stream, stopWords); AssertTokenStreamContents(filter, new String[] { "go", "school" }, new int[] { 0, 12 }, new int[] { 2, 18 }, null, new int[] { 1, 4 }, null, 18, new bool[] { false, false }, true); }
public void TestSuggestStopFilter() { CharArraySet stopWords = StopFilter.MakeStopSet(TEST_VERSION_CURRENT, "a"); Analyzer indexAnalyzer = new TestSuggestStopFilterAnalyzer1(this, stopWords); Analyzer queryAnalyzer = new TestSuggestStopFilterAnalyzer2(this, stopWords); AnalyzingInfixSuggester suggester = new AnalyzingInfixSuggester(TEST_VERSION_CURRENT, NewDirectory(), indexAnalyzer, queryAnalyzer, 3); Input[] keys = new Input[] { new Input("a bob for apples", 10, new BytesRef("foobaz")), }; suggester.Build(new InputArrayIterator(keys)); IList <Lookup.LookupResult> results = suggester.DoLookup(TestUtil.StringToCharSequence("a", Random()).ToString(), 10, true, true); assertEquals(1, results.size()); assertEquals("a bob for <b>a</b>pples", results[0].key); suggester.Dispose(); }
public override TokenStream TokenStream(string fieldName, TextReader reader) { TokenStream result = null; if (useStopWords) { ISet <string> stopWords = StopFilter.MakeStopSet(new string[] { "a", "an", "and", "are", "as", "at", "be", "but", "by", "for", "if", "in", "into", "is", "it", "no", "not", "of", "on", "or", "such", "that", "the", "their", "then", "there", "these", "they", "this", "to", "was", "will", "with", // EN "van", "aan", "dat", "de", "den", "der", "des", "deze", "die", "dit", "door", "een", "het", "ik", "is", "je", "na", // NL "au", "aux", "la", "le", "les" }); // FR result = new PorterStemFilter(new StopFilter(false, new ASCIIFoldingFilter(new LowerCaseFilter(new CDRWhitespaceTokenizer(reader))), stopWords)); } else { result = new PorterStemFilter(new ASCIIFoldingFilter(new LowerCaseFilter(new CDRWhitespaceTokenizer(reader)))); } return(result); }
/// <summary> /// Returns as <seealso cref="CharArraySet"/> from wordFiles, which /// can be a comma-separated list of filenames /// </summary> protected internal CharArraySet GetWordSet(IResourceLoader loader, string wordFiles, bool ignoreCase) { AssureMatchVersion(); IEnumerable <string> files = SplitFileNames(wordFiles); CharArraySet words = null; if (files.Count() > 0) { // default stopwords list has 35 or so words, but maybe don't make it that // big to start words = new CharArraySet(luceneMatchVersion, files.Count() * 10, ignoreCase); foreach (string file in files) { var wlist = GetLines(loader, file.Trim()); words.UnionWith(StopFilter.MakeStopSet(luceneMatchVersion, wlist, ignoreCase)); } } return(words); }
public virtual void TestAltFillerToken() { Analyzer @delegate = Analyzer.NewAnonymous(createComponents: (fieldName, reader) => { CharArraySet stopSet = StopFilter.MakeStopSet(TEST_VERSION_CURRENT, "into"); Tokenizer tokenizer = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false); TokenFilter filter = new StopFilter(TEST_VERSION_CURRENT, tokenizer, stopSet); return(new TokenStreamComponents(tokenizer, filter)); }); ShingleAnalyzerWrapper analyzer = new ShingleAnalyzerWrapper(@delegate, ShingleFilter.DEFAULT_MIN_SHINGLE_SIZE, ShingleFilter.DEFAULT_MAX_SHINGLE_SIZE, ShingleFilter.DEFAULT_TOKEN_SEPARATOR, true, false, "--"); AssertAnalyzesTo(analyzer, "please divide into shingles", new string[] { "please", "please divide", "divide", "divide --", "-- shingles", "shingles" }, new int[] { 0, 0, 7, 7, 19, 19 }, new int[] { 6, 13, 13, 19, 27, 27 }, new int[] { 1, 0, 1, 0, 1, 1 }); analyzer = new ShingleAnalyzerWrapper(@delegate, ShingleFilter.DEFAULT_MIN_SHINGLE_SIZE, ShingleFilter.DEFAULT_MAX_SHINGLE_SIZE, ShingleFilter.DEFAULT_TOKEN_SEPARATOR, false, false, null); AssertAnalyzesTo(analyzer, "please divide into shingles", new string[] { "please divide", "divide ", " shingles" }, new int[] { 0, 7, 19 }, new int[] { 13, 19, 27 }, new int[] { 1, 1, 1 }); analyzer = new ShingleAnalyzerWrapper(@delegate, ShingleFilter.DEFAULT_MIN_SHINGLE_SIZE, ShingleFilter.DEFAULT_MAX_SHINGLE_SIZE, ShingleFilter.DEFAULT_TOKEN_SEPARATOR, false, false, ""); AssertAnalyzesTo(analyzer, "please divide into shingles", new string[] { "please divide", "divide ", " shingles" }, new int[] { 0, 7, 19 }, new int[] { 13, 19, 27 }, new int[] { 1, 1, 1 }); }
public static List <UrlDocument> BuscarEnIndiceSemantico(String original, Boolean usarEspañol) { //Lista de documentos resultado de la búsqueda List <Document> DocumenResult = new List <Document>(); List <UrlDocument> UrlResult = new List <UrlDocument>(); //Llama al procedimiento que realiza la busqueda if (usarEspañol) { DocumenResult = moreLikeThisAnalyzer(original, StopFilter.MakeStopSet(SpanishAnalyzer.SPANISH_STOP_WORDS), new SpanishAnalyzer()); } else { DocumenResult = moreLikeThisAnalyzer(original, Lucene.Net.Analysis.StopAnalyzer.ENGLISH_STOP_WORDS_SET, new StandardAnalyzer(Version.LUCENE_30)); } //Convertir en UrlDocument para su procesamiento Web foreach (Document doc in DocumenResult) { UrlDocument UrlDoc = new UrlDocument(); UrlDoc.Id = doc.GetField("Id").StringValue; UrlDoc.Tittle = doc.GetField("Title").StringValue; UrlDoc.URL = doc.GetField("feed").StringValue; UrlDoc.Resume = doc.GetField("Descripcion").StringValue; UrlDoc.Tags = doc.GetField("Tags").StringValue; UrlDoc.Localizacion_name = doc.GetField("Location").StringValue; UrlDoc.Domain = doc.GetField("Domain").StringValue; UrlDoc.Datastreams_feed = doc.GetField("DataStreams").StringValue; UrlDoc.Website = doc.GetField("Website").StringValue; UrlDoc.Elevacion = doc.GetField("Elevacion").StringValue; UrlDoc.Latitud = doc.GetField("Latitud").StringValue; UrlDoc.Longitud = doc.GetField("Longitud").StringValue; //Campos propios de indexacion string listaconconceptos = doc.GetField("Conceptos").StringValue; UrlDoc.Conceptos = ConvertirenLista(listaconconceptos); UrlResult.Add(UrlDoc); } return(UrlResult); }
static List <string> TokenizeStandard(string content, TokenizeConfig config) { StringReader reader = new StringReader(content); TokenStream result = new StandardTokenizer(Lucene.Net.Util.Version.LUCENE_24, reader); var stophash = StopFilter.MakeStopSet(config.StopWords); result = new StandardFilter(result); result = new LowerCaseFilter(result); result = new StopFilter(true, result, stophash, true); /// Set up lexicon/invertlexicon, featurevectors, wordappearancecount /// result.Reset(); TermAttribute termattr = (TermAttribute)result.GetAttribute(typeof(TermAttribute)); List <string> words = new List <string>(); while (result.IncrementToken()) { words.Add(termattr.Term()); } return(words); }
public ArrayList getKeywords(string result, string tipoAnalizador) { ArrayList ListStemsList = new ArrayList(); TokenStream tokenStream = new StandardTokenizer(new System.IO.StringReader(result)); tokenStream = new StandardFilter(tokenStream); //elimina los signos de puntuación tokenStream = new LowerCaseFilter(tokenStream); //convierte el contenido a minúsculas if (tipoAnalizador == "Español") { //filtrará el contenido con el listado de stopWords tokenStream = new StopFilter(tokenStream, StopFilter.MakeStopSet(SpanishAnalyzer.SPANISH_STOP_WORDS), true); //Operacion de lematización de la palabras //SpanishAnalyzer ansp = new SpanishAnalyzer(); //tokenStream = ansp.SpanishSteammer(tokenStream); } else { //filtrará el contenido con el listado de stopWords tokenStream = new StopFilter(tokenStream, StopAnalyzer.ENGLISH_STOP_WORDS, true); //Operacion de lematización de la palabras //tokenStream = new PorterStemFilter(tokenStream); } string cadena = ""; string[] token; Lucene.Net.Analysis.Token current; while ((current = tokenStream.Next()) != null) { cadena = current.ToString(); token = cadena.Split(','); cadena = cadena.Substring(1, token[0].Length - 1); ListStemsList.Add(cadena); } return(ListStemsList); }
private static List <string> TokenizeCWB(string content, TokenizeConfig config) { if (_chineseWordBreaker == null) { _chineseWordBreaker = new ChineseWordBreaker(@"Utils\Lib\WordBreaker\"); } //Tokenize var words = _chineseWordBreaker.Tokenize(content); //Filter Stopwords var words2 = new List <string>(); var stophash = StopFilter.MakeStopSet(config.StopWords); foreach (var word in words) { if (!stophash.Contains(word) && Regex.Match(word).Success) { words2.Add(word); } } return(words2); }
/** * Builds an exclusionlist from an array of Strings. */ public void SetStemExclusionTable(string[] exclusionlist) { excltable = StopFilter.MakeStopSet(exclusionlist); }
/** * Builds an analyzer with the given stop words. */ public LithuanianAnalyzer(string[] stopwords) { stoptable = StopFilter.MakeStopSet(stopwords); }
/** * Builds an analyzer with the default stop words ({@link #BRAZILIAN_STOP_WORDS}). */ public LithuanianAnalyzer() { stoptable = StopFilter.MakeStopSet(STOP_WORDS); }
/// <summary> /// Builds an analyzer which removes words in the provided array. /// </summary> /// <param name="stopWords">stop word array</param> public CJKAnalyzer(Version matchVersion, params string[] stopWords) { stopTable = StopFilter.MakeStopSet(stopWords); this.matchVersion = matchVersion; }
public void SetStemExclusionTable(String[] exclusionlist) { exclusionSet = StopFilter.MakeStopSet(exclusionlist); PreviousTokenStream = null; }
public GermanAnalyzer(Version matchVersion, params string[] stopwords) : this(matchVersion, StopFilter.MakeStopSet(stopwords)) { }
/// <summary> /// Builds an analyzer which removes words in the provided array. /// </summary> /// <param name="stopWords">stop word array</param> public CJKAnalyzer(String[] stopWords) { stopTable = StopFilter.MakeStopSet(stopWords); }
public StandardAnalyzer(System.String[] stopwords, bool replaceInvalidAcronym) : this(Version.LUCENE_24, StopFilter.MakeStopSet(stopwords)) { this.replaceInvalidAcronym = replaceInvalidAcronym; }
public StandardAnalyzer(System.String[] stopWords) : this(Version.LUCENE_24, StopFilter.MakeStopSet(stopWords)) { }