Example #1
0
            protected override TokenStreamComponents CreateComponents(string fieldName, TextReader reader)
            {
                Tokenizer    tokenizer = new MockTokenizer(reader);
                CharArraySet stopSet   = StopFilter.MakeStopSet(TEST_VERSION_CURRENT, "of");

                return(new TokenStreamComponents(tokenizer, new StopFilter(TEST_VERSION_CURRENT, tokenizer, stopSet)));
            }
        //Para el procesamiento de textos
        internal string DeleteInvalidData(string result, string tipoAnalizador)
        {
            TokenStream tokenStream = new StandardTokenizer(Version.LUCENE_30, new System.IO.StringReader(result));

            tokenStream = new StandardFilter(tokenStream);  //elimina los signos de puntuación
            tokenStream = new LowerCaseFilter(tokenStream); //convierte el contenido a minúsculas
            if (tipoAnalizador == "Español")
            {
                //filtrará el contenido con el listado de stopWords
                tokenStream = new StopFilter(true, tokenStream, StopFilter.MakeStopSet(SpanishAnalyzer.SPANISH_STOP_WORDS));
                //Convierte caracteres que estan por encima del 127 en la tabla ASCII
                tokenStream = new ASCIIFoldingFilter(tokenStream);
                //Operacion de lematización de la palabras
                tokenStream = SpanishSteammer(tokenStream);
            }
            else
            {
                //filtrará el contenido con el listado de stopWords
                tokenStream = new StopFilter(true, tokenStream, StopAnalyzer.ENGLISH_STOP_WORDS_SET);
                //Operacion de lematización de la palabras
                tokenStream = new PorterStemFilter(tokenStream);
            }

            return(GetDataTokens(tokenStream));
        }
            protected internal override TokenStreamComponents CreateComponents(string fieldName, TextReader reader)
            {
                CharArraySet stopSet   = StopFilter.MakeStopSet(TEST_VERSION_CURRENT, "into");
                Tokenizer    tokenizer = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false);
                TokenFilter  filter    = new StopFilter(TEST_VERSION_CURRENT, tokenizer, stopSet);

                return(new TokenStreamComponents(tokenizer, filter));
            }
 /// <summary>
 /// Builds an analyzer.
 /// </summary>
 public DutchAnalyzer()
 {
     stoptable = StopFilter.MakeStopSet(DUTCH_STOP_WORDS);
     _stemdict.Add("fiets", "fiets");            //otherwise fiet
     _stemdict.Add("bromfiets", "bromfiets");    //otherwise bromfiet
     _stemdict.Add("ei", "eier");
     _stemdict.Add("kind", "kinder");
 }
Example #5
0
        protected override TokenStreamComponents CreateComponents(string fieldName, TextReader reader)
        {
            Tokenizer   source = new StandardTokenizer(Version.LUCENE_48, reader);
            TokenStream result = new CyrilicToLatinFilter(source);

            result = new LowerCaseFilter(Version.LUCENE_48, result);
            result = new StopFilter(Version.LUCENE_48, result, StopFilter.MakeStopSet(Version.LUCENE_48, STOP_WORDS));
            return(new TokenStreamComponents(source, result));
        }
Example #6
0
        public override TokenStream TokenStream(String FieldName, TextReader reader)
        {
            TokenStream result = new StandardTokenizer(reader);

            result = new StandardFilter(result);
            result = new LowerCaseFilter(result);
            result = new StopFilter(result, StopFilter.MakeStopSet(SPANISH_STOP_WORDS), true);
            //result = new PorterStemFilter(result);
            result = SpanishSteammer(result);
            return(result);
        }
Example #7
0
        static List <string> TokenizeICTCLAS(string content, TokenizeConfig config)
        {
            if (!IsICTCLASInitialized)
            {
                if (!NLPIR_Init(datapath, 0, ""))//给出Data文件所在的路径,注意根据实际情况修改。
                {
                    throw new Exception("Init ICTCLAS failed!");
                }
                //System.Console.WriteLine("Init ICTCLAS success!");

                IsICTCLASInitialized = true;
            }

            //Add user dictionary
            if (config.UserDict != null && config.UserDict.Count != 0)
            {
                foreach (var kvp in config.UserDict)
                {
                    NLPIR_AddUserWord(kvp.Key + " " + kvp.Value);//词 词性 example:点击下载 vyou
                }
            }

            //Tokenize
            var           intPtr = NLPIR_ParagraphProcess(content.ToLower(), 1);
            var           str    = Marshal.PtrToStringAnsi(intPtr);
            var           tokens = str.Split(new char[] { ' ' }, StringSplitOptions.RemoveEmptyEntries);
            List <string> words  = new List <string>();

            foreach (var token in tokens)
            {
                var index = token.IndexOf('/');
                if (index > 0)
                {
                    words.Add(token.Substring(0, index));
                }
            }

            //Filter Stopwords
            var words2   = new List <string>();
            var stophash = StopFilter.MakeStopSet(config.StopWords);

            foreach (var word in words)
            {
                if (!stophash.Contains(word) && Regex.Match(word).Success)
                {
                    words2.Add(word);
                }
            }

            return(words2);
        }
        //Para el procesamiento de textos
        public string AnalizarConsulta(string consulta, string tipoAnalizador)
        {
            ArrayList   ListStemsList = new ArrayList();
            TokenStream tokenStream   = new StandardTokenizer(Version.LUCENE_30, new System.IO.StringReader(consulta));

            tokenStream = new StandardFilter(tokenStream);  //elimina los signos de puntuación
            tokenStream = new LowerCaseFilter(tokenStream); //convierte el contenido a minúsculas
            if (tipoAnalizador == "Español")
            {
                //filtrará el contenido con el listado de stopWords
                tokenStream = new StopFilter(true, tokenStream, StopFilter.MakeStopSet(SpanishAnalyzer.SPANISH_STOP_WORDS));
                //Convierte caracteres que estan por encima del 127 en la tabla ASCII
                tokenStream = new ASCIIFoldingFilter(tokenStream);
                //Operacion de lematización de la palabras
                tokenStream = SpanishSteammer(tokenStream);
            }
            else if (tipoAnalizador == "Ingles")
            {
                //filtrará el contenido con el listado de stopWords
                tokenStream = new StopFilter(true, tokenStream, StopAnalyzer.ENGLISH_STOP_WORDS_SET);
                //Operacion de lematización de la palabras
                tokenStream = new PorterStemFilter(tokenStream);
            }
            else //Sino establece idioma solo elimina palabras vacias en ambos idiomas
            {
                //filtrará el contenido con el listado de stopWords
                tokenStream = new StopFilter(true, tokenStream, StopFilter.MakeStopSet(SpanishAnalyzer.SPANISH_STOP_WORDS));
                //filtrará el contenido con el listado de stopWords
                tokenStream = new StopFilter(true, tokenStream, StopAnalyzer.ENGLISH_STOP_WORDS_SET);
            }

            string term     = string.Empty;
            var    termAttr = tokenStream.GetAttribute <ITermAttribute>();

            int i = 0;

            while (tokenStream.IncrementToken())
            {
                if (i == 0)
                {
                    term = termAttr.Term;
                    i++;
                }
                else
                {
                    term = term + "," + termAttr.Term;
                }
            }

            return((string.IsNullOrEmpty(term)) ? string.Empty : term.Trim());
        }
Example #9
0
            public override TokenStream TokenStream(string fieldName, System.IO.TextReader reader)
            {
                TokenStream result = new StandardTokenizer(kLuceneVersion, reader);

                result = new StandardFilter(result);
                result = new LowerCaseFilter(result);
                result = new ASCIIFoldingFilter(result);
                result = new StopFilter(false, result, StopFilter.MakeStopSet(kEnglishStopWords));
                //DEFAULT_SIDE = Side.FRONT
                result = new EdgeNGramTokenFilter(
                    result, Lucene.Net.Analysis.NGram.EdgeNGramTokenFilter.DEFAULT_SIDE, 1, 20);

                return(result);
            }
Example #10
0
        public override TokenStream TokenStream(String fieldName, TextReader reader)
        {
            TokenStream result = new SentenceTokenizer(reader);

            result = new WordTokenizer(result, wordSegment);
            // result = new LowerCaseFilter(result);
            // 不再需要LowerCaseFilter,因为SegTokenFilter已经将所有英文字符转换成小写
            // stem太严格了, This is not bug, this feature:)
            result = new PorterStemFilter(result);
            if (stopWords != null)
            {
                result = new StopFilter(true, result, StopFilter.MakeStopSet(stopWords), false);
            }
            return(result);
        }
Example #11
0
        protected override TokenStreamComponents CreateComponents(string fieldName, TextReader reader)
        {
            TokenStreamComponents tokenStreamComponents = null;
            Tokenizer             tokenizer             = new StandardTokenizer(matchVersion, reader);
            TokenStream           stream = new LowerCaseFilter(matchVersion, tokenizer);

            stream = new CyrllicToLatinFilter(stream);
            stream = new StopFilter(matchVersion, stream, StopFilter.MakeStopSet(matchVersion, STOP_WORDS));
            stream = new SnowballFilter(stream, new SimpleSerbianStemmer());
            stream = new ASCIIFoldingFilter(stream);

            tokenStreamComponents = new TokenStreamComponents(tokenizer, stream);

            return(tokenStreamComponents);
        }
        public void TestEndNotStopWord()
        {
            CharArraySet stopWords = StopFilter.MakeStopSet(TEST_VERSION_CURRENT, "to");
            TokenStream  stream    = new MockTokenizer(new StringReader("go to"));
            TokenStream  filter    = new SuggestStopFilter(stream, stopWords);

            AssertTokenStreamContents(filter,
                                      new string[] { "go", "to" },
                                      new int[] { 0, 3 },
                                      new int[] { 2, 5 },
                                      null,
                                      new int[] { 1, 1 },
                                      null,
                                      5,
                                      new bool[] { false, true },
                                      true);
        }
        public void TestMultipleStopWords()
        {
            CharArraySet stopWords = StopFilter.MakeStopSet(TEST_VERSION_CURRENT, "to", "the", "a");
            TokenStream  stream    = new MockTokenizer(new StringReader("go to a the school"));
            TokenStream  filter    = new SuggestStopFilter(stream, stopWords);

            filter = new SuggestStopFilter(stream, stopWords);
            AssertTokenStreamContents(filter,
                                      new String[] { "go", "school" },
                                      new int[] { 0, 12 },
                                      new int[] { 2, 18 },
                                      null,
                                      new int[] { 1, 4 },
                                      null,
                                      18,
                                      new bool[] { false, false },
                                      true);
        }
Example #14
0
        public void TestSuggestStopFilter()
        {
            CharArraySet stopWords     = StopFilter.MakeStopSet(TEST_VERSION_CURRENT, "a");
            Analyzer     indexAnalyzer = new TestSuggestStopFilterAnalyzer1(this, stopWords);
            Analyzer     queryAnalyzer = new TestSuggestStopFilterAnalyzer2(this, stopWords);

            AnalyzingInfixSuggester suggester = new AnalyzingInfixSuggester(TEST_VERSION_CURRENT, NewDirectory(), indexAnalyzer, queryAnalyzer, 3);

            Input[] keys = new Input[] {
                new Input("a bob for apples", 10, new BytesRef("foobaz")),
            };

            suggester.Build(new InputArrayIterator(keys));
            IList <Lookup.LookupResult> results = suggester.DoLookup(TestUtil.StringToCharSequence("a", Random()).ToString(), 10, true, true);

            assertEquals(1, results.size());
            assertEquals("a bob for <b>a</b>pples", results[0].key);
            suggester.Dispose();
        }
Example #15
0
        public override TokenStream TokenStream(string fieldName, TextReader reader)
        {
            TokenStream result = null;

            if (useStopWords)
            {
                ISet <string> stopWords = StopFilter.MakeStopSet(new string[] { "a", "an", "and", "are", "as", "at", "be", "but", "by", "for", "if", "in", "into", "is", "it", "no", "not", "of", "on", "or", "such", "that", "the", "their", "then", "there", "these", "they", "this", "to", "was", "will", "with", // EN
                                                                                "van", "aan", "dat", "de", "den", "der", "des", "deze", "die", "dit", "door", "een", "het", "ik", "is", "je", "na",                                                                                                                  // NL
                                                                                "au", "aux", "la", "le", "les" });                                                                                                                                                                                                   // FR

                result = new PorterStemFilter(new StopFilter(false, new ASCIIFoldingFilter(new LowerCaseFilter(new CDRWhitespaceTokenizer(reader))), stopWords));
            }
            else
            {
                result = new PorterStemFilter(new ASCIIFoldingFilter(new LowerCaseFilter(new CDRWhitespaceTokenizer(reader))));
            }

            return(result);
        }
Example #16
0
        /// <summary>
        /// Returns as <seealso cref="CharArraySet"/> from wordFiles, which
        /// can be a comma-separated list of filenames
        /// </summary>
        protected internal CharArraySet GetWordSet(IResourceLoader loader, string wordFiles, bool ignoreCase)
        {
            AssureMatchVersion();
            IEnumerable <string> files = SplitFileNames(wordFiles);
            CharArraySet         words = null;

            if (files.Count() > 0)
            {
                // default stopwords list has 35 or so words, but maybe don't make it that
                // big to start
                words = new CharArraySet(luceneMatchVersion, files.Count() * 10, ignoreCase);
                foreach (string file in files)
                {
                    var wlist = GetLines(loader, file.Trim());
                    words.UnionWith(StopFilter.MakeStopSet(luceneMatchVersion, wlist, ignoreCase));
                }
            }
            return(words);
        }
        public virtual void TestAltFillerToken()
        {
            Analyzer @delegate = Analyzer.NewAnonymous(createComponents: (fieldName, reader) =>
            {
                CharArraySet stopSet = StopFilter.MakeStopSet(TEST_VERSION_CURRENT, "into");
                Tokenizer tokenizer  = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false);
                TokenFilter filter   = new StopFilter(TEST_VERSION_CURRENT, tokenizer, stopSet);
                return(new TokenStreamComponents(tokenizer, filter));
            });

            ShingleAnalyzerWrapper analyzer = new ShingleAnalyzerWrapper(@delegate, ShingleFilter.DEFAULT_MIN_SHINGLE_SIZE, ShingleFilter.DEFAULT_MAX_SHINGLE_SIZE, ShingleFilter.DEFAULT_TOKEN_SEPARATOR, true, false, "--");

            AssertAnalyzesTo(analyzer, "please divide into shingles", new string[] { "please", "please divide", "divide", "divide --", "-- shingles", "shingles" }, new int[] { 0, 0, 7, 7, 19, 19 }, new int[] { 6, 13, 13, 19, 27, 27 }, new int[] { 1, 0, 1, 0, 1, 1 });

            analyzer = new ShingleAnalyzerWrapper(@delegate, ShingleFilter.DEFAULT_MIN_SHINGLE_SIZE, ShingleFilter.DEFAULT_MAX_SHINGLE_SIZE, ShingleFilter.DEFAULT_TOKEN_SEPARATOR, false, false, null);
            AssertAnalyzesTo(analyzer, "please divide into shingles", new string[] { "please divide", "divide ", " shingles" }, new int[] { 0, 7, 19 }, new int[] { 13, 19, 27 }, new int[] { 1, 1, 1 });

            analyzer = new ShingleAnalyzerWrapper(@delegate, ShingleFilter.DEFAULT_MIN_SHINGLE_SIZE, ShingleFilter.DEFAULT_MAX_SHINGLE_SIZE, ShingleFilter.DEFAULT_TOKEN_SEPARATOR, false, false, "");
            AssertAnalyzesTo(analyzer, "please divide into shingles", new string[] { "please divide", "divide ", " shingles" }, new int[] { 0, 7, 19 }, new int[] { 13, 19, 27 }, new int[] { 1, 1, 1 });
        }
        public static List <UrlDocument> BuscarEnIndiceSemantico(String original, Boolean usarEspañol)
        {
            //Lista de documentos resultado de la búsqueda
            List <Document>    DocumenResult = new List <Document>();
            List <UrlDocument> UrlResult     = new List <UrlDocument>();

            //Llama al procedimiento que realiza la busqueda
            if (usarEspañol)
            {
                DocumenResult = moreLikeThisAnalyzer(original, StopFilter.MakeStopSet(SpanishAnalyzer.SPANISH_STOP_WORDS), new SpanishAnalyzer());
            }
            else
            {
                DocumenResult = moreLikeThisAnalyzer(original, Lucene.Net.Analysis.StopAnalyzer.ENGLISH_STOP_WORDS_SET, new StandardAnalyzer(Version.LUCENE_30));
            }

            //Convertir en UrlDocument para su procesamiento Web
            foreach (Document doc in DocumenResult)
            {
                UrlDocument UrlDoc = new UrlDocument();
                UrlDoc.Id                = doc.GetField("Id").StringValue;
                UrlDoc.Tittle            = doc.GetField("Title").StringValue;
                UrlDoc.URL               = doc.GetField("feed").StringValue;
                UrlDoc.Resume            = doc.GetField("Descripcion").StringValue;
                UrlDoc.Tags              = doc.GetField("Tags").StringValue;
                UrlDoc.Localizacion_name = doc.GetField("Location").StringValue;
                UrlDoc.Domain            = doc.GetField("Domain").StringValue;
                UrlDoc.Datastreams_feed  = doc.GetField("DataStreams").StringValue;
                UrlDoc.Website           = doc.GetField("Website").StringValue;
                UrlDoc.Elevacion         = doc.GetField("Elevacion").StringValue;
                UrlDoc.Latitud           = doc.GetField("Latitud").StringValue;
                UrlDoc.Longitud          = doc.GetField("Longitud").StringValue;

                //Campos propios de indexacion
                string listaconconceptos = doc.GetField("Conceptos").StringValue;
                UrlDoc.Conceptos = ConvertirenLista(listaconconceptos);

                UrlResult.Add(UrlDoc);
            }
            return(UrlResult);
        }
Example #19
0
        static List <string> TokenizeStandard(string content, TokenizeConfig config)
        {
            StringReader reader = new StringReader(content);
            TokenStream  result = new StandardTokenizer(Lucene.Net.Util.Version.LUCENE_24, reader);

            var stophash = StopFilter.MakeStopSet(config.StopWords);

            result = new StandardFilter(result);
            result = new LowerCaseFilter(result);
            result = new StopFilter(true, result, stophash, true);

            /// Set up lexicon/invertlexicon, featurevectors, wordappearancecount ///
            result.Reset();
            TermAttribute termattr = (TermAttribute)result.GetAttribute(typeof(TermAttribute));
            List <string> words    = new List <string>();

            while (result.IncrementToken())
            {
                words.Add(termattr.Term());
            }
            return(words);
        }
        public ArrayList getKeywords(string result, string tipoAnalizador)
        {
            ArrayList   ListStemsList = new ArrayList();
            TokenStream tokenStream   = new StandardTokenizer(new System.IO.StringReader(result));

            tokenStream = new StandardFilter(tokenStream);  //elimina los signos de puntuación
            tokenStream = new LowerCaseFilter(tokenStream); //convierte el contenido a minúsculas
            if (tipoAnalizador == "Español")
            {
                //filtrará el contenido con el listado de stopWords
                tokenStream = new StopFilter(tokenStream, StopFilter.MakeStopSet(SpanishAnalyzer.SPANISH_STOP_WORDS), true);
                //Operacion de lematización de la palabras
                //SpanishAnalyzer ansp = new SpanishAnalyzer();
                //tokenStream = ansp.SpanishSteammer(tokenStream);
            }
            else
            {
                //filtrará el contenido con el listado de stopWords
                tokenStream = new StopFilter(tokenStream, StopAnalyzer.ENGLISH_STOP_WORDS, true);
                //Operacion de lematización de la palabras
                //tokenStream = new PorterStemFilter(tokenStream);
            }

            string cadena = "";

            string[] token;
            Lucene.Net.Analysis.Token current;
            while ((current = tokenStream.Next()) != null)
            {
                cadena = current.ToString();
                token  = cadena.Split(',');
                cadena = cadena.Substring(1, token[0].Length - 1);
                ListStemsList.Add(cadena);
            }
            return(ListStemsList);
        }
Example #21
0
        private static List <string> TokenizeCWB(string content, TokenizeConfig config)
        {
            if (_chineseWordBreaker == null)
            {
                _chineseWordBreaker = new ChineseWordBreaker(@"Utils\Lib\WordBreaker\");
            }

            //Tokenize
            var words = _chineseWordBreaker.Tokenize(content);

            //Filter Stopwords
            var words2   = new List <string>();
            var stophash = StopFilter.MakeStopSet(config.StopWords);

            foreach (var word in words)
            {
                if (!stophash.Contains(word) && Regex.Match(word).Success)
                {
                    words2.Add(word);
                }
            }

            return(words2);
        }
Example #22
0
 /**
  * Builds an exclusionlist from an array of Strings.
  */
 public void SetStemExclusionTable(string[] exclusionlist)
 {
     excltable = StopFilter.MakeStopSet(exclusionlist);
 }
Example #23
0
 /**
  * Builds an analyzer with the given stop words.
  */
 public LithuanianAnalyzer(string[] stopwords)
 {
     stoptable = StopFilter.MakeStopSet(stopwords);
 }
Example #24
0
 /**
  * Builds an analyzer with the default stop words ({@link #BRAZILIAN_STOP_WORDS}).
  */
 public LithuanianAnalyzer()
 {
     stoptable = StopFilter.MakeStopSet(STOP_WORDS);
 }
Example #25
0
 /// <summary>
 /// Builds an analyzer which removes words in the provided array.
 /// </summary>
 /// <param name="stopWords">stop word array</param>
 public CJKAnalyzer(Version matchVersion, params string[] stopWords)
 {
     stopTable         = StopFilter.MakeStopSet(stopWords);
     this.matchVersion = matchVersion;
 }
Example #26
0
 public void SetStemExclusionTable(String[] exclusionlist)
 {
     exclusionSet        = StopFilter.MakeStopSet(exclusionlist);
     PreviousTokenStream = null;
 }
Example #27
0
 public GermanAnalyzer(Version matchVersion, params string[] stopwords)
     : this(matchVersion, StopFilter.MakeStopSet(stopwords))
 {
 }
Example #28
0
 /// <summary>
 /// Builds an analyzer which removes words in the provided array.
 /// </summary>
 /// <param name="stopWords">stop word array</param>
 public CJKAnalyzer(String[] stopWords)
 {
     stopTable = StopFilter.MakeStopSet(stopWords);
 }
Example #29
0
 public StandardAnalyzer(System.String[] stopwords, bool replaceInvalidAcronym) : this(Version.LUCENE_24, StopFilter.MakeStopSet(stopwords))
 {
     this.replaceInvalidAcronym = replaceInvalidAcronym;
 }
Example #30
0
 public StandardAnalyzer(System.String[] stopWords) : this(Version.LUCENE_24, StopFilter.MakeStopSet(stopWords))
 {
 }