Analyzer for Russian language. Supports an external list of stopwords (words that will not be indexed at all). A default set of stopwords is used unless an alternative list is specified.
Inheritance: Lucene.Net.Analysis.Analyzer
 public virtual void TestWithStemExclusionSet()
 {
     CharArraySet set = new CharArraySet(TEST_VERSION_CURRENT, 1, true);
     set.add("представление");
     Analyzer a = new RussianAnalyzer(TEST_VERSION_CURRENT, RussianAnalyzer.DefaultStopSet, set);
     AssertAnalyzesTo(a, "Вместе с тем о силе электромагнитной энергии имели представление еще", new string[] { "вмест", "сил", "электромагнитн", "энерг", "имел", "представление" });
 }
        public void TestUnicode()
        {
            RussianAnalyzer ra = new RussianAnalyzer(Version.LUCENE_CURRENT);

            using (inWords = new StreamReader(@"ru\testUTF8.txt", Encoding.UTF8))
            using (sampleUnicode = new StreamReader(@"ru\resUTF8.txt", Encoding.UTF8))
            {

                TokenStream _in = ra.TokenStream("all", inWords);

                RussianLetterTokenizer sample =
                    new RussianLetterTokenizer(
                        sampleUnicode);

                ITermAttribute text = _in.GetAttribute<ITermAttribute>();
                ITermAttribute sampleText = sample.GetAttribute<ITermAttribute>();

                for (; ; )
                {
                    if (_in.IncrementToken() == false)
                        break;

                    bool nextSampleToken = sample.IncrementToken();
                    Assert.AreEqual(text.Term, nextSampleToken == false ? null : sampleText.Term, "Unicode");
                }
            }
        }
 public static void ClearLuceneIndexRecord(int record_id)
 {
     // init lucene
     using (var analyzer = new RussianAnalyzer(Version.LUCENE_30))
     using (var writer = new IndexWriter(LuceneConfig.Directory, analyzer, IndexWriter.MaxFieldLength.UNLIMITED))
     {
         // remove older index entry
         var searchQuery = new TermQuery(new Term("Id", record_id.ToString()));
         writer.DeleteDocuments(searchQuery);
     }
 }
 public static void AddUpdateLuceneIndex(IEnumerable<CardCriterion> cardCriteria)
 {
     // init lucene
     using (var analyzer = new RussianAnalyzer(Version.LUCENE_30))
         //using (var analyzer = new SnowballAnalyzer(Version.LUCENE_30, "Russian"))//Includes stopwords? if not, create a GetStopWordslist() method
     using (var writer = new IndexWriter(LuceneConfig.Directory, analyzer, IndexWriter.MaxFieldLength.UNLIMITED))
     {
         // add data to lucene search index (replaces older entries if any)
         foreach (var criterion in cardCriteria)
             _addToLuceneIndex(criterion, writer);
     }
 }
 public static bool ClearLuceneIndex()
 {
     try
     {
         using (var analyzer = new RussianAnalyzer(Version.LUCENE_30))
         using (
             var writer = new IndexWriter(LuceneConfig.Directory, analyzer, true,
                 IndexWriter.MaxFieldLength.UNLIMITED))
         {
             // remove older index entries
             writer.DeleteAll();
         }
     }
     catch (Exception)
     {
         return false;
     }
     return true;
 }
        public void TestDigitsInRussianCharset()
        {
            TextReader reader = new StringReader("text 1000");
            RussianAnalyzer ra = new RussianAnalyzer(Version.LUCENE_CURRENT);
            TokenStream stream = ra.TokenStream("", reader);

            ITermAttribute termText = stream.GetAttribute<ITermAttribute>();
            
            try
            {
                Assert.True(stream.IncrementToken());
                Assert.AreEqual("text", termText.Term);
                Assert.True(stream.IncrementToken());
                Assert.AreEqual("1000", termText.Term, "RussianAnalyzer's tokenizer skips numbers from input text");
                Assert.False(stream.IncrementToken());
            }
            catch (IOException e)
            {
                Assert.Fail("unexpected IOException");
            }
        }
Exemple #7
0
        public virtual void TestDigitsInRussianCharset()
        {
            RussianAnalyzer ra = new RussianAnalyzer(TEST_VERSION_CURRENT);

            AssertAnalyzesTo(ra, "text 1000", new string[] { "text", "1000" });
        }
Exemple #8
0
		/// <summary>
		/// Поиск на основе ранее построенного индекса
		/// </summary>
		private static ICollection<string> Search(
			int forumID,
			string searchText,
			bool searchInText,
			bool searchInSubject,
			bool searchAuthor,
			bool searchInMyMessages,
			bool searchAnyWords,
			DateTime from,
			DateTime to)
		{
			var result = new List<string>();
			var query = new BooleanQuery();
			var analyzer = new RussianAnalyzer(Version.LUCENE_30);
			var indexPath = GetIndexDir();
			var searchTextExists = !string.IsNullOrEmpty(searchText);

			#region Обработка строки
			// Сигнатура языка поиска - **
			if (searchTextExists)
			{
				if (searchText.StartsWith(_signature))
				{
					// Да, хотим использовать язык, отрезаем ** и считаем остаток строки написанным на языке поиска
					searchText = searchText.Substring(_signature.Length);
				}
				else
				{
					// Используем простой поиск: экранируем спецсимволы, получаем токены (пробел - разделитель), учитываем флажок searchAnyWords (AND/OR)
					// Порядок важен, первое - \\
					var specChars = new[] {"\\", "+", "-", "&", "|", "!", "(", ")", "{", "}", "[", "]", "^", "\"", "~", "*", "?", ":"};
					searchText =
						specChars
							.Aggregate(
								searchText,
								(current, specChar) => current.Replace(specChar, "\\" + specChar));
					var token = searchText.Split(new[] {' '}, StringSplitOptions.RemoveEmptyEntries);

					if (searchAnyWords)
						searchText = string.Join(" ", token);
					else
						searchText = "+" + string.Join(" +", token);
				}
			}
			#endregion

			if (forumID != -1)
				query.Add(
					new TermQuery(new Term("gid", forumID.ToString())),
					Occur.MUST);

			if (searchInMyMessages)
				query.Add(
					new TermQuery(new Term("uid", Config.Instance.SelfId.ToString())),
					Occur.MUST);

			//if (searchInQuestions)
			//  bq.Add(new TermQuery(new Term("tid", "0")), true, false);

			if (from.Ticks != 0 || to.Ticks != 0)
			{
				var rq = new TermRangeQuery("dte", FormatDate(from), FormatDate(to), true, true);
				query.Add(rq, Occur.MUST);
			}
			
			if (searchTextExists)
			{
				var searchTextQuery = new BooleanQuery();
				if (searchInText)
					searchTextQuery.Add(
						new QueryParser(Version.LUCENE_29, "message", analyzer).Parse(searchText),
						Occur.SHOULD);
				if (searchInSubject)
					searchTextQuery.Add(
						new QueryParser(Version.LUCENE_29, "subject", analyzer).Parse(searchText),
						Occur.SHOULD);
				if (searchAuthor)
					searchTextQuery.Add(
						new QueryParser(Version.LUCENE_29, "usernick", analyzer).Parse(searchText),
						Occur.SHOULD);
				query.Add(searchTextQuery, Occur.MUST);
			}

			var searcher = new IndexSearcher(indexPath, true);
			try
			{
				var topDocs = searcher.Search(query, _maxSearchReults);
				result
					.AddRange(
						topDocs
							.ScoreDocs
							.Select(scored => searcher.Doc(scored.Doc).Get("mid")));
			}
			finally
			{
				searcher.Close();
			}

			return result;
		}
        // main search method
        private IEnumerable<CardCriterion> _search(List<KeyValuePair<string, object>> searchCriterion)
        {
            //ifdict.Value contains * OR ?, replace. If After this, searchString is empty, return new List<>
            if (searchCriterion.Where(kvp => string.IsNullOrEmpty(kvp.Value.toString().Replace("?", ""))).ToList().Any())
                return new List<CardCriterion>();

            // set up lucene searcher
            using (var searcher = new IndexSearcher(LuceneConfig.Directory, false))
            {

                IEnumerable<CardCriterion> results; //rename to resultlist

                using (var analyzer = new RussianAnalyzer(Version.LUCENE_30)) //contains already a russian stopwordlist
                {

                    Query query = new QueryMaker(analyzer, this, searchCriterion).MakeQuery();
                    ScoreDoc[] hits = searcher.Search(query, LuceneConfig.HitsLimit).ScoreDocs;
                    results = _mapLuceneToDataList(hits, searcher);
                }
                return results;
            }
        }
 public virtual void TestReusableTokenStream30()
 {
     Analyzer a = new RussianAnalyzer(LuceneVersion.LUCENE_30);
     AssertAnalyzesTo(a, "Вместе с тем о силе электромагнитной энергии имели представление еще", new string[] { "вмест", "сил", "электромагнитн", "энерг", "имел", "представлен" });
     AssertAnalyzesTo(a, "Но знание это хранилось в тайне", new string[] { "знан", "хран", "тайн" });
 }
 public virtual void TestReusableTokenStream()
 {
     Analyzer a = new RussianAnalyzer(TEST_VERSION_CURRENT);
     AssertAnalyzesTo(a, "Вместе с тем о силе электромагнитной энергии имели представление еще", new string[] { "вмест", "сил", "электромагнитн", "энерг", "имел", "представлен" });
     AssertAnalyzesTo(a, "Но знание это хранилось в тайне", new string[] { "знан", "эт", "хран", "тайн" });
 }
 public virtual void TestDigitsInRussianCharset()
 {
     RussianAnalyzer ra = new RussianAnalyzer(TEST_VERSION_CURRENT);
     AssertAnalyzesTo(ra, "text 1000", new string[] { "text", "1000" });
 }
 public static void Optimize()
 {
     using (var analyzer = new RussianAnalyzer(Version.LUCENE_30))
     using (var writer = new IndexWriter(LuceneConfig.Directory, analyzer, IndexWriter.MaxFieldLength.UNLIMITED))
         writer.Optimize();
 }