예제 #1
0
 public static void AddUpdateLuceneIndex(IEnumerable <CardCriterion> cardCriteria)
 {
     // init lucene
     using (var analyzer = new RussianAnalyzer(Version.LUCENE_30))
         //using (var analyzer = new SnowballAnalyzer(Version.LUCENE_30, "Russian"))//Includes stopwords? if not, create a GetStopWordslist() method
         using (var writer = new IndexWriter(LuceneConfig.Directory, analyzer, IndexWriter.MaxFieldLength.UNLIMITED))
         {
             // add data to lucene search index (replaces older entries if any)
             foreach (var criterion in cardCriteria)
             {
                 _addToLuceneIndex(criterion, writer);
             }
         }
 }
예제 #2
0
 /// <summary>
 /// add index
 /// </summary>
 /// <param name="sampleDatas"></param>
 public static void AddUpdateLuceneIndex(IEnumerable <SampleData> sampleDatas)
 {
     // init lucene
     using (var luceneDirectory = LuceneDirectory)
         using (var analyzer = new RussianAnalyzer(Version))
             using (var writer = new IndexWriter(luceneDirectory, analyzer, IndexWriter.MaxFieldLength.UNLIMITED))
             {
                 writer.SetRAMBufferSizeMB(10);
                 // add data to lucene search index (replaces older entries if any)
                 foreach (var sampleData in sampleDatas)
                 {
                     AddToLuceneIndex(sampleData, writer);
                 }
                 writer.Commit();
             }
 }
예제 #3
0
 public static bool ClearIndex()
 {
     try
     {
         var analyzer = new RussianAnalyzer(Version.LUCENE_30);
         using (var writer = new IndexWriter(_directory, analyzer, IndexWriter.MaxFieldLength.UNLIMITED))
         {
             writer.DeleteAll();
             analyzer.Close();
             writer.Dispose();
         }
     }
     catch (Exception)
     {
         return(false);
     }
     return(true);
 }
예제 #4
0
 public static bool ClearLuceneIndex()
 {
     try
     {
         using (var analyzer = new RussianAnalyzer(Version.LUCENE_30))
             using (
                 var writer = new IndexWriter(LuceneConfig.Directory, analyzer, true,
                                              IndexWriter.MaxFieldLength.UNLIMITED))
             {
                 // remove older index entries
                 writer.DeleteAll();
             }
     }
     catch (Exception)
     {
         return(false);
     }
     return(true);
 }
        public void TestDigitsInRussianCharset()
        {
            TextReader      reader = new StringReader("text 1000");
            RussianAnalyzer ra     = new RussianAnalyzer(Version.LUCENE_CURRENT);
            TokenStream     stream = ra.TokenStream("", reader);

            ITermAttribute termText = stream.GetAttribute <ITermAttribute>();

            try
            {
                Assert.True(stream.IncrementToken());
                Assert.AreEqual("text", termText.Term);
                Assert.True(stream.IncrementToken());
                Assert.AreEqual("1000", termText.Term, "RussianAnalyzer's tokenizer skips numbers from input text");
                Assert.False(stream.IncrementToken());
            }
            catch (IOException e)
            {
                Assert.Fail("unexpected IOException");
            }
        }
예제 #6
0
        public virtual void  TestKOI8()
        {
            //System.out.println(new java.util.Date());
            RussianAnalyzer ra = new RussianAnalyzer(RussianCharsets.KOI8);

            // KOI8
            inWordsKOI8 = new System.IO.StreamReader(
                new System.IO.FileStream(
                    new System.IO.FileInfo(
                        dataDir.FullName + @"Analysis\RU\testKOI8.txt").FullName,
                    System.IO.FileMode.Open,
                    System.IO.FileAccess.Read),
                System.Text.Encoding.GetEncoding("iso-8859-1"));

            sampleKOI8 = new System.IO.StreamReader(
                new System.IO.FileStream(
                    new System.IO.FileInfo(dataDir.FullName + @"Analysis\RU\resKOI8.htm").FullName,
                    System.IO.FileMode.Open,
                    System.IO.FileAccess.Read),
                System.Text.Encoding.GetEncoding("iso-8859-1"));

            TokenStream            in_Renamed = ra.TokenStream("all", inWordsKOI8);
            RussianLetterTokenizer sample     = new RussianLetterTokenizer(sampleKOI8, RussianCharsets.KOI8);

            for (; ;)
            {
                Token token = in_Renamed.Next();

                if (token == null)
                {
                    break;
                }

                Token sampleToken = sample.Next();
                Assert.AreEqual(token.TermText(), sampleToken == null ? null : sampleToken.TermText(), "KOI8");
            }

            inWordsKOI8.Close();
            sampleKOI8.Close();
        }
예제 #7
0
        public virtual void  TestUnicode()
        {
            RussianAnalyzer ra = new RussianAnalyzer(RussianCharsets.UnicodeRussian);

            inWords = new System.IO.StreamReader(
                new System.IO.FileStream(
                    new System.IO.FileInfo(
                        dataDir.FullName + @"Analysis\RU\testUnicode.txt").FullName,
                    System.IO.FileMode.Open,
                    System.IO.FileAccess.Read),
                System.Text.Encoding.GetEncoding("Unicode"));

            sampleUnicode = new System.IO.StreamReader(
                new System.IO.FileStream(
                    new System.IO.FileInfo(
                        dataDir.FullName + @"Analysis\RU\resUnicode.htm").FullName,
                    System.IO.FileMode.Open,
                    System.IO.FileAccess.Read),
                System.Text.Encoding.GetEncoding("Unicode"));

            TokenStream in_Renamed = ra.TokenStream("all", inWords);

            RussianLetterTokenizer sample = new RussianLetterTokenizer(sampleUnicode, RussianCharsets.UnicodeRussian);

            for (; ;)
            {
                Token token = in_Renamed.Next();

                if (token == null)
                {
                    break;
                }

                Token sampleToken = sample.Next();
                Assert.AreEqual(token.TermText(), sampleToken == null ? null : sampleToken.TermText(), "Unicode");
            }

            inWords.Close();
            sampleUnicode.Close();
        }
예제 #8
0
		public virtual void  TestUnicode()
		{
			RussianAnalyzer ra = new RussianAnalyzer(RussianCharsets.UnicodeRussian);
			inWords = new System.IO.StreamReader(
                new System.IO.FileStream(
                    new System.IO.FileInfo(
                        dataDir.FullName + @"Analysis\RU\testUnicode.txt").FullName, 
                        System.IO.FileMode.Open, 
                        System.IO.FileAccess.Read), 
                    System.Text.Encoding.GetEncoding("Unicode"));
			
			sampleUnicode = new System.IO.StreamReader(
                new System.IO.FileStream(
                    new System.IO.FileInfo(
                        dataDir.FullName + @"Analysis\RU\resUnicode.htm").FullName, 
                        System.IO.FileMode.Open,
                        System.IO.FileAccess.Read), 
                    System.Text.Encoding.GetEncoding("Unicode"));
			
			TokenStream in_Renamed = ra.TokenStream("all", inWords);
			
			RussianLetterTokenizer sample = new RussianLetterTokenizer(sampleUnicode, RussianCharsets.UnicodeRussian);
			
			for (; ; )
			{
				Token token = in_Renamed.Next();
				
				if (token == null)
				{
					break;
				}
				
				Token sampleToken = sample.Next();
				Assert.AreEqual(token.TermText(), sampleToken == null ? null : sampleToken.TermText(), "Unicode");
			}
			
			inWords.Close();
			sampleUnicode.Close();
		}
예제 #9
0
        public virtual void  Test1251()
        {
            // 1251
            inWords1251 = new System.IO.StreamReader(
                new System.IO.FileStream(
                    new System.IO.FileInfo(
                        dataDir.FullName + @"Analysis\RU\test1251.txt").FullName,
                    System.IO.FileMode.Open,
                    System.IO.FileAccess.Read),
                System.Text.Encoding.GetEncoding("iso-8859-1"));

            sample1251 = new System.IO.StreamReader(
                new System.IO.FileStream(
                    new System.IO.FileInfo(
                        dataDir.FullName + @"Analysis\RU\res1251.htm").FullName,
                    System.IO.FileMode.Open,
                    System.IO.FileAccess.Read),
                System.Text.Encoding.GetEncoding("iso-8859-1"));

            RussianAnalyzer        ra         = new RussianAnalyzer(RussianCharsets.CP1251);
            TokenStream            in_Renamed = ra.TokenStream("", inWords1251);
            RussianLetterTokenizer sample     = new RussianLetterTokenizer(sample1251, RussianCharsets.CP1251);

            for (; ;)
            {
                Token token = in_Renamed.Next();

                if (token == null)
                {
                    break;
                }

                Token sampleToken = sample.Next();
                Assert.AreEqual(token.TermText(), sampleToken == null ? null : sampleToken.TermText(), "1251");
            }

            inWords1251.Close();
            sample1251.Close();
        }
예제 #10
0
        // main search method
        private IEnumerable <CardCriterion> _search(List <KeyValuePair <string, object> > searchCriterion)
        {
            //ifdict.Value contains * OR ?, replace. If After this, searchString is empty, return new List<>
            if (searchCriterion.Where(kvp => string.IsNullOrEmpty(kvp.Value.toString().Replace("?", ""))).ToList().Any())
            {
                return(new List <CardCriterion>());
            }

            // set up lucene searcher
            using (var searcher = new IndexSearcher(LuceneConfig.Directory, false))
            {
                IEnumerable <CardCriterion> results;                          //rename to resultlist

                using (var analyzer = new RussianAnalyzer(Version.LUCENE_30)) //contains already a russian stopwordlist
                {
                    Query      query = new QueryMaker(analyzer, this, searchCriterion).MakeQuery();
                    ScoreDoc[] hits  = searcher.Search(query, LuceneConfig.HitsLimit).ScoreDocs;
                    results = _mapLuceneToDataList(hits, searcher);
                }
                return(results);
            }
        }
예제 #11
0
		public virtual void  TestKOI8()
		{
			//System.out.println(new java.util.Date());
			RussianAnalyzer ra = new RussianAnalyzer(RussianCharsets.KOI8);
			// KOI8
			inWordsKOI8 = new System.IO.StreamReader(
                new System.IO.FileStream(
                    new System.IO.FileInfo(
                        dataDir.FullName + @"Analysis\RU\testKOI8.txt").FullName, 
                        System.IO.FileMode.Open, 
                        System.IO.FileAccess.Read), 
                        System.Text.Encoding.GetEncoding("iso-8859-1"));
			
			sampleKOI8 = new System.IO.StreamReader(
                new System.IO.FileStream(
                    new System.IO.FileInfo(dataDir.FullName + @"Analysis\RU\resKOI8.htm").FullName, 
                        System.IO.FileMode.Open, 
                        System.IO.FileAccess.Read), 
                        System.Text.Encoding.GetEncoding("iso-8859-1"));
			
			TokenStream in_Renamed = ra.TokenStream("all", inWordsKOI8);
			RussianLetterTokenizer sample = new RussianLetterTokenizer(sampleKOI8, RussianCharsets.KOI8);
			
			for (; ; )
			{
				Token token = in_Renamed.Next();
				
				if (token == null)
				{
					break;
				}
				
				Token sampleToken = sample.Next();
				Assert.AreEqual(token.TermText(), sampleToken == null ? null : sampleToken.TermText(), "KOI8");
			}
			
			inWordsKOI8.Close();
			sampleKOI8.Close();
		}
예제 #12
0
		public virtual void  Test1251()
		{
			// 1251
			inWords1251 = new System.IO.StreamReader(
                new System.IO.FileStream(
                    new System.IO.FileInfo(
                        dataDir.FullName + @"Analysis\RU\test1251.txt").FullName, 
                        System.IO.FileMode.Open, 
                        System.IO.FileAccess.Read), 
                        System.Text.Encoding.GetEncoding("iso-8859-1"));
			
			sample1251 = new System.IO.StreamReader(
                new System.IO.FileStream(
                    new System.IO.FileInfo(
                        dataDir.FullName + @"Analysis\RU\res1251.htm").FullName, 
                        System.IO.FileMode.Open, 
                        System.IO.FileAccess.Read), 
                        System.Text.Encoding.GetEncoding("iso-8859-1"));
			
			RussianAnalyzer ra = new RussianAnalyzer(RussianCharsets.CP1251);
			TokenStream in_Renamed = ra.TokenStream("", inWords1251);
			RussianLetterTokenizer sample = new RussianLetterTokenizer(sample1251, RussianCharsets.CP1251);
			
			for (; ; )
			{
				Token token = in_Renamed.Next();
				
				if (token == null)
				{
					break;
				}
				
				Token sampleToken = sample.Next();
				Assert.AreEqual(token.TermText(), sampleToken == null ? null : sampleToken.TermText(), "1251");
			}
			
			inWords1251.Close();
			sample1251.Close();
		}
예제 #13
0
        /// <summary>
        /// Поиск на основе ранее построенного индекса
        /// </summary>
        private static ICollection <string> Search(
            int forumID,
            string searchText,
            bool searchInText,
            bool searchInSubject,
            bool searchAuthor,
            bool searchInMyMessages,
            bool searchAnyWords,
            DateTime from,
            DateTime to)
        {
            var result           = new List <string>();
            var query            = new BooleanQuery();
            var analyzer         = new RussianAnalyzer(Version.LUCENE_30);
            var indexPath        = GetIndexDir();
            var searchTextExists = !string.IsNullOrEmpty(searchText);

            #region Обработка строки
            // Сигнатура языка поиска - **
            if (searchTextExists)
            {
                if (searchText.StartsWith(_signature))
                {
                    // Да, хотим использовать язык, отрезаем ** и считаем остаток строки написанным на языке поиска
                    searchText = searchText.Substring(_signature.Length);
                }
                else
                {
                    // Используем простой поиск: экранируем спецсимволы, получаем токены (пробел - разделитель), учитываем флажок searchAnyWords (AND/OR)
                    // Порядок важен, первое - \\
                    var specChars = new[] { "\\", "+", "-", "&", "|", "!", "(", ")", "{", "}", "[", "]", "^", "\"", "~", "*", "?", ":" };
                    searchText =
                        specChars
                        .Aggregate(
                            searchText,
                            (current, specChar) => current.Replace(specChar, "\\" + specChar));
                    var token = searchText.Split(new[] { ' ' }, StringSplitOptions.RemoveEmptyEntries);

                    if (searchAnyWords)
                    {
                        searchText = string.Join(" ", token);
                    }
                    else
                    {
                        searchText = "+" + string.Join(" +", token);
                    }
                }
            }
            #endregion

            if (forumID != -1)
            {
                query.Add(
                    new TermQuery(new Term("gid", forumID.ToString())),
                    Occur.MUST);
            }

            if (searchInMyMessages)
            {
                query.Add(
                    new TermQuery(new Term("uid", Config.Instance.SelfId.ToString())),
                    Occur.MUST);
            }

            //if (searchInQuestions)
            //  bq.Add(new TermQuery(new Term("tid", "0")), true, false);

            if (from.Ticks != 0 || to.Ticks != 0)
            {
                var rq = new TermRangeQuery("dte", FormatDate(from), FormatDate(to), true, true);
                query.Add(rq, Occur.MUST);
            }

            if (searchTextExists)
            {
                var searchTextQuery = new BooleanQuery();
                if (searchInText)
                {
                    searchTextQuery.Add(
                        new QueryParser(Version.LUCENE_29, "message", analyzer).Parse(searchText),
                        Occur.SHOULD);
                }
                if (searchInSubject)
                {
                    searchTextQuery.Add(
                        new QueryParser(Version.LUCENE_29, "subject", analyzer).Parse(searchText),
                        Occur.SHOULD);
                }
                if (searchAuthor)
                {
                    searchTextQuery.Add(
                        new QueryParser(Version.LUCENE_29, "usernick", analyzer).Parse(searchText),
                        Occur.SHOULD);
                }
                query.Add(searchTextQuery, Occur.MUST);
            }

            var searcher = new IndexSearcher(indexPath, true);
            try
            {
                var topDocs = searcher.Search(query, _maxSearchReults);
                result
                .AddRange(
                    topDocs
                    .ScoreDocs
                    .Select(scored => searcher.Doc(scored.Doc).Get("mid")));
            }
            finally
            {
                searcher.Close();
            }

            return(result);
        }
예제 #14
0
 public static void Optimize()
 {
     using (var analyzer = new RussianAnalyzer(Version.LUCENE_30))
         using (var writer = new IndexWriter(LuceneConfig.Directory, analyzer, IndexWriter.MaxFieldLength.UNLIMITED))
             writer.Optimize();
 }