public virtual void TestWithStemExclusionSet() { CharArraySet set = new CharArraySet(TEST_VERSION_CURRENT, 1, true); set.add("представление"); Analyzer a = new RussianAnalyzer(TEST_VERSION_CURRENT, RussianAnalyzer.DefaultStopSet, set); AssertAnalyzesTo(a, "Вместе с тем о силе электромагнитной энергии имели представление еще", new string[] { "вмест", "сил", "электромагнитн", "энерг", "имел", "представление" }); }
public void TestUnicode() { RussianAnalyzer ra = new RussianAnalyzer(Version.LUCENE_CURRENT); using (inWords = new StreamReader(@"ru\testUTF8.txt", Encoding.UTF8)) using (sampleUnicode = new StreamReader(@"ru\resUTF8.txt", Encoding.UTF8)) { TokenStream _in = ra.TokenStream("all", inWords); RussianLetterTokenizer sample = new RussianLetterTokenizer( sampleUnicode); ITermAttribute text = _in.GetAttribute<ITermAttribute>(); ITermAttribute sampleText = sample.GetAttribute<ITermAttribute>(); for (; ; ) { if (_in.IncrementToken() == false) break; bool nextSampleToken = sample.IncrementToken(); Assert.AreEqual(text.Term, nextSampleToken == false ? null : sampleText.Term, "Unicode"); } } }
public static void ClearLuceneIndexRecord(int record_id) { // init lucene using (var analyzer = new RussianAnalyzer(Version.LUCENE_30)) using (var writer = new IndexWriter(LuceneConfig.Directory, analyzer, IndexWriter.MaxFieldLength.UNLIMITED)) { // remove older index entry var searchQuery = new TermQuery(new Term("Id", record_id.ToString())); writer.DeleteDocuments(searchQuery); } }
public static void AddUpdateLuceneIndex(IEnumerable<CardCriterion> cardCriteria) { // init lucene using (var analyzer = new RussianAnalyzer(Version.LUCENE_30)) //using (var analyzer = new SnowballAnalyzer(Version.LUCENE_30, "Russian"))//Includes stopwords? if not, create a GetStopWordslist() method using (var writer = new IndexWriter(LuceneConfig.Directory, analyzer, IndexWriter.MaxFieldLength.UNLIMITED)) { // add data to lucene search index (replaces older entries if any) foreach (var criterion in cardCriteria) _addToLuceneIndex(criterion, writer); } }
public static bool ClearLuceneIndex() { try { using (var analyzer = new RussianAnalyzer(Version.LUCENE_30)) using ( var writer = new IndexWriter(LuceneConfig.Directory, analyzer, true, IndexWriter.MaxFieldLength.UNLIMITED)) { // remove older index entries writer.DeleteAll(); } } catch (Exception) { return false; } return true; }
public void TestDigitsInRussianCharset() { TextReader reader = new StringReader("text 1000"); RussianAnalyzer ra = new RussianAnalyzer(Version.LUCENE_CURRENT); TokenStream stream = ra.TokenStream("", reader); ITermAttribute termText = stream.GetAttribute<ITermAttribute>(); try { Assert.True(stream.IncrementToken()); Assert.AreEqual("text", termText.Term); Assert.True(stream.IncrementToken()); Assert.AreEqual("1000", termText.Term, "RussianAnalyzer's tokenizer skips numbers from input text"); Assert.False(stream.IncrementToken()); } catch (IOException e) { Assert.Fail("unexpected IOException"); } }
public virtual void TestDigitsInRussianCharset() { RussianAnalyzer ra = new RussianAnalyzer(TEST_VERSION_CURRENT); AssertAnalyzesTo(ra, "text 1000", new string[] { "text", "1000" }); }
/// <summary> /// Поиск на основе ранее построенного индекса /// </summary> private static ICollection<string> Search( int forumID, string searchText, bool searchInText, bool searchInSubject, bool searchAuthor, bool searchInMyMessages, bool searchAnyWords, DateTime from, DateTime to) { var result = new List<string>(); var query = new BooleanQuery(); var analyzer = new RussianAnalyzer(Version.LUCENE_30); var indexPath = GetIndexDir(); var searchTextExists = !string.IsNullOrEmpty(searchText); #region Обработка строки // Сигнатура языка поиска - ** if (searchTextExists) { if (searchText.StartsWith(_signature)) { // Да, хотим использовать язык, отрезаем ** и считаем остаток строки написанным на языке поиска searchText = searchText.Substring(_signature.Length); } else { // Используем простой поиск: экранируем спецсимволы, получаем токены (пробел - разделитель), учитываем флажок searchAnyWords (AND/OR) // Порядок важен, первое - \\ var specChars = new[] {"\\", "+", "-", "&", "|", "!", "(", ")", "{", "}", "[", "]", "^", "\"", "~", "*", "?", ":"}; searchText = specChars .Aggregate( searchText, (current, specChar) => current.Replace(specChar, "\\" + specChar)); var token = searchText.Split(new[] {' '}, StringSplitOptions.RemoveEmptyEntries); if (searchAnyWords) searchText = string.Join(" ", token); else searchText = "+" + string.Join(" +", token); } } #endregion if (forumID != -1) query.Add( new TermQuery(new Term("gid", forumID.ToString())), Occur.MUST); if (searchInMyMessages) query.Add( new TermQuery(new Term("uid", Config.Instance.SelfId.ToString())), Occur.MUST); //if (searchInQuestions) // bq.Add(new TermQuery(new Term("tid", "0")), true, false); if (from.Ticks != 0 || to.Ticks != 0) { var rq = new TermRangeQuery("dte", FormatDate(from), FormatDate(to), true, true); query.Add(rq, Occur.MUST); } if (searchTextExists) { var searchTextQuery = new BooleanQuery(); if (searchInText) searchTextQuery.Add( new QueryParser(Version.LUCENE_29, "message", analyzer).Parse(searchText), Occur.SHOULD); if (searchInSubject) searchTextQuery.Add( new QueryParser(Version.LUCENE_29, "subject", analyzer).Parse(searchText), Occur.SHOULD); if (searchAuthor) searchTextQuery.Add( new QueryParser(Version.LUCENE_29, "usernick", analyzer).Parse(searchText), Occur.SHOULD); query.Add(searchTextQuery, Occur.MUST); } var searcher = new IndexSearcher(indexPath, true); try { var topDocs = searcher.Search(query, _maxSearchReults); result .AddRange( topDocs .ScoreDocs .Select(scored => searcher.Doc(scored.Doc).Get("mid"))); } finally { searcher.Close(); } return result; }
// main search method private IEnumerable<CardCriterion> _search(List<KeyValuePair<string, object>> searchCriterion) { //ifdict.Value contains * OR ?, replace. If After this, searchString is empty, return new List<> if (searchCriterion.Where(kvp => string.IsNullOrEmpty(kvp.Value.toString().Replace("?", ""))).ToList().Any()) return new List<CardCriterion>(); // set up lucene searcher using (var searcher = new IndexSearcher(LuceneConfig.Directory, false)) { IEnumerable<CardCriterion> results; //rename to resultlist using (var analyzer = new RussianAnalyzer(Version.LUCENE_30)) //contains already a russian stopwordlist { Query query = new QueryMaker(analyzer, this, searchCriterion).MakeQuery(); ScoreDoc[] hits = searcher.Search(query, LuceneConfig.HitsLimit).ScoreDocs; results = _mapLuceneToDataList(hits, searcher); } return results; } }
public virtual void TestReusableTokenStream30() { Analyzer a = new RussianAnalyzer(LuceneVersion.LUCENE_30); AssertAnalyzesTo(a, "Вместе с тем о силе электромагнитной энергии имели представление еще", new string[] { "вмест", "сил", "электромагнитн", "энерг", "имел", "представлен" }); AssertAnalyzesTo(a, "Но знание это хранилось в тайне", new string[] { "знан", "хран", "тайн" }); }
public virtual void TestReusableTokenStream() { Analyzer a = new RussianAnalyzer(TEST_VERSION_CURRENT); AssertAnalyzesTo(a, "Вместе с тем о силе электромагнитной энергии имели представление еще", new string[] { "вмест", "сил", "электромагнитн", "энерг", "имел", "представлен" }); AssertAnalyzesTo(a, "Но знание это хранилось в тайне", new string[] { "знан", "эт", "хран", "тайн" }); }
public static void Optimize() { using (var analyzer = new RussianAnalyzer(Version.LUCENE_30)) using (var writer = new IndexWriter(LuceneConfig.Directory, analyzer, IndexWriter.MaxFieldLength.UNLIMITED)) writer.Optimize(); }