public static void AddUpdateLuceneIndex(IEnumerable <CardCriterion> cardCriteria) { // init lucene using (var analyzer = new RussianAnalyzer(Version.LUCENE_30)) //using (var analyzer = new SnowballAnalyzer(Version.LUCENE_30, "Russian"))//Includes stopwords? if not, create a GetStopWordslist() method using (var writer = new IndexWriter(LuceneConfig.Directory, analyzer, IndexWriter.MaxFieldLength.UNLIMITED)) { // add data to lucene search index (replaces older entries if any) foreach (var criterion in cardCriteria) { _addToLuceneIndex(criterion, writer); } } }
/// <summary> /// add index /// </summary> /// <param name="sampleDatas"></param> public static void AddUpdateLuceneIndex(IEnumerable <SampleData> sampleDatas) { // init lucene using (var luceneDirectory = LuceneDirectory) using (var analyzer = new RussianAnalyzer(Version)) using (var writer = new IndexWriter(luceneDirectory, analyzer, IndexWriter.MaxFieldLength.UNLIMITED)) { writer.SetRAMBufferSizeMB(10); // add data to lucene search index (replaces older entries if any) foreach (var sampleData in sampleDatas) { AddToLuceneIndex(sampleData, writer); } writer.Commit(); } }
public static bool ClearIndex() { try { var analyzer = new RussianAnalyzer(Version.LUCENE_30); using (var writer = new IndexWriter(_directory, analyzer, IndexWriter.MaxFieldLength.UNLIMITED)) { writer.DeleteAll(); analyzer.Close(); writer.Dispose(); } } catch (Exception) { return(false); } return(true); }
public static bool ClearLuceneIndex() { try { using (var analyzer = new RussianAnalyzer(Version.LUCENE_30)) using ( var writer = new IndexWriter(LuceneConfig.Directory, analyzer, true, IndexWriter.MaxFieldLength.UNLIMITED)) { // remove older index entries writer.DeleteAll(); } } catch (Exception) { return(false); } return(true); }
public void TestDigitsInRussianCharset() { TextReader reader = new StringReader("text 1000"); RussianAnalyzer ra = new RussianAnalyzer(Version.LUCENE_CURRENT); TokenStream stream = ra.TokenStream("", reader); ITermAttribute termText = stream.GetAttribute <ITermAttribute>(); try { Assert.True(stream.IncrementToken()); Assert.AreEqual("text", termText.Term); Assert.True(stream.IncrementToken()); Assert.AreEqual("1000", termText.Term, "RussianAnalyzer's tokenizer skips numbers from input text"); Assert.False(stream.IncrementToken()); } catch (IOException e) { Assert.Fail("unexpected IOException"); } }
public virtual void TestKOI8() { //System.out.println(new java.util.Date()); RussianAnalyzer ra = new RussianAnalyzer(RussianCharsets.KOI8); // KOI8 inWordsKOI8 = new System.IO.StreamReader( new System.IO.FileStream( new System.IO.FileInfo( dataDir.FullName + @"Analysis\RU\testKOI8.txt").FullName, System.IO.FileMode.Open, System.IO.FileAccess.Read), System.Text.Encoding.GetEncoding("iso-8859-1")); sampleKOI8 = new System.IO.StreamReader( new System.IO.FileStream( new System.IO.FileInfo(dataDir.FullName + @"Analysis\RU\resKOI8.htm").FullName, System.IO.FileMode.Open, System.IO.FileAccess.Read), System.Text.Encoding.GetEncoding("iso-8859-1")); TokenStream in_Renamed = ra.TokenStream("all", inWordsKOI8); RussianLetterTokenizer sample = new RussianLetterTokenizer(sampleKOI8, RussianCharsets.KOI8); for (; ;) { Token token = in_Renamed.Next(); if (token == null) { break; } Token sampleToken = sample.Next(); Assert.AreEqual(token.TermText(), sampleToken == null ? null : sampleToken.TermText(), "KOI8"); } inWordsKOI8.Close(); sampleKOI8.Close(); }
public virtual void TestUnicode() { RussianAnalyzer ra = new RussianAnalyzer(RussianCharsets.UnicodeRussian); inWords = new System.IO.StreamReader( new System.IO.FileStream( new System.IO.FileInfo( dataDir.FullName + @"Analysis\RU\testUnicode.txt").FullName, System.IO.FileMode.Open, System.IO.FileAccess.Read), System.Text.Encoding.GetEncoding("Unicode")); sampleUnicode = new System.IO.StreamReader( new System.IO.FileStream( new System.IO.FileInfo( dataDir.FullName + @"Analysis\RU\resUnicode.htm").FullName, System.IO.FileMode.Open, System.IO.FileAccess.Read), System.Text.Encoding.GetEncoding("Unicode")); TokenStream in_Renamed = ra.TokenStream("all", inWords); RussianLetterTokenizer sample = new RussianLetterTokenizer(sampleUnicode, RussianCharsets.UnicodeRussian); for (; ;) { Token token = in_Renamed.Next(); if (token == null) { break; } Token sampleToken = sample.Next(); Assert.AreEqual(token.TermText(), sampleToken == null ? null : sampleToken.TermText(), "Unicode"); } inWords.Close(); sampleUnicode.Close(); }
public virtual void TestUnicode() { RussianAnalyzer ra = new RussianAnalyzer(RussianCharsets.UnicodeRussian); inWords = new System.IO.StreamReader( new System.IO.FileStream( new System.IO.FileInfo( dataDir.FullName + @"Analysis\RU\testUnicode.txt").FullName, System.IO.FileMode.Open, System.IO.FileAccess.Read), System.Text.Encoding.GetEncoding("Unicode")); sampleUnicode = new System.IO.StreamReader( new System.IO.FileStream( new System.IO.FileInfo( dataDir.FullName + @"Analysis\RU\resUnicode.htm").FullName, System.IO.FileMode.Open, System.IO.FileAccess.Read), System.Text.Encoding.GetEncoding("Unicode")); TokenStream in_Renamed = ra.TokenStream("all", inWords); RussianLetterTokenizer sample = new RussianLetterTokenizer(sampleUnicode, RussianCharsets.UnicodeRussian); for (; ; ) { Token token = in_Renamed.Next(); if (token == null) { break; } Token sampleToken = sample.Next(); Assert.AreEqual(token.TermText(), sampleToken == null ? null : sampleToken.TermText(), "Unicode"); } inWords.Close(); sampleUnicode.Close(); }
public virtual void Test1251() { // 1251 inWords1251 = new System.IO.StreamReader( new System.IO.FileStream( new System.IO.FileInfo( dataDir.FullName + @"Analysis\RU\test1251.txt").FullName, System.IO.FileMode.Open, System.IO.FileAccess.Read), System.Text.Encoding.GetEncoding("iso-8859-1")); sample1251 = new System.IO.StreamReader( new System.IO.FileStream( new System.IO.FileInfo( dataDir.FullName + @"Analysis\RU\res1251.htm").FullName, System.IO.FileMode.Open, System.IO.FileAccess.Read), System.Text.Encoding.GetEncoding("iso-8859-1")); RussianAnalyzer ra = new RussianAnalyzer(RussianCharsets.CP1251); TokenStream in_Renamed = ra.TokenStream("", inWords1251); RussianLetterTokenizer sample = new RussianLetterTokenizer(sample1251, RussianCharsets.CP1251); for (; ;) { Token token = in_Renamed.Next(); if (token == null) { break; } Token sampleToken = sample.Next(); Assert.AreEqual(token.TermText(), sampleToken == null ? null : sampleToken.TermText(), "1251"); } inWords1251.Close(); sample1251.Close(); }
// main search method private IEnumerable <CardCriterion> _search(List <KeyValuePair <string, object> > searchCriterion) { //ifdict.Value contains * OR ?, replace. If After this, searchString is empty, return new List<> if (searchCriterion.Where(kvp => string.IsNullOrEmpty(kvp.Value.toString().Replace("?", ""))).ToList().Any()) { return(new List <CardCriterion>()); } // set up lucene searcher using (var searcher = new IndexSearcher(LuceneConfig.Directory, false)) { IEnumerable <CardCriterion> results; //rename to resultlist using (var analyzer = new RussianAnalyzer(Version.LUCENE_30)) //contains already a russian stopwordlist { Query query = new QueryMaker(analyzer, this, searchCriterion).MakeQuery(); ScoreDoc[] hits = searcher.Search(query, LuceneConfig.HitsLimit).ScoreDocs; results = _mapLuceneToDataList(hits, searcher); } return(results); } }
public virtual void TestKOI8() { //System.out.println(new java.util.Date()); RussianAnalyzer ra = new RussianAnalyzer(RussianCharsets.KOI8); // KOI8 inWordsKOI8 = new System.IO.StreamReader( new System.IO.FileStream( new System.IO.FileInfo( dataDir.FullName + @"Analysis\RU\testKOI8.txt").FullName, System.IO.FileMode.Open, System.IO.FileAccess.Read), System.Text.Encoding.GetEncoding("iso-8859-1")); sampleKOI8 = new System.IO.StreamReader( new System.IO.FileStream( new System.IO.FileInfo(dataDir.FullName + @"Analysis\RU\resKOI8.htm").FullName, System.IO.FileMode.Open, System.IO.FileAccess.Read), System.Text.Encoding.GetEncoding("iso-8859-1")); TokenStream in_Renamed = ra.TokenStream("all", inWordsKOI8); RussianLetterTokenizer sample = new RussianLetterTokenizer(sampleKOI8, RussianCharsets.KOI8); for (; ; ) { Token token = in_Renamed.Next(); if (token == null) { break; } Token sampleToken = sample.Next(); Assert.AreEqual(token.TermText(), sampleToken == null ? null : sampleToken.TermText(), "KOI8"); } inWordsKOI8.Close(); sampleKOI8.Close(); }
public virtual void Test1251() { // 1251 inWords1251 = new System.IO.StreamReader( new System.IO.FileStream( new System.IO.FileInfo( dataDir.FullName + @"Analysis\RU\test1251.txt").FullName, System.IO.FileMode.Open, System.IO.FileAccess.Read), System.Text.Encoding.GetEncoding("iso-8859-1")); sample1251 = new System.IO.StreamReader( new System.IO.FileStream( new System.IO.FileInfo( dataDir.FullName + @"Analysis\RU\res1251.htm").FullName, System.IO.FileMode.Open, System.IO.FileAccess.Read), System.Text.Encoding.GetEncoding("iso-8859-1")); RussianAnalyzer ra = new RussianAnalyzer(RussianCharsets.CP1251); TokenStream in_Renamed = ra.TokenStream("", inWords1251); RussianLetterTokenizer sample = new RussianLetterTokenizer(sample1251, RussianCharsets.CP1251); for (; ; ) { Token token = in_Renamed.Next(); if (token == null) { break; } Token sampleToken = sample.Next(); Assert.AreEqual(token.TermText(), sampleToken == null ? null : sampleToken.TermText(), "1251"); } inWords1251.Close(); sample1251.Close(); }
/// <summary> /// Поиск на основе ранее построенного индекса /// </summary> private static ICollection <string> Search( int forumID, string searchText, bool searchInText, bool searchInSubject, bool searchAuthor, bool searchInMyMessages, bool searchAnyWords, DateTime from, DateTime to) { var result = new List <string>(); var query = new BooleanQuery(); var analyzer = new RussianAnalyzer(Version.LUCENE_30); var indexPath = GetIndexDir(); var searchTextExists = !string.IsNullOrEmpty(searchText); #region Обработка строки // Сигнатура языка поиска - ** if (searchTextExists) { if (searchText.StartsWith(_signature)) { // Да, хотим использовать язык, отрезаем ** и считаем остаток строки написанным на языке поиска searchText = searchText.Substring(_signature.Length); } else { // Используем простой поиск: экранируем спецсимволы, получаем токены (пробел - разделитель), учитываем флажок searchAnyWords (AND/OR) // Порядок важен, первое - \\ var specChars = new[] { "\\", "+", "-", "&", "|", "!", "(", ")", "{", "}", "[", "]", "^", "\"", "~", "*", "?", ":" }; searchText = specChars .Aggregate( searchText, (current, specChar) => current.Replace(specChar, "\\" + specChar)); var token = searchText.Split(new[] { ' ' }, StringSplitOptions.RemoveEmptyEntries); if (searchAnyWords) { searchText = string.Join(" ", token); } else { searchText = "+" + string.Join(" +", token); } } } #endregion if (forumID != -1) { query.Add( new TermQuery(new Term("gid", forumID.ToString())), Occur.MUST); } if (searchInMyMessages) { query.Add( new TermQuery(new Term("uid", Config.Instance.SelfId.ToString())), Occur.MUST); } //if (searchInQuestions) // bq.Add(new TermQuery(new Term("tid", "0")), true, false); if (from.Ticks != 0 || to.Ticks != 0) { var rq = new TermRangeQuery("dte", FormatDate(from), FormatDate(to), true, true); query.Add(rq, Occur.MUST); } if (searchTextExists) { var searchTextQuery = new BooleanQuery(); if (searchInText) { searchTextQuery.Add( new QueryParser(Version.LUCENE_29, "message", analyzer).Parse(searchText), Occur.SHOULD); } if (searchInSubject) { searchTextQuery.Add( new QueryParser(Version.LUCENE_29, "subject", analyzer).Parse(searchText), Occur.SHOULD); } if (searchAuthor) { searchTextQuery.Add( new QueryParser(Version.LUCENE_29, "usernick", analyzer).Parse(searchText), Occur.SHOULD); } query.Add(searchTextQuery, Occur.MUST); } var searcher = new IndexSearcher(indexPath, true); try { var topDocs = searcher.Search(query, _maxSearchReults); result .AddRange( topDocs .ScoreDocs .Select(scored => searcher.Doc(scored.Doc).Get("mid"))); } finally { searcher.Close(); } return(result); }
public static void Optimize() { using (var analyzer = new RussianAnalyzer(Version.LUCENE_30)) using (var writer = new IndexWriter(LuceneConfig.Directory, analyzer, IndexWriter.MaxFieldLength.UNLIMITED)) writer.Optimize(); }