protected override Analyzer GetAnalyzer(Net.Util.Version version) { var a = new PerFieldAnalyzerWrapper(base.GetAnalyzer(version)); a.AddAnalyzer("Version", new KeywordAnalyzer()); a.AddAnalyzer("Flag", new KeywordAnalyzer()); return a; }
protected override Analyzer GetAnalyzer(Net.Util.Version version) { var analyzer = new PerFieldAnalyzerWrapper(base.GetAnalyzer(version)); analyzer.AddAnalyzer("Path", new CaseInsensitiveKeywordAnalyzer()); analyzer.AddAnalyzer("Key", new KeywordAnalyzer()); return analyzer; }
void SetAnalyzerType(Type defaultType, IEnumerable<FieldDetails> fields) { if (defaultType == null) { defaultType = typeof(StandardAnalyzer); } // create default analyzer _defaultAnalyzer = Activator.CreateInstance(defaultType) as Analyzer; if (_defaultAnalyzer == null) { throw new ArgumentException("defaultType is not an Analyzer type"); } var wrapper = new PerFieldAnalyzerWrapper(_defaultAnalyzer); if (fields != null) { foreach (var fd in fields) { if (fd.Field.Analyzer!=null) { var fieldAnalyzer = CreateAnalyzerFromType(fd.Field.Analyzer); if (fieldAnalyzer != null) { wrapper.AddAnalyzer(fd.Name, fieldAnalyzer); } } } } Analyzer = wrapper; }
private void InitSearchServiceAnalyzer(Type indexingServiceSettingsType, Analyzer defaultAnalyzer, Analyzer textAnalyzer) { var perFieldAnalyzerWrapper = new PerFieldAnalyzerWrapper(defaultAnalyzer); perFieldAnalyzerWrapper.AddAnalyzer("EPISERVER_SEARCH_ID", new KeywordAnalyzer()); perFieldAnalyzerWrapper.AddAnalyzer("EPISERVER_SEARCH_CULTURE", new KeywordAnalyzer()); perFieldAnalyzerWrapper.AddAnalyzer("EPISERVER_SEARCH_REFERENCEID", new KeywordAnalyzer()); perFieldAnalyzerWrapper.AddAnalyzer("EPISERVER_SEARCH_AUTHORSTORAGE", new KeywordAnalyzer()); perFieldAnalyzerWrapper.AddAnalyzer("EPISERVER_SEARCH_CATEGORIES", new WhitespaceAnalyzer()); perFieldAnalyzerWrapper.AddAnalyzer("EPISERVER_SEARCH_ACL", new WhitespaceAnalyzer()); perFieldAnalyzerWrapper.AddAnalyzer("EPISERVER_SEARCH_VIRTUALPATH", new WhitespaceAnalyzer()); perFieldAnalyzerWrapper.AddAnalyzer("EPISERVER_SEARCH_TYPE", new WhitespaceAnalyzer()); perFieldAnalyzerWrapper.AddAnalyzer("EPISERVER_SEARCH_CREATED", new WhitespaceAnalyzer()); perFieldAnalyzerWrapper.AddAnalyzer("EPISERVER_SEARCH_MODIFIED", new WhitespaceAnalyzer()); perFieldAnalyzerWrapper.AddAnalyzer("EPISERVER_SEARCH_PUBLICATIONEND", new WhitespaceAnalyzer()); perFieldAnalyzerWrapper.AddAnalyzer("EPISERVER_SEARCH_PUBLICATIONSTART", new WhitespaceAnalyzer()); perFieldAnalyzerWrapper.AddAnalyzer("EPISERVER_SEARCH_ITEMSTATUS", new WhitespaceAnalyzer()); perFieldAnalyzerWrapper.AddAnalyzer("EPISERVER_SEARCH_TITLE", textAnalyzer); perFieldAnalyzerWrapper.AddAnalyzer("EPISERVER_SEARCH_DISPLAYTEXT", textAnalyzer); perFieldAnalyzerWrapper.AddAnalyzer("EPISERVER_SEARCH_AUTHORS", textAnalyzer); perFieldAnalyzerWrapper.AddAnalyzer("EPISERVER_SEARCH_DEFAULT", textAnalyzer); indexingServiceSettingsType .GetField("_analyzer", BindingFlags.Static | BindingFlags.NonPublic) .SetValue(null, perFieldAnalyzerWrapper); }
/// <summary> /// Detects untokenized fields and sets as NotAnalyzed in analyzer /// </summary> private static string PreProcessUntokenizedTerms(PerFieldAnalyzerWrapper analyzer, string query, Analyzer keywordAnlyzer) { var untokenizedMatches = untokenizedQuery.Matches(query); if (untokenizedMatches.Count < 1) { return query; } var sb = new StringBuilder(query); // KeywordAnalyzer will not tokenize the values // process in reverse order to leverage match string indexes for (int i=untokenizedMatches.Count; i>0; i--) { Match match = untokenizedMatches[i-1]; // specify that term for this field should not be tokenized analyzer.AddAnalyzer(match.Groups[1].Value, keywordAnlyzer); Group term = match.Groups[2]; // remove enclosing "[[" "]]" from term value (again in reverse order) sb.Remove(term.Index+term.Length-2, 2); sb.Remove(term.Index, 2); } return sb.ToString(); }
protected override Analyzer GetAnalyzer(Net.Util.Version version) { analyzer = new PerFieldAnalyzerWrapper(base.GetAnalyzer(version)); analyzer.AddAnalyzer<SampleDocument>(t => t.Id, new KeywordAnalyzer()); analyzer.AddAnalyzer<SampleDocument>(t => t.Key, new CaseInsensitiveKeywordAnalyzer()); return analyzer; }
private void btnSearch_Click(object sender, EventArgs e) { lstResults.Items.Clear(); searcher = new IndexSearcher(new RAMDirectory(_indexTarget)); PerFieldAnalyzerWrapper analyzer = new PerFieldAnalyzerWrapper(new StandardAnalyzer()); analyzer.AddAnalyzer("ayat_arabic", new DiacriticAnalyzer(FilterData.stopWords)); //MyQueryParser parser = new MyQueryParser(new string[] { "ayat_desc", "ayat_urdu", "ayat_arabic" }, analyzer); //parser.SetDefaultOperator(QueryParser.Operator.AND); //Query q = parser.Parse(txtSearch.Text); //Query q = new TermQuery(new Term("ayatno", NumberTools.LongToString(long.Parse(txtSearch.Text)))); BooleanQuery q = new BooleanQuery(); long l1 = 1; long l2 = 500; long l3 = 1; long l4 = 1; //RangeQuery rq = new RangeQuery(new Term("ayatno", l1.ToString("00000")), new Term("ayatno", l2.ToString("00000")), true); //q.Add(rq, true, false); q.Add(new TermQuery(new Term("sid", l3.ToString("00000"))), true, false); q.Add(new TermQuery(new Term("ayatno", l4.ToString("00000"))), true, false); MessageBox.Show(q.ToString()); Sort sort = new Sort(new string[] { "pid", "sid", "ayatno" }); hits = searcher.Search(q, sort); lblHits.Text = hits.Length() + " hit(s)."; Application.DoEvents(); for (int i = 0; i < hits.Length(); i++) { StringBuilder sb = new StringBuilder(); sb.Append("Para: ").Append(hits.Doc(i).Get("pid")); sb.Append(", Surat: ").Append(hits.Doc(i).Get("sid")); sb.Append(", Verse: ").Append(hits.Doc(i).Get("ayatno")); lstResults.Items.Add(sb.ToString()); } }
public InstancePerFieldAnalyzerWrapper() { var analyzer = new Lucene.Net.Analysis.PerFieldAnalyzerWrapper(new GISAServer.Search.Synonyms.SynonymAnalyzer(new GISAServer.Search.Synonyms.XmlSynonymEngine())); analyzer.AddAnalyzer("cota", new Lucene.Net.Analysis.KeywordAnalyzer()); instancePerFieldAnalyzerWrapper = analyzer; }
public static Query BuildQuery(string query, IndexQuery indexQuery, PerFieldAnalyzerWrapper analyzer) { var originalQuery = query; Analyzer keywordAnalyzer = new KeywordAnalyzer(); try { var queryParser = new RangeQueryParser(Version.LUCENE_29, indexQuery.DefaultField ?? string.Empty, analyzer) { DefaultOperator = indexQuery.DefaultOperator == QueryOperator.Or ? QueryParser.Operator.OR : QueryParser.Operator.AND, AllowLeadingWildcard = true }; query = PreProcessUntokenizedTerms(query, queryParser); query = PreProcessSearchTerms(query); query = PreProcessDateTerms(query, queryParser); var generatedQuery = queryParser.Parse(query); generatedQuery = HandleMethods(generatedQuery); return generatedQuery; } catch (ParseException pe) { if (originalQuery == query) throw new ParseException("Could not parse: '" + query + "'", pe); throw new ParseException("Could not parse modified query: '" + query + "' original was: '" + originalQuery + "'", pe); } finally { keywordAnalyzer.Close(); } }
public InstancePerFieldAnalyzerWrapper() { var analyzer = new Lucene.Net.Analysis.PerFieldAnalyzerWrapper(new Synonyms.SynonymAnalyzer(new Synonyms.XmlSynonymEngine())); analyzer.AddAnalyzer("cota", new Lucene.Net.Analysis.KeywordAnalyzer()); analyzer.AddAnalyzer("codigo", new Lucene.Net.Analysis.KeywordAnalyzer()); instancePerFieldAnalyzerWrapper = analyzer; }
public void StartEmailIndexing() { if (!Directory.Exists(GlobalData.EmailIndexPath)) Directory.CreateDirectory(GlobalData.EmailIndexPath); IndexWriter index; PerFieldAnalyzerWrapper pfaw = new PerFieldAnalyzerWrapper(new KeywordAnalyzer()); pfaw.AddAnalyzer("body", new StopAnalyzer()); try { index = new IndexWriter(GlobalData.EmailIndexPath, pfaw, false); } catch { index = new IndexWriter(GlobalData.EmailIndexPath, pfaw, true); } const string PopServer = "pop.google.in"; const int PopPort = 995; const string User = "******"; const string Pass = "******"; using (Pop3Client client = new Pop3Client(PopServer, PopPort, true, User, Pass)) { client.Trace += new Action<string>(Console.WriteLine); //connects to Pop3 Server, Executes POP3 USER and PASS client.Authenticate(); client.Stat(); foreach (Pop3ListItem item in client.List()) { Document doc = new Document(); MailMessageEx message = client.RetrMailMessageEx(item); doc.Add(new Field("subject", message.Subject.ToLower(), Field.Store.YES, Field.Index.NO_NORMS)); doc.Add(new Field("from", message.From.ToString().ToLower(), Field.Store.YES, Field.Index.NO_NORMS)); doc.Add(new Field("to", message.To.ToString().ToLower(), Field.Store.YES, Field.Index.NO_NORMS)); //doc.Add(new Field("date", message.DeliveryDate.ToLower(), Field.Store.YES, Field.Index.NO_NORMS)); string code = message.Body; code = Regex.Replace(code, @"<\s*head\s*>(.|\n|\r)*?<\s*/\s*head\s*>", " ", RegexOptions.Compiled); //repalce <head> section with single whitespace code = Regex.Replace(code, @"<\s*script (.|\n|\r)*?<\s*/\s*script\s*>", " ", RegexOptions.Compiled);//repalce remaining <script> tags from body with single whitespace code = Regex.Replace(code, @"<!--(.|\n|\r)*?-->", " ", RegexOptions.Compiled); //repalce comments code = Regex.Replace(code, @"<(.|\n|\r)*?>", " ", RegexOptions.Compiled); //repalce all tags with single whitespace code = Regex.Replace(code, @"&.*?;", " ", RegexOptions.Compiled); //replace > e.t.c code = Regex.Replace(code, @"\s+", " ", RegexOptions.Compiled); //replace multiple whitespaces characters by single whitespace code = Regex.Replace(code, @"\ufffd", " ", RegexOptions.Compiled); doc.Add(new Field("body", code.ToLower(), Field.Store.YES, Field.Index.NO_NORMS)); index.AddDocument(doc); } client.Noop(); client.Rset(); client.Quit(); index.Optimize(); index.Close(); } }
public void TestPerFieldAnalyzer() { var analyzer = new PerFieldAnalyzerWrapper(new SimpleAnalyzer()); analyzer.AddAnalyzer("partnum", new KeywordAnalyzer()); var query = new QueryParser(Lucene.Net.Util.Version.LUCENE_29, "description", analyzer) .Parse("partnum:Q36 AND SPACE"); Assert.AreEqual("+partnum:Q36 +space", query.ToString("description"), "Q36 kept as-is"); Assert.AreEqual(1, searcher.Search(query, searcher.MaxDoc()).ScoreDocs.Length, "docs found!!!"); }
}//contructor which is used to initialize the objects //create index public void CreateIndex(string indexPath) { luceneIndexDirectory = Lucene.Net.Store.FSDirectory.Open(indexPath); analyzerstandard = new Lucene.Net.Analysis.Standard.StandardAnalyzer(VERSION); analyzerkeyword = new Lucene.Net.Analysis.KeywordAnalyzer(); IndexWriter.MaxFieldLength mfl = new IndexWriter.MaxFieldLength(IndexWriter.DEFAULT_MAX_FIELD_LENGTH); analysor = new PerFieldAnalyzerWrapper(analyzerstandard); writer = new Lucene.Net.Index.IndexWriter(luceneIndexDirectory, analysor, true, mfl); writer.SetSimilarity(customSimilarity);//for task 6 }
public virtual void TestPerFieldAnalyzer() { PerFieldAnalyzerWrapper analyzer = new PerFieldAnalyzerWrapper(new SimpleAnalyzer()); analyzer.AddAnalyzer("partnum", new KeywordAnalyzer()); Lucene.Net.QueryParsers.QueryParser queryParser = new Lucene.Net.QueryParsers.QueryParser("description", analyzer); Query query = queryParser.Parse("partnum:Q36 AND SPACE"); Hits hits = searcher.Search(query); Assert.AreEqual("+partnum:Q36 +space", query.ToString("description"), "Q36 kept as-is"); Assert.AreEqual(1, hits.Length(), "doc found!"); }
public virtual void TestPerFieldAnalyzer() { PerFieldAnalyzerWrapper analyzer = new PerFieldAnalyzerWrapper(new SimpleAnalyzer()); analyzer.AddAnalyzer("partnum", new KeywordAnalyzer()); QueryParser queryParser = new QueryParser("description", analyzer); Query query = queryParser.Parse("partnum:Q36 AND SPACE"); ScoreDoc[] hits = searcher.Search(query, null, 1000).scoreDocs; Assert.AreEqual("+partnum:Q36 +space", query.ToString("description"), "Q36 kept as-is"); Assert.AreEqual(1, hits.Length, "doc found!"); }
public virtual void TestPerFieldAnalyzer() { PerFieldAnalyzerWrapper analyzer = new PerFieldAnalyzerWrapper(new SimpleAnalyzer()); analyzer.AddAnalyzer("partnum", new KeywordAnalyzer()); QueryParser queryParser = new QueryParser(Version.LUCENE_CURRENT, "description", analyzer); Query query = queryParser.Parse("partnum:Q36 AND SPACE"); ScoreDoc[] hits = searcher.Search(query, null, 1000, null).ScoreDocs; Assert.AreEqual("+partnum:Q36 +space", query.ToString("description"), "Q36 kept as-is"); Assert.AreEqual(1, hits.Length, "doc found!"); }
public virtual void TestPerField() { System.String text = "Qwerty"; PerFieldAnalyzerWrapper analyzer = new PerFieldAnalyzerWrapper(new WhitespaceAnalyzer()); analyzer.AddAnalyzer("special", new SimpleAnalyzer()); TokenStream tokenStream = analyzer.TokenStream("field", new System.IO.StringReader(text)); Token token = tokenStream.Next(); Assert.AreEqual("Qwerty", token.TermText(), "WhitespaceAnalyzer does not lowercase"); tokenStream = analyzer.TokenStream("special", new System.IO.StringReader(text)); token = tokenStream.Next(); Assert.AreEqual("qwerty", token.TermText(), "SimpleAnalyzer lowercases"); }
public EDSIndexer(string desIndexPath, Analyzer analyser, bool overwriteIndexDir) { keywordAnalyzer = analyser; pfaw = new PerFieldAnalyzerWrapper(analyser); pfaw.AddAnalyzer("content", stopAnalyzer); //generally for content v use stop analyser try { index = new IndexWriter(desIndexPath, pfaw, overwriteIndexDir); } catch { index = new IndexWriter(desIndexPath, pfaw, true); } }
public void Code() { Analyzer _keywordanalyzer = new KeywordAnalyzer(); Analyzer _simpleanalyzer = new Lucene.Net.Analysis.SimpleAnalyzer(); Analyzer _stopanalyzer = new Lucene.Net.Analysis.StopAnalyzer(Lucene.Net.Util.Version.LUCENE_30); Analyzer _whitespaceanalyzer = new Lucene.Net.Analysis.WhitespaceAnalyzer(); Analyzer _standardanalyzer = new Lucene.Net.Analysis.Standard.StandardAnalyzer(Lucene.Net.Util.Version.LUCENE_30); var _perfieldanalyzer = new Lucene.Net.Analysis.PerFieldAnalyzerWrapper(_standardanalyzer); _perfieldanalyzer.AddAnalyzer("firstname", _keywordanalyzer); _perfieldanalyzer.AddAnalyzer("lastname", _keywordanalyzer); IndexWriter _writer = new IndexWriter(_directory, _perfieldanalyzer, IndexWriter.MaxFieldLength.UNLIMITED); IndexReader _reader = _writer.GetReader(); IndexSearcher _searcher = new IndexSearcher(_reader); //QueryParser parser = new QueryParser(Lucene.Net.Util.Version.LUCENE_30, "title", _standardanalyzer); string[] fields = new[] { "text", "title", "author" }; var boosts = new Dictionary <string, float>(); boosts.Add("text", 2.0f); boosts.Add("title", 1.5f); QueryParser parser = new MultiFieldQueryParser(Lucene.Net.Util.Version.LUCENE_30, fields, _standardanalyzer, boosts); Query query = parser.Parse("lucene is great"); TopDocs hits = _searcher.Search(query, 1000); IEnumerable <Document> docs = hits.ScoreDocs.Select(hit => _searcher.Doc(hit.Doc)); var books = docs.Select(doc => new Book() { Text = doc.Get("text"), Title = doc.Get("title"), Author = doc.Get("author"), Length = Int32.Parse(doc.Get("length")) }); _writer.Optimize(); _writer.Commit(); _writer.DeleteAll(); }
public static Query BuildQuery(string query, PerFieldAnalyzerWrapper analyzer) { var keywordAnalyzer = new KeywordAnalyzer(); try { query = PreProcessUntokenizedTerms(analyzer, query, keywordAnalyzer); var queryParser = new RangeQueryParser(Version.LUCENE_29, "", analyzer); queryParser.SetAllowLeadingWildcard(true); return queryParser.Parse(query);; } finally { keywordAnalyzer.Close(); } }
public static Query BuildQuery(string query, PerFieldAnalyzerWrapper analyzer) { Analyzer keywordAnalyzer = null; try { query = PreProcessUntokenizedTerms(analyzer, query, ref keywordAnalyzer); var queryParser = new RangeQueryParser(Version.LUCENE_29, string.Empty, analyzer); queryParser.SetAllowLeadingWildcard(true); // not the recommended approach, should rather use ReverseFilter return queryParser.Parse(query); } finally { if (keywordAnalyzer != null) keywordAnalyzer.Close(); } }
public virtual void TestPerField() { System.String text = "Qwerty"; PerFieldAnalyzerWrapper analyzer = new PerFieldAnalyzerWrapper(new WhitespaceAnalyzer()); analyzer.AddAnalyzer("special", new SimpleAnalyzer()); TokenStream tokenStream = analyzer.TokenStream("Field", new System.IO.StringReader(text)); Token token = tokenStream.Next(); Assert.AreEqual("Qwerty", token.TermText(), "WhitespaceAnalyzer does not lowercase"); tokenStream = analyzer.TokenStream("special", new System.IO.StringReader(text)); token = tokenStream.Next(); Assert.AreEqual("qwerty", token.TermText(), "SimpleAnalyzer lowercases"); }
public LuceneApplication() { luceneIndexDirectory = null; analyzerstandard = null; analyzerkeyword = null; writer = null; analysor = null; searcher = null; parser = null; customSimilarity = new CustomSimilarity();//for task 6 tokenCount = new Dictionary <string, int>(); numofdoc = 0; numofrelevant = 0; option = new List <string>(); infneed = new Dictionary <string, string>(); }//contructor which is used to initialize the objects
public virtual void TestPerField() { System.String text = "Qwerty"; PerFieldAnalyzerWrapper analyzer = new PerFieldAnalyzerWrapper(new WhitespaceAnalyzer()); analyzer.AddAnalyzer("special", new SimpleAnalyzer()); TokenStream tokenStream = analyzer.TokenStream("field", new System.IO.StringReader(text)); ITermAttribute termAtt = tokenStream.GetAttribute<ITermAttribute>(); Assert.IsTrue(tokenStream.IncrementToken()); Assert.AreEqual("Qwerty", termAtt.Term, "WhitespaceAnalyzer does not lowercase"); tokenStream = analyzer.TokenStream("special", new System.IO.StringReader(text)); termAtt = tokenStream.GetAttribute<ITermAttribute>(); Assert.IsTrue(tokenStream.IncrementToken()); Assert.AreEqual("qwerty", termAtt.Term, "SimpleAnalyzer lowercases"); }
public static Query BuildQuery(string query, string defaultField, PerFieldAnalyzerWrapper analyzer) { Analyzer keywordAnalyzer = new KeywordAnalyzer(); try { var queryParser = new RangeQueryParser(Version.LUCENE_29, defaultField ?? string.Empty, analyzer); query = PreProcessUntokenizedTerms(query, queryParser); query = PreProcessSearchTerms(query); query = PreProcessDateTerms(query, queryParser); queryParser.SetAllowLeadingWildcard(true); // not the recommended approach, should rather use ReverseFilter return queryParser.Parse(query); } finally { keywordAnalyzer.Close(); } }
public void CompareHtmlTokenization() { const string str = @"test1 <a href=""foo"">testlink</a> test2 test3"; PerFieldAnalyzerWrapper pfaw = new PerFieldAnalyzerWrapper(new HtmlStandardAnalyzer()); pfaw.AddAnalyzer("Morph", new HtmlMorphAnalyzer(HspellDict)); Directory indexDirectory = new RAMDirectory(); IndexWriter writer = new IndexWriter(indexDirectory, pfaw, true, IndexWriter.MaxFieldLength.UNLIMITED); Document doc = new Document(); doc.Add(new Field("Simple", str, Field.Store.YES, Field.Index.ANALYZED, Field.TermVector.WITH_POSITIONS_OFFSETS)); doc.Add(new Field("Morph", str, Field.Store.YES, Field.Index.ANALYZED, Field.TermVector.WITH_POSITIONS_OFFSETS)); writer.AddDocument(doc); writer.Close(); CompareTermData(indexDirectory, str); }
public SearcherContext(Directory dir, Analyzer defaultAnalyzer, TimeSpan targetMinStale, TimeSpan targetMaxStale, TimeSpan commitInterval, TimeSpan optimizeInterval) { Analyzer = new PerFieldAnalyzerWrapper(defaultAnalyzer); _writer = new IndexWriter(dir, Analyzer, IndexWriter.MaxFieldLength.UNLIMITED); Manager = new NrtManager(_writer); _reopener = new NrtManagerReopener(Manager, targetMaxStale, targetMinStale); _committer = new Committer(_writer, commitInterval, optimizeInterval); _threads.AddRange(new[] { new Thread(_reopener.Start), new Thread(_committer.Start) }); foreach (var t in _threads) { t.Start(); } }
public virtual void TestPerField() { System.String text = "Qwerty"; PerFieldAnalyzerWrapper analyzer = new PerFieldAnalyzerWrapper(new WhitespaceAnalyzer()); analyzer.AddAnalyzer("special", new SimpleAnalyzer()); TokenStream tokenStream = analyzer.TokenStream("field", new System.IO.StringReader(text)); ITermAttribute termAtt = tokenStream.GetAttribute <ITermAttribute>(); Assert.IsTrue(tokenStream.IncrementToken()); Assert.AreEqual("Qwerty", termAtt.Term, "WhitespaceAnalyzer does not lowercase"); tokenStream = analyzer.TokenStream("special", new System.IO.StringReader(text)); termAtt = tokenStream.GetAttribute <ITermAttribute>(); Assert.IsTrue(tokenStream.IncrementToken()); Assert.AreEqual("qwerty", termAtt.Term, "SimpleAnalyzer lowercases"); }
protected override Task<bool> OnProcessBatch(CollectorHttpClient client, IEnumerable<JToken> items, JToken context, DateTime commitTimeStamp, CancellationToken cancellationToken) { PerFieldAnalyzerWrapper analyzer = new PerFieldAnalyzerWrapper(new StandardAnalyzer(Lucene.Net.Util.Version.LUCENE_30)); analyzer.AddAnalyzer("Id", new IdentifierKeywordAnalyzer()); int i = 0; using (IndexWriter writer = new IndexWriter(_directory, analyzer, false, IndexWriter.MaxFieldLength.UNLIMITED)) { foreach (JObject item in items) { i++; string id = item["nuget:id"].ToString(); string version = item["nuget:version"].ToString(); BooleanQuery query = new BooleanQuery(); query.Add(new BooleanClause(new TermQuery(new Term("Id", id.ToLowerInvariant())), Occur.MUST)); query.Add(new BooleanClause(new TermQuery(new Term("Version", version)), Occur.MUST)); writer.DeleteDocuments(query); Document doc = new Document(); doc.Add(new Field("Id", item["nuget:id"].ToString(), Field.Store.YES, Field.Index.ANALYZED)); doc.Add(new Field("Version", item["nuget:version"].ToString(), Field.Store.YES, Field.Index.NOT_ANALYZED)); writer.AddDocument(doc); } string trace = Guid.NewGuid().ToString(); writer.Commit(new Dictionary<string, string> { { "commitTimeStamp", commitTimeStamp.ToString("O") }, { "trace", trace } }); Trace.TraceInformation("COMMIT {0} documents, index contains {1} documents, commitTimeStamp {2}, trace: {3}", i, writer.NumDocs(), commitTimeStamp.ToString("O"), trace); } return Task.FromResult(true); }
public void Search(string keyword) { IndexReader reader = null; IndexSearcher searcher = null; try { reader = IndexReader.Open(FSDirectory.Open(new DirectoryInfo(indexDirectory)), true); searcher = new IndexSearcher(reader); //创建查询 PerFieldAnalyzerWrapper wrapper = new PerFieldAnalyzerWrapper(analyzer); wrapper.AddAnalyzer("FileName", analyzer); wrapper.AddAnalyzer("Author", analyzer); wrapper.AddAnalyzer("Content", analyzer); string[] fields = { "FileName", "Author", "Content" }; QueryParser parser = new MultiFieldQueryParser(Lucene.Net.Util.Version.LUCENE_30, fields, wrapper); Query query = parser.Parse(keyword); TopScoreDocCollector collector = TopScoreDocCollector.Create(NumberHits, true); searcher.Search(query, collector); var hits = collector.TopDocs().ScoreDocs; int numTotalHits = collector.TotalHits; //以后就可以对获取到的collector数据进行操作 for (int i = 0; i < hits.Count(); i++) { var hit = hits[i]; Document doc = searcher.Doc(hit.Doc); Field fileNameField = doc.GetField("FileName"); Field authorField = doc.GetField("Author"); Field pathField = doc.GetField("Path"); } } finally { if (searcher != null) searcher.Dispose(); if (reader != null) reader.Dispose(); } }
public static Lucene.Net.Analysis.Analyzer GetAnalyzer() { //return new StandardAnalyzer(new string[] {"的", "之" }); if (analyzerWrapper == null) { analyzerWrapper = new Lucene.Net.Analysis.PerFieldAnalyzerWrapper(new StandardAnalyzer(MyLucene.GetLuceneVersion())); analyzerWrapper.AddAnalyzer("name", new MyAnalyzer(stopWords)); analyzerWrapper.AddAnalyzer("japName", new MyAnalyzer(stopWords)); analyzerWrapper.AddAnalyzer("oldName", new MyAnalyzer(stopWords)); analyzerWrapper.AddAnalyzer("shortName", new MyAnalyzer(stopWords)); analyzerWrapper.AddAnalyzer("effect", new MyAnalyzer(stopWords2)); analyzerWrapper.AddAnalyzer("adjust", new MyAnalyzer(stopWords2)); analyzerWrapper.AddAnalyzer("tribe", new Lucene.Net.Analysis.KeywordAnalyzer()); analyzerWrapper.AddAnalyzer("cheatcode", new KeywordAnalyzer()); analyzerWrapper.AddAnalyzer("aliasList", new PunctuationAnalyzer()); analyzerWrapper.AddAnalyzer("cardCamp", new Lucene.Net.Analysis.KeywordAnalyzer()); analyzerWrapper.AddAnalyzer("enName", new LetterDigitAnalyzer()); analyzerWrapper.AddAnalyzer("pyname", new SimpleAnalyzer()); analyzerWrapper.AddAnalyzer("pyshortName", new SimpleAnalyzer()); analyzerWrapper.AddAnalyzer("pyoldName", new SimpleAnalyzer()); analyzerWrapper.AddAnalyzer("effectType", new SimpleAnalyzer()); analyzerWrapper.AddAnalyzer("package", new PunctuationAnalyzer()); //因为高级搜索的关系,中文的字段名也需要分词 analyzerWrapper.AddAnalyzer("中文名", new MyAnalyzer(stopWords)); analyzerWrapper.AddAnalyzer("日文名", new MyAnalyzer(stopWords)); analyzerWrapper.AddAnalyzer("旧卡名", new MyAnalyzer(stopWords)); analyzerWrapper.AddAnalyzer("曾用名", new MyAnalyzer(stopWords)); analyzerWrapper.AddAnalyzer("简称", new MyAnalyzer(stopWords)); analyzerWrapper.AddAnalyzer("俗称", new MyAnalyzer(stopWords)); analyzerWrapper.AddAnalyzer("缩写", new MyAnalyzer(stopWords)); analyzerWrapper.AddAnalyzer("效果", new MyAnalyzer(stopWords2)); analyzerWrapper.AddAnalyzer("效果说明", new MyAnalyzer(stopWords2)); analyzerWrapper.AddAnalyzer("调整", new MyAnalyzer(stopWords2)); analyzerWrapper.AddAnalyzer("种族", new Lucene.Net.Analysis.KeywordAnalyzer()); analyzerWrapper.AddAnalyzer("卡包", new PunctuationAnalyzer()); } return(analyzerWrapper); }
private static Query HandleMethods(Query query, PerFieldAnalyzerWrapper analyzer) { var termQuery = query as TermQuery; if (termQuery != null && termQuery.Term.Field.StartsWith("@")) { return HandleMethodsForQueryAndTerm(query, termQuery.Term, analyzer); } var wildcardQuery = query as WildcardQuery; if (wildcardQuery != null) { return HandleMethodsForQueryAndTerm(query, wildcardQuery.Term, analyzer); } var booleanQuery = query as BooleanQuery; if (booleanQuery != null) { foreach (var c in booleanQuery.Clauses) { c.Query = HandleMethods(c.Query, analyzer); } if (booleanQuery.Clauses.Count == 0) return booleanQuery; var mergeGroups = booleanQuery.Clauses.Select(x=>x.Query).OfType<IRavenLuceneMethodQuery>().GroupBy(x => x.Field).ToArray(); if (mergeGroups.Length == 0) return booleanQuery; foreach (var mergeGroup in mergeGroups) { var clauses = mergeGroup.ToArray(); var first = clauses[0]; foreach (var mergedClause in clauses.Skip(1)) { booleanQuery.Clauses.RemoveAll(x => ReferenceEquals(x.Query, mergedClause)); } var ravenLuceneMethodQuery = clauses.Skip(1).Aggregate(first, (methodQuery, clause) => methodQuery.Merge(clause)); booleanQuery.Clauses.First(x => ReferenceEquals(x.Query, first)).Query = (Query)ravenLuceneMethodQuery; } if (booleanQuery.Clauses.Count == 1) return booleanQuery.Clauses[0].Query; return booleanQuery; } return query; }
/// <summary> /// Detects untokenized fields and sets as NotAnalyzed in analyzer /// </summary> private static string PreProcessUntokenizedTerms(PerFieldAnalyzerWrapper analyzer, string query, ref Analyzer keywordAnalyzer) { var untokenizedMatches = untokenizedQuery.Matches(query); if (untokenizedMatches.Count < 1) return query; var sb = new StringBuilder(query); // Initialize a KeywordAnalyzer // KeywordAnalyzer will not tokenize the values keywordAnalyzer = new KeywordAnalyzer(); // process in reverse order to leverage match string indexes for (var i = untokenizedMatches.Count; i > 0; i--) { var match = untokenizedMatches[i - 1]; // specify that term for this field should not be tokenized analyzer.AddAnalyzer(match.Groups[1].Value, keywordAnalyzer); var term = match.Groups[2]; // introduce " " around the term var startIndex = term.Index; var length = term.Length - 2; if (sb[startIndex + length - 1] != '"') { sb.Insert(startIndex + length, '"'); length += 1; } if (sb[startIndex + 2] != '"') { sb.Insert(startIndex + 2, '"'); length += 1; } // remove enclosing "[[" "]]" from term value (again in reverse order) sb.Remove(startIndex + length, 2); sb.Remove(startIndex, 2); } return sb.ToString(); }
public static Lucene.Net.Analysis.Analyzer GetAnalyzer() { //return new StandardAnalyzer(new string[] {"的", "之" }); if (analyzerWrapper == null) { analyzerWrapper = new Lucene.Net.Analysis.PerFieldAnalyzerWrapper(new StandardAnalyzer()); analyzerWrapper.AddAnalyzer("name", new MyAnalyzer(stopWords)); analyzerWrapper.AddAnalyzer("japName", new MyAnalyzer(stopWords)); analyzerWrapper.AddAnalyzer("oldName", new MyAnalyzer(stopWords)); analyzerWrapper.AddAnalyzer("shortName", new MyAnalyzer(stopWords)); analyzerWrapper.AddAnalyzer("effect", new MyAnalyzer(stopWords2)); analyzerWrapper.AddAnalyzer("adjust", new MyAnalyzer(stopWords2)); analyzerWrapper.AddAnalyzer("tribe", new Lucene.Net.Analysis.KeywordAnalyzer()); analyzerWrapper.AddAnalyzer("cheatcode", new Lucene.Net.Analysis.KeywordAnalyzer()); analyzerWrapper.AddAnalyzer("cardCamp", new Lucene.Net.Analysis.KeywordAnalyzer()); analyzerWrapper.AddAnalyzer("enName", new LetterDigitAnalyzer()); analyzerWrapper.AddAnalyzer("pyname", new SimpleAnalyzer()); analyzerWrapper.AddAnalyzer("pyshortName", new SimpleAnalyzer()); analyzerWrapper.AddAnalyzer("pyoldName", new SimpleAnalyzer()); analyzerWrapper.AddAnalyzer("effectType", new SimpleAnalyzer()); //中文的字段名在搜索前已经全部转为了英文字段名,所以无分词的必要 /* * analyzerWrapper.AddAnalyzer("中文名", new MyAnalyzer(stopWords)); * analyzerWrapper.AddAnalyzer("日文名", new MyAnalyzer(stopWords)); * analyzerWrapper.AddAnalyzer("旧卡名", new MyAnalyzer(stopWords)); * analyzerWrapper.AddAnalyzer("曾用名", new MyAnalyzer(stopWords)); * analyzerWrapper.AddAnalyzer("简称", new MyAnalyzer(stopWords)); * analyzerWrapper.AddAnalyzer("俗称", new MyAnalyzer(stopWords)); * analyzerWrapper.AddAnalyzer("缩写", new MyAnalyzer(stopWords)); * analyzerWrapper.AddAnalyzer("效果", new MyAnalyzer(stopWords2)); * analyzerWrapper.AddAnalyzer("效果说明", new MyAnalyzer(stopWords2)); * analyzerWrapper.AddAnalyzer("调整", new MyAnalyzer(stopWords2)); * analyzerWrapper.AddAnalyzer("种族", new Lucene.Net.Analysis.KeywordAnalyzer()); */ } return(analyzerWrapper); }
public static Analyzer GetAnalyzer() { var snowball = new SnowballAndWordSplittingAnalyzer("English"); PerFieldAnalyzerWrapper analyzer = new PerFieldAnalyzerWrapper(snowball); SandoField[] fields = new SandoField[] { SandoField.ClassId, SandoField.Source, SandoField.AccessLevel, SandoField.ProgramElementType, SandoField.DefinitionLineNumber, SandoField.FileExtension, SandoField.FullFilePath, SandoField.Id, SandoField.IsConstructor, SandoField.Modifiers, SandoField.DefinitionColumnNumber }; foreach (var field in fields) analyzer.AddAnalyzer(field.ToString(), new KeywordAnalyzer()); return analyzer; }
internal static Analyzer GetAnalyzer() { //var masterAnalyzer = new PerFieldAnalyzerWrapper(new KeywordAnalyzer()); ////TODO: Lucene_FullText2 is failed with new WhitespaceAnalyzer ////masterAnalyzer.AddAnalyzer(LucObject.FieldName.AllText, new WhitespaceAnalyzer()); //masterAnalyzer.AddAnalyzer(LucObject.FieldName.AllText, new StandardAnalyzer()); //return masterAnalyzer; // Field Analyzer // ----------------------------------------------------------------- // Name Lucene.Net.Analysis.KeywordAnalyzer // Path Lucene.Net.Analysis.KeywordAnalyzer // Keywords Lucene.Net.Analysis.StopAnalyzer // _Text Lucene.Net.Analysis.Standard.StandardAnalyzer // ----------------------------------------------------------------- // Default Lucene.Net.Analysis.WhitespaceAnalyzer var masterAnalyzer = new PerFieldAnalyzerWrapper(new KeywordAnalyzer()); foreach (var item in SenseNet.ContentRepository.Storage.StorageContext.Search.SearchEngine.GetAnalyzers()) masterAnalyzer.AddAnalyzer(item.Key, (Analyzer)Activator.CreateInstance(item.Value)); masterAnalyzer.AddAnalyzer(LucObject.FieldName.AllText, new StandardAnalyzer()); //masterAnalyzer.AddAnalyzer(LucObject.FieldName.AllText, new StandardAnalyzer(SenseNet.Search.Indexing.LuceneManager.LuceneVersion)); return masterAnalyzer; }
public static Query BuildQuery(string query, PerFieldAnalyzerWrapper analyzer) { return BuildQuery(query, new IndexQuery(), analyzer); }
/// <summary> /// This method will construct a three folder structure inside <paramref name="targetDirectory"/> containing: Html, Index, and Source /// </summary> /// <param name="sourceDirectory">Directory containing ldoc files</param> /// <param name="targetDirectory">Output directory</param> public void Build(string sourceDirectory, string targetDirectory) { if (Directory.Exists(targetDirectory) && Directory.EnumerateFileSystemEntries(targetDirectory).Any()) throw new InvalidOperationException("Target path is not empty."); this.OnStateChanged(State.Preparing); string htmlRoot = Path.Combine(targetDirectory, "Html"); string indexRoot = Path.Combine(targetDirectory, "Index"); string sourceRoot = Path.Combine(targetDirectory, "Source"); DirectoryInfo htmlDir = Directory.CreateDirectory(htmlRoot); DirectoryInfo indexDir = Directory.CreateDirectory(indexRoot); DirectoryInfo sourceDir = Directory.CreateDirectory(sourceRoot); var sourceFiles = Directory.EnumerateFiles(sourceDirectory, "*.ldoc", SearchOption.TopDirectoryOnly); // copy all source files to output directory and add to bundle Bundle bundle = new Bundle(this.IgnoreVersionComponent); foreach (var sourceFile in sourceFiles) { string targetFile = Path.Combine(sourceDir.FullName, Path.GetFileName(sourceFile)); File.Copy(sourceFile, targetFile); bundle.Add(XDocument.Load(targetFile)); } // merge ldoc files this.OnStateChanged(State.Merging); AssetRedirectCollection assetRedirects; var mergedDoc = bundle.Merge(out assetRedirects); // generate output var templateData = new TemplateData { AssetRedirects = assetRedirects, Document = mergedDoc, IgnoredVersionComponent = this.IgnoreVersionComponent, TargetDirectory = htmlDir.FullName }; this.OnStateChanged(State.Templating); TemplateOutput templateOutput = this.Template.Generate(templateData); this.OnStateChanged(State.Indexing); // one stop-word per line StringReader stopWordsReader = new StringReader(@"missing"); // index output using (var directory = FSDirectory.Open(indexDir)) using (stopWordsReader) { Analyzer analyzer = new StandardAnalyzer(global::Lucene.Net.Util.Version.LUCENE_29, stopWordsReader); Analyzer titleAnalyzer = new TitleAnalyzer(); IDictionary fieldAnalyzers = new Dictionary<string, Analyzer> { { "title", titleAnalyzer } }; PerFieldAnalyzerWrapper analyzerWrapper = new PerFieldAnalyzerWrapper(analyzer, fieldAnalyzers); using (var writer = new IndexWriter(directory, analyzerWrapper, IndexWriter.MaxFieldLength.UNLIMITED)) { foreach (WorkUnitResult result in templateOutput.Results) { //string absPath = Path.Combine(htmlDir.FullName, result.SavedAs); //HtmlDocument htmlDoc = new HtmlDocument(); //htmlDoc.Load(absPath); //string htmlTitle = string.Empty; //var titleNode = htmlDoc.DocumentNode.SelectSingleNode("/html/head/title"); //if (titleNode != null) // htmlTitle = HtmlEntity.DeEntitize(titleNode.InnerText); // //.Replace('.', ' ') // //.Replace('<', ' ') // //.Replace('>', ' ') // //.Replace('[', ' ') // //.Replace(']', ' ') // //.Replace('(', ' ') // //.Replace(')', ' '); //HtmlNode contentNode = htmlDoc.GetElementbyId("content"); //HtmlNode summaryNode = contentNode.SelectSingleNode(".//p[@class='summary']"); //string summary = string.Empty; //if (summaryNode != null && summaryNode.SelectSingleNode("span[@class='error']") == null) // summary = HtmlEntity.DeEntitize(summaryNode.InnerText); //string body = HtmlEntity.DeEntitize(contentNode.InnerText); //var doc = new Document(); //doc.Add(new Field("uri", new Uri(result.SavedAs, UriKind.Relative).ToString(), Field.Store.YES, Field.Index.NO)); //doc.Add(new Field("aid", result.Asset, Field.Store.YES, Field.Index.NOT_ANALYZED)); //foreach (AssetIdentifier aid in result.Aliases) // doc.Add(new Field("alias", aid, Field.Store.NO, Field.Index.NOT_ANALYZED)); //foreach (var section in result.Sections) //{ // doc.Add(new Field("section", section.AssetIdentifier, // Field.Store.NO, // Field.Index.NOT_ANALYZED)); //} //doc.Add(new Field("title", htmlTitle, Field.Store.YES, Field.Index.ANALYZED)); //doc.Add(new Field("summary", summary, Field.Store.YES, Field.Index.ANALYZED)); //doc.Add(new Field("content", body, Field.Store.YES, Field.Index.ANALYZED)); //TraceSources.ContentBuilderSource.TraceVerbose("Indexing document: {0}", doc.ToString()); //writer.AddDocument(doc); } writer.Optimize(); writer.Commit(); writer.Close(); } analyzerWrapper.Close(); analyzer.Close(); directory.Close(); } this.OnStateChanged(State.Finalizing); var infoDoc = new XDocument( new XElement("content", new XAttribute("created", XmlConvert.ToString(DateTime.UtcNow, XmlDateTimeSerializationMode.Utc)), templateOutput.Results.Select(ConvertToXml))); infoDoc.Save(Path.Combine(targetDirectory, "info.xml")); this.OnStateChanged(State.Idle); }
private static void DisposeAnalyzerAndFriends(List<Action> toDispose, PerFieldAnalyzerWrapper analyzer) { if (analyzer != null) analyzer.Close(); foreach (Action dispose in toDispose) { dispose(); } toDispose.Clear(); }