protected override Analyzer GetAnalyzer(Net.Util.Version version) { var a = new PerFieldAnalyzerWrapper(base.GetAnalyzer(version)); a.AddAnalyzer("Version", new KeywordAnalyzer()); a.AddAnalyzer("Flag", new KeywordAnalyzer()); return a; }
public InstancePerFieldAnalyzerWrapper() { var analyzer = new Lucene.Net.Analysis.PerFieldAnalyzerWrapper(new Synonyms.SynonymAnalyzer(new Synonyms.XmlSynonymEngine())); analyzer.AddAnalyzer("cota", new Lucene.Net.Analysis.KeywordAnalyzer()); analyzer.AddAnalyzer("codigo", new Lucene.Net.Analysis.KeywordAnalyzer()); instancePerFieldAnalyzerWrapper = analyzer; }
protected override Analyzer GetAnalyzer(Net.Util.Version version) { var analyzer = new PerFieldAnalyzerWrapper(base.GetAnalyzer(version)); analyzer.AddAnalyzer("Path", new CaseInsensitiveKeywordAnalyzer()); analyzer.AddAnalyzer("Key", new KeywordAnalyzer()); return analyzer; }
protected override Analyzer GetAnalyzer(Net.Util.Version version) { analyzer = new PerFieldAnalyzerWrapper(base.GetAnalyzer(version)); analyzer.AddAnalyzer<SampleDocument>(t => t.Id, new KeywordAnalyzer()); analyzer.AddAnalyzer<SampleDocument>(t => t.Key, new CaseInsensitiveKeywordAnalyzer()); return analyzer; }
public void Code() { Analyzer _keywordanalyzer = new KeywordAnalyzer(); Analyzer _simpleanalyzer = new Lucene.Net.Analysis.SimpleAnalyzer(); Analyzer _stopanalyzer = new Lucene.Net.Analysis.StopAnalyzer(Lucene.Net.Util.Version.LUCENE_30); Analyzer _whitespaceanalyzer = new Lucene.Net.Analysis.WhitespaceAnalyzer(); Analyzer _standardanalyzer = new Lucene.Net.Analysis.Standard.StandardAnalyzer(Lucene.Net.Util.Version.LUCENE_30); var _perfieldanalyzer = new Lucene.Net.Analysis.PerFieldAnalyzerWrapper(_standardanalyzer); _perfieldanalyzer.AddAnalyzer("firstname", _keywordanalyzer); _perfieldanalyzer.AddAnalyzer("lastname", _keywordanalyzer); IndexWriter _writer = new IndexWriter(_directory, _perfieldanalyzer, IndexWriter.MaxFieldLength.UNLIMITED); IndexReader _reader = _writer.GetReader(); IndexSearcher _searcher = new IndexSearcher(_reader); //QueryParser parser = new QueryParser(Lucene.Net.Util.Version.LUCENE_30, "title", _standardanalyzer); string[] fields = new[] { "text", "title", "author" }; var boosts = new Dictionary <string, float>(); boosts.Add("text", 2.0f); boosts.Add("title", 1.5f); QueryParser parser = new MultiFieldQueryParser(Lucene.Net.Util.Version.LUCENE_30, fields, _standardanalyzer, boosts); Query query = parser.Parse("lucene is great"); TopDocs hits = _searcher.Search(query, 1000); IEnumerable <Document> docs = hits.ScoreDocs.Select(hit => _searcher.Doc(hit.Doc)); var books = docs.Select(doc => new Book() { Text = doc.Get("text"), Title = doc.Get("title"), Author = doc.Get("author"), Length = Int32.Parse(doc.Get("length")) }); _writer.Optimize(); _writer.Commit(); _writer.DeleteAll(); }
void SetAnalyzerType(Type defaultType, IEnumerable<FieldDetails> fields) { if (defaultType == null) { defaultType = typeof(StandardAnalyzer); } // create default analyzer _defaultAnalyzer = Activator.CreateInstance(defaultType) as Analyzer; if (_defaultAnalyzer == null) { throw new ArgumentException("defaultType is not an Analyzer type"); } var wrapper = new PerFieldAnalyzerWrapper(_defaultAnalyzer); if (fields != null) { foreach (var fd in fields) { if (fd.Field.Analyzer!=null) { var fieldAnalyzer = CreateAnalyzerFromType(fd.Field.Analyzer); if (fieldAnalyzer != null) { wrapper.AddAnalyzer(fd.Name, fieldAnalyzer); } } } } Analyzer = wrapper; }
/// <summary> /// Detects untokenized fields and sets as NotAnalyzed in analyzer /// </summary> private static string PreProcessUntokenizedTerms(PerFieldAnalyzerWrapper analyzer, string query, Analyzer keywordAnlyzer) { var untokenizedMatches = untokenizedQuery.Matches(query); if (untokenizedMatches.Count < 1) { return query; } var sb = new StringBuilder(query); // KeywordAnalyzer will not tokenize the values // process in reverse order to leverage match string indexes for (int i=untokenizedMatches.Count; i>0; i--) { Match match = untokenizedMatches[i-1]; // specify that term for this field should not be tokenized analyzer.AddAnalyzer(match.Groups[1].Value, keywordAnlyzer); Group term = match.Groups[2]; // remove enclosing "[[" "]]" from term value (again in reverse order) sb.Remove(term.Index+term.Length-2, 2); sb.Remove(term.Index, 2); } return sb.ToString(); }
private void btnSearch_Click(object sender, EventArgs e) { lstResults.Items.Clear(); searcher = new IndexSearcher(new RAMDirectory(_indexTarget)); PerFieldAnalyzerWrapper analyzer = new PerFieldAnalyzerWrapper(new StandardAnalyzer()); analyzer.AddAnalyzer("ayat_arabic", new DiacriticAnalyzer(FilterData.stopWords)); //MyQueryParser parser = new MyQueryParser(new string[] { "ayat_desc", "ayat_urdu", "ayat_arabic" }, analyzer); //parser.SetDefaultOperator(QueryParser.Operator.AND); //Query q = parser.Parse(txtSearch.Text); //Query q = new TermQuery(new Term("ayatno", NumberTools.LongToString(long.Parse(txtSearch.Text)))); BooleanQuery q = new BooleanQuery(); long l1 = 1; long l2 = 500; long l3 = 1; long l4 = 1; //RangeQuery rq = new RangeQuery(new Term("ayatno", l1.ToString("00000")), new Term("ayatno", l2.ToString("00000")), true); //q.Add(rq, true, false); q.Add(new TermQuery(new Term("sid", l3.ToString("00000"))), true, false); q.Add(new TermQuery(new Term("ayatno", l4.ToString("00000"))), true, false); MessageBox.Show(q.ToString()); Sort sort = new Sort(new string[] { "pid", "sid", "ayatno" }); hits = searcher.Search(q, sort); lblHits.Text = hits.Length() + " hit(s)."; Application.DoEvents(); for (int i = 0; i < hits.Length(); i++) { StringBuilder sb = new StringBuilder(); sb.Append("Para: ").Append(hits.Doc(i).Get("pid")); sb.Append(", Surat: ").Append(hits.Doc(i).Get("sid")); sb.Append(", Verse: ").Append(hits.Doc(i).Get("ayatno")); lstResults.Items.Add(sb.ToString()); } }
public void StartEmailIndexing() { if (!Directory.Exists(GlobalData.EmailIndexPath)) Directory.CreateDirectory(GlobalData.EmailIndexPath); IndexWriter index; PerFieldAnalyzerWrapper pfaw = new PerFieldAnalyzerWrapper(new KeywordAnalyzer()); pfaw.AddAnalyzer("body", new StopAnalyzer()); try { index = new IndexWriter(GlobalData.EmailIndexPath, pfaw, false); } catch { index = new IndexWriter(GlobalData.EmailIndexPath, pfaw, true); } const string PopServer = "pop.google.in"; const int PopPort = 995; const string User = "******"; const string Pass = "******"; using (Pop3Client client = new Pop3Client(PopServer, PopPort, true, User, Pass)) { client.Trace += new Action<string>(Console.WriteLine); //connects to Pop3 Server, Executes POP3 USER and PASS client.Authenticate(); client.Stat(); foreach (Pop3ListItem item in client.List()) { Document doc = new Document(); MailMessageEx message = client.RetrMailMessageEx(item); doc.Add(new Field("subject", message.Subject.ToLower(), Field.Store.YES, Field.Index.NO_NORMS)); doc.Add(new Field("from", message.From.ToString().ToLower(), Field.Store.YES, Field.Index.NO_NORMS)); doc.Add(new Field("to", message.To.ToString().ToLower(), Field.Store.YES, Field.Index.NO_NORMS)); //doc.Add(new Field("date", message.DeliveryDate.ToLower(), Field.Store.YES, Field.Index.NO_NORMS)); string code = message.Body; code = Regex.Replace(code, @"<\s*head\s*>(.|\n|\r)*?<\s*/\s*head\s*>", " ", RegexOptions.Compiled); //repalce <head> section with single whitespace code = Regex.Replace(code, @"<\s*script (.|\n|\r)*?<\s*/\s*script\s*>", " ", RegexOptions.Compiled);//repalce remaining <script> tags from body with single whitespace code = Regex.Replace(code, @"<!--(.|\n|\r)*?-->", " ", RegexOptions.Compiled); //repalce comments code = Regex.Replace(code, @"<(.|\n|\r)*?>", " ", RegexOptions.Compiled); //repalce all tags with single whitespace code = Regex.Replace(code, @"&.*?;", " ", RegexOptions.Compiled); //replace > e.t.c code = Regex.Replace(code, @"\s+", " ", RegexOptions.Compiled); //replace multiple whitespaces characters by single whitespace code = Regex.Replace(code, @"\ufffd", " ", RegexOptions.Compiled); doc.Add(new Field("body", code.ToLower(), Field.Store.YES, Field.Index.NO_NORMS)); index.AddDocument(doc); } client.Noop(); client.Rset(); client.Quit(); index.Optimize(); index.Close(); } }
public void Search(string keyword) { IndexReader reader = null; IndexSearcher searcher = null; try { reader = IndexReader.Open(FSDirectory.Open(new DirectoryInfo(indexDirectory)), true); searcher = new IndexSearcher(reader); //创建查询 PerFieldAnalyzerWrapper wrapper = new PerFieldAnalyzerWrapper(analyzer); wrapper.AddAnalyzer("FileName", analyzer); wrapper.AddAnalyzer("Author", analyzer); wrapper.AddAnalyzer("Content", analyzer); string[] fields = { "FileName", "Author", "Content" }; QueryParser parser = new MultiFieldQueryParser(Lucene.Net.Util.Version.LUCENE_30, fields, wrapper); Query query = parser.Parse(keyword); TopScoreDocCollector collector = TopScoreDocCollector.Create(NumberHits, true); searcher.Search(query, collector); var hits = collector.TopDocs().ScoreDocs; int numTotalHits = collector.TotalHits; //以后就可以对获取到的collector数据进行操作 for (int i = 0; i < hits.Count(); i++) { var hit = hits[i]; Document doc = searcher.Doc(hit.Doc); Field fileNameField = doc.GetField("FileName"); Field authorField = doc.GetField("Author"); Field pathField = doc.GetField("Path"); } } finally { if (searcher != null) searcher.Dispose(); if (reader != null) reader.Dispose(); } }
public void TestPerFieldAnalyzer() { var analyzer = new PerFieldAnalyzerWrapper(new SimpleAnalyzer()); analyzer.AddAnalyzer("partnum", new KeywordAnalyzer()); var query = new QueryParser(Lucene.Net.Util.Version.LUCENE_29, "description", analyzer) .Parse("partnum:Q36 AND SPACE"); Assert.AreEqual("+partnum:Q36 +space", query.ToString("description"), "Q36 kept as-is"); Assert.AreEqual(1, searcher.Search(query, searcher.MaxDoc()).ScoreDocs.Length, "docs found!!!"); }
public virtual void TestPerFieldAnalyzer() { PerFieldAnalyzerWrapper analyzer = new PerFieldAnalyzerWrapper(new SimpleAnalyzer()); analyzer.AddAnalyzer("partnum", new KeywordAnalyzer()); Lucene.Net.QueryParsers.QueryParser queryParser = new Lucene.Net.QueryParsers.QueryParser("description", analyzer); Query query = queryParser.Parse("partnum:Q36 AND SPACE"); Hits hits = searcher.Search(query); Assert.AreEqual("+partnum:Q36 +space", query.ToString("description"), "Q36 kept as-is"); Assert.AreEqual(1, hits.Length(), "doc found!"); }
public virtual void TestPerFieldAnalyzer() { PerFieldAnalyzerWrapper analyzer = new PerFieldAnalyzerWrapper(new SimpleAnalyzer()); analyzer.AddAnalyzer("partnum", new KeywordAnalyzer()); QueryParser queryParser = new QueryParser("description", analyzer); Query query = queryParser.Parse("partnum:Q36 AND SPACE"); ScoreDoc[] hits = searcher.Search(query, null, 1000).scoreDocs; Assert.AreEqual("+partnum:Q36 +space", query.ToString("description"), "Q36 kept as-is"); Assert.AreEqual(1, hits.Length, "doc found!"); }
public virtual void TestPerFieldAnalyzer() { PerFieldAnalyzerWrapper analyzer = new PerFieldAnalyzerWrapper(new SimpleAnalyzer()); analyzer.AddAnalyzer("partnum", new KeywordAnalyzer()); QueryParser queryParser = new QueryParser(Version.LUCENE_CURRENT, "description", analyzer); Query query = queryParser.Parse("partnum:Q36 AND SPACE"); ScoreDoc[] hits = searcher.Search(query, null, 1000, null).ScoreDocs; Assert.AreEqual("+partnum:Q36 +space", query.ToString("description"), "Q36 kept as-is"); Assert.AreEqual(1, hits.Length, "doc found!"); }
public virtual void TestPerField() { System.String text = "Qwerty"; PerFieldAnalyzerWrapper analyzer = new PerFieldAnalyzerWrapper(new WhitespaceAnalyzer()); analyzer.AddAnalyzer("special", new SimpleAnalyzer()); TokenStream tokenStream = analyzer.TokenStream("field", new System.IO.StringReader(text)); Token token = tokenStream.Next(); Assert.AreEqual("Qwerty", token.TermText(), "WhitespaceAnalyzer does not lowercase"); tokenStream = analyzer.TokenStream("special", new System.IO.StringReader(text)); token = tokenStream.Next(); Assert.AreEqual("qwerty", token.TermText(), "SimpleAnalyzer lowercases"); }
public EDSIndexer(string desIndexPath, Analyzer analyser, bool overwriteIndexDir) { keywordAnalyzer = analyser; pfaw = new PerFieldAnalyzerWrapper(analyser); pfaw.AddAnalyzer("content", stopAnalyzer); //generally for content v use stop analyser try { index = new IndexWriter(desIndexPath, pfaw, overwriteIndexDir); } catch { index = new IndexWriter(desIndexPath, pfaw, true); } }
internal static Analyzer GetAnalyzer() { //var masterAnalyzer = new PerFieldAnalyzerWrapper(new KeywordAnalyzer()); ////TODO: Lucene_FullText2 is failed with new WhitespaceAnalyzer ////masterAnalyzer.AddAnalyzer(LucObject.FieldName.AllText, new WhitespaceAnalyzer()); //masterAnalyzer.AddAnalyzer(LucObject.FieldName.AllText, new StandardAnalyzer()); //return masterAnalyzer; // Field Analyzer // ----------------------------------------------------------------- // Name Lucene.Net.Analysis.KeywordAnalyzer // Path Lucene.Net.Analysis.KeywordAnalyzer // Keywords Lucene.Net.Analysis.StopAnalyzer // _Text Lucene.Net.Analysis.Standard.StandardAnalyzer // ----------------------------------------------------------------- // Default Lucene.Net.Analysis.WhitespaceAnalyzer var masterAnalyzer = new PerFieldAnalyzerWrapper(new KeywordAnalyzer()); foreach (var item in SenseNet.ContentRepository.Storage.StorageContext.Search.SearchEngine.GetAnalyzers()) masterAnalyzer.AddAnalyzer(item.Key, (Analyzer)Activator.CreateInstance(item.Value)); masterAnalyzer.AddAnalyzer(LucObject.FieldName.AllText, new StandardAnalyzer()); //masterAnalyzer.AddAnalyzer(LucObject.FieldName.AllText, new StandardAnalyzer(SenseNet.Search.Indexing.LuceneManager.LuceneVersion)); return masterAnalyzer; }
public virtual void TestPerField() { System.String text = "Qwerty"; PerFieldAnalyzerWrapper analyzer = new PerFieldAnalyzerWrapper(new WhitespaceAnalyzer()); analyzer.AddAnalyzer("special", new SimpleAnalyzer()); TokenStream tokenStream = analyzer.TokenStream("Field", new System.IO.StringReader(text)); Token token = tokenStream.Next(); Assert.AreEqual("Qwerty", token.TermText(), "WhitespaceAnalyzer does not lowercase"); tokenStream = analyzer.TokenStream("special", new System.IO.StringReader(text)); token = tokenStream.Next(); Assert.AreEqual("qwerty", token.TermText(), "SimpleAnalyzer lowercases"); }
public void CompareHtmlTokenization() { const string str = @"test1 <a href=""foo"">testlink</a> test2 test3"; PerFieldAnalyzerWrapper pfaw = new PerFieldAnalyzerWrapper(new HtmlStandardAnalyzer()); pfaw.AddAnalyzer("Morph", new HtmlMorphAnalyzer(HspellDict)); Directory indexDirectory = new RAMDirectory(); IndexWriter writer = new IndexWriter(indexDirectory, pfaw, true, IndexWriter.MaxFieldLength.UNLIMITED); Document doc = new Document(); doc.Add(new Field("Simple", str, Field.Store.YES, Field.Index.ANALYZED, Field.TermVector.WITH_POSITIONS_OFFSETS)); doc.Add(new Field("Morph", str, Field.Store.YES, Field.Index.ANALYZED, Field.TermVector.WITH_POSITIONS_OFFSETS)); writer.AddDocument(doc); writer.Close(); CompareTermData(indexDirectory, str); }
public virtual void TestPerField() { System.String text = "Qwerty"; PerFieldAnalyzerWrapper analyzer = new PerFieldAnalyzerWrapper(new WhitespaceAnalyzer()); analyzer.AddAnalyzer("special", new SimpleAnalyzer()); TokenStream tokenStream = analyzer.TokenStream("field", new System.IO.StringReader(text)); ITermAttribute termAtt = tokenStream.GetAttribute<ITermAttribute>(); Assert.IsTrue(tokenStream.IncrementToken()); Assert.AreEqual("Qwerty", termAtt.Term, "WhitespaceAnalyzer does not lowercase"); tokenStream = analyzer.TokenStream("special", new System.IO.StringReader(text)); termAtt = tokenStream.GetAttribute<ITermAttribute>(); Assert.IsTrue(tokenStream.IncrementToken()); Assert.AreEqual("qwerty", termAtt.Term, "SimpleAnalyzer lowercases"); }
public virtual void TestPerField() { System.String text = "Qwerty"; PerFieldAnalyzerWrapper analyzer = new PerFieldAnalyzerWrapper(new WhitespaceAnalyzer()); analyzer.AddAnalyzer("special", new SimpleAnalyzer()); TokenStream tokenStream = analyzer.TokenStream("field", new System.IO.StringReader(text)); ITermAttribute termAtt = tokenStream.GetAttribute <ITermAttribute>(); Assert.IsTrue(tokenStream.IncrementToken()); Assert.AreEqual("Qwerty", termAtt.Term, "WhitespaceAnalyzer does not lowercase"); tokenStream = analyzer.TokenStream("special", new System.IO.StringReader(text)); termAtt = tokenStream.GetAttribute <ITermAttribute>(); Assert.IsTrue(tokenStream.IncrementToken()); Assert.AreEqual("qwerty", termAtt.Term, "SimpleAnalyzer lowercases"); }
protected override Task<bool> OnProcessBatch(CollectorHttpClient client, IEnumerable<JToken> items, JToken context, DateTime commitTimeStamp, CancellationToken cancellationToken) { PerFieldAnalyzerWrapper analyzer = new PerFieldAnalyzerWrapper(new StandardAnalyzer(Lucene.Net.Util.Version.LUCENE_30)); analyzer.AddAnalyzer("Id", new IdentifierKeywordAnalyzer()); int i = 0; using (IndexWriter writer = new IndexWriter(_directory, analyzer, false, IndexWriter.MaxFieldLength.UNLIMITED)) { foreach (JObject item in items) { i++; string id = item["nuget:id"].ToString(); string version = item["nuget:version"].ToString(); BooleanQuery query = new BooleanQuery(); query.Add(new BooleanClause(new TermQuery(new Term("Id", id.ToLowerInvariant())), Occur.MUST)); query.Add(new BooleanClause(new TermQuery(new Term("Version", version)), Occur.MUST)); writer.DeleteDocuments(query); Document doc = new Document(); doc.Add(new Field("Id", item["nuget:id"].ToString(), Field.Store.YES, Field.Index.ANALYZED)); doc.Add(new Field("Version", item["nuget:version"].ToString(), Field.Store.YES, Field.Index.NOT_ANALYZED)); writer.AddDocument(doc); } string trace = Guid.NewGuid().ToString(); writer.Commit(new Dictionary<string, string> { { "commitTimeStamp", commitTimeStamp.ToString("O") }, { "trace", trace } }); Trace.TraceInformation("COMMIT {0} documents, index contains {1} documents, commitTimeStamp {2}, trace: {3}", i, writer.NumDocs(), commitTimeStamp.ToString("O"), trace); } return Task.FromResult(true); }
/// <summary> /// Detects untokenized fields and sets as NotAnalyzed in analyzer /// </summary> private static string PreProcessUntokenizedTerms(PerFieldAnalyzerWrapper analyzer, string query, ref Analyzer keywordAnalyzer) { var untokenizedMatches = untokenizedQuery.Matches(query); if (untokenizedMatches.Count < 1) return query; var sb = new StringBuilder(query); // Initialize a KeywordAnalyzer // KeywordAnalyzer will not tokenize the values keywordAnalyzer = new KeywordAnalyzer(); // process in reverse order to leverage match string indexes for (var i = untokenizedMatches.Count; i > 0; i--) { var match = untokenizedMatches[i - 1]; // specify that term for this field should not be tokenized analyzer.AddAnalyzer(match.Groups[1].Value, keywordAnalyzer); var term = match.Groups[2]; // introduce " " around the term var startIndex = term.Index; var length = term.Length - 2; if (sb[startIndex + length - 1] != '"') { sb.Insert(startIndex + length, '"'); length += 1; } if (sb[startIndex + 2] != '"') { sb.Insert(startIndex + 2, '"'); length += 1; } // remove enclosing "[[" "]]" from term value (again in reverse order) sb.Remove(startIndex + length, 2); sb.Remove(startIndex, 2); } return sb.ToString(); }
public static Analyzer GetAnalyzer() { var snowball = new SnowballAndWordSplittingAnalyzer("English"); PerFieldAnalyzerWrapper analyzer = new PerFieldAnalyzerWrapper(snowball); SandoField[] fields = new SandoField[] { SandoField.ClassId, SandoField.Source, SandoField.AccessLevel, SandoField.ProgramElementType, SandoField.DefinitionLineNumber, SandoField.FileExtension, SandoField.FullFilePath, SandoField.Id, SandoField.IsConstructor, SandoField.Modifiers, SandoField.DefinitionColumnNumber }; foreach (var field in fields) analyzer.AddAnalyzer(field.ToString(), new KeywordAnalyzer()); return analyzer; }
//create an index system, you need to indicate where the source files are public void IndexText(string filepath, bool titleboost, bool authorboost, string titlevalue, string authorvalue) { StreamReader file = new StreamReader(filepath); string content = file.ReadToEnd(); string[] delimiter = { ".I", ".T", ".A", ".B", ".W" }; string[] words = content.Split(delimiter, StringSplitOptions.RemoveEmptyEntries); string length = ""; int countfordoc = 0; countfordoc++; //because there are five parts in sourcefile, they need to be seperated. if (words.Length > 5) { length = words.Length.ToString(); } string[] wordprocessed = new string[words.Length]; int i = 0; //get rid of some symbols because string contain some unwanted symbols //delete the title in abstract because it can be seen as an error if (words[4].Contains(words[1])) { words[4] = words[4].Replace(words[1], string.Empty); } // get rid of the symbol which is in charge of changing for a new line foreach (string w in words) { wordprocessed[i] = w.Replace("\n", string.Empty); i++; } //define 5 fields for index Lucene.Net.Documents.Field docid = new Field(DocID, wordprocessed[0], Field.Store.YES, Field.Index.ANALYZED, Field.TermVector.WITH_POSITIONS_OFFSETS); Lucene.Net.Documents.Field title = new Field(TITLE, wordprocessed[1], Field.Store.YES, Field.Index.ANALYZED, Field.TermVector.WITH_POSITIONS_OFFSETS); Lucene.Net.Documents.Field author = new Field(AUTHOR, wordprocessed[2], Field.Store.YES, Field.Index.ANALYZED, Field.TermVector.WITH_POSITIONS_OFFSETS); Lucene.Net.Documents.Field bibliinformation = new Field(BIBLiINFO, wordprocessed[3], Field.Store.YES, Field.Index.ANALYZED, Field.TermVector.WITH_POSITIONS_OFFSETS); Lucene.Net.Documents.Field abstracts = new Field(ABSTRACT, wordprocessed[4], Field.Store.YES, Field.Index.ANALYZED, Field.TermVector.WITH_POSITIONS_OFFSETS); if (titleboost == true) { title.Boost = int.Parse(titlevalue); } else { title.Boost = 1; } //for task 7 if (authorboost == true) { author.Boost = int.Parse(authorvalue); } else { author.Boost = 1; } //for task 7 analysor.AddAnalyzer(DocID, analyzerkeyword); //set ID using keyword analyzor analysor.AddAnalyzer(AUTHOR, analyzerkeyword); //set author using keyword analyzor, in my opinion, it cann't be separated. analysor.AddAnalyzer(BIBLiINFO, analyzerkeyword); //set bibliography using keyword analyzor Lucene.Net.Documents.Document doc = new Document(); doc.Add(docid); doc.Add(title); doc.Add(abstracts); doc.Add(author); doc.Add(bibliinformation); writer.AddDocument(doc);//writer is bond with analysor. here, my analysor is a mixture of 2 types of analyzor file.Close(); }
private PerFieldAnalyzerWrapper CreateAnalyzer(ICollection<Action> toDispose) { var standardAnalyzer = new StandardAnalyzer(Version.LUCENE_29); toDispose.Add(standardAnalyzer.Close); var perFieldAnalyzerWrapper = new PerFieldAnalyzerWrapper(standardAnalyzer); foreach (var analyzer in indexDefinition.Analyzers) { var analyzerInstance = indexDefinition.CreateAnalyzerInstance(analyzer.Key, analyzer.Value); if(analyzerInstance == null) continue; toDispose.Add(analyzerInstance.Close); perFieldAnalyzerWrapper.AddAnalyzer(analyzer.Key, analyzerInstance); } KeywordAnalyzer keywordAnalyzer = null; foreach (var fieldIndexing in indexDefinition.Indexes) { switch (fieldIndexing.Value) { case FieldIndexing.NotAnalyzedNoNorms: case FieldIndexing.NotAnalyzed: if(keywordAnalyzer == null) { keywordAnalyzer = new KeywordAnalyzer(); toDispose.Add(keywordAnalyzer.Close); } perFieldAnalyzerWrapper.AddAnalyzer(fieldIndexing.Key, keywordAnalyzer); break; } } return perFieldAnalyzerWrapper; }
public static Lucene.Net.Analysis.Analyzer GetAnalyzer() { //return new StandardAnalyzer(new string[] {"的", "之" }); if (analyzerWrapper == null) { analyzerWrapper = new Lucene.Net.Analysis.PerFieldAnalyzerWrapper(new StandardAnalyzer(MyLucene.GetLuceneVersion())); analyzerWrapper.AddAnalyzer("name", new MyAnalyzer(stopWords)); analyzerWrapper.AddAnalyzer("japName", new MyAnalyzer(stopWords)); analyzerWrapper.AddAnalyzer("oldName", new MyAnalyzer(stopWords)); analyzerWrapper.AddAnalyzer("shortName", new MyAnalyzer(stopWords)); analyzerWrapper.AddAnalyzer("effect", new MyAnalyzer(stopWords2)); analyzerWrapper.AddAnalyzer("adjust", new MyAnalyzer(stopWords2)); analyzerWrapper.AddAnalyzer("tribe", new Lucene.Net.Analysis.KeywordAnalyzer()); analyzerWrapper.AddAnalyzer("cheatcode", new KeywordAnalyzer()); analyzerWrapper.AddAnalyzer("aliasList", new PunctuationAnalyzer()); analyzerWrapper.AddAnalyzer("cardCamp", new Lucene.Net.Analysis.KeywordAnalyzer()); analyzerWrapper.AddAnalyzer("enName", new LetterDigitAnalyzer()); analyzerWrapper.AddAnalyzer("pyname", new SimpleAnalyzer()); analyzerWrapper.AddAnalyzer("pyshortName", new SimpleAnalyzer()); analyzerWrapper.AddAnalyzer("pyoldName", new SimpleAnalyzer()); analyzerWrapper.AddAnalyzer("effectType", new SimpleAnalyzer()); analyzerWrapper.AddAnalyzer("package", new PunctuationAnalyzer()); //因为高级搜索的关系,中文的字段名也需要分词 analyzerWrapper.AddAnalyzer("中文名", new MyAnalyzer(stopWords)); analyzerWrapper.AddAnalyzer("日文名", new MyAnalyzer(stopWords)); analyzerWrapper.AddAnalyzer("旧卡名", new MyAnalyzer(stopWords)); analyzerWrapper.AddAnalyzer("曾用名", new MyAnalyzer(stopWords)); analyzerWrapper.AddAnalyzer("简称", new MyAnalyzer(stopWords)); analyzerWrapper.AddAnalyzer("俗称", new MyAnalyzer(stopWords)); analyzerWrapper.AddAnalyzer("缩写", new MyAnalyzer(stopWords)); analyzerWrapper.AddAnalyzer("效果", new MyAnalyzer(stopWords2)); analyzerWrapper.AddAnalyzer("效果说明", new MyAnalyzer(stopWords2)); analyzerWrapper.AddAnalyzer("调整", new MyAnalyzer(stopWords2)); analyzerWrapper.AddAnalyzer("种族", new Lucene.Net.Analysis.KeywordAnalyzer()); analyzerWrapper.AddAnalyzer("卡包", new PunctuationAnalyzer()); } return(analyzerWrapper); }
public static Lucene.Net.Analysis.Analyzer GetAnalyzer() { //return new StandardAnalyzer(new string[] {"的", "之" }); if (analyzerWrapper == null) { analyzerWrapper = new Lucene.Net.Analysis.PerFieldAnalyzerWrapper(new StandardAnalyzer()); analyzerWrapper.AddAnalyzer("name", new MyAnalyzer(stopWords)); analyzerWrapper.AddAnalyzer("japName", new MyAnalyzer(stopWords)); analyzerWrapper.AddAnalyzer("oldName", new MyAnalyzer(stopWords)); analyzerWrapper.AddAnalyzer("shortName", new MyAnalyzer(stopWords)); analyzerWrapper.AddAnalyzer("effect", new MyAnalyzer(stopWords2)); analyzerWrapper.AddAnalyzer("adjust", new MyAnalyzer(stopWords2)); analyzerWrapper.AddAnalyzer("tribe", new Lucene.Net.Analysis.KeywordAnalyzer()); analyzerWrapper.AddAnalyzer("cheatcode", new Lucene.Net.Analysis.KeywordAnalyzer()); analyzerWrapper.AddAnalyzer("cardCamp", new Lucene.Net.Analysis.KeywordAnalyzer()); analyzerWrapper.AddAnalyzer("enName", new LetterDigitAnalyzer()); analyzerWrapper.AddAnalyzer("pyname", new SimpleAnalyzer()); analyzerWrapper.AddAnalyzer("pyshortName", new SimpleAnalyzer()); analyzerWrapper.AddAnalyzer("pyoldName", new SimpleAnalyzer()); //中文的字段名在搜索前已经全部转为了英文字段名,所以无分词的必要 /* * analyzerWrapper.AddAnalyzer("中文名", new MyAnalyzer(stopWords)); * analyzerWrapper.AddAnalyzer("日文名", new MyAnalyzer(stopWords)); * analyzerWrapper.AddAnalyzer("旧卡名", new MyAnalyzer(stopWords)); * analyzerWrapper.AddAnalyzer("曾用名", new MyAnalyzer(stopWords)); * analyzerWrapper.AddAnalyzer("简称", new MyAnalyzer(stopWords)); * analyzerWrapper.AddAnalyzer("俗称", new MyAnalyzer(stopWords)); * analyzerWrapper.AddAnalyzer("缩写", new MyAnalyzer(stopWords)); * analyzerWrapper.AddAnalyzer("效果", new MyAnalyzer(stopWords2)); * analyzerWrapper.AddAnalyzer("效果说明", new MyAnalyzer(stopWords2)); * analyzerWrapper.AddAnalyzer("调整", new MyAnalyzer(stopWords2)); * analyzerWrapper.AddAnalyzer("种族", new Lucene.Net.Analysis.KeywordAnalyzer()); */ } return(analyzerWrapper); }
public PerFieldAnalyzerWrapper CreateAnalyzer(Analyzer defaultAnalyzer, ICollection<Action> toDispose) { toDispose.Add(defaultAnalyzer.Close); var perFieldAnalyzerWrapper = new PerFieldAnalyzerWrapper(defaultAnalyzer); foreach (var analyzer in indexDefinition.Analyzers) { Analyzer analyzerInstance = IndexingExtensions.CreateAnalyzerInstance(analyzer.Key, analyzer.Value); if (analyzerInstance == null) continue; toDispose.Add(analyzerInstance.Close); perFieldAnalyzerWrapper.AddAnalyzer(analyzer.Key, analyzerInstance); } StandardAnalyzer standardAnalyzer = null; KeywordAnalyzer keywordAnalyzer = null; foreach (var fieldIndexing in indexDefinition.Indexes) { switch (fieldIndexing.Value) { case FieldIndexing.NotAnalyzed: if (keywordAnalyzer == null) { keywordAnalyzer = new KeywordAnalyzer(); toDispose.Add(keywordAnalyzer.Close); } perFieldAnalyzerWrapper.AddAnalyzer(fieldIndexing.Key, keywordAnalyzer); break; case FieldIndexing.Analyzed: if (indexDefinition.Analyzers.ContainsKey(fieldIndexing.Key)) continue; if (standardAnalyzer == null) { standardAnalyzer = new StandardAnalyzer(Version.LUCENE_29); toDispose.Add(standardAnalyzer.Close); } perFieldAnalyzerWrapper.AddAnalyzer(fieldIndexing.Key, standardAnalyzer); break; } } return perFieldAnalyzerWrapper; }
protected void Initialize(ILuceneIndex index, bool close) { Assert.ArgumentNotNull(index, "index"); PerFieldAnalyzerWrapper aw = new PerFieldAnalyzerWrapper(index.Analyzer); aw.AddAnalyzer("_language", new KeywordAnalyzer()); this._analyzer = aw; Assert.IsNotNull(this._analyzer, "Failed to request analyzer from the index"); }
public ScoreDoc[] Search(string keyword) { int num = 10; IndexReader reader = null; var analyzer = new StandardAnalyzer(Lucene.Net.Util.Version.LUCENE_30); PerFieldAnalyzerWrapper wrapper = new PerFieldAnalyzerWrapper(analyzer); wrapper.AddAnalyzer("Name", analyzer); string[] fields = { "Name" }; // try // { // reader = IndexReader.Open(FSDirectory.Open(new DirectoryInfo(_indexDirectory)), true); // searcher = new IndexSearcher(reader); QueryParser parser = new MultiFieldQueryParser(Lucene.Net.Util.Version.LUCENE_30, fields, wrapper); Query query = parser.Parse(keyword); TopScoreDocCollector collector = TopScoreDocCollector.Create(num, true); _indexSearcher.Search(query, collector); var hits = collector.TopDocs().ScoreDocs; return hits; // } }
internal static void OptimizeAllIndexes() { PerFieldAnalyzerWrapper pfaw = new PerFieldAnalyzerWrapper(new KeywordAnalyzer()); pfaw.AddAnalyzer("content", new StopAnalyzer()); foreach (string dir in Directory.GetDirectories(GlobalData.IndexRootPath)) { IndexWriter writer = new IndexWriter(dir, pfaw, false); writer.Optimize(); writer.Close(); } }
private void configureBexisIndexing(bool recreateIndex) { configXML = new XmlDocument(); configXML.Load(FileHelper.ConfigFilePath); LoadBeforeIndexing(); Lucene.Net.Store.Directory pathIndex = FSDirectory.Open(new DirectoryInfo(luceneIndexPath)); Lucene.Net.Store.Directory autoCompleteIndex = FSDirectory.Open(new DirectoryInfo(autoCompleteIndexPath)); PerFieldAnalyzerWrapper analyzer = new PerFieldAnalyzerWrapper(new BexisAnalyzer()); indexWriter = new IndexWriter(pathIndex, analyzer, recreateIndex, IndexWriter.MaxFieldLength.UNLIMITED); autoCompleteIndexWriter = new IndexWriter(autoCompleteIndex, new NGramAnalyzer(), true, IndexWriter.MaxFieldLength.UNLIMITED); foreach (XmlNode a in categoryXmlNodeList) { analyzer.AddAnalyzer("ng_" + a.Attributes.GetNamedItem("lucene_name").Value, new NGramAnalyzer()); } analyzer.AddAnalyzer("ng_all", new NGramAnalyzer()); isIndexConfigured = true; }
public PerFieldAnalyzerWrapper CreateAnalyzer(Analyzer defaultAnalyzer, ICollection<Action> toDispose, bool forQuerying = false) { toDispose.Add(defaultAnalyzer.Close); var perFieldAnalyzerWrapper = new PerFieldAnalyzerWrapper(defaultAnalyzer); foreach (var analyzer in indexDefinition.Analyzers) { Analyzer analyzerInstance = IndexingExtensions.CreateAnalyzerInstance(analyzer.Key, analyzer.Value); if (analyzerInstance == null) continue; toDispose.Add(analyzerInstance.Close); if (forQuerying) { var customAttributes = analyzerInstance.GetType().GetCustomAttributes(typeof(NotForQueryingAttribute), false); if (customAttributes.Length > 0) continue; } perFieldAnalyzerWrapper.AddAnalyzer(analyzer.Key, analyzerInstance); } StandardAnalyzer standardAnalyzer = null; KeywordAnalyzer keywordAnalyzer = null; foreach (var fieldIndexing in indexDefinition.Indexes) { switch (fieldIndexing.Value) { case FieldIndexing.NotAnalyzed: if (keywordAnalyzer == null) { keywordAnalyzer = new KeywordAnalyzer(); toDispose.Add(keywordAnalyzer.Close); } perFieldAnalyzerWrapper.AddAnalyzer(fieldIndexing.Key, keywordAnalyzer); break; case FieldIndexing.Analyzed: if (indexDefinition.Analyzers.ContainsKey(fieldIndexing.Key)) continue; if (standardAnalyzer == null) { standardAnalyzer = new StandardAnalyzer(Version.LUCENE_29); toDispose.Add(standardAnalyzer.Close); } perFieldAnalyzerWrapper.AddAnalyzer(fieldIndexing.Key, standardAnalyzer); break; } } return perFieldAnalyzerWrapper; }
/// <summary> /// Loads the data into the Lucene index /// </summary> /// <param name="directory"> /// Directory where the index is located. /// </param> private void LoadLuceneIndex(SimpleFSDirectory directory) { // Create an analyzer that uses UpperCaseLetterOrDigitAnalyzer for all fields, but UpperCaseKeywordAnalyzer for ProductCode // (because we want to regard product codes as 1 word). var analyzer = new PerFieldAnalyzerWrapper(new UpperCaseLetterOrDigitAnalyzer()); analyzer.AddAnalyzer("ProductCode", new UpperCaseKeywordAnalyzer()); // ----------- // Store products into Lucene. // This will create a new index. Other requests will still be able to read the existing index. // Create writer that will overwrite the existing index using (var writer = new IndexWriter(directory, analyzer, true, IndexWriter.MaxFieldLength.UNLIMITED)) { IEnumerable<ProductSearchResult> results = _productRepository.GetAllProductSearchResults(); foreach (var result in results) { var doc = new Document(); doc.Add(new Field("ProductId", result.ProductId.ToString(CultureInfo.InvariantCulture), Field.Store.YES, Field.Index.NO)); // Store field in index so it can be searched, but don't analyse it - just store as is. var productCodeField = new Field("ProductCode", result.ProductCode, Field.Store.YES, Field.Index.ANALYZED); doc.Add(productCodeField); doc.Add(new Field("ProductDescription", result.ProductDescription, Field.Store.YES, Field.Index.ANALYZED)); writer.AddDocument(doc); } } }
public static IndexWriter Create(AbstractConnection connection, Process process, Entity entity) { using (var dir = LuceneDirectoryFactory.Create(connection, entity)) { Analyzer defaultAnalyzer = new KeywordAnalyzer(); if (process.SearchTypes.ContainsKey("default")) { defaultAnalyzer = LuceneAnalyzerFactory.Create(process.SearchTypes["default"].Analyzer, connection.Version); } var analyzer = new PerFieldAnalyzerWrapper(defaultAnalyzer); foreach (var field in GetFields(entity, connection.Version, connection.Logger)) { analyzer.AddAnalyzer(field.Key, field.Value); } return new IndexWriter(dir, analyzer, IndexWriter.MaxFieldLength.UNLIMITED); } }
public static IndexWriter Create(AbstractConnection connection, Entity entity) { var dir = LuceneDirectoryFactory.Create(connection, entity); Analyzer defaultAnalyzer = new KeywordAnalyzer(); var analyzer = new PerFieldAnalyzerWrapper(defaultAnalyzer); foreach (var field in GetFields(entity, connection.Version, connection.Logger)) { analyzer.AddAnalyzer(field.Key, field.Value); } return new IndexWriter(dir, analyzer, IndexWriter.MaxFieldLength.UNLIMITED); }