/// <summary> /// Returns an Analyzer for the given AnalyzerType /// </summary> /// <param name="oAnalyzerType">Enumeration value</param> /// <returns>Analyzer</returns> public static Analyzer GetAnalyzer(AnalyzerType oAnalyzerType) { Analyzer oAnalyzer = null; switch (oAnalyzerType) { case AnalyzerType.SimpleAnalyzer: oAnalyzer = new SimpleAnalyzer(); break; case AnalyzerType.StopAnalyzer: oAnalyzer = new StopAnalyzer(); break; case AnalyzerType.WhitespaceAnalyzer: oAnalyzer = new WhitespaceAnalyzer(); break; default: case AnalyzerType.StandardAnalyzer: oAnalyzer = new StandardAnalyzer(); break; } return(oAnalyzer); }
/// <summary> /// Helper to create a new index writer /// </summary> /// <returns></returns> private static IndexWriter CreateIndexWriter() { var luceneDirectory = FSDirectory.Open(System.IO.Directory.CreateDirectory(TestHelper.DirectoryPath)); var analyzer = new WhitespaceAnalyzer(); var indexWriter = new IndexWriter(luceneDirectory, analyzer, IndexWriter.MaxFieldLength.UNLIMITED); return(indexWriter); }
public IndexResult IndexFiles(IndexerConfig config, IndexerCallback callback = null) { var totalWatch = new Stopwatch(); var directory = FSDirectory.Open(config.IndexPath); var analyzer = new WhitespaceAnalyzer(); var files = TextExtractor.GetFilesFromFolder( config.FolderPath, config.Extensions, config.IsRecusive); var result = new IndexResult { TotalFile = files.Count }; totalWatch.Start(); foreach (var file in files) { result.CurrentFile = file.FullName; callback?.Invoke(result); using (var writer = new IndexWriter(directory, analyzer, IndexWriter.MaxFieldLength.UNLIMITED)) { var stopwatch = new Stopwatch(); var document = new Document(); var txtInfo = TextExtractor.ReadText(file.FullName); var fields = GetFields(txtInfo, config.Excludes); foreach (var field in fields) { document.Add(field); } stopwatch.Start(); writer.AddDocument(document, analyzer); writer.Optimize(); stopwatch.Stop(); result.Time = stopwatch.Elapsed; result.ReadedFiles = TextExtractor.ReadedFile; result.Size = txtInfo.FileSize; callback?.Invoke(result); } } totalWatch.Stop(); result.Time = totalWatch.Elapsed; result.ReadedFiles = TextExtractor.ReadedFile; result.Size = TextExtractor.TotalSize; return(result); }
public void TestFuzzyLikeThisQueryEquals() { Analyzer analyzer = new WhitespaceAnalyzer(); FuzzyLikeThisQuery fltq1 = new FuzzyLikeThisQuery(10, analyzer); fltq1.AddTerms("javi", "subject", 0.5f, 2); FuzzyLikeThisQuery fltq2 = new FuzzyLikeThisQuery(10, analyzer); fltq2.AddTerms("javi", "subject", 0.5f, 2); Assert.AreEqual(fltq1, fltq2, "FuzzyLikeThisQuery with same attributes is not equal"); }
//JAVA TO C# CONVERTER WARNING: Method 'throws' clauses are not available in .NET: //ORIGINAL LINE: public void testNull() throws Exception public virtual void testNull() { Analyzer a = new WhitespaceAnalyzer(TEST_VERSION_CURRENT); assertAnalyzesTo(a, "foo bar FOO BAR", new string[] { "foo", "bar", "FOO", "BAR" }); assertAnalyzesTo(a, "foo bar . FOO <> BAR", new string[] { "foo", "bar", ".", "FOO", "<>", "BAR" }); assertAnalyzesTo(a, "foo.bar.FOO.BAR", new string[] { "foo.bar.FOO.BAR" }); assertAnalyzesTo(a, "U.S.A.", new string[] { "U.S.A." }); assertAnalyzesTo(a, "C++", new string[] { "C++" }); assertAnalyzesTo(a, "B2B", new string[] { "B2B" }); assertAnalyzesTo(a, "2B", new string[] { "2B" }); assertAnalyzesTo(a, "\"QUOTED\" word", new string[] { "\"QUOTED\"", "word" }); }
private Query GetQuery(string field, string term) { var analyzer = new WhitespaceAnalyzer(LuceneVersion.LUCENE_48); var parser = new QueryParser(LuceneVersion.LUCENE_48, field, analyzer); var query = parser.Parse(term); return(query); /* * var phrase = new MultiPhraseQuery(); * phrase.Add(new Term(field, term)); * return phrase; */ }
public void LongPointTest() { var analyzer = new WhitespaceAnalyzer(); var iwc = new IndexWriterConfig(analyzer); iwc.SetOpenMode(IndexWriterConfigOpenMode.CREATE); //インデックス作成--------------------------------------------- DateTime baseDate = DateTime.Parse("2020/07/16 08:00:00"); var ram = new RAMDirectory(); var writer = new IndexWriter(ram, iwc); try { for (int i = 0; i < 10; i++) { var doc = new Document(); doc.Add(new TextField("text", "hoge foo", FieldStore.YES)); DateTime tmp = baseDate.AddDays(i); long l = long.Parse(tmp.ToString("yyyyMMddHHmmss")); doc.Add(new LongPoint("date", l)); doc.Add(new StoredField("date", l)); writer.AddDocument(doc); } } finally { writer.Close(); } //検索------------------------------------------------------------ TermQuery tq = new TermQuery(new Term("text", "foo")); Query rq = LongPoint.NewRangeQuery("date", 20200717000000, 20200719000000); BooleanQueryBuilder b = new BooleanQueryBuilder(); b.Add(tq, BooleanClauseOccur.MUST); //AND条件 b.Add(rq, BooleanClauseOccur.FILTER); //AND条件(スコアリングに関与しない) Query q = b.Build(); DirectoryReader dr = DirectoryReader.Open(ram); IndexSearcher searcher = new IndexSearcher(dr); ScoreDoc[] hits = searcher.Search(q, 100).ScoreDocs; for (int i = 0; i < hits.Length; i++) { var doc = searcher.Doc(hits[i].Doc); Debug.WriteLine(DateTime.ParseExact(doc.Get("date"), "yyyyMMddHHmmss", null)); } Assert.AreEqual(hits.Length, 2); }
static void Main(string[] args) { Lucene.Net.Util.Version version = Lucene.Net.Util.Version.LUCENE_29; var values = new List <string>() { "ab", "a b", "a-b", "a_b", "a/b", "a.b", }; var util = new Util(); Analyzer analyzer = new StandardAnalyzer(version); using (var tester = new AnalyzerTester(version, analyzer, values)) { PrintTestName("StandardAnalyzer"); //util.PrintTerms(tester.GetIndexReader(), AnalyzerTester.FieldName); foreach (var value in values) { SearchAndPrintResult(tester.Search, analyzer, value); } SearchAndPrintResult(tester.Search, analyzer, "a*"); SearchAndPrintResult(tester.Search, analyzer, "a*b"); SearchAndPrintResult(tester.Search, analyzer, "a?b"); } analyzer = new WhitespaceAnalyzer(); using (var tester = new AnalyzerTester(version, analyzer, values)) { PrintTestName("WhitespaceAnalyzer"); //util.PrintTerms(tester.GetIndexReader(), AnalyzerTester.FieldName); //var x = util.GetDocumentFieldValues(tester.GetIndexReader(), AnalyzerTester.FieldName); foreach (var value in values) { SearchAndPrintResult(tester.Search, analyzer, value); } SearchAndPrintResult(tester.Search, analyzer, "a*"); SearchAndPrintResult(tester.Search, analyzer, "a*b"); SearchAndPrintResult(tester.Search, analyzer, "a?b"); } }
public virtual void TestLUCENENET615() { var english = new EnglishAnalyzer(Lucene.Net.Util.LuceneVersion.LUCENE_48); var whitespace = new WhitespaceAnalyzer(Lucene.Net.Util.LuceneVersion.LUCENE_48); var pf = new PerFieldAnalyzerWrapper(english, new JCG.Dictionary <string, Analyzer>() { { "foo", whitespace } }); var test1 = english.GetTokenStream(null, "test"); // Does not throw var test2 = pf.GetTokenStream("", "test"); // works Assert.DoesNotThrow(() => pf.GetTokenStream(null, "test"), "GetTokenStream should not throw NullReferenceException with a null key"); }
/// <summary> /// /// </summary> /// <param name="documents"></param> internal static void IndexDocuments(IEnumerable <Document> documents) { var luceneDirectory = FSDirectory.Open(System.IO.Directory.CreateDirectory(TestHelper.DirectoryPath)); var analyzer = new WhitespaceAnalyzer(); var indexWriter = new IndexWriter(luceneDirectory, analyzer, IndexWriter.MaxFieldLength.UNLIMITED); foreach (var document in documents) { indexWriter.AddDocument(document); } indexWriter.Optimize(); indexWriter.Commit(); indexWriter.Close(); }
public static void testWhitespace(String testString) { Analyzer analyzer = new WhitespaceAnalyzer(); StringReader r = new StringReader(testString); TokenStream ts = analyzer.TokenStream("", r); Console.WriteLine("=====Whitespace analyzer===="); ITermAttribute termAtt = ts.GetAttribute <ITermAttribute>(); //bool hasNext; while (ts.IncrementToken()) //老版本中的ts.Next()已经不存在了。 { string iterm = termAtt.Term; Console.WriteLine("[" + iterm + "]"); } //string iterm = termAtt.Term; // ITermAttribute ternAtt = ts.GetAttribute<ITermAttribute>; }
public static void Main(String[] args) { Analyzer analyzer = new WhitespaceAnalyzer(); QueryParser parser = new QueryParser("f", analyzer); Query query = parser.Parse("a x:b"); FieldQuery fieldQuery = new FieldQuery(query, true, false); Directory dir = new RAMDirectory(); IndexWriter writer = new IndexWriter(dir, analyzer, IndexWriter.MaxFieldLength.LIMITED); Document doc = new Document(); doc.Add(new Field("f", "a a a b b c a b b c d e f", Field.Store.YES, Field.Index.ANALYZED, Field.TermVector.WITH_POSITIONS_OFFSETS)); doc.Add(new Field("f", "b a b a f", Field.Store.YES, Field.Index.ANALYZED, Field.TermVector.WITH_POSITIONS_OFFSETS)); writer.AddDocument(doc); writer.Close(); IndexReader reader = IndexReader.Open(dir, true); FieldTermStack ftl = new FieldTermStack(reader, 0, "f", fieldQuery); reader.Close(); }
static void BuildIndex(Directory directory) { string[] paths = new[] { @"C:\Users\vj\folder1\lucene\", @"C:\Users\vj\folder1\lucene\folder1\folder2", @"C:\Users\vj\folder2\lucene2\folder1\lucene\" }; Analyzer analyzer = new WhitespaceAnalyzer(); using (var writer = new IndexWriter(directory, analyzer, true, IndexWriter.MaxFieldLength.LIMITED)) { foreach (string path in paths) { Document doc = new Document(); var field = new Field("Path", path, Field.Store.YES, Field.Index.NOT_ANALYZED); doc.Add(field); writer.AddDocument(doc); } } }
private static void QueryIndex(Directory directory) { string userQueryString = @"folder1\lucene\"; Analyzer analyzer = new WhitespaceAnalyzer(); var queryParser = new QueryParser(Lucene.Net.Util.Version.LUCENE_29, "Path", analyzer); queryParser.AllowLeadingWildcard = true; string queryString = "*" + userQueryString.Replace(@"\", @"\\"); var query = queryParser.Parse(queryString); IndexSearcher searcher = new IndexSearcher(directory); IndexReader reader = searcher.IndexReader; TopDocs topDocs = searcher.Search(query, 100); foreach (ScoreDoc doc in topDocs.ScoreDocs) { string path = reader.Document(doc.Doc).Get("Path"); Console.WriteLine(path); } Console.ReadKey(); }
internal static PerFieldAnalyzerWrapper BuildAnalyzer(Lucene.Net.Util.Version version, Dictionary <string, AyatColumnAttribute> fieldInfo) { var analyzer = new StandardAnalyzer(version); var analyzer2 = new ArabicAnalyzer(version); var whitespaceAnalyzer = new WhitespaceAnalyzer(); var perFieldAnalyzerWrapper = new PerFieldAnalyzerWrapper(whitespaceAnalyzer); foreach (KeyValuePair <string, AyatColumnAttribute> current in fieldInfo) { switch (current.Value.Language) { case Language.Arabic: case Language.ArabicNoAraab: perFieldAnalyzerWrapper.AddAnalyzer(current.Key, analyzer2); continue; case Language.English: perFieldAnalyzerWrapper.AddAnalyzer(current.Key, analyzer); continue; } perFieldAnalyzerWrapper.AddAnalyzer(current.Key, whitespaceAnalyzer); } return(perFieldAnalyzerWrapper); }
static void Main(string[] args) { Action <Analyzer, String> displayAction = DisplayTokens; var version = Lucene.Net.Util.Version.LUCENE_30; var text = "Høje Taastrup Århus René"; Console.WriteLine("Original string: {0}", text); Console.WriteLine(); Analyzer analyzer = new KeywordAnalyzer(); displayAction(analyzer, text); analyzer = new WhitespaceAnalyzer(); displayAction(analyzer, text); analyzer = new SimpleAnalyzer(); displayAction(analyzer, text); analyzer = new StopAnalyzer(version); displayAction(analyzer, text); analyzer = new StandardAnalyzer(version); displayAction(analyzer, text); analyzer = new SnowballAnalyzer(Version.LUCENE_30, "Danish"); // http://snowball.tartarus.org/ displayAction(analyzer, text); analyzer = new TestAnalyzer(version); displayAction(analyzer, text); //analyzer = new LowerCaseKeywordAnalyzer(); //displayAction(analyzer, text); //analyzer = new EdgeNGramAnalyzer(version); //displayAction(analyzer, text); //analyzer = new ReverseAnalyzer(version); //displayAction(analyzer, text); //new PerFieldAnalyzerWrapper() //Different fields require different analyzers }
/// <summary> /// Given a clause which has a Lucene based phrase, creates a KQL query. /// </summary> /// <param name="queryStringClause">The given clausse.</param> /// <returns>A KQL query string.</returns> private string CreateKqlFromLucenePhrase(QueryStringClause queryStringClause) { // we need to parse the phrase using var analyzer = new WhitespaceAnalyzer(); var queryParser = new QueryParser( Lucene.Net.Util.Version.LUCENE_30, queryStringClause.Default, analyzer) { AllowLeadingWildcard = queryStringClause.Wildcard, LowercaseExpandedTerms = false, }; // escaping special charachters from the pharse before parsing. // we would call QueryParser.Escape() method, but it escapes all charachters and // in our case we only have to worry about backslash. // implementation is based on: https://github.com/apache/lucenenet/blob/0eaf76540b8de326d1aa9ca24f4b5d6425a9ae38/src/Lucene.Net.QueryParser/Classic/QueryParserBase.cs var escapedPhrase = queryStringClause.Phrase.Replace(@"\", @"\\\", StringComparison.OrdinalIgnoreCase); // we parse and get the Lucene.Net query model var query = queryParser.Parse(escapedPhrase); // We make our own 'visitable' Lucence.Net query model var luceneQuery = VisitableLuceneQueryFactory.Make(query); // Visit var luceneVisitor = new LuceneVisitor(); luceneQuery.Accept(luceneVisitor); dynamic esQuery = luceneQuery.ESQuery; esQuery.Accept(this); return(esQuery.KustoQL); }
public LuceneIndexContext(Directory directory) { Analyzer = new WhitespaceAnalyzer(); Writer = new IndexWriter(directory, Analyzer, IndexWriter.MaxFieldLength.UNLIMITED); Manager = new NrtManager(Writer); }
//JAVA TO C# CONVERTER WARNING: Method 'throws' clauses are not available in .NET: //ORIGINAL LINE: public void testNull() throws Exception public virtual void testNull() { Analyzer a = new WhitespaceAnalyzer(TEST_VERSION_CURRENT); assertAnalyzesTo(a, "foo bar FOO BAR", new string[] {"foo", "bar", "FOO", "BAR"}); assertAnalyzesTo(a, "foo bar . FOO <> BAR", new string[] {"foo", "bar", ".", "FOO", "<>", "BAR"}); assertAnalyzesTo(a, "foo.bar.FOO.BAR", new string[] {"foo.bar.FOO.BAR"}); assertAnalyzesTo(a, "U.S.A.", new string[] {"U.S.A."}); assertAnalyzesTo(a, "C++", new string[] {"C++"}); assertAnalyzesTo(a, "B2B", new string[] {"B2B"}); assertAnalyzesTo(a, "2B", new string[] {"2B"}); assertAnalyzesTo(a, "\"QUOTED\" word", new string[] {"\"QUOTED\"", "word"}); }
/* * 分词函数 * @srcdata:待分词的文本 * 返回值:按照学长格式定义的分词结果的string表示 * 即{<分词1>}{<分词2>}...{<分词n>} */ // 这个函数是核心 // 输入是待分词的内容 // 输出是分词结果 // 分词结果的格式是{<word>} // 这个格式是学长定义的,我们为了不破坏既定的接口,沿用了这个格式 // 这个函数的工作原理主要是调用了Lucene.Net.Analysis和Lucene.China的接口 // 调用这两个接口的配置工作很简单:1.在引用中加入dll文件 2.在可执行程序的目录下放置一个data文件夹,文件夹内有两个文件,分别是sDict和sNoise // 存放词库和噪声 /*private bool isChineseWord(string word) * { * if (word == null) * { * return false; * } * for (int i = 0; i < word.Length; i++) * { * char chr = word[i]; * if (!(chr >= 0x4E00 && chr <= 0x9FFF)) * { * return false; * } * } * * return true; * }*/ /*private string word_seg(string srcdata) * { * //StringBuilder sb = new StringBuilder(); * //sb.Remove(0, sb.Length); * string t1 = ""; * ChineseAnalyzer analyzer = new Lucene.China.ChineseAnalyzer(); * //string FilePath = @"C:\Users\梁亦清\Documents\Visual Studio 2013\Projects\中科院分词简例\1.htm"; * * StringReader sr = new StringReader(srcdata); * //Console.WriteLine(sr.ToString()); * //Environment.Exit(0); * TokenStream stream = analyzer.TokenStream("", sr); * * //long begin = System.DateTime.Now.Ticks; * Lucene.Net.Analysis.Token t = stream.Next(); * while (t != null) * { * /* * t1 = t.ToString(); //显示格式: (关键词,0,2) ,需要处理 * t1 = t1.Replace("(", ""); * char[] separator = { ',' }; * t1 = t1.Split(separator)[0]; * if (isChineseWord(t1)) * { * sb.Append("{<" + t1 + ">}"); * } * t = stream.Next(); * } * //return sb.ToString() * }*/ // 这个函数是学长代码的对外接口,我们沿用了这个接口,但使用的分词方法不是朴素贝叶斯 /*public string DoWordSegment(string strIn) * { * return word_seg(strIn); * * }*/ public List <string> cutwords(string words, string analyzer = "Lucene.China.ChineseAnalyzer") { List <string> results = new List <string>(); switch (analyzer) { case "Lucene.Net.Analysis.SimpleAnalyzer": SimpleAnalyzer analyzerInstance0 = new SimpleAnalyzer(); TokenStream ts0 = analyzerInstance0.ReusableTokenStream("", new StringReader(words)); Lucene.Net.Analysis.Token token0; while ((token0 = ts0.Next()) != null) { results.Add(token0.TermText()); } ts0.Close(); analyzerInstance0.Close(); break; case "Lucene.Net.Analysis.KeywordAnalyzer": KeywordAnalyzer analyzerInstance1 = new KeywordAnalyzer(); TokenStream ts1 = analyzerInstance1.ReusableTokenStream("", new StringReader(words)); Lucene.Net.Analysis.Token token1; while ((token1 = ts1.Next()) != null) { results.Add(token1.TermText()); } ts1.Close(); analyzerInstance1.Close(); break; case "Lucene.Net.Analysis.StopAnalyzer": StopAnalyzer analyzerInstance2 = new StopAnalyzer(); TokenStream ts2 = analyzerInstance2.ReusableTokenStream("", new StringReader(words)); Lucene.Net.Analysis.Token token2; while ((token2 = ts2.Next()) != null) { results.Add(token2.TermText()); } ts2.Close(); analyzerInstance2.Close(); break; case "Lucene.Net.Analysis.WhitespaceAnalyzer": WhitespaceAnalyzer analyzerInstance3 = new WhitespaceAnalyzer(); TokenStream ts3 = analyzerInstance3.ReusableTokenStream("", new StringReader(words)); Lucene.Net.Analysis.Token token3; while ((token3 = ts3.Next()) != null) { results.Add(token3.TermText()); } ts3.Close(); analyzerInstance3.Close(); break; case "Lucene.Net.Analysis.PanGu.PanGuAnalyzer": PanGu.Segment.Init(@"G:\CProjects\Pipeline\pipeline\Pipeline\bin\Release\PanGu.xml"); PanGuAnalyzer analyzerInstance4 = new PanGuAnalyzer(); TokenStream ts4 = analyzerInstance4.TokenStream("", new StringReader(words)); Lucene.Net.Analysis.Token token4; while ((token4 = ts4.Next()) != null) { results.Add(token4.TermText()); } ts4.Close(); analyzerInstance4.Close(); break; case "Lucene.Net.Analysis.Standard.StandardAnalyzer": StandardAnalyzer analyzerInstance5 = new StandardAnalyzer(); TokenStream ts5 = analyzerInstance5.ReusableTokenStream("", new StringReader(words)); Lucene.Net.Analysis.Token token5; while ((token5 = ts5.Next()) != null) { results.Add(token5.TermText()); } ts5.Close(); analyzerInstance5.Close(); break; case "Lucene.China.ChineseAnalyzer": default: ChineseAnalyzer analyzerInstance6 = new ChineseAnalyzer(); TokenStream ts6 = analyzerInstance6.ReusableTokenStream("", new StringReader(words)); Lucene.Net.Analysis.Token token6; while ((token6 = ts6.Next()) != null) { results.Add(token6.TermText()); } ts6.Close(); analyzerInstance6.Close(); break; } return(results); }
private Analyzer GetAnalyzer() { var analyzer = new WhitespaceAnalyzer(LuceneVersion.LUCENE_48); return(analyzer); }