/// <summary> /// 得到所有拆分字 /// </summary> /// <param name="strKey">字符串</param> /// <returns>数组</returns> public static string[] ReadWordbase(string strKey) { string readWord = string.Empty, strToken = string.Empty; List <string> list = new ChineseAnalyzer().GetTokenList(strKey); return(list.ToArray()); }
protected void Page_Load(object sender, EventArgs e) { //if (Session["KeyWords"] == null ? false : true) //{ // Response.Redirect("Search.aspx"); //} String text = Session["KeyWords"].ToString(); ChineseAnalyzer analyzer = new ChineseAnalyzer(); TokenStream ts = analyzer.TokenStream("ItemName", new System.IO.StringReader(text)); Lucene.Net.Analysis.Token token; try { int n = 0; while ((token = ts.Next()) != null) { this.lbMsg.Text += (n++) + "->" + token.TermText() + " " + token.StartOffset() + " " + token.EndOffset() + " " + token.Type() + "<br>"; // Response.Write((n++) + "->" + token.TermText() + " " + token.StartOffset() + " " //+ token.EndOffset() + " " + token.Type() + "<br>"); } } catch { this.lbMsg.Text = "wrong"; } // Analyzer analyzer = new StandardAnalyzer(); Directory directory = FSDirectory.GetDirectory(Server.MapPath("/indexFile/"), false); IndexSearcher isearcher = new IndexSearcher(directory); Query query; query = QueryParser.Parse(Session["KeyWords"].ToString(), "ItemName", analyzer); //query = QueryParser.Parse("2", "nid", analyzer); Hits hits = isearcher.Search(query); this.lbMsg.Text += "<font color=red>共找到" + hits.Length() + "条记录</font><br>"; //Response.Write("<font color=red>共找到" + hits.Length() + "条记录</font><br>"); for (int i = 0; i < hits.Length(); i++) { Document hitDoc = hits.Doc(i); this.lbMsg.Text += "编号:" + hitDoc.Get("ItemID").ToString() + "<br>" + "分类:" + hitDoc.Get("CategoryName").ToString() + "<br>" + "专题:" + hitDoc.Get("ProductName").ToString() + "<br>" + "标题:<a href=" + hitDoc.Get("visiturl").ToString() + ">" + hitDoc.Get("ItemName").ToString() + "</a><br>"; //Response.Write("编号:" + hitDoc.Get("ItemID").ToString() + "<br>"); //Response.Write("分类:" + hitDoc.Get("CategoryName").ToString() + "<br>"); //Response.Write("标题:<a href=" + hitDoc.Get("visiturl").ToString() + ">" + hitDoc.Get("ItemName").ToString() + "</a><br>"); //Response.Write("专题:" + hitDoc.Get("ProductName").ToString() + "<br>"); } isearcher.Close(); directory.Close(); }
public void TestNumerics() { Analyzer justTokenizer = new JustChineseTokenizerAnalyzer(); AssertAnalyzesTo(justTokenizer, "中1234", new String[] { "中", "1234" }); // in this case the ChineseAnalyzer (which applies ChineseFilter) will remove the numeric token. Analyzer a = new ChineseAnalyzer(); AssertAnalyzesTo(a, "中1234", new String[] { "中" }); }
public void TestReusableTokenStream() { Analyzer a = new ChineseAnalyzer(); AssertAnalyzesToReuse(a, "中华人民共和国", new String[] { "中", "华", "人", "民", "共", "和", "国" }, new int[] { 0, 1, 2, 3, 4, 5, 6 }, new int[] { 1, 2, 3, 4, 5, 6, 7 }); AssertAnalyzesToReuse(a, "北京市", new String[] { "北", "京", "市" }, new int[] { 0, 1, 2 }, new int[] { 1, 2, 3 }); }
public void TestEnglish() { Analyzer chinese = new ChineseAnalyzer(); AssertAnalyzesTo(chinese, "This is a Test. b c d", new String[] { "test" }); Analyzer justTokenizer = new JustChineseTokenizerAnalyzer(); AssertAnalyzesTo(justTokenizer, "This is a Test. b c d", new String[] { "this", "is", "a", "test", "b", "c", "d" }); Analyzer justFilter = new JustChineseFilterAnalyzer(); AssertAnalyzesTo(justFilter, "This is a Test. b c d", new String[] { "This", "Test." }); }
/// <summary> /// 得到拆分字中的某一个 /// </summary> /// <param name="strKey">字符串</param> /// <param name="readNumber">下标</param> /// <returns>字符串</returns> public static string ReadWordbase(string strKey, int readNumber) { string readWord = string.Empty, strToken = string.Empty; List <string> list = new ChineseAnalyzer().GetTokenList(strKey); int i = 0; foreach (var item in list) { if (i == readNumber) { break; } readWord = item; i++; } return(readWord); }
public void TestQuery() { //查询 //Analyzer objCA = new StandardAnalyzer(); Analyzer objCA = new ChineseAnalyzer(); String words = "北京"; IndexSearcher searcher = new IndexSearcher(@"F:\Index"); Query query = new QueryParser("Text", objCA).Parse(words); Hits hits = searcher.Search(query); String msg = ""; for (int i = 0; i < hits.Length(); i++) { msg += hits.Doc(i).GetField("Text").StringValue() + "\n"; } MessageBox.Show(msg); searcher.Close(); }
public void TestBuildIndex() { // 创建索引 //Analyzer objCA = new StandardAnalyzer(); //分词器 Analyzer objCA = new ChineseAnalyzer(); IndexWriter writer = new IndexWriter(@"F:\Index", objCA, true); Document doc = new Document(); doc.Add(new Field("Text", "哦耶,美丽的姑娘。", Field.Store.YES, Field.Index.TOKENIZED)); writer.AddDocument(doc); doc = new Document(); doc.Add(new Field("Text", "北京矿业大学", Field.Store.YES, Field.Index.TOKENIZED)); writer.AddDocument(doc); doc = new Document(); doc.Add(new Field("Text", "北京矿业大学东门", Field.Store.YES, Field.Index.TOKENIZED)); writer.AddDocument(doc); doc = new Document(); doc.Add(new Field("Text", "北京矿业大学图书馆", Field.Store.YES, Field.Index.TOKENIZED)); writer.AddDocument(doc); doc = new Document(); doc.Add(new Field("Text", "北京大学", Field.Store.YES, Field.Index.TOKENIZED)); writer.AddDocument(doc); doc = new Document(); doc.Add(new Field("Text", "麦当劳", Field.Store.YES, Field.Index.TOKENIZED)); writer.AddDocument(doc); doc = new Document(); doc.Add(new Field("Text", "清华大学", Field.Store.YES, Field.Index.TOKENIZED)); writer.AddDocument(doc); doc = new Document(); doc.Add(new Field("Text", "沙县小吃", Field.Store.YES, Field.Index.TOKENIZED)); writer.AddDocument(doc); doc = new Document(); doc.Add(new Field("Text", "穆图科技有限公司", Field.Store.YES, Field.Index.TOKENIZED)); writer.AddDocument(doc); writer.Close(); }
public void BuildIndex() { var indexFolder = FSDirectory.Open(new DirectoryInfo(GeneralConstants.Paths.BBCIndex)); Analyzer analyzer = new ChineseAnalyzer(); using (IndexWriter indexWriter = new IndexWriter(indexFolder, analyzer, IndexWriter.MaxFieldLength.UNLIMITED)) { var outdoors = GetOutDoors(null); // int count = 0; outdoors.ForEach(item => indexWriter.AddDocument( x => x.AddAnalysedField(BBCFields.Title, item.Title, false), x => x.AddAnalysedField(BBCFields.Description, item.Description, false), x => x.AddAnalysedField(BBCFields.AreaAtt, item.AreaAtt, false), x => x.AddAnalysedField(BBCFields.CityName, item.CityName, false), x => x.AddAnalysedField(BBCFields.ProvinceName, item.ProvinceName, false), x => x.AddAnalysedField(BBCFields.MediaCateName, item.MediaCateName, false), x => x.AddAnalysedField(BBCFields.PMediaCateName, item.PMediaCateName, false), x => x.AddAnalysedField(BBCFields.FormatName, item.FormatName, false), x => x.AddAnalysedField(BBCFields.OwnerCateName, item.OwnerCateName, false), x => x.AddNonAnalysedField(BBCFields.Price, item.Price.ToString(), true), x => x.AddNonAnalysedField(BBCFields.Province, item.Province.ToString(), true), x => x.AddNonAnalysedField(BBCFields.City, item.City.ToString(), true), x => x.AddNonAnalysedField(BBCFields.MediaCode, item.MediaCode.ToString(), true), x => x.AddNonAnalysedField(BBCFields.PMediaCode, item.PMediaCode.ToString(), true), x => x.AddNonAnalysedField(BBCFields.FormatCode, item.FormatCode.ToString(), true), x => x.AddNonAnalysedField(BBCFields.PeriodCode, item.PeriodCode.ToString(), true), x => x.AddNonAnalysedField(BBCFields.OwnerCode, item.OwnerCode.ToString(), true), x => x.AddNonAnalysedField(BBCFields.Status, item.Status.ToString(), true), x => x.AddNonAnalysedField(BBCFields.MediaID, item.MediaID.ToString(), true), x => x.AddNonAnalysedField(BBCFields.Published, item.Published.ToString(), true), x => x.AddNonAnalysedField(BBCFields.Hit, item.Hit.ToString(), true) )); //foreach (var outdoor in outdoors) //{ // /*if (count > 4) // { // break; // } // count ++;*/ // var newsArticles = ReadFeed(rssFile); // newsArticles.ForEach( // newsArticle => // indexWriter.AddDocument // ( // x => x.AddAnalysedField(BBCFields.Title, newsArticle.Title, true), // x => x.AddAnalysedField(BBCFields.Description, newsArticle.Description, true), // x => x.AddAnalysedField(BBCFields.Copyright, newsArticle.Copyright), // x => x.AddStoredField(BBCFields.Link, newsArticle.Link), // x => x.AddNonAnalysedField(BBCFields.PublishDate, newsArticle.PublishDateTime.ToString("yyyyMMdd-hhMMss-mmm"), true), // x => x.AddNonAnalysedField(BBCFields.Sortable, newsArticle.Title)) // must be non-analysed to sort against it // ); //} indexWriter.Optimize(); indexWriter.Close(); } }
private static Query ParseQuery(SearchFilter searchFilter) { if (String.IsNullOrWhiteSpace(searchFilter.SearchTerm)) { return new MatchAllDocsQuery(); } var fields = new[] { "Title", "Description", "MediaCateName", "CityName" }; var analyzer = new ChineseAnalyzer(); //var analyzer = new StandardAnalyzer(LuceneCommon.LuceneVersion); var queryParser = new MultiFieldQueryParser(LuceneCommon.LuceneVersion, fields, analyzer); var query = queryParser.Parse(searchFilter.SearchTerm); // All terms in the multi-term query appear in at least one of the fields. var conjuctionQuery = new BooleanQuery(); conjuctionQuery.Boost = 2.0f; // Some terms in the multi-term query appear in at least one of the fields. var disjunctionQuery = new BooleanQuery(); disjunctionQuery.Boost = 0.1f; // Suffix wildcard search e.g. jquer* var wildCardQuery = new BooleanQuery(); wildCardQuery.Boost = 0.5f; //// Escape the entire term we use for exact searches. var escapedSearchTerm = Escape(searchFilter.SearchTerm); var exactIdQuery = new TermQuery(new Term("Title", escapedSearchTerm)); exactIdQuery.Boost = 2.5f; var wildCardIdQuery = new WildcardQuery(new Term("Title", "*" + escapedSearchTerm + "*")); foreach (var term in GetSearchTerms(searchFilter.SearchTerm)) { var termQuery = queryParser.Parse(term); conjuctionQuery.Add(termQuery, Occur.MUST); disjunctionQuery.Add(termQuery, Occur.SHOULD); foreach (var field in fields) { var wildCardTermQuery = new WildcardQuery(new Term(field, term + "*")); wildCardTermQuery.Boost = 0.7f; wildCardQuery.Add(wildCardTermQuery, Occur.SHOULD); } } //var combinedQuery = // conjuctionQuery.Combine(new Query[] { exactIdQuery, conjuctionQuery }); //// Create an OR of all the queries that we have var combinedQuery = conjuctionQuery.Combine(new Query[] { exactIdQuery, wildCardIdQuery, conjuctionQuery, disjunctionQuery, wildCardQuery }); if (searchFilter.SortProperty == SortProperty.Hit) { // If searching by relevance, boost scores by download count. var downloadCountBooster = new FieldScoreQuery("Hit", FieldScoreQuery.Type.INT); return new CustomScoreQuery(combinedQuery, downloadCountBooster); } return combinedQuery; }
private static IEnumerable<string> GetSearchTerms(string searchTerm) { List<string> result = new List<string>(); var analyzer = new ChineseAnalyzer(); StringReader sr = new StringReader(searchTerm); TokenStream stream = analyzer.TokenStream(null, sr); bool hasnext = stream.IncrementToken(); System.DateTime start = System.DateTime.Now; ITermAttribute ita; while (hasnext) { ita = stream.GetAttribute<ITermAttribute>(); result.Add(ita.Term); hasnext = stream.IncrementToken(); } stream.CloneAttributes(); sr.Close(); analyzer.Dispose(); var resultString = string.Join(" ", result); return resultString.Split(new[] { ' ' }, StringSplitOptions.RemoveEmptyEntries) .Concat(new[] { searchTerm }) .Distinct(StringComparer.OrdinalIgnoreCase) .Select(Escape); }
private Analyzer GuessAnalyzer(string filePath) { Analyzer ret = null; switch (Path.GetFileName(filePath).Substring(0, 2).ToLowerInvariant()) { case "zh": ret = new ChineseAnalyzer(); break; case "cs": ret = new CzechAnalyzer(); break; case "da": ret = new SnowballAnalyzer("Danish"); break; case "nl": ret = new SnowballAnalyzer("Dutch"); break; case "en": ret = new SnowballAnalyzer("English"); break; case "fi": ret = new SnowballAnalyzer("Finnish"); break; case "fr": ret = new SnowballAnalyzer("French"); break; case "de": ret = new SnowballAnalyzer("German"); break; case "it": ret = new SnowballAnalyzer("Italian"); break; case "ja": ret = new CJKAnalyzer(); break; case "ko": ret = new CJKAnalyzer(); break; case "no": ret = new SnowballAnalyzer("Norwegian"); break; case "pt": ret = new SnowballAnalyzer("Portuguese"); break; case "ru": ret = new SnowballAnalyzer("Russian"); break; case "es": ret = new SnowballAnalyzer("Spanish"); break; case "se": ret = new SnowballAnalyzer("Swedish"); break; default: ret = new StandardAnalyzer(); break; } return(ret); }
/* * 分词函数 * @srcdata:待分词的文本 * 返回值:按照学长格式定义的分词结果的string表示 * 即{<分词1>}{<分词2>}...{<分词n>} */ // 这个函数是核心 // 输入是待分词的内容 // 输出是分词结果 // 分词结果的格式是{<word>} // 这个格式是学长定义的,我们为了不破坏既定的接口,沿用了这个格式 // 这个函数的工作原理主要是调用了Lucene.Net.Analysis和Lucene.China的接口 // 调用这两个接口的配置工作很简单:1.在引用中加入dll文件 2.在可执行程序的目录下放置一个data文件夹,文件夹内有两个文件,分别是sDict和sNoise // 存放词库和噪声 /*private bool isChineseWord(string word) * { * if (word == null) * { * return false; * } * for (int i = 0; i < word.Length; i++) * { * char chr = word[i]; * if (!(chr >= 0x4E00 && chr <= 0x9FFF)) * { * return false; * } * } * * return true; * }*/ /*private string word_seg(string srcdata) * { * //StringBuilder sb = new StringBuilder(); * //sb.Remove(0, sb.Length); * string t1 = ""; * ChineseAnalyzer analyzer = new Lucene.China.ChineseAnalyzer(); * //string FilePath = @"C:\Users\梁亦清\Documents\Visual Studio 2013\Projects\中科院分词简例\1.htm"; * * StringReader sr = new StringReader(srcdata); * //Console.WriteLine(sr.ToString()); * //Environment.Exit(0); * TokenStream stream = analyzer.TokenStream("", sr); * * //long begin = System.DateTime.Now.Ticks; * Lucene.Net.Analysis.Token t = stream.Next(); * while (t != null) * { * /* * t1 = t.ToString(); //显示格式: (关键词,0,2) ,需要处理 * t1 = t1.Replace("(", ""); * char[] separator = { ',' }; * t1 = t1.Split(separator)[0]; * if (isChineseWord(t1)) * { * sb.Append("{<" + t1 + ">}"); * } * t = stream.Next(); * } * //return sb.ToString() * }*/ // 这个函数是学长代码的对外接口,我们沿用了这个接口,但使用的分词方法不是朴素贝叶斯 /*public string DoWordSegment(string strIn) * { * return word_seg(strIn); * * }*/ public List <string> cutwords(string words, string analyzer = "Lucene.China.ChineseAnalyzer") { List <string> results = new List <string>(); switch (analyzer) { case "Lucene.Net.Analysis.SimpleAnalyzer": SimpleAnalyzer analyzerInstance0 = new SimpleAnalyzer(); TokenStream ts0 = analyzerInstance0.ReusableTokenStream("", new StringReader(words)); Lucene.Net.Analysis.Token token0; while ((token0 = ts0.Next()) != null) { results.Add(token0.TermText()); } ts0.Close(); analyzerInstance0.Close(); break; case "Lucene.Net.Analysis.KeywordAnalyzer": KeywordAnalyzer analyzerInstance1 = new KeywordAnalyzer(); TokenStream ts1 = analyzerInstance1.ReusableTokenStream("", new StringReader(words)); Lucene.Net.Analysis.Token token1; while ((token1 = ts1.Next()) != null) { results.Add(token1.TermText()); } ts1.Close(); analyzerInstance1.Close(); break; case "Lucene.Net.Analysis.StopAnalyzer": StopAnalyzer analyzerInstance2 = new StopAnalyzer(); TokenStream ts2 = analyzerInstance2.ReusableTokenStream("", new StringReader(words)); Lucene.Net.Analysis.Token token2; while ((token2 = ts2.Next()) != null) { results.Add(token2.TermText()); } ts2.Close(); analyzerInstance2.Close(); break; case "Lucene.Net.Analysis.WhitespaceAnalyzer": WhitespaceAnalyzer analyzerInstance3 = new WhitespaceAnalyzer(); TokenStream ts3 = analyzerInstance3.ReusableTokenStream("", new StringReader(words)); Lucene.Net.Analysis.Token token3; while ((token3 = ts3.Next()) != null) { results.Add(token3.TermText()); } ts3.Close(); analyzerInstance3.Close(); break; case "Lucene.Net.Analysis.PanGu.PanGuAnalyzer": PanGu.Segment.Init(@"G:\CProjects\Pipeline\pipeline\Pipeline\bin\Release\PanGu.xml"); PanGuAnalyzer analyzerInstance4 = new PanGuAnalyzer(); TokenStream ts4 = analyzerInstance4.TokenStream("", new StringReader(words)); Lucene.Net.Analysis.Token token4; while ((token4 = ts4.Next()) != null) { results.Add(token4.TermText()); } ts4.Close(); analyzerInstance4.Close(); break; case "Lucene.Net.Analysis.Standard.StandardAnalyzer": StandardAnalyzer analyzerInstance5 = new StandardAnalyzer(); TokenStream ts5 = analyzerInstance5.ReusableTokenStream("", new StringReader(words)); Lucene.Net.Analysis.Token token5; while ((token5 = ts5.Next()) != null) { results.Add(token5.TermText()); } ts5.Close(); analyzerInstance5.Close(); break; case "Lucene.China.ChineseAnalyzer": default: ChineseAnalyzer analyzerInstance6 = new ChineseAnalyzer(); TokenStream ts6 = analyzerInstance6.ReusableTokenStream("", new StringReader(words)); Lucene.Net.Analysis.Token token6; while ((token6 = ts6.Next()) != null) { results.Add(token6.TermText()); } ts6.Close(); analyzerInstance6.Close(); break; } return(results); }
private static void EnsureIndexWriterCore(bool creatingIndex) { if (!Directory.Exists(LuceneCommon.IndexDirectory)) { Directory.CreateDirectory(LuceneCommon.IndexDirectory); } var analyzer = new ChineseAnalyzer(); //var analyzer = new StandardAnalyzer(LuceneCommon.LuceneVersion); var directoryInfo = new DirectoryInfo(LuceneCommon.IndexDirectory); var directory = new SimpleFSDirectory(directoryInfo); _indexWriter = new IndexWriter(directory, analyzer, create: creatingIndex, mfl: IndexWriter.MaxFieldLength.UNLIMITED); }
private Analyzer GuessAnalyzer(string filePath, out bool isRTL) { Analyzer ret = null; isRTL = false; switch (Path.GetFileName(filePath).Substring(0, 2).ToLowerInvariant()) { case "zh": ret = new ChineseAnalyzer(); break; case "cs": ret = new CzechAnalyzer(); break; case "da": ret = new SnowballAnalyzer("Danish"); break; case "nl": ret = new SnowballAnalyzer("Dutch"); break; case "en": ret = new SnowballAnalyzer("English"); break; case "fi": ret = new SnowballAnalyzer("Finnish"); break; case "fr": ret = new SnowballAnalyzer("French"); break; case "de": ret = new SnowballAnalyzer("German"); break; case "it": ret = new SnowballAnalyzer("Italian"); break; case "ja": ret = new CJKAnalyzer(); break; case "ko": ret = new CJKAnalyzer(); break; case "no": ret = new SnowballAnalyzer("Norwegian"); break; case "pt": ret = new SnowballAnalyzer("Portuguese"); break; case "ru": ret = new SnowballAnalyzer("Russian"); break; case "es": ret = new SnowballAnalyzer("Spanish"); break; case "se": ret = new SnowballAnalyzer("Swedish"); break; case "ar": isRTL = true; // TODO: Lucene 2.9 has a light stemmer for Arabic providing good search results ret = new StandardAnalyzer(Lucene.Net.Util.Version.LUCENE_29); break; case "he": { isRTL = true; string hspellPath = System.Configuration.ConfigurationManager.AppSettings["hspellPath"]; if (!string.IsNullOrEmpty(hspellPath) && Directory.Exists(hspellPath)) { try { ret = new Lucene.Net.Analysis.Hebrew.MorphAnalyzer(hspellPath); break; } catch { } } ret = new Lucene.Net.Analysis.Hebrew.SimpleAnalyzer(); break; } default: ret = new StandardAnalyzer(Lucene.Net.Util.Version.LUCENE_29); break; } return(ret); }