public static List <string> PanGuSplitWord(string msg) { Analyzer analyzer = new PanGuAnalyzer(); TokenStream tokenStream = analyzer.TokenStream("", new StringReader(msg)); ITermAttribute ita; bool hasNext = true; List <string> list = new List <string>(); while (hasNext) { ita = tokenStream.GetAttribute <ITermAttribute>(); list.Add(ita.Term); hasNext = tokenStream.IncrementToken(); } analyzer.Close(); return(list); /*Token token; * List<string> list = new List<string>(); * while ((token = tokenStream.) != null) * { * list.Add(token.TermText()); * } * return list;*/ }
private static IEnumerable <string> GetSearchTerms(string searchTerm) { List <string> result = new List <string>(); var analyzer = new PanGuAnalyzer(); StringReader sr = new StringReader(searchTerm); TokenStream stream = analyzer.TokenStream(null, sr); bool hasnext = stream.IncrementToken(); System.DateTime start = System.DateTime.Now; ITermAttribute ita; while (hasnext) { ita = stream.GetAttribute <ITermAttribute>(); result.Add(ita.Term); hasnext = stream.IncrementToken(); } stream.CloneAttributes(); sr.Close(); analyzer.Dispose(); var resultString = string.Join(" ", result); return(resultString.Split(new[] { ' ' }, StringSplitOptions.RemoveEmptyEntries) .Concat(new[] { searchTerm }) .Distinct(StringComparer.OrdinalIgnoreCase) .Select(Escape)); }
/// <summary> /// 对keyword进行分词,将分词的结果返回 /// </summary> public static IEnumerable <string> SplitWords(string keyword) { IList <string> list = new List <string>(); Analyzer analyzer = new PanGuAnalyzer(); TokenStream stream = analyzer.TokenStream(keyword, new StringReader(keyword)); ITermAttribute ita = null; bool hasNext = stream.IncrementToken(); while (hasNext) { ita = stream.GetAttribute <ITermAttribute>(); list.Add(ita.Term); hasNext = stream.IncrementToken(); } return(list); //IList<string> list = new List<string>(); //Analyzer analyzer = new PanGuAnalyzer(); //TokenStream tokenStream = analyzer.TokenStream("", new StringReader(keyword)); //Token token = null; //while ((token = tokenStream.IncrementToken()) != null) //{ // // token.TermText()为当前分的词 // string word = token.TermText(); // list.Add(word); //} //return list; }
private List <BookSearchModel> SearchBookContent(string searchWords) { List <BookSearchModel> bookSearchModelList = new List <BookSearchModel>(); //1.对搜索条件进行分词 Analyzer analyzer = new PanGuAnalyzer(); TokenStream tokenStream = analyzer.TokenStream("", new StringReader(searchWords)); Lucene.Net.Analysis.Token token = null; string indexPath = @"D:\lucenedir"; //string kw = "面向对象";//对用户输入的搜索条件进行拆分。 FSDirectory directory = FSDirectory.Open(new DirectoryInfo(indexPath), new NoLockFactory()); IndexReader reader = IndexReader.Open(directory, true); IndexSearcher searcher = new IndexSearcher(reader); //搜索条件 PhraseQuery query = new PhraseQuery(); //foreach (string word in kw.Split(' '))//先用空格,让用户去分词,空格分隔的就是词“计算机 专业” //{ // query.Add(new Term("body", word)); //} //query.Add(new Term("body","语言"));--可以添加查询条件,两者是add关系.顺序没有关系. // query.Add(new Term("body", "大学生")); while ((token = tokenStream.Next()) != null) { query.Add(new Term("body", token.TermText())); } // query.Add(new Term("body", kw));//body中含有kw的文章 query.SetSlop(100);//多个查询条件的词之间的最大距离.在文章中相隔太远 也就无意义.(例如 “大学生”这个查询条件和"简历"这个查询条件之间如果间隔的词太多也就没有意义了。) //TopScoreDocCollector是盛放查询结果的容器 TopScoreDocCollector collector = TopScoreDocCollector.create(1000, true); searcher.Search(query, null, collector); //根据query查询条件进行查询,查询结果放入collector容器 ScoreDoc[] docs = collector.TopDocs(0, collector.GetTotalHits()).scoreDocs; //得到所有查询结果中的文档,GetTotalHits():表示总条数 TopDocs(300, 20);//表示得到300(从300开始),到320(结束)的文档内容. //可以用来实现分页功能 for (int i = 0; i < docs.Length; i++) { // //搜索ScoreDoc[]只能获得文档的id,这样不会把查询结果的Document一次性加载到内存中。降低了内存压力,需要获得文档的详细内容的时候通过searcher.Doc来根据文档id来获得文档的详细内容对象Document. int docId = docs[i].doc; //得到查询结果文档的id(Lucene内部分配的id) Document doc = searcher.Doc(docId); //找到文档id对应的文档详细信息 BookSearchModel searchModel = new BookSearchModel(); searchModel.Id = int.Parse(doc.Get("ID")); searchModel.Title = doc.Get("title"); searchModel.ContenDescription = SearchWordHighlight.CreateHightLight(searchWords, doc.Get("body")); //this.listBox1.Items.Add(doc.Get("number") + "\n");// 取出放进字段的值 //this.listBox1.Items.Add(doc.Get("body") + "\n"); //this.listBox1.Items.Add("-----------------------\n"); bookSearchModelList.Add(searchModel); } //将搜索的此插入词库之中 SearchDetails entity = new SearchDetails() { Id = Guid.NewGuid(), KeyWords = searchWords, SearchDateTime = DateTime.Now }; SearchDetailsService.AddEntity(entity); return(bookSearchModelList); }
/// <summary> /// 进行搜索 /// </summary> /// <returns></returns> public ActionResult Search() { string kw = Request["kw"]; // 获取用户输入的搜索内容 string indexPath = Server.MapPath("~/lucenedir"); // 从哪里搜索 // 对用户输入的内容进行分割 List <string> kws = new List <string>(); // 定义一个集合用来存储分割后的分词 Analyzer analyzer = new PanGuAnalyzer(); TokenStream tokenStream = analyzer.TokenStream("", new StringReader(kw.ToString())); Lucene.Net.Analysis.Token token = null; while ((token = tokenStream.Next()) != null) { kws.Add(token.TermText()); } FSDirectory directory = FSDirectory.Open(new DirectoryInfo(indexPath), new NoLockFactory()); IndexReader reader = IndexReader.Open(directory, true); IndexSearcher searcher = new IndexSearcher(reader); //搜索条件 // 注意:这个类只可以进行单个列条件搜索,如果想要实现多个条件搜索要使用另外一个类 PhraseQuery query = new PhraseQuery(); foreach (var word in kws) { query.Add(new Term("content", word)); // 向content这个列进行搜索 } query.SetSlop(100);//多个查询条件的词之间的最大距离.在文章中相隔太远 也就无意义.(例如 “大学生”这个查询条件和"简历"这个查询条件之间如果间隔的词太多也就没有意义了。) //TopScoreDocCollector是盛放查询结果的容器 TopScoreDocCollector collector = TopScoreDocCollector.create(1000, true); searcher.Search(query, null, collector); //根据query查询条件进行查询,查询结果放入collector容器 ScoreDoc[] docs = collector.TopDocs(0, collector.GetTotalHits()).scoreDocs; //得到所有查询结果中的文档,GetTotalHits():表示总条数 TopDocs(300, 20);//表示得到300(从300开始),到320(结束)的文档内容. // 创建一个list集合用来存储搜索到的结果 List <BookVieModel> bookList = new List <BookVieModel>(); for (int i = 0; i < docs.Length; i++) { //搜索ScoreDoc[]只能获得文档的id,这样不会把查询结果的Document一次性加载到内存中。降低了内存压力,需要获得文档的详细内容的时候通过searcher.Doc来根据文档id来获得文档的详细内容对象Document. int docId = docs[i].doc; //得到查询结果文档的id(Lucene内部分配的id) Document doc = searcher.Doc(docId); //找到文档id对应的文档详细信息 BookVieModel model = new BookVieModel(); model.Id = Convert.ToInt32(doc.Get("Id")); // 注意:这些字段要和在添加搜索词库的时候保持一致 model.Title = CreateHightLight(kw, doc.Get("title")); // 注意:这些字段要和在添加搜索词库的时候保持一致 // 对搜索到结果中的搜索词进行高亮显示 model.Content = CreateHightLight(kw, doc.Get("content")); // 注意:这些字段要和在添加搜索词库的时候保持一致 bookList.Add(model); } ViewBag.books = bookList; ViewBag.kw = kw; return(View("Index")); }
private void button3_Click(object sender, EventArgs e) { string indexPath = @"C:\Users\杨ShineLon\Desktop\lucenedir"; // 从哪里搜索 string kw = textBox1.Text; //"面向对象";//对用户输入的搜索条件进行拆分。 // 对用户输入的内容进行分割 List <string> kws = new List <string>(); // 定义一个集合用来存储分割后的分词 Analyzer analyzer = new PanGuAnalyzer(); TokenStream tokenStream = analyzer.TokenStream("", new StringReader(kw.ToString())); Net.Analysis.Token token = null; while ((token = tokenStream.Next()) != null) { kws.Add(token.TermText()); //Console.WriteLine(token.TermText()); } FSDirectory directory = FSDirectory.Open(new DirectoryInfo(indexPath), new NoLockFactory()); IndexReader reader = IndexReader.Open(directory, true); IndexSearcher searcher = new IndexSearcher(reader); //搜索条件 PhraseQuery query = new PhraseQuery(); //foreach (string word in kw.Split(' '))//先用空格,让用户去分词,空格分隔的就是词“计算机 专业” //{ // query.Add(new Term("body", word)); //} //query.Add(new Term("body","语言"));--可以添加查询条件,两者是add关系.顺序没有关系. // query.Add(new Term("body", "大学生")); //query.Add(new Term("body", kw));//body中含有kw的文章 foreach (var word in kws) { query.Add(new Term("body", word)); } query.SetSlop(100);//多个查询条件的词之间的最大距离.在文章中相隔太远 也就无意义.(例如 “大学生”这个查询条件和"简历"这个查询条件之间如果间隔的词太多也就没有意义了。) //TopScoreDocCollector是盛放查询结果的容器 TopScoreDocCollector collector = TopScoreDocCollector.create(1000, true); searcher.Search(query, null, collector); //根据query查询条件进行查询,查询结果放入collector容器 ScoreDoc[] docs = collector.TopDocs(0, collector.GetTotalHits()).scoreDocs; //得到所有查询结果中的文档,GetTotalHits():表示总条数 TopDocs(300, 20);//表示得到300(从300开始),到320(结束)的文档内容. //可以用来实现分页功能 this.listBox1.Items.Clear(); for (int i = 0; i < docs.Length; i++) { // //搜索ScoreDoc[]只能获得文档的id,这样不会把查询结果的Document一次性加载到内存中。降低了内存压力,需要获得文档的详细内容的时候通过searcher.Doc来根据文档id来获得文档的详细内容对象Document. int docId = docs[i].doc; //得到查询结果文档的id(Lucene内部分配的id) Document doc = searcher.Doc(docId); //找到文档id对应的文档详细信息 this.listBox1.Items.Add(doc.Get("number") + "\n"); // 取出放进字段的值 this.listBox1.Items.Add(doc.Get("body") + "\n"); this.listBox1.Items.Add("-----------------------\n"); } }
private void button6_Click(object sender, EventArgs e) { Analyzer analyzer = new PanGuAnalyzer(); TokenStream tokenStream = analyzer.TokenStream("", new StringReader(textBox2.Text)); Token token = null; while ((token = tokenStream.Next()) != null) { MessageBox.Show(token.TermText()); } }
private void button3_Click(object sender, EventArgs e) { Analyzer analyzer = new PanGuAnalyzer(); TokenStream tokenStream = analyzer.TokenStream("", new StringReader("面向世界,面向现代化")); Lucene.Net.Analysis.Token token = null; while ((token = tokenStream.Next()) != null) { Console.WriteLine(token.TermText()); } }
private void button3_Click(object sender, EventArgs e) { Analyzer analyzer = new PanGuAnalyzer(); TokenStream tokenStream = analyzer.TokenStream("", new StringReader("面向对象编程,没有对象哈哈哈")); //var cta = tokenStream.AddAttribute<TermAttribute>(); while (tokenStream.IncrementToken()) { var str = tokenStream.GetAttribute <ITermAttribute>(); Console.WriteLine(str.Term); } }
/// <summary> /// 盘古分词 /// </summary> /// <param name="msg">需要进行拆分的字符串</param> /// <returns>拆分结果</returns> public static List <string> PanguSplitWords(string msg) { List <string> list = new List <string>(); Analyzer analyzer = new PanGuAnalyzer(); TokenStream tokenStream = analyzer.TokenStream("", new StringReader(msg)); Lucene.Net.Analysis.Token token = null; while ((token = tokenStream.Next()) != null) { list.Add(token.TermText()); } return(list); }
private string[] SplitWords(string content) { List <string> strList = new List <string>(); Analyzer analyzer = new PanGuAnalyzer();//指定使用盘古 PanGuAnalyzer 分词算法 TokenStream tokenStream = analyzer.TokenStream("", new StringReader(content)); Lucene.Net.Analysis.Token token = null; while ((token = tokenStream.Next()) != null) { //Next继续分词 直至返回null strList.Add(token.TermText()); //得到分词后结果 } return(strList.ToArray()); }
/// <summary> /// 对用户输入的搜索的条件进行分词 /// </summary> /// <param name="str"></param> /// <returns></returns> private static string[] SplitWord(string str) { List <string> list = new List <string>(); Analyzer analyzer = new PanGuAnalyzer(); //指定盘古分词 TokenStream tokenStream = analyzer.TokenStream("", new StringReader(str)); // Lucene.Net.Analysis.Token token = null; while ((token = tokenStream.Next()) != null) { list.Add(token.TermText()); } return(list.ToArray()); }
public static string[] GetKeyWords(string str) { var list = new List <string>(); Analyzer analyzer = new PanGuAnalyzer(); TokenStream tokenStream = analyzer.TokenStream("", new StringReader(str)); Token token; while ((token = tokenStream.Next()) != null) { list.Add(token.TermText()); } return(list.ToArray()); }
/// <summary> /// 对字符串进行分词 /// </summary> /// <param name="str"></param> /// <returns></returns> public static List <string> GetPanGuWord(string str) { Analyzer analyzer = new PanGuAnalyzer(); TokenStream tokenStream = analyzer.TokenStream("", new StringReader(str)); Lucene.Net.Analysis.Token token = null; List <string> list = new List <string>(); while ((token = tokenStream.Next()) != null) { list.Add(token.TermText()); } return(list); }
public static string[] SplitWords(string keyWord) { List <string> strList = new List <string>(); Analyzer analyzer = new PanGuAnalyzer(); //指定使用盘古 PanGuAnalyzer 分词算法 TokenStream tokenStream = analyzer.TokenStream("", new StringReader(keyWord)); ITermAttribute token = null; while (tokenStream.IncrementToken()) { token = tokenStream.GetAttribute <ITermAttribute>(); strList.Add(token.Term); //得到分词后结果 } return(strList.ToArray()); }
private void button2_Click(object sender, EventArgs e) { Analyzer analyzer = new PanGuAnalyzer(); using (TokenStream tokenStream = analyzer.TokenStream("", new StringReader("北京,Hi欢迎你们大家"))) { ITermAttribute ita; while (tokenStream.IncrementToken()) { ita = tokenStream.GetAttribute <ITermAttribute>(); Console.WriteLine(ita.Term); } } }
private string[] SplitWords(string content) { List <string> strList = new List <string>(); Analyzer analyzer = new PanGuAnalyzer();//指定使用盘古 PanGuAnalyzer 分词算法 TokenStream tokenStream = analyzer.TokenStream("", new StringReader(content)); //Lucene.Net.Analysis.Token token = null; while (tokenStream.IncrementToken()) { //Next继续分词 直至返回null strList.Add(tokenStream.GetAttribute <ITermAttribute>().Term); //得到分词后结果 } return(strList.ToArray()); }
/// <summary> /// 利用盘古分词来分词 /// </summary> /// <param name="keyword"></param> /// <returns></returns> public static string[] WordSegmentation(string keyword) { List <string> list = new List <string>(); Analyzer analyzer = new PanGuAnalyzer(); //Analyzer analyzer = new StandardAnalyzer(); TokenStream tokenStream = analyzer.TokenStream("", new StringReader(keyword)); Lucene.Net.Analysis.Token token = null; while ((token = tokenStream.Next()) != null) { list.Add(token.TermText()); } return(list.ToArray()); }
private void button1_Click_1(object sender, EventArgs e) { string indexPath = "c:/index"; string kw = textBox1.Text; FSDirectory directory = FSDirectory.Open(new DirectoryInfo(indexPath), new NoLockFactory()); IndexReader reader = IndexReader.Open(directory, true); IndexSearcher searcher = new IndexSearcher(reader); PhraseQuery query = new PhraseQuery(); //todo:把用户输入的关键词进行拆词 //char[] str = textBox1.Text.ToCharArray(); //for (int i = 0; i < str.Length; i++) //{ // query.Add(new Term("name", str[i].ToString())); //} List <String> list = new List <string>(); Analyzer analyzer = new PanGuAnalyzer(); TokenStream tokenStream = analyzer.TokenStream("", new StringReader(textBox1.Text)); Token token = null; while ((token = tokenStream.Next()) != null) { list.Add(token.TermText()); } for (int i = 0; i < list.Count; i++) { query.Add(new Term("name", list[i].ToString())); } query.SetSlop(100); TopScoreDocCollector collector = TopScoreDocCollector.create(100, true); searcher.Search(query, null, collector); ScoreDoc[] docs = collector.TopDocs(0, collector.GetTotalHits()).scoreDocs; for (int i = 0; i < docs.Length; i++) { int docId = docs[i].doc;//取到文档的编号(主键,这个是Lucene .net分配的) //检索结果中只有文档的id,如果要取Document,则需要Doc再去取 //降低内容占用 Document doc = searcher.Doc(docId);//根据id找Document string code = doc.Get("code"); string name = doc.Get("name"); MessageBox.Show("code:" + code + "name:" + name); } }
/// <summary> /// 盘古分词 /// </summary> /// <param name="words"></param> /// <returns></returns> public static object PanGu(string words) { Analyzer analyzer = new PanGuAnalyzer(); TokenStream tokenStream = analyzer.TokenStream("", new StringReader(words)); Lucene.Net.Analysis.Token token = null; var str = ""; while ((token = tokenStream.Next()) != null) { string word = token.TermText(); // token.TermText() 取得当前分词 str += word + " | "; } return(str); }
public static List <string> PanGuSplitWord(string msg) { Analyzer analyzer = new PanGuAnalyzer(); StringReader r = new StringReader(msg); TokenStream ts = analyzer.TokenStream("", r); ITermAttribute termAtt = ts.GetAttribute <ITermAttribute>(); List <string> list = new List <string>(); while (ts.IncrementToken()) { string iterm = termAtt.Term; list.Add(iterm); } return(list); }
private void button2_Click(object sender, EventArgs e) { Analyzer analyzer = new PanGuAnalyzer(); StringReader r = new StringReader("南通大学,欢迎您"); TokenStream ts = analyzer.TokenStream("", r); Console.WriteLine("=====PanGu analyzer======="); ITermAttribute termAtt = ts.GetAttribute <ITermAttribute>(); while (ts.IncrementToken()) { string iterm = termAtt.Term; Console.WriteLine("[" + iterm + "]"); } }
public static string[] PanGuSplit(string key) { Analyzer analyzer = new PanGuAnalyzer(); TokenStream tokenStream = analyzer.TokenStream("", new StringReader(key)); Lucene.Net.Analysis.Token token = null; List <string> list = new List <string>(); while ((token = tokenStream.Next()) != null) { //Console.WriteLine(token.TermText()); list.Add(token.TermText()); } return(list.ToArray()); }
/// <summary> /// 盘古分词 /// </summary> /// <param name="msg"></param> /// <returns></returns> public static List <string> PanGuSplitWord(string msg) { Analyzer analyzer = new PanGuAnalyzer(); TokenStream tokenStream = analyzer.TokenStream("", new StringReader(msg)); List <string> list = new List <string>(); bool hasnext = false; Lucene.Net.Analysis.Tokenattributes.ITermAttribute ita; while (hasnext = tokenStream.IncrementToken()) { ita = tokenStream.GetAttribute <Lucene.Net.Analysis.Tokenattributes.ITermAttribute>(); list.Append(ita.Term); } return(list); }
/// <summary> /// 对keyword进行分词,将分词的结果返回 /// </summary> public static IEnumerable <string> SplitWords(string keyword) { IList <string> list = new List <string>(); Analyzer analyzer = new PanGuAnalyzer(); TokenStream tokenStream = analyzer.TokenStream("", new StringReader(keyword)); Token token = null; while ((token = tokenStream.Next()) != null) { // token.TermText()为当前分的词 string word = token.TermText(); list.Add(word); } return(list); }
/// <summary> /// 对索引分词 /// </summary> /// <param name="str"></param> /// <returns></returns> public static string[] SqlitIndexWord(string str) { //盘古分词 //对输入的搜索条件进行分词 List <string> list = new List <string>(); Analyzer analyzer = new PanGuAnalyzer(); TokenStream tokenStream = analyzer.TokenStream("", new StringReader(str)); Lucene.Net.Analysis.Token token = null; while ((token = tokenStream.Next()) != null) { Console.WriteLine(token.TermText()); list.Add(token.TermText()); } return(list.ToArray()); }
/// <summary> /// 对输入的搜索条件进行分词 /// </summary> /// <param name="str"></param> /// <returns></returns> public static List <string> GetPanGuWord(string str) { Analyzer analyzer = new PanGuAnalyzer(); TokenStream tokenStream = analyzer.TokenStream("", new StringReader(str)); var list = new List <string>(); while (tokenStream.IncrementToken()) { var term = tokenStream.GetAttribute <ITermAttribute>(); list.Add(term.Term); } return(list); }
public static List <string> PanGuSplitWord(string kw) { List <string> list = new List <string>(); Analyzer analyzer = new PanGuAnalyzer(); using (TokenStream tokenStream = analyzer.TokenStream("", new StringReader(kw))) { ITermAttribute ita; while (tokenStream.IncrementToken()) { ita = tokenStream.GetAttribute <ITermAttribute>(); list.Add(ita.Term); } } return(list); }
public static string GetKeyWordSplid(string keywords) { StringBuilder sb = new StringBuilder(); Analyzer analyzer = new PanGuAnalyzer(); TokenStream stream = analyzer.TokenStream(keywords, new StringReader(keywords)); ITermAttribute ita = null; bool hasNext = stream.IncrementToken(); while (hasNext) { ita = stream.GetAttribute <ITermAttribute>(); sb.Append(ita.Term + " "); hasNext = stream.IncrementToken(); } return(sb.ToString()); }
/// <summary> /// 把输入的msg进行分词 /// </summary> /// <param name="msg"></param> /// <returns></returns> public static IEnumerable <string> SplitWords(string msg) { List <string> list = new List <string>(); Analyzer analyzer = new PanGuAnalyzer(); TokenStream tokenStream = analyzer.TokenStream("", new StringReader(msg)); Lucene.Net.Analysis.Token token = null; //Next()取分到的下一个词 while ((token = tokenStream.Next()) != null) { string word = token.TermText();//分到的词 list.Add(word); } return(list); }