/// <summary> /// Returns a 32 bit float from the payload, or 1f it null. /// </summary> /// <param name="token"></param> /// <returns></returns> public override float GetWeight(Token token) { if (token.GetPayload() == null || token.GetPayload().GetData() == null) return 1f; return PayloadHelper.DecodeFloat(token.GetPayload().GetData()); }
/// <summary> /// Gets the string query. /// </summary> /// <param name="searchCriteria">The search criteria.</param> /// <param name="matchCondition">The match condition.</param> private void GetStringQuery(Criteria searchCriteria, MatchCondition matchCondition) { try { Lucene.Net.Analysis.Token token = null; Lucene.Net.Analysis.Token token2 = null; TokenStream stream = _analyzer.TokenStream(searchCriteria.Field.ToString(), new StringReader(searchCriteria.Value)); do { token2 = token; token = stream.Next(); if (token2 != null) { string stoken = token2.TermText(); BooleanQuery outputQuery = new BooleanQuery(); this.TokenToQuery(searchCriteria.Field.ToString(), stoken, searchCriteria.Condition.ToString(), ref outputQuery); if (matchCondition == MatchCondition.MatchAll) { _tempQuery.Add(outputQuery, BooleanClause.Occur.MUST); } else { _tempQuery.Add(outputQuery, BooleanClause.Occur.SHOULD); } } }while (token != null); } catch (Exception ex) { _logger.Error("Error while creating String Query :" + ex.InnerException == null ? ex.Message.ToString() : ex.InnerException.Message); } }
public override TokenPositioner GetTokenPositioner(Token token) { return token.GetPositionIncrement() == 0 ? TokenPositioner.NewRow : TokenPositioner.NewColumn; }
public virtual void TestCtor() { Token t = new Token(); char[] content = "hello".ToCharArray(); t.SetTermBuffer(content, 0, content.Length); char[] buf = t.TermBuffer(); Assert.AreNotEqual(t.TermBuffer(), content); Assert.AreEqual("hello", t.Term); Assert.AreEqual("word", t.Type); Assert.AreEqual(0, t.Flags); t = new Token(6, 22); t.SetTermBuffer(content, 0, content.Length); Assert.AreEqual("hello", t.Term); Assert.AreEqual("(hello,6,22)", t.ToString()); Assert.AreEqual("word", t.Type); Assert.AreEqual(0, t.Flags); t = new Token(6, 22, 7); t.SetTermBuffer(content, 0, content.Length); Assert.AreEqual("hello", t.Term); Assert.AreEqual("(hello,6,22)", t.ToString()); Assert.AreEqual(7, t.Flags); t = new Token(6, 22, "junk"); t.SetTermBuffer(content, 0, content.Length); Assert.AreEqual("hello", t.Term); Assert.AreEqual("(hello,6,22,type=junk)", t.ToString()); Assert.AreEqual(0, t.Flags); }
public override Token Next(/* in */ Token reusableToken) { System.Diagnostics.Debug.Assert(reusableToken != null); Token nextToken = input.Next(reusableToken); if (nextToken != null) { char[] buffer = nextToken.TermBuffer(); int length = nextToken.TermLength(); // If no characters actually require rewriting then we // just return token as-is: for (int i = 0; i < length; i++) { char c = buffer[i]; if (c >= '\u00c0' && c <= '\ufb06') { RemoveAccents(buffer, length); nextToken.SetTermBuffer(output, 0, outputPos); break; } } return nextToken; } else return null; }
private void ProcessEmailToken(Lucene.Net.Analysis.Token token) { token_type = tokentype_email; string email = token.TermText(); parts = email.Split(replace_array); if (parts.Length == 1) // safety check { return; } int index_at = email.IndexOf('@'); // store username part as a large token // and also remove the final tld part Array.Copy(parts, 0, parts, 1, parts.Length - 1); parts [0] = email.Substring(0, index_at); #if ENABLE_RDF_ADAPTER if (link_call_back != null) { link_call_back("mailto://" + email, true); } #endif }
public override void Add(Token t) { if (t != null && t.Term().ToUpper().Equals("The".ToUpper())) { base.Add(t); } }
public override Token Next(Token reusableToken) { System.Diagnostics.Debug.Assert(reusableToken != null); Token nextToken = input.Next(reusableToken); sink.Add(nextToken); return nextToken; }
public static Token NextToken(TokenStream input, Token reusableToken) { if (input == null) return null; if (!input.IncrementToken()) return null; ITermAttribute termAtt = input.GetAttribute<ITermAttribute>(); IOffsetAttribute offsetAtt = input.GetAttribute<IOffsetAttribute>(); ITypeAttribute typeAtt = input.GetAttribute<ITypeAttribute>(); if (reusableToken == null) { reusableToken = new Token(); } reusableToken.Clear(); if (termAtt != null) reusableToken.SetTermBuffer(termAtt.TermBuffer(), 0, termAtt.TermLength()); if (offsetAtt != null) { reusableToken.StartOffset = offsetAtt.StartOffset; reusableToken.EndOffset = offsetAtt.EndOffset; } if (typeAtt != null) reusableToken.Type = typeAtt.Type; return reusableToken; }
/// <summary>Returns the next token in the stream, or null at EOS. /// <p>Removes <tt>'s</tt> from the end of words. /// <p>Removes dots from acronyms. /// </summary> public override Lucene.Net.Analysis.Token Next() { Lucene.Net.Analysis.Token t = input.Next(); if (t == null) { return(null); } System.String text = t.TermText(); System.String type = t.Type(); if (type == APOSTROPHE_TYPE && (text.EndsWith("'s") || text.EndsWith("'S"))) { return(new Lucene.Net.Analysis.Token(text.Substring(0, (text.Length - 2) - (0)), t.StartOffset(), t.EndOffset(), type)); } else if (type == ACRONYM_TYPE) { // remove dots System.Text.StringBuilder trimmed = new System.Text.StringBuilder(); for (int i = 0; i < text.Length; i++) { char c = text[i]; if (c != '.') { trimmed.Append(c); } } return(new Lucene.Net.Analysis.Token(trimmed.ToString(), t.StartOffset(), t.EndOffset(), type)); } else { return(t); } }
public void SnowballAnalyzer() { // The algorithm is language-specific, using stemming. Stemming algorithms attempt to reduce a word to a common root form. string text = "building build builds builded"; string output = "Analyzing '" + text + "', generated the tokens: "; Dictionary <string, string> tokensFound = new Dictionary <string, string>(); // Do the analyzis Analyzer analyzer = new SnowballAnalyzer("English", StandardAnalyzer.STOP_WORDS); TokenStream stream = analyzer.TokenStream("contents", new StringReader(text)); while (true) { Token token = stream.Next(); if (token == null) { break; } // Append only unique tokens if (!tokensFound.ContainsKey(token.TermText())) { tokensFound[token.TermText()] = token.TermText(); output += "[" + token.TermText() + "] "; } } log.Debug(output); Assert.AreEqual(1, tokensFound.Count); }
private Query GetParsedQuerywc(string text) { BooleanQuery query = new BooleanQuery(); BooleanQuery.SetMaxClauseCount(0x2710); if (text.Length > 0) { BooleanQuery query2 = new BooleanQuery(); QueryParser parser = new QueryParser("UserType", analyzer); query2.Add(parser.Parse("Users"), BooleanClause.Occur.SHOULD); query.Add(query2, BooleanClause.Occur.MUST); } Lucene.Net.Analysis.Token token = null; Lucene.Net.Analysis.Token token2 = null; TokenStream stream = analyzer.TokenStream("UserType", new StringReader(text)); do { token2 = token; token = stream.Next(); if (token2 != null) { string stoken = token2.TermText(); BooleanQuery outputQuery = new BooleanQuery(); this.TokenToQuery("Name", stoken, ref outputQuery); query.Add(outputQuery, BooleanClause.Occur.MUST); } }while (token != null); return(query); }
/// <summary> /// Stores a 32 bit float in the payload, or set it to null if 1f; /// </summary> /// <param name="token"></param> /// <param name="weight"></param> public override void SetWeight(Token token, float weight) { token.SetPayload( weight == 1f ? null : new Payload(PayloadHelper.EncodeFloat(weight)) ); }
public CutLeterDigitFilter(TokenStream input) : base(input) { reusableToken = new Token(); termAtt = AddAttribute<ITermAttribute>(); offsetAtt = AddAttribute<IOffsetAttribute>(); typeAtt = AddAttribute<ITypeAttribute>(); }
private List <BookSearchModel> SearchBookContent(string searchWords) { List <BookSearchModel> bookSearchModelList = new List <BookSearchModel>(); //1.对搜索条件进行分词 Analyzer analyzer = new PanGuAnalyzer(); TokenStream tokenStream = analyzer.TokenStream("", new StringReader(searchWords)); Lucene.Net.Analysis.Token token = null; string indexPath = @"D:\lucenedir"; //string kw = "面向对象";//对用户输入的搜索条件进行拆分。 FSDirectory directory = FSDirectory.Open(new DirectoryInfo(indexPath), new NoLockFactory()); IndexReader reader = IndexReader.Open(directory, true); IndexSearcher searcher = new IndexSearcher(reader); //搜索条件 PhraseQuery query = new PhraseQuery(); //foreach (string word in kw.Split(' '))//先用空格,让用户去分词,空格分隔的就是词“计算机 专业” //{ // query.Add(new Term("body", word)); //} //query.Add(new Term("body","语言"));--可以添加查询条件,两者是add关系.顺序没有关系. // query.Add(new Term("body", "大学生")); while ((token = tokenStream.Next()) != null) { query.Add(new Term("body", token.TermText())); } // query.Add(new Term("body", kw));//body中含有kw的文章 query.SetSlop(100);//多个查询条件的词之间的最大距离.在文章中相隔太远 也就无意义.(例如 “大学生”这个查询条件和"简历"这个查询条件之间如果间隔的词太多也就没有意义了。) //TopScoreDocCollector是盛放查询结果的容器 TopScoreDocCollector collector = TopScoreDocCollector.create(1000, true); searcher.Search(query, null, collector); //根据query查询条件进行查询,查询结果放入collector容器 ScoreDoc[] docs = collector.TopDocs(0, collector.GetTotalHits()).scoreDocs; //得到所有查询结果中的文档,GetTotalHits():表示总条数 TopDocs(300, 20);//表示得到300(从300开始),到320(结束)的文档内容. //可以用来实现分页功能 for (int i = 0; i < docs.Length; i++) { // //搜索ScoreDoc[]只能获得文档的id,这样不会把查询结果的Document一次性加载到内存中。降低了内存压力,需要获得文档的详细内容的时候通过searcher.Doc来根据文档id来获得文档的详细内容对象Document. int docId = docs[i].doc; //得到查询结果文档的id(Lucene内部分配的id) Document doc = searcher.Doc(docId); //找到文档id对应的文档详细信息 BookSearchModel searchModel = new BookSearchModel(); searchModel.Id = int.Parse(doc.Get("ID")); searchModel.Title = doc.Get("title"); searchModel.ContenDescription = SearchWordHighlight.CreateHightLight(searchWords, doc.Get("body")); //this.listBox1.Items.Add(doc.Get("number") + "\n");// 取出放进字段的值 //this.listBox1.Items.Add(doc.Get("body") + "\n"); //this.listBox1.Items.Add("-----------------------\n"); bookSearchModelList.Add(searchModel); } //将搜索的此插入词库之中 SearchDetails entity = new SearchDetails() { Id = Guid.NewGuid(), KeyWords = searchWords, SearchDateTime = DateTime.Now }; SearchDetailsService.AddEntity(entity); return(bookSearchModelList); }
/// <summary> /// 进行搜索 /// </summary> /// <returns></returns> public ActionResult Search() { string kw = Request["kw"]; // 获取用户输入的搜索内容 string indexPath = Server.MapPath("~/lucenedir"); // 从哪里搜索 // 对用户输入的内容进行分割 List <string> kws = new List <string>(); // 定义一个集合用来存储分割后的分词 Analyzer analyzer = new PanGuAnalyzer(); TokenStream tokenStream = analyzer.TokenStream("", new StringReader(kw.ToString())); Lucene.Net.Analysis.Token token = null; while ((token = tokenStream.Next()) != null) { kws.Add(token.TermText()); } FSDirectory directory = FSDirectory.Open(new DirectoryInfo(indexPath), new NoLockFactory()); IndexReader reader = IndexReader.Open(directory, true); IndexSearcher searcher = new IndexSearcher(reader); //搜索条件 // 注意:这个类只可以进行单个列条件搜索,如果想要实现多个条件搜索要使用另外一个类 PhraseQuery query = new PhraseQuery(); foreach (var word in kws) { query.Add(new Term("content", word)); // 向content这个列进行搜索 } query.SetSlop(100);//多个查询条件的词之间的最大距离.在文章中相隔太远 也就无意义.(例如 “大学生”这个查询条件和"简历"这个查询条件之间如果间隔的词太多也就没有意义了。) //TopScoreDocCollector是盛放查询结果的容器 TopScoreDocCollector collector = TopScoreDocCollector.create(1000, true); searcher.Search(query, null, collector); //根据query查询条件进行查询,查询结果放入collector容器 ScoreDoc[] docs = collector.TopDocs(0, collector.GetTotalHits()).scoreDocs; //得到所有查询结果中的文档,GetTotalHits():表示总条数 TopDocs(300, 20);//表示得到300(从300开始),到320(结束)的文档内容. // 创建一个list集合用来存储搜索到的结果 List <BookVieModel> bookList = new List <BookVieModel>(); for (int i = 0; i < docs.Length; i++) { //搜索ScoreDoc[]只能获得文档的id,这样不会把查询结果的Document一次性加载到内存中。降低了内存压力,需要获得文档的详细内容的时候通过searcher.Doc来根据文档id来获得文档的详细内容对象Document. int docId = docs[i].doc; //得到查询结果文档的id(Lucene内部分配的id) Document doc = searcher.Doc(docId); //找到文档id对应的文档详细信息 BookVieModel model = new BookVieModel(); model.Id = Convert.ToInt32(doc.Get("Id")); // 注意:这些字段要和在添加搜索词库的时候保持一致 model.Title = CreateHightLight(kw, doc.Get("title")); // 注意:这些字段要和在添加搜索词库的时候保持一致 // 对搜索到结果中的搜索词进行高亮显示 model.Content = CreateHightLight(kw, doc.Get("content")); // 注意:这些字段要和在添加搜索词库的时候保持一致 bookList.Add(model); } ViewBag.books = bookList; ViewBag.kw = kw; return(View("Index")); }
void AddToken(Token oriToken, int termBufferOffset, int termBufferLength, byte type) { Token token = new Token(oriToken.TermBuffer(), termBufferOffset, termBufferLength, oriToken.StartOffset + termBufferOffset, oriToken.StartOffset + termBufferOffset + termBufferLength); if (type == (byte)UnicodeCategory.DecimalDigitNumber) token.Type = Word.TYPE_DIGIT; else token.Type = Word.TYPE_LETTER; tokenQueue.Enqueue(token); }
public override Token Next() { Token t = this.input.Next(); if (t != null) { t = new Token(_replaceDiacritics(t.TermText()), t.StartOffset(), t.EndOffset()/*, "DiacriticFiltered"*/); } return t; }
public virtual void TestToString() { char[] b = new char[]{'a', 'l', 'o', 'h', 'a'}; Token t = new Token("", 0, 5); t.SetTermBuffer(b, 0, 5); Assert.AreEqual("(aloha,0,5)", t.ToString()); t.SetTermText("hi there"); Assert.AreEqual("(hi there,0,5)", t.ToString()); }
private void button3_Click(object sender, EventArgs e) { Analyzer analyzer = new PanGuAnalyzer(); TokenStream tokenStream = analyzer.TokenStream("", new StringReader("面向世界,面向现代化")); Lucene.Net.Analysis.Token token = null; while ((token = tokenStream.Next()) != null) { Console.WriteLine(token.TermText()); } }
/// <summary>Returns the next token in the stream, or null at EOS. /// <p>Removes <tt>'s</tt> from the end of words. /// <p>Removes dots from acronyms. /// </summary> public override Lucene.Net.Analysis.Token Next() { Lucene.Net.Analysis.Token t = input.Next(); if (t == null) { return(null); } return(t); }
/// <summary> /// 一元分词 /// </summary> /// <param name="sender"></param> /// <param name="e"></param> private void button1_Click(object sender, EventArgs e) { Analyzer analyzer = new StandardAnalyzer(); TokenStream tokenStream = analyzer.TokenStream("", new StringReader("北京,Hi欢迎你们大家")); Lucene.Net.Analysis.Token token = null; while ((token = tokenStream.Next()) != null) { Console.WriteLine(token.TermText()); } }
public override Token Next(Token result) { result = input.Next(result); if (result != null) { if (stemmer.Stem(result.TermBuffer(), 0, result.termLength)) result.SetTermBuffer(stemmer.GetResultBuffer(), 0, stemmer.GetResultLength()); return result; } else return null; }
public virtual void TestResize() { Token t = new Token(); char[] content = "hello".ToCharArray(); t.SetTermBuffer(content, 0, content.Length); for (int i = 0; i < 2000; i++) { t.ResizeTermBuffer(i); Assert.IsTrue(i <= t.TermBuffer().Length); Assert.AreEqual("hello", t.Term); } }
public override Token Next(/* in */ Token reusableToken) { System.Diagnostics.Debug.Assert(reusableToken != null); Token nextToken = input.Next(reusableToken); if (nextToken == null) return null; if (stemmer.Stem(nextToken.TermBuffer(), 0, nextToken.TermLength())) nextToken.SetTermBuffer(stemmer.GetResultBuffer(), 0, stemmer.GetResultLength()); return nextToken; }
public override Token Next(/* in */ Token reusableToken) { System.Diagnostics.Debug.Assert(reusableToken != null); reusableToken.Clear(); int length = 0; int start = bufferIndex; char[] buffer = reusableToken.TermBuffer(); while (true) { if (bufferIndex >= dataLen) { offset += dataLen; dataLen = input is Lucene.Net.Index.ReusableStringReader ? ((Lucene.Net.Index.ReusableStringReader) input).Read(ioBuffer) : input.Read((System.Char[]) ioBuffer, 0, ioBuffer.Length); if (dataLen <= 0) { if (length > 0) break; else return null; } bufferIndex = 0; } char c = ioBuffer[bufferIndex++]; if (IsTokenChar(c)) { // if it's a token char if (length == 0) // start of token start = offset + bufferIndex - 1; else if (length == buffer.Length) buffer = reusableToken.ResizeTermBuffer(1 + length); buffer[length++] = Normalize(c); // buffer it, normalized if (length == MAX_WORD_LEN) // buffer overflow! break; } else if (length > 0) // at non-Letter w/ chars break; // return 'em } reusableToken.SetTermLength(length); reusableToken.SetStartOffset(start); reusableToken.SetEndOffset(start + length); return reusableToken; }
private string[] SplitWords(string content) { List <string> strList = new List <string>(); Analyzer analyzer = new PanGuAnalyzer();//指定使用盘古 PanGuAnalyzer 分词算法 TokenStream tokenStream = analyzer.TokenStream("", new StringReader(content)); Lucene.Net.Analysis.Token token = null; while ((token = tokenStream.Next()) != null) { //Next继续分词 直至返回null strList.Add(token.TermText()); //得到分词后结果 } return(strList.ToArray()); }
public SingleTokenTokenStream(Token token) { Debug.Assert(token != null, "Token was null!"); _singleToken = (Token) token.Clone(); // ReSharper disable DoNotCallOverridableMethodsInConstructor _tokenAtt = (AttributeImpl) AddAttribute(typeof (TermAttribute)); // ReSharper restore DoNotCallOverridableMethodsInConstructor Debug.Assert(_tokenAtt is Token || _tokenAtt.GetType().Name.Equals(typeof (TokenWrapper).Name), "Token Attribute is the wrong type! Type was: " + _tokenAtt.GetType().Name + " but expected " + typeof (TokenWrapper).Name); }
/// <summary> /// 盘古分词 /// </summary> /// <param name="msg">需要进行拆分的字符串</param> /// <returns>拆分结果</returns> public static List <string> PanguSplitWords(string msg) { List <string> list = new List <string>(); Analyzer analyzer = new PanGuAnalyzer(); TokenStream tokenStream = analyzer.TokenStream("", new StringReader(msg)); Lucene.Net.Analysis.Token token = null; while ((token = tokenStream.Next()) != null) { list.Add(token.TermText()); } return(list); }
/// <summary> /// 对用户输入的搜索的条件进行分词 /// </summary> /// <param name="str"></param> /// <returns></returns> private static string[] SplitWord(string str) { List <string> list = new List <string>(); Analyzer analyzer = new PanGuAnalyzer(); //指定盘古分词 TokenStream tokenStream = analyzer.TokenStream("", new StringReader(str)); // Lucene.Net.Analysis.Token token = null; while ((token = tokenStream.Next()) != null) { list.Add(token.TermText()); } return(list.ToArray()); }
public override Token Next(Token reusableToken) { System.Diagnostics.Debug.Assert(reusableToken != null); if (iter == null) iter = lst.GetEnumerator(); // Since this TokenStream can be reset we have to maintain the tokens as immutable if (iter.MoveNext()) { Token nextToken = iter.Current; return (Token) nextToken.Clone(); } return null; }
/// <summary> /// /// </summary> /// <param name="token"></param> /// <returns>the token flags int value as TokenPosition</returns> public override TokenPositioner GetTokenPositioner(Token token) { switch (token.GetFlags()) { case 0: return TokenPositioner.NewColumn; case 1: return TokenPositioner.NewRow; case 2: return TokenPositioner.SameRow; } throw new IOException("Unknown matrix positioning of token " + token); }
public override Token Next(Token token) { token.Clear(); int length = 0; int start = bufferIndex; char[] buffer = token.TermBuffer(); while (true) { if (bufferIndex >= dataLen) { offset += dataLen; dataLen = input is Lucene.Net.Index.DocumentsWriter.ReusableStringReader ? ((Lucene.Net.Index.DocumentsWriter.ReusableStringReader) input).Read(ioBuffer) : input.Read((System.Char[]) ioBuffer, 0, ioBuffer.Length); if (dataLen <= 0) { if (length > 0) break; else return null; } bufferIndex = 0; } char c = ioBuffer[bufferIndex++]; if (IsTokenChar(c)) { // if it's a token char if (length == 0) // start of token start = offset + bufferIndex - 1; else if (length == buffer.Length) buffer = token.ResizeTermBuffer(1 + length); buffer[length++] = Normalize(c); // buffer it, normalized if (length == MAX_WORD_LEN) // buffer overflow! break; } else if (length > 0) // at non-Letter w/ chars break; // return 'em } token.termLength = length; token.startOffset = start; token.endOffset = start + length; return token; }
public static void Analyze(TextReader reader) { Lucene.Net.Analysis.Token lastToken = null; Analyzer indexing_analyzer = new LuceneCommon.BeagleAnalyzer(true); TokenStream stream = indexing_analyzer.TokenStream("Text", reader); int position = 1; for (Lucene.Net.Analysis.Token t = stream.Next(); t != null; t = stream.Next()) { position += (t.GetPositionIncrement() - 1); Console.WriteLine(t); } }
/// <summary> /// 对字符串进行分词 /// </summary> /// <param name="str"></param> /// <returns></returns> public static List <string> GetPanGuWord(string str) { Analyzer analyzer = new PanGuAnalyzer(); TokenStream tokenStream = analyzer.TokenStream("", new StringReader(str)); Lucene.Net.Analysis.Token token = null; List <string> list = new List <string>(); while ((token = tokenStream.Next()) != null) { list.Add(token.TermText()); } return(list); }
private string AnalyzerResult(string txtBody, Analyzer analyzer) { TokenStream tokenStream = analyzer.TokenStream("", new StringReader(txtBody)); Lucene.Net.Analysis.Token token = null; StringBuilder sb = new StringBuilder(); // 新版本 3+ .Next() 已经被废弃 while ((token = tokenStream.Next()) != null) { sb.Append(token.TermText() + "\r\n"); } return(sb.ToString()); }
internal virtual void VerifyPayload(TokenStream ts) { Token t = new Token(); for (byte b = 1; ; b++) { t.Clear(); t = ts.Next(t); if (t == null) break; // System.out.println("id="+System.identityHashCode(t) + " " + t); // System.out.println("payload=" + (int)t.getPayload().toByteArray()[0]); Assert.AreEqual(b, t.GetPayload().ToByteArray()[0]); } }
/// <summary> /// 利用盘古分词来分词 /// </summary> /// <param name="keyword"></param> /// <returns></returns> public static string[] WordSegmentation(string keyword) { List <string> list = new List <string>(); Analyzer analyzer = new PanGuAnalyzer(); //Analyzer analyzer = new StandardAnalyzer(); TokenStream tokenStream = analyzer.TokenStream("", new StringReader(keyword)); Lucene.Net.Analysis.Token token = null; while ((token = tokenStream.Next()) != null) { list.Add(token.TermText()); } return(list.ToArray()); }
public override Token Next() { if (words == null) { words = MCSegment.Segment.Seg(this.input_text); } if (words == null || index >= words.Length) return null; string word = words[index++]; Token token = new Token(word, offset, offset + word.Length); offset += word.Length; return token; }
public override Lucene.Net.Analysis.Token Next() { if (parts != null) { if (++parts_index < parts.Length) { string part = parts [parts_index]; Lucene.Net.Analysis.Token part_token; // FIXME: Searching for google.com will not match www.google.com. // If we decide to allow google-style "abcd.1234" which means // "abcd 1234" as a consequtive phrase, then adjusting // the startOffset and endOffset would enable matching // google.com to www.google.com int start_offset = (parts_index == 0 && token_type == tokentype_email ? 0 : last_end_offset + 1); // assuming only one separator int end_offset = start_offset + part.Length; part_token = new Lucene.Net.Analysis.Token(part, start_offset, end_offset, token_type); part_token.SetPositionIncrement(0); last_end_offset = (parts_index == 0 && token_type == tokentype_email ? -1 : end_offset); // assuming only one separator return(part_token); } else { // clear the array parts = null; parts_index = -1; last_end_offset = -1; token_type = null; } } Token token; while ((token = token_stream.Next()) != null) { //Console.WriteLine ("Found token: [{0}]", token.TermText ()); if (ProcessToken(ref token)) { return(token); } } return(null); }
/// <summary> Returns the next input Token whose termText() is the right len</summary> public override Token Next(Token result) { // return the first non-stop word found for (Token token = input.Next(result); token != null; token = input.Next(result)) { int len = token.TermText().Length; if (len >= min && len <= max) { return token; } // note: else we ignore it but should we index each part of it? } // reached EOS -- return null return null; }
/// <summary> /// 盘古分词 /// </summary> /// <param name="words"></param> /// <returns></returns> public static object PanGu(string words) { Analyzer analyzer = new PanGuAnalyzer(); TokenStream tokenStream = analyzer.TokenStream("", new StringReader(words)); Lucene.Net.Analysis.Token token = null; var str = ""; while ((token = tokenStream.Next()) != null) { string word = token.TermText(); // token.TermText() 取得当前分词 str += word + " | "; } return(str); }
public static string[] PanGuSplit(string key) { Analyzer analyzer = new PanGuAnalyzer(); TokenStream tokenStream = analyzer.TokenStream("", new StringReader(key)); Lucene.Net.Analysis.Token token = null; List <string> list = new List <string>(); while ((token = tokenStream.Next()) != null) { //Console.WriteLine(token.TermText()); list.Add(token.TermText()); } return(list.ToArray()); }
public virtual bool IsNewFragment(Token token) { char kar1 = this.text[token.StartOffset() - 2]; char kar2 = this.text[token.StartOffset() - 3]; char kar3 = this.text[token.StartOffset() - 4]; bool isNewFrag= ((token.EndOffset()>=(fragmentSize*(currentNumFrags - 1) + (fragmentSize/2))&& (isCriticalChar(kar1) || isCriticalChar(kar2) || isCriticalChar(kar3))) || (token.EndOffset()>=(fragmentSize*currentNumFrags))); if(isNewFrag) { currentNumFrags++; } return isNewFrag; }
public override bool IncrementToken() { if (tokenList != null) { index++; if (index < tokenList.Count) { termAtt.SetTermBuffer(tokenList[index].TermBuffer(), 0, tokenList[index].TermLength()); termOff.SetOffset(tokenList[index].StartOffset, tokenList[index].EndOffset); return(true); } tokenList = null; return(false); } tokenList = new List <Token>(); // First cache result while (input.IncrementToken()) { Token newToken = new Token(termAtt.Term, termOff.StartOffset, termOff.EndOffset); foreach (Token token in tokenList) { if (token.StartOffset == newToken.StartOffset && token.Term == newToken.Term) { token.SetOffset(newToken.StartOffset, newToken.EndOffset); newToken = null; break; } } //foreach if (newToken != null) { tokenList.Add(newToken); } } // while; // now output the tokens! if (tokenList.Count > 0) { index = 0; termAtt.SetTermBuffer(tokenList[index].TermBuffer(), 0, tokenList[index].TermLength()); termOff.SetOffset(tokenList[index].StartOffset, tokenList[index].EndOffset); return(true); } return(false); }
public override Token Next(Token reusableToken) { Token nextToken = input.Next(reusableToken); if (nextToken != null) { char[] buffer = nextToken.TermBuffer(); int length = nextToken.TermLength(); for (int i = 0; i < length; i++) buffer[i] = System.Char.ToLower(buffer[i]); return nextToken; } else return null; }
/// <summary>Returns the next token in the stream, or null at EOS. /// @deprecated The returned Token is a "full private copy" (not /// re-used across calls to next()) but will be slower /// than calling {@link #Next(Token)} instead.. /// </summary> public virtual Token Next() { Token reusableToken = new Token(); Token nextToken = Next(reusableToken); if (nextToken != null) { Payload p = nextToken.GetPayload(); if (p != null) { nextToken.SetPayload((Payload) p.Clone()); } } return nextToken; }
public override Token Next(Token result) { result = input.Next(result); if (result != null) { char[] buffer = result.TermBuffer(); int length = result.termLength; for (int i = 0; i < length; i++) buffer[i] = System.Char.ToLower(buffer[i]); return result; } else return null; }
/// <summary> Returns the next input Token whose term() is the right len</summary> public override Token Next(/* in */ Token reusableToken) { System.Diagnostics.Debug.Assert(reusableToken != null); // return the first non-stop word found for (Token nextToken = input.Next(reusableToken); nextToken != null; nextToken = input.Next(reusableToken)) { int len = nextToken.TermLength(); if (len >= min && len <= max) { return nextToken; } // note: else we ignore it but should we index each part of it? } // reached EOS -- return null return null; }
/// <summary> /// 对索引分词 /// </summary> /// <param name="str"></param> /// <returns></returns> public static string[] SqlitIndexWord(string str) { //盘古分词 //对输入的搜索条件进行分词 List <string> list = new List <string>(); Analyzer analyzer = new PanGuAnalyzer(); TokenStream tokenStream = analyzer.TokenStream("", new StringReader(str)); Lucene.Net.Analysis.Token token = null; while ((token = tokenStream.Next()) != null) { Console.WriteLine(token.TermText()); list.Add(token.TermText()); } return(list.ToArray()); }
public sealed override Boolean IncrementToken() { ClearAttributes(); Lucene.Net.Analysis.Token word = Next(); if (word != null) { var buffer = word.ToString(); termAtt.SetEmpty().Append(buffer); offsetAtt.SetOffset(CorrectOffset(word.StartOffset), CorrectOffset(word.EndOffset)); typeAtt.Type = word.Type; return(true); } End(); this.Dispose(); return(false); }
/// <summary> /// 分词测试 /// </summary> /// <param name="keyword"></param> /// <returns></returns> public string Token(string keyword) { string ret = ""; System.IO.StringReader reader = new System.IO.StringReader(keyword); Lucene.Net.Analysis.TokenStream ts = analyzer.TokenStream(keyword, reader); Lucene.Net.Analysis.Token token = ts.Next(); while (token != null) { ret += " " + token.TermText(); token = ts.Next(); } ts.CloneAttributes(); reader.Close(); analyzer.Close(); return(ret); }
/// <summary> /// 把输入的msg进行分词 /// </summary> /// <param name="msg"></param> /// <returns></returns> public static IEnumerable <string> SplitWords(string msg) { List <string> list = new List <string>(); Analyzer analyzer = new PanGuAnalyzer(); TokenStream tokenStream = analyzer.TokenStream("", new StringReader(msg)); Lucene.Net.Analysis.Token token = null; //Next()取分到的下一个词 while ((token = tokenStream.Next()) != null) { string word = token.TermText();//分到的词 list.Add(word); } return(list); }
/// <summary> /// /// </summary> /// <returns>Returns the next token in the stream, or null at EOS</returns> public override Token Next() { if ((token = input.Next()) == null) { return null; } else { String s = stemmer.Stem(token.TermText()); if (!s.Equals(token.TermText())) { return new Token(s, token.StartOffset(), token.EndOffset(), token.Type()); } return token; } }
public void Analyzers() { string[] strings = new string[] { "The quick brown fox jumped over the lazy dogs", "XY&Z Corporation - [email protected]" }; Analyzer[] analyzers = new Analyzer[] { new WhitespaceAnalyzer(), new SimpleAnalyzer(), new StopAnalyzer(), new StandardAnalyzer(), new SnowballAnalyzer("English", StopAnalyzer.ENGLISH_STOP_WORDS), // Same as EnglishAnalyzer new KeywordAnalyzer() }; foreach (string text in strings) { log.Debug("Analyzing \"" + text + "\""); // Make each analyzer analyze the current string. foreach (Analyzer analyzer in analyzers) { StringBuilder analysisText = new StringBuilder(); analysisText.AppendLine("\t" + analyzer.GetType().Name + ":"); analysisText.Append("\t\t"); TokenStream stream = analyzer.TokenStream("contents", new StringReader(text)); while (true) { Token token = stream.Next(); if (token == null) { break; } analysisText.Append("[" + token.TermText() + "] "); } log.Debug(analysisText.ToString()); } } }
public override Token Next(Token token) { token.Clear(); if (start == 0) { length = input.Read((System.Char[])ioBuffer, 0, ioBuffer.Length); if (length <= 0) return null; } if (start == length) return null; token.SetTermBuffer(ioBuffer, start, 1); start++; token.termBuffer[0] = System.Char.ToLower(token.termBuffer[0]); return token; }
/// <summary> /// 测试不同的Analyzer分词效果 /// </summary> /// <param name="listAnalyzer"></param> /// <param name="input"></param> public static void TestAnalyzer(IList <Analyzer> listAnalyzer, string input) { foreach (Analyzer analyzer in listAnalyzer) { Console.WriteLine(string.Format("{0}:", analyzer.ToString())); using (TextReader reader = new StringReader(input)) { TokenStream stream = analyzer.ReusableTokenStream(string.Empty, reader); Lucene.Net.Analysis.Token token = null; while ((token = stream.Next()) != null) { Console.WriteLine(token.TermText()); } } Console.WriteLine(); } }
public Lucene.Net.Analysis.Token Next() { int length = 0; bool res = iter.MoveNext(); Lucene.Net.Analysis.Token token; if (res) { JiebaNet.Segmenter.Token word = iter.Current; token = new Lucene.Net.Analysis.Token(word.Word, word.StartIndex, word.EndIndex); // Console.WriteLine("xxxxxxxxxxxxxxxx分词:"+word.Word+"xxxxxxxxxxx起始位置:"+word.StartIndex+"xxxxxxxxxx结束位置"+word.EndIndex); start += length; return(token); } else { return(null); } }
private void ProcessURLToken(Lucene.Net.Analysis.Token token) { token_type = tokentype_host; string hostname = token.TermText(); parts = hostname.Split('.'); if (parts [0] != "www") { return; } // remove initial www Array.Copy(parts, 1, parts, 0, parts.Length - 1); Array.Resize(ref parts, parts.Length - 1); // FIXME: Remove final tld // Any string of form "<alnum> '.')+<alnum>" has type HOST // Removing last token might remove important words from non-host // string of that form. To fix that, we need to match against the // huge list of TLDs. }
public override Token Next(Token token) { token.Clear(); if (start == 0) { length = input.Read((System.Char[])ioBuffer, 0, ioBuffer.Length); if (length <= 0) { return(null); } } if (start == length) { return(null); } token.SetTermBuffer(ioBuffer, start, 1); start++; token.termBuffer[0] = System.Char.ToLower(token.termBuffer[0]); return(token); }