/// <summary>
        /// Returns a 32 bit float from the payload, or 1f it null.
        /// </summary>
        /// <param name="token"></param>
        /// <returns></returns>
        public override float GetWeight(Token token)
        {
            if (token.GetPayload() == null || token.GetPayload().GetData() == null)
                return 1f;

            return PayloadHelper.DecodeFloat(token.GetPayload().GetData());
        }
示例#2
0
 /// <summary>
 /// Gets the string query.
 /// </summary>
 /// <param name="searchCriteria">The search criteria.</param>
 /// <param name="matchCondition">The match condition.</param>
 private void GetStringQuery(Criteria searchCriteria, MatchCondition matchCondition)
 {
     try
     {
         Lucene.Net.Analysis.Token token  = null;
         Lucene.Net.Analysis.Token token2 = null;
         TokenStream stream = _analyzer.TokenStream(searchCriteria.Field.ToString(), new StringReader(searchCriteria.Value));
         do
         {
             token2 = token;
             token  = stream.Next();
             if (token2 != null)
             {
                 string       stoken      = token2.TermText();
                 BooleanQuery outputQuery = new BooleanQuery();
                 this.TokenToQuery(searchCriteria.Field.ToString(), stoken, searchCriteria.Condition.ToString(), ref outputQuery);
                 if (matchCondition == MatchCondition.MatchAll)
                 {
                     _tempQuery.Add(outputQuery, BooleanClause.Occur.MUST);
                 }
                 else
                 {
                     _tempQuery.Add(outputQuery, BooleanClause.Occur.SHOULD);
                 }
             }
         }while (token != null);
     }
     catch (Exception ex)
     {
         _logger.Error("Error while creating String Query :" + ex.InnerException == null ? ex.Message.ToString() : ex.InnerException.Message);
     }
 }
 public override TokenPositioner GetTokenPositioner(Token token)
 {
     return
         token.GetPositionIncrement() == 0
             ? TokenPositioner.NewRow
             : TokenPositioner.NewColumn;
 }
示例#4
0
 public virtual void  TestCtor()
 {
     Token t = new Token();
     char[] content = "hello".ToCharArray();
     t.SetTermBuffer(content, 0, content.Length);
     char[] buf = t.TermBuffer();
     Assert.AreNotEqual(t.TermBuffer(), content);
     Assert.AreEqual("hello", t.Term);
     Assert.AreEqual("word", t.Type);
     Assert.AreEqual(0, t.Flags);
     
     t = new Token(6, 22);
     t.SetTermBuffer(content, 0, content.Length);
     Assert.AreEqual("hello", t.Term);
     Assert.AreEqual("(hello,6,22)", t.ToString());
     Assert.AreEqual("word", t.Type);
     Assert.AreEqual(0, t.Flags);
     
     t = new Token(6, 22, 7);
     t.SetTermBuffer(content, 0, content.Length);
     Assert.AreEqual("hello", t.Term);
     Assert.AreEqual("(hello,6,22)", t.ToString());
     Assert.AreEqual(7, t.Flags);
     
     t = new Token(6, 22, "junk");
     t.SetTermBuffer(content, 0, content.Length);
     Assert.AreEqual("hello", t.Term);
     Assert.AreEqual("(hello,6,22,type=junk)", t.ToString());
     Assert.AreEqual(0, t.Flags);
 }
 public override Token Next(/* in */ Token reusableToken)
 {
     System.Diagnostics.Debug.Assert(reusableToken != null);
     Token nextToken = input.Next(reusableToken);
     if (nextToken != null)
     {
         char[] buffer = nextToken.TermBuffer();
         int length = nextToken.TermLength();
         // If no characters actually require rewriting then we
         // just return token as-is:
         for (int i = 0; i < length; i++)
         {
             char c = buffer[i];
             if (c >= '\u00c0' && c <= '\ufb06')
             {
                 RemoveAccents(buffer, length);
                 nextToken.SetTermBuffer(output, 0, outputPos);
                 break;
             }
         }
         return nextToken;
     }
     else
         return null;
 }
示例#6
0
        private void ProcessEmailToken(Lucene.Net.Analysis.Token token)
        {
            token_type = tokentype_email;

            string email = token.TermText();

            parts = email.Split(replace_array);
            if (parts.Length == 1)             // safety check
            {
                return;
            }

            int index_at = email.IndexOf('@');

            // store username part as a large token
            // and also remove the final tld part
            Array.Copy(parts, 0, parts, 1, parts.Length - 1);
            parts [0] = email.Substring(0, index_at);
#if ENABLE_RDF_ADAPTER
            if (link_call_back != null)
            {
                link_call_back("mailto://" + email, true);
            }
#endif
        }
示例#7
0
			public override void  Add(Token t)
			{
				if (t != null && t.Term().ToUpper().Equals("The".ToUpper()))
				{
					base.Add(t);
				}
			}
		public override Token Next(Token reusableToken)
		{
			System.Diagnostics.Debug.Assert(reusableToken != null);
			Token nextToken = input.Next(reusableToken);
			sink.Add(nextToken);
			return nextToken;
		}
        public static Token NextToken(TokenStream input, Token reusableToken)
        {
            if (input == null) 
                return null;
            if (!input.IncrementToken()) 
                return null;

            ITermAttribute termAtt = input.GetAttribute<ITermAttribute>();
            IOffsetAttribute offsetAtt = input.GetAttribute<IOffsetAttribute>();
            ITypeAttribute typeAtt = input.GetAttribute<ITypeAttribute>();

            if (reusableToken == null)
            {
                reusableToken = new Token();
            }
            reusableToken.Clear();

            if (termAtt != null)
                reusableToken.SetTermBuffer(termAtt.TermBuffer(), 0, termAtt.TermLength());

            if (offsetAtt != null)
            {
                reusableToken.StartOffset = offsetAtt.StartOffset;
                reusableToken.EndOffset = offsetAtt.EndOffset;
            }

            if (typeAtt != null)
                reusableToken.Type = typeAtt.Type;

            return reusableToken;
        }
示例#10
0
        /// <summary>Returns the next token in the stream, or null at EOS.
        /// <p>Removes <tt>'s</tt> from the end of words.
        /// <p>Removes dots from acronyms.
        /// </summary>
        public override Lucene.Net.Analysis.Token Next()
        {
            Lucene.Net.Analysis.Token t = input.Next();

            if (t == null)
            {
                return(null);
            }

            System.String text = t.TermText();
            System.String type = t.Type();

            if (type == APOSTROPHE_TYPE && (text.EndsWith("'s") || text.EndsWith("'S")))
            {
                return(new Lucene.Net.Analysis.Token(text.Substring(0, (text.Length - 2) - (0)), t.StartOffset(), t.EndOffset(), type));
            }
            else if (type == ACRONYM_TYPE)
            {
                // remove dots
                System.Text.StringBuilder trimmed = new System.Text.StringBuilder();
                for (int i = 0; i < text.Length; i++)
                {
                    char c = text[i];
                    if (c != '.')
                    {
                        trimmed.Append(c);
                    }
                }
                return(new Lucene.Net.Analysis.Token(trimmed.ToString(), t.StartOffset(), t.EndOffset(), type));
            }
            else
            {
                return(t);
            }
        }
示例#11
0
        public void SnowballAnalyzer()
        {
            // The algorithm is language-specific, using stemming. Stemming algorithms attempt to reduce a word to a common root form.
            string text   = "building build builds builded";
            string output = "Analyzing '" + text + "', generated the tokens: ";
            Dictionary <string, string> tokensFound = new Dictionary <string, string>();

            // Do the analyzis
            Analyzer    analyzer = new SnowballAnalyzer("English", StandardAnalyzer.STOP_WORDS);
            TokenStream stream   = analyzer.TokenStream("contents", new StringReader(text));

            while (true)
            {
                Token token = stream.Next();
                if (token == null)
                {
                    break;
                }

                // Append only unique tokens
                if (!tokensFound.ContainsKey(token.TermText()))
                {
                    tokensFound[token.TermText()] = token.TermText();
                    output += "[" + token.TermText() + "] ";
                }
            }

            log.Debug(output);

            Assert.AreEqual(1, tokensFound.Count);
        }
示例#12
0
        private Query GetParsedQuerywc(string text)
        {
            BooleanQuery query = new BooleanQuery();

            BooleanQuery.SetMaxClauseCount(0x2710);
            if (text.Length > 0)
            {
                BooleanQuery query2 = new BooleanQuery();
                QueryParser  parser = new QueryParser("UserType", analyzer);
                query2.Add(parser.Parse("Users"), BooleanClause.Occur.SHOULD);
                query.Add(query2, BooleanClause.Occur.MUST);
            }
            Lucene.Net.Analysis.Token token  = null;
            Lucene.Net.Analysis.Token token2 = null;
            TokenStream stream = analyzer.TokenStream("UserType", new StringReader(text));

            do
            {
                token2 = token;
                token  = stream.Next();
                if (token2 != null)
                {
                    string       stoken      = token2.TermText();
                    BooleanQuery outputQuery = new BooleanQuery();
                    this.TokenToQuery("Name", stoken, ref outputQuery);
                    query.Add(outputQuery, BooleanClause.Occur.MUST);
                }
            }while (token != null);
            return(query);
        }
 /// <summary>
 /// Stores a 32 bit float in the payload, or set it to null if 1f;
 /// </summary>
 /// <param name="token"></param>
 /// <param name="weight"></param>
 public override void SetWeight(Token token, float weight)
 {
     token.SetPayload(
         weight == 1f
             ? null
             : new Payload(PayloadHelper.EncodeFloat(weight))
         );
 }
 public CutLeterDigitFilter(TokenStream input)
     : base(input)
 {
     reusableToken = new Token();
     termAtt = AddAttribute<ITermAttribute>();
     offsetAtt = AddAttribute<IOffsetAttribute>();
     typeAtt = AddAttribute<ITypeAttribute>();
 }
示例#15
0
        private List <BookSearchModel> SearchBookContent(string searchWords)
        {
            List <BookSearchModel> bookSearchModelList = new List <BookSearchModel>();
            //1.对搜索条件进行分词
            Analyzer    analyzer    = new PanGuAnalyzer();
            TokenStream tokenStream = analyzer.TokenStream("", new StringReader(searchWords));

            Lucene.Net.Analysis.Token token = null;
            string indexPath = @"D:\lucenedir";
            //string kw = "面向对象";//对用户输入的搜索条件进行拆分。
            FSDirectory   directory = FSDirectory.Open(new DirectoryInfo(indexPath), new NoLockFactory());
            IndexReader   reader    = IndexReader.Open(directory, true);
            IndexSearcher searcher  = new IndexSearcher(reader);
            //搜索条件
            PhraseQuery query = new PhraseQuery();

            //foreach (string word in kw.Split(' '))//先用空格,让用户去分词,空格分隔的就是词“计算机   专业”
            //{
            //    query.Add(new Term("body", word));
            //}
            //query.Add(new Term("body","语言"));--可以添加查询条件,两者是add关系.顺序没有关系.
            // query.Add(new Term("body", "大学生"));
            while ((token = tokenStream.Next()) != null)
            {
                query.Add(new Term("body", token.TermText()));
            }
            // query.Add(new Term("body", kw));//body中含有kw的文章
            query.SetSlop(100);//多个查询条件的词之间的最大距离.在文章中相隔太远 也就无意义.(例如 “大学生”这个查询条件和"简历"这个查询条件之间如果间隔的词太多也就没有意义了。)
            //TopScoreDocCollector是盛放查询结果的容器
            TopScoreDocCollector collector = TopScoreDocCollector.create(1000, true);

            searcher.Search(query, null, collector);                                    //根据query查询条件进行查询,查询结果放入collector容器
            ScoreDoc[] docs = collector.TopDocs(0, collector.GetTotalHits()).scoreDocs; //得到所有查询结果中的文档,GetTotalHits():表示总条数   TopDocs(300, 20);//表示得到300(从300开始),到320(结束)的文档内容.
            //可以用来实现分页功能
            for (int i = 0; i < docs.Length; i++)
            {
                //
                //搜索ScoreDoc[]只能获得文档的id,这样不会把查询结果的Document一次性加载到内存中。降低了内存压力,需要获得文档的详细内容的时候通过searcher.Doc来根据文档id来获得文档的详细内容对象Document.
                int             docId       = docs[i].doc;         //得到查询结果文档的id(Lucene内部分配的id)
                Document        doc         = searcher.Doc(docId); //找到文档id对应的文档详细信息
                BookSearchModel searchModel = new BookSearchModel();
                searchModel.Id                = int.Parse(doc.Get("ID"));
                searchModel.Title             = doc.Get("title");
                searchModel.ContenDescription = SearchWordHighlight.CreateHightLight(searchWords, doc.Get("body"));
                //this.listBox1.Items.Add(doc.Get("number") + "\n");// 取出放进字段的值
                //this.listBox1.Items.Add(doc.Get("body") + "\n");
                //this.listBox1.Items.Add("-----------------------\n");
                bookSearchModelList.Add(searchModel);
            }
            //将搜索的此插入词库之中
            SearchDetails entity = new SearchDetails()
            {
                Id = Guid.NewGuid(), KeyWords = searchWords, SearchDateTime = DateTime.Now
            };

            SearchDetailsService.AddEntity(entity);
            return(bookSearchModelList);
        }
示例#16
0
        /// <summary>
        /// 进行搜索
        /// </summary>
        /// <returns></returns>
        public ActionResult Search()
        {
            string kw        = Request["kw"];                 // 获取用户输入的搜索内容
            string indexPath = Server.MapPath("~/lucenedir"); // 从哪里搜索

            // 对用户输入的内容进行分割
            List <string> kws         = new List <string>(); // 定义一个集合用来存储分割后的分词
            Analyzer      analyzer    = new PanGuAnalyzer();
            TokenStream   tokenStream = analyzer.TokenStream("", new StringReader(kw.ToString()));

            Lucene.Net.Analysis.Token token = null;
            while ((token = tokenStream.Next()) != null)
            {
                kws.Add(token.TermText());
            }

            FSDirectory   directory = FSDirectory.Open(new DirectoryInfo(indexPath), new NoLockFactory());
            IndexReader   reader    = IndexReader.Open(directory, true);
            IndexSearcher searcher  = new IndexSearcher(reader);
            //搜索条件

            // 注意:这个类只可以进行单个列条件搜索,如果想要实现多个条件搜索要使用另外一个类
            PhraseQuery query = new PhraseQuery();

            foreach (var word in kws)
            {
                query.Add(new Term("content", word)); // 向content这个列进行搜索
            }

            query.SetSlop(100);//多个查询条件的词之间的最大距离.在文章中相隔太远 也就无意义.(例如 “大学生”这个查询条件和"简历"这个查询条件之间如果间隔的词太多也就没有意义了。)
            //TopScoreDocCollector是盛放查询结果的容器
            TopScoreDocCollector collector = TopScoreDocCollector.create(1000, true);

            searcher.Search(query, null, collector);                                    //根据query查询条件进行查询,查询结果放入collector容器
            ScoreDoc[] docs = collector.TopDocs(0, collector.GetTotalHits()).scoreDocs; //得到所有查询结果中的文档,GetTotalHits():表示总条数   TopDocs(300, 20);//表示得到300(从300开始),到320(结束)的文档内容.

            // 创建一个list集合用来存储搜索到的结果
            List <BookVieModel> bookList = new List <BookVieModel>();

            for (int i = 0; i < docs.Length; i++)
            {
                //搜索ScoreDoc[]只能获得文档的id,这样不会把查询结果的Document一次性加载到内存中。降低了内存压力,需要获得文档的详细内容的时候通过searcher.Doc来根据文档id来获得文档的详细内容对象Document.
                int      docId = docs[i].doc;         //得到查询结果文档的id(Lucene内部分配的id)
                Document doc   = searcher.Doc(docId); //找到文档id对应的文档详细信息

                BookVieModel model = new BookVieModel();
                model.Id    = Convert.ToInt32(doc.Get("Id"));             // 注意:这些字段要和在添加搜索词库的时候保持一致
                model.Title = CreateHightLight(kw, doc.Get("title"));     // 注意:这些字段要和在添加搜索词库的时候保持一致
                // 对搜索到结果中的搜索词进行高亮显示
                model.Content = CreateHightLight(kw, doc.Get("content")); // 注意:这些字段要和在添加搜索词库的时候保持一致

                bookList.Add(model);
            }
            ViewBag.books = bookList;
            ViewBag.kw    = kw;
            return(View("Index"));
        }
 void AddToken(Token oriToken, int termBufferOffset, int termBufferLength, byte type)
 {
     Token token = new Token(oriToken.TermBuffer(), termBufferOffset, termBufferLength,
         oriToken.StartOffset + termBufferOffset, oriToken.StartOffset + termBufferOffset + termBufferLength);
     if (type == (byte)UnicodeCategory.DecimalDigitNumber)
         token.Type = Word.TYPE_DIGIT;
     else
         token.Type = Word.TYPE_LETTER;
     tokenQueue.Enqueue(token);
 }
示例#18
0
        public override Token Next()
        {
            Token t = this.input.Next();
            if (t != null)
            {
                t = new Token(_replaceDiacritics(t.TermText()), t.StartOffset(), t.EndOffset()/*, "DiacriticFiltered"*/);

            }
            return t;
        }
示例#19
0
		public virtual void  TestToString()
		{
			char[] b = new char[]{'a', 'l', 'o', 'h', 'a'};
			Token t = new Token("", 0, 5);
			t.SetTermBuffer(b, 0, 5);
			Assert.AreEqual("(aloha,0,5)", t.ToString());
			
			t.SetTermText("hi there");
			Assert.AreEqual("(hi there,0,5)", t.ToString());
		}
示例#20
0
        private void button3_Click(object sender, EventArgs e)
        {
            Analyzer    analyzer    = new PanGuAnalyzer();
            TokenStream tokenStream = analyzer.TokenStream("", new StringReader("面向世界,面向现代化"));

            Lucene.Net.Analysis.Token token = null;
            while ((token = tokenStream.Next()) != null)
            {
                Console.WriteLine(token.TermText());
            }
        }
示例#21
0
        /// <summary>Returns the next token in the stream, or null at EOS.
        /// <p>Removes <tt>'s</tt> from the end of words.
        /// <p>Removes dots from acronyms.
        /// </summary>
        public override Lucene.Net.Analysis.Token Next()
        {
            Lucene.Net.Analysis.Token t = input.Next();

            if (t == null)
            {
                return(null);
            }

            return(t);
        }
示例#22
0
        /// <summary>
        /// 一元分词
        /// </summary>
        /// <param name="sender"></param>
        /// <param name="e"></param>
        private void button1_Click(object sender, EventArgs e)
        {
            Analyzer    analyzer    = new StandardAnalyzer();
            TokenStream tokenStream = analyzer.TokenStream("", new StringReader("北京,Hi欢迎你们大家"));

            Lucene.Net.Analysis.Token token = null;
            while ((token = tokenStream.Next()) != null)
            {
                Console.WriteLine(token.TermText());
            }
        }
 public override Token Next(Token result)
 {
     result = input.Next(result);
     if (result != null)
     {
         if (stemmer.Stem(result.TermBuffer(), 0, result.termLength))
             result.SetTermBuffer(stemmer.GetResultBuffer(), 0, stemmer.GetResultLength());
         return result;
     }
     else
         return null;
 }
示例#24
0
 public virtual void  TestResize()
 {
     Token t = new Token();
     char[] content = "hello".ToCharArray();
     t.SetTermBuffer(content, 0, content.Length);
     for (int i = 0; i < 2000; i++)
     {
         t.ResizeTermBuffer(i);
         Assert.IsTrue(i <= t.TermBuffer().Length);
         Assert.AreEqual("hello", t.Term);
     }
 }
        public override Token Next(/* in */ Token reusableToken)
        {
            System.Diagnostics.Debug.Assert(reusableToken != null);
            Token nextToken = input.Next(reusableToken);
            if (nextToken == null)
                return null;

            if (stemmer.Stem(nextToken.TermBuffer(), 0, nextToken.TermLength()))
                nextToken.SetTermBuffer(stemmer.GetResultBuffer(), 0, stemmer.GetResultLength());

            return nextToken;
        }
示例#26
0
        public override Token Next(/* in */ Token reusableToken)
        {
            System.Diagnostics.Debug.Assert(reusableToken != null);
            reusableToken.Clear();
            int length = 0;
            int start = bufferIndex;
            char[] buffer = reusableToken.TermBuffer();
            while (true)
            {

                if (bufferIndex >= dataLen)
                {
                    offset += dataLen;
                    dataLen = input is Lucene.Net.Index.ReusableStringReader ? ((Lucene.Net.Index.ReusableStringReader) input).Read(ioBuffer) : input.Read((System.Char[]) ioBuffer, 0, ioBuffer.Length);
                    if (dataLen <= 0)
                    {
                        if (length > 0)
                            break;
                        else
                            return null;
                    }
                    bufferIndex = 0;
                }

                char c = ioBuffer[bufferIndex++];

                if (IsTokenChar(c))
                {
                    // if it's a token char

                    if (length == 0)
                        // start of token
                        start = offset + bufferIndex - 1;
                    else if (length == buffer.Length)
                        buffer = reusableToken.ResizeTermBuffer(1 + length);

                    buffer[length++] = Normalize(c); // buffer it, normalized

                    if (length == MAX_WORD_LEN)
                        // buffer overflow!
                        break;
                }
                else if (length > 0)
                    // at non-Letter w/ chars
                    break; // return 'em
            }

            reusableToken.SetTermLength(length);
            reusableToken.SetStartOffset(start);
            reusableToken.SetEndOffset(start + length);
            return reusableToken;
        }
示例#27
0
        private string[] SplitWords(string content)
        {
            List <string> strList     = new List <string>();
            Analyzer      analyzer    = new PanGuAnalyzer();//指定使用盘古 PanGuAnalyzer 分词算法
            TokenStream   tokenStream = analyzer.TokenStream("", new StringReader(content));

            Lucene.Net.Analysis.Token token = null;
            while ((token = tokenStream.Next()) != null)
            {                                  //Next继续分词 直至返回null
                strList.Add(token.TermText()); //得到分词后结果
            }
            return(strList.ToArray());
        }
        public SingleTokenTokenStream(Token token)
        {
            Debug.Assert(token != null, "Token was null!");
            _singleToken = (Token) token.Clone();

            // ReSharper disable DoNotCallOverridableMethodsInConstructor
            _tokenAtt = (AttributeImpl) AddAttribute(typeof (TermAttribute));
            // ReSharper restore DoNotCallOverridableMethodsInConstructor

            Debug.Assert(_tokenAtt is Token || _tokenAtt.GetType().Name.Equals(typeof (TokenWrapper).Name),
                         "Token Attribute is the wrong type! Type was: " + _tokenAtt.GetType().Name + " but expected " +
                         typeof (TokenWrapper).Name);
        }
示例#29
0
        /// <summary>
        /// 盘古分词
        /// </summary>
        /// <param name="msg">需要进行拆分的字符串</param>
        /// <returns>拆分结果</returns>
        public static List <string> PanguSplitWords(string msg)
        {
            List <string> list        = new List <string>();
            Analyzer      analyzer    = new PanGuAnalyzer();
            TokenStream   tokenStream = analyzer.TokenStream("", new StringReader(msg));

            Lucene.Net.Analysis.Token token = null;
            while ((token = tokenStream.Next()) != null)
            {
                list.Add(token.TermText());
            }
            return(list);
        }
示例#30
0
        /// <summary>
        /// 对用户输入的搜索的条件进行分词
        /// </summary>
        /// <param name="str"></param>
        /// <returns></returns>
        private static string[] SplitWord(string str)
        {
            List <string> list        = new List <string>();
            Analyzer      analyzer    = new PanGuAnalyzer();                             //指定盘古分词
            TokenStream   tokenStream = analyzer.TokenStream("", new StringReader(str)); //

            Lucene.Net.Analysis.Token token = null;
            while ((token = tokenStream.Next()) != null)
            {
                list.Add(token.TermText());
            }
            return(list.ToArray());
        }
示例#31
0
		public override Token Next(Token reusableToken)
		{
			System.Diagnostics.Debug.Assert(reusableToken != null);
			if (iter == null)
				iter = lst.GetEnumerator();
			// Since this TokenStream can be reset we have to maintain the tokens as immutable
			if (iter.MoveNext())
			{
				Token nextToken = iter.Current;
				return (Token) nextToken.Clone();
			}
			return null;
		}
 /// <summary>
 /// 
 /// </summary>
 /// <param name="token"></param>
 /// <returns>the token flags int value as TokenPosition</returns>
 public override TokenPositioner GetTokenPositioner(Token token)
 {
     switch (token.GetFlags())
     {
         case 0:
             return TokenPositioner.NewColumn;
         case 1:
             return TokenPositioner.NewRow;
         case 2:
             return TokenPositioner.SameRow;
     }
     throw new IOException("Unknown matrix positioning of token " + token);
 }
示例#33
0
        public override Token Next(Token token)
        {
            token.Clear();
            int length = 0;
            int start = bufferIndex;
            char[] buffer = token.TermBuffer();
            while (true)
            {

                if (bufferIndex >= dataLen)
                {
                    offset += dataLen;
                    dataLen = input is Lucene.Net.Index.DocumentsWriter.ReusableStringReader ? ((Lucene.Net.Index.DocumentsWriter.ReusableStringReader) input).Read(ioBuffer) : input.Read((System.Char[]) ioBuffer, 0, ioBuffer.Length);
                    if (dataLen <= 0)
                    {
                        if (length > 0)
                            break;
                        else
                            return null;
                    }
                    bufferIndex = 0;
                }

                char c = ioBuffer[bufferIndex++];

                if (IsTokenChar(c))
                {
                    // if it's a token char

                    if (length == 0)
                        // start of token
                        start = offset + bufferIndex - 1;
                    else if (length == buffer.Length)
                        buffer = token.ResizeTermBuffer(1 + length);

                    buffer[length++] = Normalize(c); // buffer it, normalized

                    if (length == MAX_WORD_LEN)
                        // buffer overflow!
                        break;
                }
                else if (length > 0)
                    // at non-Letter w/ chars
                    break; // return 'em
            }

            token.termLength = length;
            token.startOffset = start;
            token.endOffset = start + length;
            return token;
        }
示例#34
0
        public static void Analyze(TextReader reader)
        {
            Lucene.Net.Analysis.Token lastToken = null;
            Analyzer    indexing_analyzer       = new LuceneCommon.BeagleAnalyzer(true);
            TokenStream stream = indexing_analyzer.TokenStream("Text", reader);

            int position = 1;

            for (Lucene.Net.Analysis.Token t = stream.Next(); t != null; t = stream.Next())
            {
                position += (t.GetPositionIncrement() - 1);
                Console.WriteLine(t);
            }
        }
示例#35
0
        /// <summary>
        /// 对字符串进行分词
        /// </summary>
        /// <param name="str"></param>
        /// <returns></returns>
        public static List <string> GetPanGuWord(string str)
        {
            Analyzer    analyzer    = new PanGuAnalyzer();
            TokenStream tokenStream = analyzer.TokenStream("", new StringReader(str));

            Lucene.Net.Analysis.Token token = null;
            List <string>             list  = new List <string>();

            while ((token = tokenStream.Next()) != null)
            {
                list.Add(token.TermText());
            }
            return(list);
        }
示例#36
0
        private string AnalyzerResult(string txtBody, Analyzer analyzer)
        {
            TokenStream tokenStream = analyzer.TokenStream("", new StringReader(txtBody));

            Lucene.Net.Analysis.Token token = null;
            StringBuilder             sb    = new StringBuilder();

            // 新版本 3+ .Next() 已经被废弃
            while ((token = tokenStream.Next()) != null)
            {
                sb.Append(token.TermText() + "\r\n");
            }
            return(sb.ToString());
        }
示例#37
0
		internal virtual void  VerifyPayload(TokenStream ts)
		{
			Token t = new Token();
			for (byte b = 1; ; b++)
			{
				t.Clear();
				t = ts.Next(t);
				if (t == null)
					break;
				// System.out.println("id="+System.identityHashCode(t) + " " + t);
				// System.out.println("payload=" + (int)t.getPayload().toByteArray()[0]);
				Assert.AreEqual(b, t.GetPayload().ToByteArray()[0]);
			}
		}
示例#38
0
        /// <summary>
        /// 利用盘古分词来分词
        /// </summary>
        /// <param name="keyword"></param>
        /// <returns></returns>
        public static string[] WordSegmentation(string keyword)
        {
            List <string> list     = new List <string>();
            Analyzer      analyzer = new PanGuAnalyzer();
            //Analyzer analyzer = new StandardAnalyzer();
            TokenStream tokenStream = analyzer.TokenStream("", new StringReader(keyword));

            Lucene.Net.Analysis.Token token = null;
            while ((token = tokenStream.Next()) != null)
            {
                list.Add(token.TermText());
            }
            return(list.ToArray());
        }
示例#39
0
        public override Token Next()
        {
            if (words == null)
            {
                words = MCSegment.Segment.Seg(this.input_text);
            }

            if (words == null || index >= words.Length) return null;

            string word = words[index++];
            Token token = new Token(word, offset, offset + word.Length);
            offset += word.Length;
            return token;
        }
示例#40
0
        public override Lucene.Net.Analysis.Token Next()
        {
            if (parts != null)
            {
                if (++parts_index < parts.Length)
                {
                    string part = parts [parts_index];
                    Lucene.Net.Analysis.Token part_token;
                    // FIXME: Searching for google.com will not match www.google.com.
                    // If we decide to allow google-style "abcd.1234" which means
                    // "abcd 1234" as a consequtive phrase, then adjusting
                    // the startOffset and endOffset would enable matching
                    // google.com to www.google.com
                    int start_offset = (parts_index == 0 && token_type == tokentype_email ?
                                        0 :
                                        last_end_offset + 1);         // assuming only one separator
                    int end_offset = start_offset + part.Length;
                    part_token = new Lucene.Net.Analysis.Token(part,
                                                               start_offset,
                                                               end_offset,
                                                               token_type);
                    part_token.SetPositionIncrement(0);
                    last_end_offset = (parts_index == 0 && token_type == tokentype_email ?
                                       -1 :
                                       end_offset);          // assuming only one separator
                    return(part_token);
                }
                else
                {
                    // clear the array
                    parts           = null;
                    parts_index     = -1;
                    last_end_offset = -1;
                    token_type      = null;
                }
            }

            Token token;

            while ((token = token_stream.Next()) != null)
            {
                //Console.WriteLine ("Found token: [{0}]", token.TermText ());
                if (ProcessToken(ref token))
                {
                    return(token);
                }
            }
            return(null);
        }
示例#41
0
 /// <summary> Returns the next input Token whose termText() is the right len</summary>
 public override Token Next(Token result)
 {
     // return the first non-stop word found
     for (Token token = input.Next(result); token != null; token = input.Next(result))
     {
         int len = token.TermText().Length;
         if (len >= min && len <= max)
         {
             return token;
         }
         // note: else we ignore it but should we index each part of it?
     }
     // reached EOS -- return null
     return null;
 }
示例#42
0
        /// <summary>
        /// 盘古分词
        /// </summary>
        /// <param name="words"></param>
        /// <returns></returns>
        public static object PanGu(string words)
        {
            Analyzer    analyzer    = new PanGuAnalyzer();
            TokenStream tokenStream = analyzer.TokenStream("", new StringReader(words));

            Lucene.Net.Analysis.Token token = null;
            var str = "";

            while ((token = tokenStream.Next()) != null)
            {
                string word = token.TermText(); // token.TermText() 取得当前分词
                str += word + "   |  ";
            }
            return(str);
        }
示例#43
0
        public static string[] PanGuSplit(string key)
        {
            Analyzer    analyzer    = new PanGuAnalyzer();
            TokenStream tokenStream = analyzer.TokenStream("", new StringReader(key));

            Lucene.Net.Analysis.Token token = null;
            List <string>             list  = new List <string>();

            while ((token = tokenStream.Next()) != null)
            {
                //Console.WriteLine(token.TermText());
                list.Add(token.TermText());
            }
            return(list.ToArray());
        }
示例#44
0
        public virtual bool IsNewFragment(Token token)
        {
            char kar1 = this.text[token.StartOffset() - 2];
            char kar2 = this.text[token.StartOffset() - 3];
            char kar3 = this.text[token.StartOffset() - 4];

            bool isNewFrag= ((token.EndOffset()>=(fragmentSize*(currentNumFrags - 1) + (fragmentSize/2))&&
                    (isCriticalChar(kar1) || isCriticalChar(kar2) || isCriticalChar(kar3)))
                    || (token.EndOffset()>=(fragmentSize*currentNumFrags)));
            if(isNewFrag)
            {
                currentNumFrags++;
            }
            return isNewFrag;
        }
示例#45
0
        public override bool IncrementToken()
        {
            if (tokenList != null)
            {
                index++;
                if (index < tokenList.Count)
                {
                    termAtt.SetTermBuffer(tokenList[index].TermBuffer(), 0, tokenList[index].TermLength());
                    termOff.SetOffset(tokenList[index].StartOffset, tokenList[index].EndOffset);
                    return(true);
                }

                tokenList = null;
                return(false);
            }

            tokenList = new List <Token>();
            // First cache result
            while (input.IncrementToken())
            {
                Token newToken = new Token(termAtt.Term, termOff.StartOffset, termOff.EndOffset);
                foreach (Token token in tokenList)
                {
                    if (token.StartOffset == newToken.StartOffset && token.Term == newToken.Term)
                    {
                        token.SetOffset(newToken.StartOffset, newToken.EndOffset);
                        newToken = null;
                        break;
                    }
                } //foreach

                if (newToken != null)
                {
                    tokenList.Add(newToken);
                }
            } // while;

            // now output the tokens!
            if (tokenList.Count > 0)
            {
                index = 0;
                termAtt.SetTermBuffer(tokenList[index].TermBuffer(), 0, tokenList[index].TermLength());
                termOff.SetOffset(tokenList[index].StartOffset, tokenList[index].EndOffset);
                return(true);
            }

            return(false);
        }
        public override Token Next(Token reusableToken)
        {
            Token nextToken = input.Next(reusableToken);
            if (nextToken != null)
            {

                char[] buffer = nextToken.TermBuffer();
                int length = nextToken.TermLength();
                for (int i = 0; i < length; i++)
                    buffer[i] = System.Char.ToLower(buffer[i]);

                return nextToken;
            }
            else
                return null;
        }
示例#47
0
        /// <summary>Returns the next token in the stream, or null at EOS.
        /// @deprecated The returned Token is a "full private copy" (not
        /// re-used across calls to next()) but will be slower
        /// than calling {@link #Next(Token)} instead.. 
        /// </summary>
        public virtual Token Next()
        {
            Token reusableToken = new Token();
            Token nextToken = Next(reusableToken);

            if (nextToken != null)
            {
                Payload p = nextToken.GetPayload();
                if (p != null)
                {
                    nextToken.SetPayload((Payload) p.Clone());
                }
            }

            return nextToken;
        }
        public override Token Next(Token result)
        {
            result = input.Next(result);
            if (result != null)
            {

                char[] buffer = result.TermBuffer();
                int length = result.termLength;
                for (int i = 0; i < length; i++)
                    buffer[i] = System.Char.ToLower(buffer[i]);

                return result;
            }
            else
                return null;
        }
示例#49
0
 /// <summary> Returns the next input Token whose term() is the right len</summary>
 public override Token Next(/* in */ Token reusableToken)
 {
     System.Diagnostics.Debug.Assert(reusableToken != null);
     // return the first non-stop word found
     for (Token nextToken = input.Next(reusableToken); nextToken != null; nextToken = input.Next(reusableToken))
     {
         int len = nextToken.TermLength();
         if (len >= min && len <= max)
         {
             return nextToken;
         }
         // note: else we ignore it but should we index each part of it?
     }
     // reached EOS -- return null
     return null;
 }
示例#50
0
        /// <summary>
        /// 对索引分词
        /// </summary>
        /// <param name="str"></param>
        /// <returns></returns>
        public static string[] SqlitIndexWord(string str)
        {
            //盘古分词 //对输入的搜索条件进行分词
            List <string> list        = new List <string>();
            Analyzer      analyzer    = new PanGuAnalyzer();
            TokenStream   tokenStream = analyzer.TokenStream("", new StringReader(str));

            Lucene.Net.Analysis.Token token = null;
            while ((token = tokenStream.Next()) != null)
            {
                Console.WriteLine(token.TermText());
                list.Add(token.TermText());
            }

            return(list.ToArray());
        }
示例#51
0
        public sealed override Boolean IncrementToken()
        {
            ClearAttributes();

            Lucene.Net.Analysis.Token word = Next();
            if (word != null)
            {
                var buffer = word.ToString();
                termAtt.SetEmpty().Append(buffer);
                offsetAtt.SetOffset(CorrectOffset(word.StartOffset), CorrectOffset(word.EndOffset));
                typeAtt.Type = word.Type;
                return(true);
            }
            End();
            this.Dispose();
            return(false);
        }
示例#52
0
        /// <summary>
        /// 分词测试
        /// </summary>
        /// <param name="keyword"></param>
        /// <returns></returns>
        public string Token(string keyword)
        {
            string ret = "";

            System.IO.StringReader          reader = new System.IO.StringReader(keyword);
            Lucene.Net.Analysis.TokenStream ts     = analyzer.TokenStream(keyword, reader);
            Lucene.Net.Analysis.Token       token  = ts.Next();
            while (token != null)
            {
                ret  += " " + token.TermText();
                token = ts.Next();
            }
            ts.CloneAttributes();
            reader.Close();
            analyzer.Close();
            return(ret);
        }
示例#53
0
        /// <summary>
        /// 把输入的msg进行分词
        /// </summary>
        /// <param name="msg"></param>
        /// <returns></returns>
        public static IEnumerable <string> SplitWords(string msg)
        {
            List <string> list = new List <string>();

            Analyzer    analyzer    = new PanGuAnalyzer();
            TokenStream tokenStream = analyzer.TokenStream("",
                                                           new StringReader(msg));

            Lucene.Net.Analysis.Token token = null;
            //Next()取分到的下一个词
            while ((token = tokenStream.Next()) != null)
            {
                string word = token.TermText();//分到的词
                list.Add(word);
            }
            return(list);
        }
示例#54
0
		/// <summary>
		/// 
		/// </summary>
		/// <returns>Returns the next token in the stream, or null at EOS</returns>
		public override Token Next() 
		{
			if ((token = input.Next()) == null)
			{
				return null;
			}
			else
			{
				String s = stemmer.Stem(token.TermText());
				if (!s.Equals(token.TermText()))
				{
					return new Token(s, token.StartOffset(), token.EndOffset(),
						token.Type());
				}
				return token;
			}
		}
示例#55
0
        public void Analyzers()
        {
            string[] strings = new string[]
            {
                "The quick brown fox jumped over the lazy dogs",
                "XY&Z Corporation - [email protected]"
            };

            Analyzer[] analyzers = new Analyzer[]
            {
                new WhitespaceAnalyzer(),
                new SimpleAnalyzer(),
                new StopAnalyzer(),
                new StandardAnalyzer(),
                new SnowballAnalyzer("English", StopAnalyzer.ENGLISH_STOP_WORDS),                               // Same as EnglishAnalyzer
                new KeywordAnalyzer()
            };

            foreach (string text in strings)
            {
                log.Debug("Analyzing \"" + text + "\"");

                // Make each analyzer analyze the current string.
                foreach (Analyzer analyzer in analyzers)
                {
                    StringBuilder analysisText = new StringBuilder();
                    analysisText.AppendLine("\t" + analyzer.GetType().Name + ":");
                    analysisText.Append("\t\t");

                    TokenStream stream = analyzer.TokenStream("contents", new StringReader(text));
                    while (true)
                    {
                        Token token = stream.Next();
                        if (token == null)
                        {
                            break;
                        }

                        analysisText.Append("[" + token.TermText() + "] ");
                    }

                    log.Debug(analysisText.ToString());
                }
            }
        }
示例#56
0
        public override Token Next(Token token)
        {
            token.Clear();
            if (start == 0)
            {
                length = input.Read((System.Char[])ioBuffer, 0, ioBuffer.Length);
                if (length <= 0)
                    return null;
            }

            if (start == length)
                return null;
            token.SetTermBuffer(ioBuffer, start, 1);

            start++;
            token.termBuffer[0] = System.Char.ToLower(token.termBuffer[0]);
            return token;
        }
示例#57
0
        /// <summary>
        /// 测试不同的Analyzer分词效果
        /// </summary>
        /// <param name="listAnalyzer"></param>
        /// <param name="input"></param>
        public static void TestAnalyzer(IList <Analyzer> listAnalyzer, string input)
        {
            foreach (Analyzer analyzer in listAnalyzer)
            {
                Console.WriteLine(string.Format("{0}:", analyzer.ToString()));

                using (TextReader reader = new StringReader(input))
                {
                    TokenStream stream = analyzer.ReusableTokenStream(string.Empty, reader);
                    Lucene.Net.Analysis.Token token = null;
                    while ((token = stream.Next()) != null)
                    {
                        Console.WriteLine(token.TermText());
                    }
                }

                Console.WriteLine();
            }
        }
示例#58
0
        public Lucene.Net.Analysis.Token Next()
        {
            int  length = 0;
            bool res    = iter.MoveNext();

            Lucene.Net.Analysis.Token token;
            if (res)
            {
                JiebaNet.Segmenter.Token word = iter.Current;

                token = new Lucene.Net.Analysis.Token(word.Word, word.StartIndex, word.EndIndex);
                // Console.WriteLine("xxxxxxxxxxxxxxxx分词:"+word.Word+"xxxxxxxxxxx起始位置:"+word.StartIndex+"xxxxxxxxxx结束位置"+word.EndIndex);
                start += length;
                return(token);
            }
            else
            {
                return(null);
            }
        }
示例#59
0
        private void ProcessURLToken(Lucene.Net.Analysis.Token token)
        {
            token_type = tokentype_host;

            string hostname = token.TermText();

            parts = hostname.Split('.');

            if (parts [0] != "www")
            {
                return;
            }

            // remove initial www
            Array.Copy(parts, 1, parts, 0, parts.Length - 1);
            Array.Resize(ref parts, parts.Length - 1);
            // FIXME: Remove final tld
            // Any string of form "<alnum> '.')+<alnum>" has type HOST
            // Removing last token might remove important words from non-host
            // string of that form. To fix that, we need to match against the
            // huge list of TLDs.
        }
示例#60
0
        public override Token Next(Token token)
        {
            token.Clear();
            if (start == 0)
            {
                length = input.Read((System.Char[])ioBuffer, 0, ioBuffer.Length);
                if (length <= 0)
                {
                    return(null);
                }
            }

            if (start == length)
            {
                return(null);
            }
            token.SetTermBuffer(ioBuffer, start, 1);

            start++;
            token.termBuffer[0] = System.Char.ToLower(token.termBuffer[0]);
            return(token);
        }