public void TestTokenizeWithSpace() { var seg = new JiebaSegmenter(); var s = "永和服装饰品有限公司"; var tokens = seg.Tokenize(s).ToList(); Assert.That(tokens.Count, Is.EqualTo(4)); Assert.That(tokens.Last().EndIndex, Is.EqualTo(s.Length)); s = "永和服装饰品 有限公司"; tokens = seg.Tokenize(s).ToList(); Assert.That(tokens.Count, Is.EqualTo(5)); Assert.That(tokens.Last().EndIndex, Is.EqualTo(s.Length)); }
public void TestTokenize() { var seg = new JiebaSegmenter(); foreach (var token in seg.Tokenize("小明最近在学习机器学习、自然语言处理、云计算和大数据")) { Console.WriteLine(token); } Console.WriteLine(); foreach (var token in seg.Tokenize("小明最近在学习机器学习、自然语言处理、云计算和大数据", TokenizerMode.Search)) { Console.WriteLine(token); } }
static public IEnumerable <string> SegmentFor(string segmenterString) { JiebaSegmenter jiebaSegmenter = new JiebaSegmenter(); return(jiebaSegmenter.Tokenize(segmenterString, TokenizerMode.Search) .Select(x => x.Word)); }
public JiebaTokenizer(JiebaSegmenter seg, string input) { segmenter = seg; termAtt = AddAttribute <ITermAttribute>(); offsetAtt = AddAttribute <IOffsetAttribute>(); typeAtt = AddAttribute <ITypeAttribute>(); tokens = segmenter.Tokenize(input, TokenizerMode.Search).ToList(); }
public override void Reset() { base.Reset(); _inputText = ReadToEnd(base.m_input); RemoveStopWords(_segmenter.Tokenize(_inputText, _mode)); _iter = _wordList.GetEnumerator(); }
public void TokenizeSearchDemo() { var segmenter = new JiebaSegmenter(); var s = "永和服装饰品有限公司"; var tokens = segmenter.Tokenize(s, TokenizerMode.Search); foreach (var token in tokens) { Console.WriteLine("word {0,-12} start: {1,-3} end: {2,-3}", token.Word, token.StartIndex, token.EndIndex); } }
public override void Reset() { base.Reset(); _InputText = ReadToEnd(base.m_input); RemoveStopWords(segmenter.Tokenize(_InputText, mode)); start = 0; iter = _WordList.GetEnumerator(); }
/// <summary> /// 搜索模式 /// </summary> /// <param name="sender"></param> /// <param name="e"></param> protected void btnSearch_Click(object sender, EventArgs e) { var segmenter = new JiebaSegmenter(); var s = txtInput.Text; var tokens = segmenter.Tokenize(s, TokenizerMode.Search); foreach (var token in tokens) { Response.Write(string.Format("word {0,-12} start: {1,-3} end: {2,-3}<br>", token.Word, token.StartIndex, token.EndIndex)); } }
public override void Reset() { base.Reset(); _InputText = ReadToEnd(base.m_input); var jiabeToken = segmenter.Tokenize(_InputText); RemoveStopWords(jiabeToken); //foreach (var item in jiabeToken) //{ // var token = new JiebaNet.Segmenter.Token () // _WordList.Add(); //} start = 0; iter = _WordList.GetEnumerator(); }
public IEnumerable <Segmenter.Token> Tokenize(string text, TokenizerMode mode = TokenizerMode.Search) { return(segmenter.Tokenize(text, mode)); }
public void TestTokenize() { var seg = new JiebaSegmenter(); seg.AddWord("机器学习"); seg.AddWord("自然语言处理"); foreach (var token in seg.Tokenize("小明最近在学习机器学习、自然语言处理、云计算和大数据")) { Console.WriteLine(token); } foreach (var token in seg.Tokenize("小明最近在学习机器学习、自然语言处理、云计算和大数据", TokenizerMode.Search)) { Console.WriteLine(token); } }
/// <summary> /// 指定分词方法 /// </summary> /// <returns></returns> virtual protected IEnumerable <SegmenterToken> Segmenter(string segmenterString) { return(jiebaSegmenter.Tokenize(segmenterString, TokenizerMode.Search) .Select(x => new SegmenterToken(x.Word, x.StartIndex))); }