public override TokenStream TokenStream(string fieldName, System.IO.TextReader reader) { var seg = new JiebaSegmenter(); TokenStream result = new JiebaTokenizer(seg, reader); result = new LowerCaseFilter(result); result = new StopFilter(true, result, StopWords); return result; }
public override TokenStream TokenStream(string fieldName, TextReader reader) { var seg = new JiebaSegmenter(); TokenStream result = new JiebaTokenizer(seg, reader); // This filter is necessary, because the parser converts the queries to lower case. result = new LowerCaseFilter(result); result = new StopFilter(true, result, StopWords); return result; }
public JiebaTokenizer(JiebaSegmenter seg, string input) { segmenter = seg; termAtt = AddAttribute<ITermAttribute>(); offsetAtt = AddAttribute<IOffsetAttribute>(); typeAtt = AddAttribute<ITypeAttribute>(); var text = input; tokens = segmenter.Tokenize(text, TokenizerMode.Search).ToList(); }
public TextRankExtractor() { Span = 5; Segmenter = new JiebaSegmenter(); PosSegmenter = new PosSegmenter(Segmenter); SetStopWords(ConfigManager.StopWordsFile); if (StopWords.IsEmpty()) { StopWords.UnionWith(DefaultStopWords); } }
public TrainingSet() { countWordGroup = new Dictionary<string, Dictionary<string, int>>(); countCharacterGroup = new Dictionary<string, Dictionary<string, int>>(); //countHead = new Dictionary<string, int>(); segmenter = new JiebaSegmenter(); trie = new WordDictionary(); Console.WriteLine(System.DateTime.Now.ToString() + "词库加载完毕,开始加载训练集..."); LoadTrainingSet(); OutputDictionary(); Console.WriteLine(System.DateTime.Now.ToString() + "训练集加载完毕..."); }
/// <summary> /// 创建索引 /// </summary> /// <returns></returns> public ActionResult CreateIndex() { var seg = new JiebaSegmenter(); seg.AddWord("Bolg"); BlogSearcher.ClearLuceneIndex(); Stopwatch st = new Stopwatch();//实例化类 st.Start();//开始计时 var data = bllSession.IArticleBLL.GetList(""); BlogSearcher.UpdateLuceneIndex(data); st.Stop();//终止计时 System.Diagnostics.Debug.WriteLine("执行时间:" + st.ElapsedMilliseconds); return Redirect("/Test/BlogSearchTest"); }
public TfidfExtractor() { Segmenter = new JiebaSegmenter(); PosSegmenter = new PosSegmenter(Segmenter); SetStopWords(ConfigManager.StopWordsFile); if (StopWords.IsEmpty()) { StopWords.UnionWith(DefaultStopWords); } Loader = new IdfLoader(DefaultIdfFile); IdfFreq = Loader.IdfFreq; MedianIdf = Loader.MedianIdf; }
private static void TestNewsData() { var seg = new JiebaSegmenter(); seg.AddWord("机器学习"); NewsSearcher.ClearLuceneIndex(); var data = NewsRepository.GetAll(); NewsSearcher.UpdateLuceneIndex(data); var results = NewsSearcher.Search("进"); foreach (var result in results) { Console.WriteLine(result); } }
public static void Run() { while (true) { var str = Console.ReadLine(); var segmenter = new JiebaSegmenter(); var segments = segmenter.Cut(str, cutAll: true); Console.WriteLine("【全模式】:{0}", string.Join("/ ", segments)); segments = segmenter.Cut(str); // 默认为精确模式 Console.WriteLine("【精确模式】:{0}", string.Join("/ ", segments)); segments = segmenter.Cut(str); // 默认为精确模式,同时也使用HMM模型 Console.WriteLine("【新词识别】:{0}", string.Join("/ ", segments)); segments = segmenter.CutForSearch(str); // 搜索引擎模式 Console.WriteLine("【搜索引擎模式】:{0}", string.Join("/ ", segments)); segments = segmenter.Cut(str); Console.WriteLine("【歧义消除】:{0}", string.Join("/ ", segments)); } }
public JiebaTokenizer(JiebaSegmenter seg, TextReader input) : this(seg, input.ReadToEnd()) { }
public ActionResult TestSearch() { var seg = new JiebaSegmenter(); seg.AddWord("机器学习"); NewsSearcher.ClearLuceneIndex(); var data = NewsRepository.GetAll(); NewsSearcher.UpdateLuceneIndex(data); var results = NewsSearcher.Search("方法研究"); return View(results); }