Example #1
0
        public override TokenStream TokenStream(string fieldName, System.IO.TextReader reader)
        {
            var seg = new JiebaSegmenter();
            TokenStream result = new JiebaTokenizer(seg, reader);

            result = new LowerCaseFilter(result);
            result = new StopFilter(true, result, StopWords);
            return result;
        }
Example #2
0
 public override TokenStream TokenStream(string fieldName, TextReader reader)
 {
     var seg = new JiebaSegmenter();
     TokenStream result = new JiebaTokenizer(seg, reader);
     // This filter is necessary, because the parser converts the queries to lower case.
     result = new LowerCaseFilter(result);
     result = new StopFilter(true, result, StopWords);
     return result;
 }
Example #3
0
        public JiebaTokenizer(JiebaSegmenter seg, string input)
        {
            segmenter = seg;
            termAtt = AddAttribute<ITermAttribute>();
            offsetAtt = AddAttribute<IOffsetAttribute>();
            typeAtt = AddAttribute<ITypeAttribute>();

            var text = input;
            tokens = segmenter.Tokenize(text, TokenizerMode.Search).ToList();
        }
Example #4
0
        public TextRankExtractor()
        {
            Span = 5;

            Segmenter = new JiebaSegmenter();
            PosSegmenter = new PosSegmenter(Segmenter);
            SetStopWords(ConfigManager.StopWordsFile);
            if (StopWords.IsEmpty())
            {
                StopWords.UnionWith(DefaultStopWords);
            }
        }
Example #5
0
 public TrainingSet()
 {
     countWordGroup = new Dictionary<string, Dictionary<string, int>>();
     countCharacterGroup = new Dictionary<string, Dictionary<string, int>>();
     //countHead = new Dictionary<string, int>();
     segmenter = new JiebaSegmenter();
     trie = new WordDictionary();
     Console.WriteLine(System.DateTime.Now.ToString() + "词库加载完毕,开始加载训练集...");
     LoadTrainingSet();
     OutputDictionary();
     Console.WriteLine(System.DateTime.Now.ToString() + "训练集加载完毕...");
 }
Example #6
0
        /// <summary>
        /// 创建索引
        /// </summary>
        /// <returns></returns>
        public ActionResult CreateIndex()
        {
            var seg = new JiebaSegmenter();
            seg.AddWord("Bolg");

            BlogSearcher.ClearLuceneIndex();
            Stopwatch st = new Stopwatch();//实例化类
            st.Start();//开始计时
            var data = bllSession.IArticleBLL.GetList("");
            BlogSearcher.UpdateLuceneIndex(data);
            st.Stop();//终止计时
            System.Diagnostics.Debug.WriteLine("执行时间:" + st.ElapsedMilliseconds);
            return Redirect("/Test/BlogSearchTest");
        }
Example #7
0
        public TfidfExtractor()
        {
            Segmenter = new JiebaSegmenter();
            PosSegmenter = new PosSegmenter(Segmenter);
            SetStopWords(ConfigManager.StopWordsFile);
            if (StopWords.IsEmpty())
            {
                StopWords.UnionWith(DefaultStopWords);
            }

            Loader = new IdfLoader(DefaultIdfFile);

            IdfFreq = Loader.IdfFreq;
            MedianIdf = Loader.MedianIdf;
        }
Example #8
0
        private static void TestNewsData()
        {
            var seg = new JiebaSegmenter();
            seg.AddWord("机器学习");

            NewsSearcher.ClearLuceneIndex();

            var data = NewsRepository.GetAll();
            NewsSearcher.UpdateLuceneIndex(data);

            var results = NewsSearcher.Search("进");
            foreach (var result in results)
            {
                Console.WriteLine(result);
            }
        }
Example #9
0
        public static void Run() {
            while (true) {

                var str = Console.ReadLine();
                var segmenter = new JiebaSegmenter();
                var segments = segmenter.Cut(str, cutAll: true);
                Console.WriteLine("【全模式】:{0}", string.Join("/ ", segments));

                segments = segmenter.Cut(str); // 默认为精确模式
                Console.WriteLine("【精确模式】:{0}", string.Join("/ ", segments));

                segments = segmenter.Cut(str); // 默认为精确模式,同时也使用HMM模型
                Console.WriteLine("【新词识别】:{0}", string.Join("/ ", segments));

                segments = segmenter.CutForSearch(str); // 搜索引擎模式
                Console.WriteLine("【搜索引擎模式】:{0}", string.Join("/ ", segments));

                segments = segmenter.Cut(str);
                Console.WriteLine("【歧义消除】:{0}", string.Join("/ ", segments));
            }
        }
Example #10
0
 public JiebaTokenizer(JiebaSegmenter seg, TextReader input)
     : this(seg, input.ReadToEnd())
 {
 }
Example #11
0
        public ActionResult TestSearch()
        {
            var seg = new JiebaSegmenter();
            seg.AddWord("机器学习");

            NewsSearcher.ClearLuceneIndex();

            var data = NewsRepository.GetAll();
            NewsSearcher.UpdateLuceneIndex(data);

            var results = NewsSearcher.Search("方法研究");

            return View(results);
        }