예제 #1
0
        private readonly string SESSION_NAME = "ListResultHrefs"; //缓存的key
        protected void Page_Load(object sender, EventArgs e)
        {
            List <ExampleMyHref> resultHrefs;

            if (!IsPostBack)
            {
                Session.RemoveAll();           //清空已有缓存以免冲突
                resultHrefs = HrefCrawler(15); //爬取前15页

                //分词并保存, 词频大于20的写入json
                var    segmenter = new JiebaSegmenter();
                string allTItles = "";
                foreach (ExampleMyHref href in resultHrefs)
                {
                    allTItles       += href.HrefTitle + ";";
                    href.KeywordList = new List <string>();
                    var word4Search = segmenter.CutForSearch(href.HrefTitle);
                    foreach (var word in word4Search)
                    {
                        if (!href.KeywordList.Contains(word))
                        {
                            href.KeywordList.Add(word);
                        }
                    }
                }
                var allWords = segmenter.CutForSearch(allTItles);
                Dictionary <string, int> wordsInts = new Dictionary <string, int>();
                string jsonstr = "[";
                int    i       = 0;
                foreach (var item in allWords.Distinct <string>())
                {
                    if (item.Length >= 2 && item.Length <= 5)
                    {
                        if (!wordsInts.ContainsKey(item))
                        {
                            int f = GetFrequency(allWords, item); //统计词频
                            wordsInts.Add(item.Trim(), f);
                            if (f >= 20)
                            {
                                if (i == 0)
                                {
                                    jsonstr += "{\"name\":\"" + item.Trim() + "\",\"value\":" + f + "}";
                                }
                                else
                                {
                                    jsonstr += ",{\"name\":\"" + item.Trim() + "\",\"value\":" + f + "}";
                                }
                                i++;
                            }
                        }
                    }
                }
                jsonstr += "]";
                WriteData("tieba.json", jsonstr);

                Session[SESSION_NAME] = resultHrefs;  //使用Session缓存查询到的数据
            }
        }
예제 #2
0
        static void Main(string[] args)
        {
            List <string> stopWords = new List <string>();
            string        stopUrl   = "./stopwords.txt";

            StreamReader rd = File.OpenText(stopUrl);
            string       ss = "";

            while ((ss = rd.ReadLine()) != null)
            {
                stopWords.Add(ss);
            }

            var segmenter = new JiebaSegmenter();
            var segments  = segmenter.Cut("我来到北京清华大学", cutAll: true);

            Console.WriteLine("【全模式】:{0}", string.Join("/ ", segments));

            segments = segmenter.Cut("我来到北京清华大学");  // 默认为精确模式
            Console.WriteLine("【精确模式】:{0}", string.Join("/ ", segments));

            segments = segmenter.Cut("他来到了网易杭研大厦");  // 默认为精确模式,同时也使用HMM模型
            Console.WriteLine("【新词识别】:{0}", string.Join("/ ", segments));

            segments = segmenter.CutForSearch("小明硕士毕业于中国科学院计算所,后在日本京都大学深造"); // 搜索引擎模式
            Console.WriteLine("【搜索引擎模式】:{0}", string.Join("/ ", segments));

            segments = segmenter.Cut("结过婚的和尚未结过婚的");
            Console.WriteLine("【歧义消除】:{0}", string.Join("/ ", segments));
            Console.ReadKey();
        }
예제 #3
0
        public void CutDemo()
        {
            var segmenter = new JiebaSegmenter();
            var segments = segmenter.Cut("我来到北京清华大学", cutAll: true);
            Console.WriteLine("【全模式】:{0}", string.Join("/ ", segments));

            segments = segmenter.Cut("我来到北京清华大学");  // 默认为精确模式
            Console.WriteLine("【精确模式】:{0}", string.Join("/ ", segments));

            segments = segmenter.Cut("他来到了网易杭研大厦");  // 默认为精确模式,同时也使用HMM模型
            Console.WriteLine("【新词识别】:{0}", string.Join("/ ", segments));

            segments = segmenter.CutForSearch("小明硕士毕业于中国科学院计算所,后在日本京都大学深造"); // 搜索引擎模式
            Console.WriteLine("【搜索引擎模式】:{0}", string.Join("/ ", segments));

            segments = segmenter.Cut("结过婚的和尚未结过婚的");
            Console.WriteLine("【歧义消除】:{0}", string.Join("/ ", segments));

            segments = segmenter.Cut("北京大学生喝进口红酒");
            Console.WriteLine("【歧义消除】:{0}", string.Join("/ ", segments));

            segments = segmenter.Cut("在北京大学生活区喝进口红酒");
            Console.WriteLine("【歧义消除】:{0}", string.Join("/ ", segments));

            segments = segmenter.Cut("腾讯视频致力于打造中国最大的在线视频媒体平台,以丰富的内容、极致的观看体验");
            Console.WriteLine("【精确模式】:{0}", string.Join("/ ", segments));

            segmenter.DeleteWord("湖南");
            segmenter.AddWord("湖南");
            //segmenter.AddWord("长沙市");
            segments = segmenter.Cut("湖南长沙市天心区");
            Console.WriteLine("【精确模式】:{0}", string.Join("/ ", segments));
        }
예제 #4
0
 static void ApartWords(ref ArrayList text)
 {
     for (int i = 0; i < text.Count; i++)
     {
         string        str       = text[i].ToString().Trim(' ').Replace(" ", "");
         var           segmenter = new JiebaSegmenter();
         var           segments  = segmenter.CutForSearch(str);
         string[]      x         = segments.ToArray();
         List <string> y         = new List <string>();
         int           flag      = 1;
         for (int j = 0; j < x.Length; j++)
         {
             flag = 1;
             foreach (var k in list)
             {
                 if (x[j] == k.ToString())
                 {
                     flag = 0;
                     break;
                 }
             }
             if (flag == 1)
             {
                 y.Add(x[j]);
             }
         }
         text[i] = string.Join(" ", y);
     }
 }
예제 #5
0
        /// <summary>
        /// 全文检索处理
        /// </summary>
        /// <param name="content"></param>
        /// <returns></returns>
        public NpgsqlTsVector GetNpgsqlTsVector(string content)
        {
            NpgsqlTsVector vector;

            try
            {
                var segmenter = new JiebaSegmenter();
                HtmlToTextHelper htmlToTextHelper = new HtmlToTextHelper();
                if (string.IsNullOrWhiteSpace(content))
                {
                    return(null);;
                }
                string noHtmlConent = htmlToTextHelper.Convert(content);
                var    list         = segmenter.CutForSearch(noHtmlConent, hmm: true);
                var    cutList      = new List <string>();
                foreach (var item in list)
                {
                    if (item.Length > 1)
                    {
                        cutList.Add(item.ToUpper());
                    }
                }
                string str = string.Join(" ", cutList);
                vector = NpgsqlTsVector.Parse(str);
            }
            catch (Exception ex)
            {
                return(null);
            }

            return(vector);
        }
예제 #6
0
        public void CutDemo()
        {
            var segmenter = new JiebaSegmenter();
            var segments  = segmenter.Cut("我来到北京清华大学", cutAll: true);

            Console.WriteLine("【全模式】:{0}", string.Join("/ ", segments));

            segments = segmenter.Cut("我来到北京清华大学");  // 默认为精确模式
            Console.WriteLine("【精确模式】:{0}", string.Join("/ ", segments));

            segments = segmenter.Cut("他来到了网易杭研大厦");  // 默认为精确模式,同时也使用HMM模型
            Console.WriteLine("【新词识别】:{0}", string.Join("/ ", segments));

            segments = segmenter.CutForSearch("小明硕士毕业于中国科学院计算所,后在日本京都大学深造"); // 搜索引擎模式
            Console.WriteLine("【搜索引擎模式】:{0}", string.Join("/ ", segments));

            segments = segmenter.Cut("结过婚的和尚未结过婚的");
            Console.WriteLine("【歧义消除】:{0}", string.Join("/ ", segments));

            segments = segmenter.Cut("北京大学生喝进口红酒");
            Console.WriteLine("【歧义消除】:{0}", string.Join("/ ", segments));

            segments = segmenter.Cut("在北京大学生活区喝进口红酒");
            Console.WriteLine("【歧义消除】:{0}", string.Join("/ ", segments));

            segments = segmenter.Cut("腾讯视频致力于打造中国最大的在线视频媒体平台,以丰富的内容、极致的观看体验");
            Console.WriteLine("【精确模式】:{0}", string.Join("/ ", segments));

            segmenter.DeleteWord("湖南");
            segmenter.AddWord("湖南");
            //segmenter.AddWord("长沙市");
            segments = segmenter.Cut("湖南长沙市天心区");
            Console.WriteLine("【精确模式】:{0}", string.Join("/ ", segments));
        }
        /// <summary>
        /// 分词
        /// </summary>
        /// <param name="keyword"></param>
        /// <returns></returns>
        public List <string> CutKeywords(string keyword)
        {
            if (_memoryCache.TryGetValue(keyword, out List <string> list))
            {
                return(list);
            }

            var set = new HashSet <string>
            {
                keyword
            };
            var mc = Regex.Matches(keyword, @"(([A-Z]*[a-z]*)[\d]*)([\u4E00-\u9FA5]+)*((?!\p{P}).)*");

            foreach (Match m in mc)
            {
                set.Add(m.Value);
                foreach (Group g in m.Groups)
                {
                    set.Add(g.Value);
                }
            }

            var segmenter = new JiebaSegmenter();

            foreach (string word in segmenter.CutForSearch(keyword))
            {
                set.Add(word);
            }
            set.RemoveWhere(s => s.Length < 2 || Regex.IsMatch(s, @"^\p{P}.*"));
            list = set.OrderByDescending(s => s.Length).ToList();
            _memoryCache.Set(keyword, list, TimeSpan.FromHours(1));
            return(list);
        }
예제 #8
0
        public void CutDemo()
        {
            var segmenter = new JiebaSegmenter();
            var segments  = segmenter.Cut("我来到北京清华大学", cutAll: true);

            Console.WriteLine("【全模式】:{0}", string.Join("/ ", segments));

            segments = segmenter.Cut("我来到北京清华大学");  // 默认为精确模式
            Console.WriteLine("【精确模式】:{0}", string.Join("/ ", segments));

            segments = segmenter.Cut("他来到了网易杭研大厦");  // 默认为精确模式,同时也使用HMM模型
            Console.WriteLine("【新词识别】:{0}", string.Join("/ ", segments));

            segments = segmenter.CutForSearch("小明硕士毕业于中国科学院计算所,后在日本京都大学深造"); // 搜索引擎模式
            Console.WriteLine("【搜索引擎模式】:{0}", string.Join("/ ", segments));

            segments = segmenter.Cut("结过婚的和尚未结过婚的");
            Console.WriteLine("【歧义消除】:{0}", string.Join("/ ", segments));

            segments = segmenter.Cut("北京大学生喝进口红酒");
            Console.WriteLine("【歧义消除】:{0}", string.Join("/ ", segments));

            segments = segmenter.Cut("在北京大学生活区喝进口红酒");
            Console.WriteLine("【歧义消除】:{0}", string.Join("/ ", segments));
        }
예제 #9
0
파일: Keywords.cs 프로젝트: bsed/Buy
        public static void Add(string text)
        {
            var segmenter = new JiebaSegmenter();
            var result    = segmenter.CutForSearch(text)
                            .GroupBy(s => s)
                            .Where(s => s.Key.Length > 1)
                            .Select(s => new { Key = s.Key, Count = s.Count() })
                            .ToList();
            //using (ApplicationDbContext db = new ApplicationDbContext())
            //{
            var temp  = result.Select(s => s.Key).ToList();
            var keys  = db.Keywords.Where(s => temp.Contains(s.Word)).ToList();
            var words = keys.Select(s => s.Word).ToList();

            foreach (var item in keys)
            {
                item.CouponNameCount += result.FirstOrDefault(s => s.Key == item.Word)?.Count ?? 0;
            }
            var addKeys = result.Where(s => !words.Contains(s.Key)).Select(s => new Keyword
            {
                CouponNameCount = s.Count,
                Word            = s.Key
            }).ToList();

            db.Keywords.AddRange(addKeys);
            db.SaveChanges();
            //}
        }
예제 #10
0
        private List <string> WordSplitResult(string strWords)
        {
            List <string>        result = new List <string>();
            IEnumerable <string> segments;

            switch (comboBoxCutMode.SelectedIndex)
            {
            case 0:
                segments = segmenter.Cut(strWords);
                break;

            case 1:
                segments = segmenter.CutForSearch(strWords);
                break;

            case 2:
                var idf = new TfidfExtractor();
                segments = idf.ExtractTags(strWords, 20, Constants.NounAndVerbPos);
                break;

            default:
                var textRank = new TextRankExtractor();
                segments = textRank.ExtractTags(strWords, 20, Constants.NounAndVerbPos);
                break;
            }
            foreach (string str in string.Join(" ", segments).Split(' '))
            {
                if (!stopwordsList.Contains(str))
                {
                    result.Add(str);
                }
            }
            return(result);
        }
예제 #11
0
        private void button1_Click(object sender, EventArgs e)
        {
            JiebaSegmenter jb    = new JiebaSegmenter();
            var            words = jb.CutForSearch(this.richTextBox1.Text);

            MsgDtlHandle(string.Format("【搜索引擎模式】:{0}", string.Join("/ ", words)));
        }
예제 #12
0
        static void Main(string[] args)
        {
            var segmenter = new JiebaSegmenter();

            segmenter.LoadUserDict("userdict.txt");
            var segments = segmenter.Cut("我来到北京清华大学", cutAll: true);

            Console.WriteLine("【全模式】:{0}", string.Join("/ ", segments));

            segments = segmenter.Cut("我来到北京清华大学");  // 默认为精确模式
            Console.WriteLine("【精确模式】:{0}", string.Join("/ ", segments));

            segments = segmenter.Cut("他来到了网易杭研大厦");  // 默认为精确模式,同时也使用HMM模型
            Console.WriteLine("【新词识别】:{0}", string.Join("/ ", segments));

            segments = segmenter.CutForSearch("小明硕士毕业于中国科学院计算所,后在日本京都大学深造"); // 搜索引擎模式
            Console.WriteLine("【搜索引擎模式】:{0}", string.Join("/ ", segments));

            segments = segmenter.Cut("结过婚的和尚未结过婚的");
            Console.WriteLine("【歧义消除】:{0}", string.Join("/ ", segments));

            segments = segmenter.Cut("linezerodemo机器学习学习机器");
            Console.WriteLine("【用户字典】:{0}", string.Join("/ ", segments));
            Console.ReadKey();
        }
        /// <summary>
        /// jieba.net分词,为检索专用,分的更细
        /// </summary>
        /// <param name="str">输入字符串</param>
        /// <param name="strout">输出字符串</param>
        /// <returns></returns>
        public string JiebaSeg(string str)
        {
            var segmenter = new JiebaSegmenter();
            var segments  = segmenter.CutForSearch(str);

            return(string.Join(" ", segments));
        }
예제 #14
0
        static void Main(string[] args)
        {
            var segmenter = new JiebaSegmenter();

            segmenter.LoadUserDict("userdict.txt");
            var segments = segmenter.Cut("我来到北京清华大学", cutAll: true);

            Console.WriteLine("【全模式】:{0}", string.Join("/ ", segments));

            segments = segmenter.Cut("我来到北京清华大学");  // 默认为精确模式
            Console.WriteLine("【精确模式】:{0}", string.Join("/ ", segments));

            segments = segmenter.Cut("他来到了网易杭研大厦");  // 默认为精确模式,同时也使用HMM模型
            Console.WriteLine("【新词识别】:{0}", string.Join("/ ", segments));

            segments = segmenter.CutForSearch("小明硕士毕业于中国科学院计算所,后在日本京都大学深造"); // 搜索引擎模式
            Console.WriteLine("【搜索引擎模式】:{0}", string.Join("/ ", segments));

            segments = segmenter.Cut("结过婚的和尚未结过婚的");
            Console.WriteLine("【歧义消除】:{0}", string.Join("/ ", segments));

            segments = segmenter.Cut("linezerodemo机器学习学习机器");
            Console.WriteLine("【用户字典】:{0}", string.Join("/ ", segments));

            //词频统计
            var s     = "此领域探讨如何处理及运用自然语言。自然语言生成系统把计算机数据转化为自然语言。自然语言理解系统把自然语言转化为计算机程序更易于处理的形式。";
            var freqs = new Counter <string>(segmenter.Cut(s));

            foreach (var pair in freqs.MostCommon(5))
            {
                Console.WriteLine($"{pair.Key}: {pair.Value}");
            }
            Console.ReadKey();
        }
예제 #15
0
        public string GetSearchKeyWorlds(string keyword)
        {
            var    segmenter = new JiebaSegmenter();
            var    list      = segmenter.CutForSearch(keyword, hmm: true);
            string result    = string.Join(" ", list);

            return(result);
        }
예제 #16
0
        protected void Button1_Click(object sender, EventArgs e)
        {
            string    text      = TextBox1.Text;
            var       segmenter = new JiebaSegmenter();
            string    aimFile   = string.Format(@"./Resources/{0}.txt", text);
            string    content   = GetContent(aimFile);
            Stopwatch sw        = new Stopwatch();

            sw.Start();
            var wordsforSearch = segmenter.CutForSearch(content);
            Dictionary <string, int> persons = new Dictionary <string, int>();
            string jsonstr = "[";
            int    i       = 0;

            foreach (string item in wordsforSearch.Distinct <string>())
            {
                if (item.Length >= 2 && item.Length <= 4)
                {
                    if (!persons.ContainsKey(item))
                    {
                        int f = GetFrequence(wordsforSearch, item);
                        persons.Add(item.Trim(), f);
                        if (f >= 20 && f != 2406)
                        {
                            if (i == 0)
                            {
                                jsonstr += "{\"name\":\"" + item.Trim() + "\",\"value\":" + f + "}";
                            }
                            else
                            {
                                jsonstr += ",{\"name\":\"" + item.Trim() + "\",\"value\":" + f + "}";
                            }
                            i++;
                        }
                    }
                }
            }
            jsonstr += "]";
            string name = TextBox2.Text;

            GetJson(name, jsonstr);
            persons = (from entry in persons orderby entry.Value descending select entry).ToDictionary(pair => pair.Key, pair => pair.Value);
            string result = ""; foreach (var person in persons)

            {
                if (person.Value >= 20)
                {
                    result += ("<br>" + person.Key + "-" + person.Value.ToString());
                }
            }

            Response.Write(result); sw.Stop(); TimeSpan ts2 = sw.Elapsed; Response.Write(string.Format("</br>Stopwatch 总共花费{0}ms.", ts2.TotalMilliseconds.ToString()));
            if (!(content == ""))
            {
                Response.Write(string.Format("</br>" + "结果已输出至{0}.json" + "</br>", name));
            }
        }
예제 #17
0
        protected void btnSearch_Click(object sender, EventArgs e)
        {
            var    segmenter      = new JiebaSegmenter();
            string aimFile        = @"./Resources/三国演义.txt";
            string content        = ReadData(aimFile);
            var    wordsforSearch = segmenter.CutForSearch(content);

            Response.Write("</br>【搜索引擎模式】:{0}" + string.Join("/ ", wordsforSearch));
        }
예제 #18
0
파일: AnalyzerUtils.cs 프로젝트: dhtweb/dht
        public static IEnumerable <string> CutForSearch(this string str)
        {
            if (string.IsNullOrWhiteSpace(str))
            {
                return(new string[0]);
            }
            var segement = new JiebaSegmenter();

            return(segement.CutForSearch(str));
        }
예제 #19
0
        public static List <string> GetWords(string content)
        {
            JiebaSegmenter segmenter = new JiebaSegmenter();
            string         goodstr   = RemoveBadSymbols(content);
            List <string>  words     = segmenter.CutForSearch(goodstr).ToList();

            words.RemoveAll(str => str.Equals(""));
            words.RemoveAll(str => str.Equals(" "));

            return(words);
        }
예제 #20
0
        public static string[] ToSeparateByJieba(this string key)
        {
            var segmenter = new JiebaSegmenter();

            segmenter.LoadUserDict(GlobalConstants.MusicDictionaryPath);
            var stopDict    = LoadStopDict(GlobalConstants.MusicStopDictionaryPath);
            var segmentList = (segmenter.CutForSearch(key))
                              .Where(s => !string.IsNullOrWhiteSpace(s))
                              .ToList();
            var segments = segmentList.Except(stopDict);

            return(segments.ToArray());
        }
예제 #21
0
        public void Get_Word6_Test()
        {
            string val       = "今年累计发货多少车";
            var    segmenter = new JiebaSegmenter();
            var    segments  = segmenter.Cut(val, cutAll: true);

            Console.WriteLine(string.Join('/', segments));

            segments = segmenter.Cut(val);
            Console.WriteLine(string.Join('/', segments));

            segments = segmenter.CutForSearch(val);
            Console.WriteLine(string.Join('/', segments));
        }
예제 #22
0
        protected void Page_Load(object sender, EventArgs e)
        {
            var segmenter = new JiebaSegmenter();
            var seg       = segmenter.Cut("我来自华东师范大学", cutAll: true);

            Response.Write(string.Format("[全模式]:{0}</br>", string.Join("/", seg)));
            seg = segmenter.Cut("我来自华东师范大学");//默认为精确模式,同时使用HMM模型
            Response.Write(string.Format("[精确模式]:{0}</br>", string.Join("/", seg)));
            seg = segmenter.Cut("他来到了华东师范大学群贤堂");
            Response.Write(string.Format("[新词识别]:{0}</br>", string.Join("/", seg)));
            seg = segmenter.CutForSearch("李白硕士毕业于东方大学计算所,后在日本京都大学深造");
            Response.Write(string.Format("[搜索引擎模式]:{0}</br>", string.Join("/", seg)));
            seg = segmenter.Cut("结过婚的和尚结过婚的");
            Response.Write(string.Format("[歧义消除]:{0}</br>", string.Join("/", seg)));
        }
        protected void Page_Load(object sender, EventArgs e)
        {
            var segmenter = new JiebaSegmenter();
            var segments  = segmenter.Cut("我来自华东师范大学", cutAll: true);

            Response.Write(string.Format("【全模式】:{0}</br>", string.Join("/ ", segments)));
            segments = segmenter.Cut("我来自华东师范大学");                          //默认为精确模式
            Response.Write(string.Format("【精确模式】:{0}</br>", string.Join("/ ", segments)));
            segments = segmenter.Cut("他来到了华东师范大学群贤堂");                      //默认为精确模式,同时也使用HMM模型
            Response.Write(string.Format("【新词识别】:{0}</br>", string.Join("/ ", segments)));
            segments = segmenter.CutForSearch("李白硕士毕业于东方大学计算所,后在日本京都大学深造"); //搜索引擎模式
            Response.Write(string.Format("【搜索引擎模式】:{0}</br>", string.Join("/ ", segments)));
            segments = segmenter.Cut("结过婚的和尚未结过婚的");
            Response.Write(string.Format("【歧义消除】:{0}</br>", string.Join("/ ", segments)));
        }
예제 #24
0
        /// <summary>
        /// 分词
        /// </summary>
        /// <param name="keyword"></param>
        /// <returns></returns>
        public List <string> CutKeywords(string keyword)
        {
            if (_memoryCache.TryGetValue(keyword, out List <string> list))
            {
                return(list);
            }
            var set = new HashSet <string>
            {
                keyword
            };
            var mc = Regex.Matches(keyword, @"(([A-Z]*[a-z]*)[\d]*)([\u4E00-\u9FA5]+)*((?!\p{P}).)*");

            foreach (Match m in mc)
            {
                set.Add(m.Value);
                foreach (Group g in m.Groups)
                {
                    set.Add(g.Value);
                }
            }
            if (keyword.Length >= 6)
            {
                try
                {
                    var res = HttpClient.GetAsync($"/api/customsearch/keywords?title={keyword}").Result;
                    if (res.StatusCode == HttpStatusCode.OK)
                    {
                        BaiduAnalysisModel model = JsonConvert.DeserializeObject <BaiduAnalysisModel>(res.Content.ReadAsStringAsync().Result);
                        model.Result.Res.KeywordList?.ForEach(s => set.Add(s));
                    }
                }
                catch
                {
                    // ignored
                }
            }

            var segmenter = new JiebaSegmenter();

            foreach (string word in segmenter.CutForSearch(keyword))
            {
                set.Add(word);
            }
            set.RemoveWhere(s => s.Length < 2 || Regex.IsMatch(s, @"^\p{P}.*"));
            list = set.OrderByDescending(s => s.Length).ToList();
            _memoryCache.Set(keyword, list, TimeSpan.FromHours(1));
            return(list);
        }
예제 #25
0
        public void TestCut()
        {
            var segmenter = new JiebaSegmenter();

            segmenter.LoadUserDict(@"D:\lucene\dict.txt");
            segmenter.LoadUserDictForEmbedded(Assembly.GetCallingAssembly(), "dict.txt");
            var segments    = segmenter.Cut("我来到北京清华大学", cutAll: true);
            var resultWords = new List <string> {
                "我", "来到", "北京", "清华", "清华大学", "华大", "大学"
            };

            Compared(segments, resultWords);

            segments    = segmenter.Cut("我来到北京清华大学");
            resultWords = new List <string> {
                "我", "来到", "北京", "清华大学"
            };
            Compared(segments, resultWords);

            segments    = segmenter.Cut("他来到了网易杭研大厦"); // 默认为精确模式,同时也使用HMM模型
            resultWords = new List <string> {
                "他", "来到", "了", "网易", "杭研", "大厦"
            };
            Compared(segments, resultWords);

            segments    = segmenter.CutForSearch("小明硕士毕业于中国科学院计算所,后在日本京都大学深造"); // 搜索引擎模式
            resultWords = new List <string> {
                "小明", "硕士", "毕业", "于", "中国", "科学", "学院", "科学院", "中国科学院", "计算", "计算所", ",", "后"
                , "在", "日本", "京都", "大学", "日本京都大学", "深造"
            };
            Compared(segments, resultWords);

            segments    = segmenter.Cut("结过婚的和尚未结过婚的");
            resultWords = new List <string> {
                "结过婚", "的", "和", "尚未", "结过婚", "的"
            };

            Compared(segments, resultWords);

            segments    = segmenter.Cut("快奔三", false, false);
            resultWords = new List <string> {
                "快", "奔三"
            };

            Compared(segments, resultWords);
        }
예제 #26
0
        public void CutDemo()
        {
            var segmenter = new JiebaSegmenter();
            var segments = segmenter.Cut("我来到北京清华大学", cutAll: true);
            Console.WriteLine("【全模式】:{0}", string.Join("/ ", segments));

            segments = segmenter.Cut("我来到北京清华大学");  // 默认为精确模式
            Console.WriteLine("【精确模式】:{0}", string.Join("/ ", segments));

            segments = segmenter.Cut("他来到了网易杭研大厦");  // 默认为精确模式,同时也使用HMM模型
            Console.WriteLine("【新词识别】:{0}", string.Join("/ ", segments));

            segments = segmenter.CutForSearch("小明硕士毕业于中国科学院计算所,后在日本京都大学深造"); // 搜索引擎模式
            Console.WriteLine("【搜索引擎模式】:{0}", string.Join("/ ", segments));

            segments = segmenter.Cut("结过婚的和尚未结过婚的");
            Console.WriteLine("【歧义消除】:{0}", string.Join("/ ", segments));
        }
예제 #27
0
        /// <summary>
        /// 获取分词之后的字符串集合
        /// </summary>
        /// <param name="objStr"></param>
        /// <param name="type"></param>
        /// <returns></returns>
        public static IEnumerable <string> GetSplitWords(string objStr, JiebaTypeEnum type = JiebaTypeEnum.Default)
        {
            var jieba = new JiebaSegmenter();

            switch (type)
            {
            case JiebaTypeEnum.Default:
                return(jieba.Cut(objStr));                    //精确模式-带HMM

            case JiebaTypeEnum.CutAll:
                return(jieba.Cut(objStr, cutAll: true));      //全模式

            case JiebaTypeEnum.CutForSearch:
                return(jieba.CutForSearch(objStr));           //搜索引擎模式

            default:
                return(jieba.Cut(objStr, false, false));      //精确模式-不带HMM
            }
        }
예제 #28
0
파일: Keywords.cs 프로젝트: bsed/Buy
        public static void UpdateSearchCount(string text)
        {
            var segmenter = new JiebaSegmenter();
            var result    = segmenter.CutForSearch(text)
                            .GroupBy(s => s)
                            .Where(s => s.Key.Length > 1)
                            .Select(s => new { Key = s.Key, Count = s.Count() })
                            .ToList();

            using (ApplicationDbContext db = new ApplicationDbContext())
            {
                var temp = result.Select(s => s.Key).ToList();
                var keys = db.Keywords.Where(s => temp.Contains(s.Word)).ToList();
                foreach (var item in keys)
                {
                    item.SearchCount += result.FirstOrDefault(s => s.Key == item.Word).Count;
                }
                db.SaveChanges();
            }
        }
예제 #29
0
        public void JiebaSegmenterTest()
        {
            var    segmenter = new JiebaSegmenter();
            string message   = "多线程";
            var    segments  = segmenter.Cut(message, cutAll: true);

            Console.WriteLine("【全模式】:{0}", string.Join("/ ", segments));

            segments = segmenter.Cut(message);  // 默认为精确模式
            Console.WriteLine("【精确模式】:{0}", string.Join("/ ", segments));

            segments = segmenter.Cut(message, hmm: true);  // 默认为精确模式,同时也使用HMM模型
            Console.WriteLine("【新词识别】:{0}", string.Join("/ ", segments));

            segments = segmenter.CutForSearch(message); // 搜索引擎模式
            Console.WriteLine("【搜索引擎模式】:{0}", string.Join("/ ", segments));

            segments = segmenter.Cut(message);
            Console.WriteLine("【歧义消除】:{0}", string.Join("/ ", segments));
        }
예제 #30
0
        static void Main(string[] args)
        {
            var segmenter = new JiebaSegmenter();
            var segments  = segmenter.Cut("WebApi 教程", cutAll: true);

            Console.WriteLine("【全模式】:{0}", string.Join("/ ", segments));

            segments = segmenter.Cut("WebApi 教程");  // 默认为精确模式
            Console.WriteLine("【精确模式】:{0}", string.Join("/ ", segments));

            segments = segmenter.Cut("WebApi 教程");  // 默认为精确模式,同时也使用HMM模型
            Console.WriteLine("【新词识别】:{0}", string.Join("/ ", segments));

            segments = segmenter.CutForSearch("webapi 教程"); // 搜索引擎模式
            Console.WriteLine("【搜索引擎模式】:{0}", string.Join("/ ", segments));

            segments = segmenter.Cut("webapi 教程");
            Console.WriteLine("【歧义消除】:{0}", string.Join("/ ", segments));
            Console.ReadKey();
        }
예제 #31
0
        /// <summary>
        /// 分词测试
        /// </summary>
        /// <param name="keyword"></param>
        /// <returns></returns>
        public static string Token(string keyword)
        {
            string ret       = "";
            var    segmenter = new JiebaSegmenter();
            var    segments  = segmenter.Cut("我来到北京清华大学", cutAll: true);

            ret = string.Format("【全模式】:{0}", string.Join("/ ", segments));

            segments = segmenter.Cut("我来到北京清华大学");  // 默认为精确模式
            ret      = string.Format("【精确模式】:{0}", string.Join("/ ", segments));

            segments = segmenter.Cut("他来到了网易杭研大厦");  // 默认为精确模式,同时也使用HMM模型
            ret      = string.Format("【新词识别】:{0}", string.Join("/ ", segments));

            segments = segmenter.CutForSearch("小明硕士毕业于中国科学院计算所,后在日本京都大学深造"); // 搜索引擎模式
            ret      = string.Format("【搜索引擎模式】:{0}", string.Join("/ ", segments));

            segments = segmenter.Cut("结过婚的和尚未结过婚的");
            ret      = string.Format("【歧义消除】:{0}", string.Join("/ ", segments));
            return(ret);
        }
예제 #32
0
        protected void BtnSearch_OnClick(object sender, EventArgs e)
        {
            string strSearch      = TxtKeyword.Text;
            var    segmenter      = new JiebaSegmenter();
            var    searchKeywords = segmenter.CutForSearch(strSearch);           //为了尽可能匹配到将搜索内容也分解为关键字
            var    searchResults  = new List <ExampleMyHref>();
            var    resultHrefs    = (List <ExampleMyHref>)Session[SESSION_NAME]; //读取缓存
            var    hrefUrls       = new List <string>();                         // 用于去重

            foreach (var href in resultHrefs)
            {
                if (!hrefUrls.Contains(href.HrefSrc))
                {
                    foreach (var hrefKeyword in href.KeywordList)
                    {
                        if (searchKeywords.Contains(hrefKeyword))
                        {
                            hrefUrls.Add(href.HrefSrc);
                            searchResults.Add(href);
                            break;
                        }
                    }
                }
            }

            if (searchResults.IsNotEmpty())
            {
                foreach (var item in searchResults)
                {
                    string resulttag = "<a href=\"" + item.HrefSrc + "\">" + item.HrefTitle + "</a><br />";
                    Response.Write(resulttag);
                    //Response.Write(item.HrefTitle + "|" + item.HrefSrc + "<br />");
                }
            }
            else
            {
                Response.Write("没有找到相关内容<br />");
            }
        }