Beispiel #1
0
        public void TestCutLargeFile()
        {
            var weiCheng = File.ReadAllText(@"Resources\围城.txt");
            var seg = new JiebaSegmenter();
            seg.Cut("热身");

            Console.WriteLine("Start to cut");
            var n = 20;
            var stopWatch = new Stopwatch();

            // Accurate mode
            stopWatch.Start();

            for (var i = 0; i < n; i++)
            {
                seg.Cut(weiCheng);
            }
            
            stopWatch.Stop();
            Console.WriteLine("Accurate mode: {0} ms", stopWatch.ElapsedMilliseconds / n);

            // Full mode
            stopWatch.Reset();
            stopWatch.Start();

            for (var i = 0; i < n; i++)
            {
                seg.Cut(weiCheng, true);
            }

            stopWatch.Stop();
            Console.WriteLine("Full mode: {0} ms", stopWatch.ElapsedMilliseconds / n);
        }
Beispiel #2
0
        public void CutDemo()
        {
            var segmenter = new JiebaSegmenter();
            var segments = segmenter.Cut("我来到北京清华大学", cutAll: true);
            Console.WriteLine("【全模式】:{0}", string.Join("/ ", segments));

            segments = segmenter.Cut("我来到北京清华大学");  // 默认为精确模式
            Console.WriteLine("【精确模式】:{0}", string.Join("/ ", segments));

            segments = segmenter.Cut("他来到了网易杭研大厦");  // 默认为精确模式,同时也使用HMM模型
            Console.WriteLine("【新词识别】:{0}", string.Join("/ ", segments));

            segments = segmenter.CutForSearch("小明硕士毕业于中国科学院计算所,后在日本京都大学深造"); // 搜索引擎模式
            Console.WriteLine("【搜索引擎模式】:{0}", string.Join("/ ", segments));

            segments = segmenter.Cut("结过婚的和尚未结过婚的");
            Console.WriteLine("【歧义消除】:{0}", string.Join("/ ", segments));

            segments = segmenter.Cut("北京大学生喝进口红酒");
            Console.WriteLine("【歧义消除】:{0}", string.Join("/ ", segments));

            segments = segmenter.Cut("在北京大学生活区喝进口红酒");
            Console.WriteLine("【歧义消除】:{0}", string.Join("/ ", segments));

            segments = segmenter.Cut("腾讯视频致力于打造中国最大的在线视频媒体平台,以丰富的内容、极致的观看体验");
            Console.WriteLine("【精确模式】:{0}", string.Join("/ ", segments));

            segmenter.DeleteWord("湖南");
            segmenter.AddWord("湖南");
            //segmenter.AddWord("长沙市");
            segments = segmenter.Cut("湖南长沙市天心区");
            Console.WriteLine("【精确模式】:{0}", string.Join("/ ", segments));
        }
Beispiel #3
0
        public void CutDemo()
        {
            var segmenter = new JiebaSegmenter();
            var segments = segmenter.Cut("我来到北京清华大学", cutAll: true);
            Console.WriteLine("【全模式】:{0}", string.Join("/ ", segments));

            segments = segmenter.Cut("我来到北京清华大学");  // 默认为精确模式
            Console.WriteLine("【精确模式】:{0}", string.Join("/ ", segments));

            segments = segmenter.Cut("他来到了网易杭研大厦");  // 默认为精确模式,同时也使用HMM模型
            Console.WriteLine("【新词识别】:{0}", string.Join("/ ", segments));

            segments = segmenter.CutForSearch("小明硕士毕业于中国科学院计算所,后在日本京都大学深造"); // 搜索引擎模式
            Console.WriteLine("【搜索引擎模式】:{0}", string.Join("/ ", segments));

            segments = segmenter.Cut("结过婚的和尚未结过婚的");
            Console.WriteLine("【歧义消除】:{0}", string.Join("/ ", segments));
        }
        public void TestCutLargeFile()
        {
            var fileName = @"Resources\围城.txt";
            var weiCheng = File.ReadAllText(fileName);
            var fileSize = (new FileInfo(fileName)).Length;

            var seg = new JiebaSegmenter();
            seg.Cut("热身一下");

            Console.WriteLine("Start to cut");
            const int n = 20;
            var stopWatch = new Stopwatch();

            // Accurate mode
            stopWatch.Start();

            for (var i = 0; i < n; i++)
            {
                seg.Cut(weiCheng);
            }

            stopWatch.Stop();
            var timeConsumed = (double)stopWatch.ElapsedMilliseconds / (1000 * n);
            Console.WriteLine("Accurate mode: {0} ms, average: {1} / second",
                                timeConsumed, fileSize / timeConsumed);

            // Full mode
            stopWatch.Reset();
            stopWatch.Start();

            for (var i = 0; i < n; i++)
            {
                seg.Cut(weiCheng, true);
            }

            stopWatch.Stop();

            timeConsumed = (double)stopWatch.ElapsedMilliseconds / (1000 * n);
            Console.WriteLine("Full mode: {0} ms, average: {1} / second",
                                timeConsumed, fileSize / timeConsumed);
        }
Beispiel #5
0
        static void Main(string[] args)
        {
            var segmenter = new JiebaSegmenter();

            segmenter.LoadUserDict("userdict.txt");
            var segments = segmenter.Cut("我来到北京清华大学", cutAll: true);

            Console.WriteLine("【全模式】:{0}", string.Join("/ ", segments));

            segments = segmenter.Cut("我来到北京清华大学");  // 默认为精确模式
            Console.WriteLine("【精确模式】:{0}", string.Join("/ ", segments));

            segments = segmenter.Cut("他来到了网易杭研大厦");  // 默认为精确模式,同时也使用HMM模型
            Console.WriteLine("【新词识别】:{0}", string.Join("/ ", segments));

            segments = segmenter.CutForSearch("小明硕士毕业于中国科学院计算所,后在日本京都大学深造"); // 搜索引擎模式
            Console.WriteLine("【搜索引擎模式】:{0}", string.Join("/ ", segments));

            segments = segmenter.Cut("结过婚的和尚未结过婚的");
            Console.WriteLine("【歧义消除】:{0}", string.Join("/ ", segments));

            segments = segmenter.Cut("linezerodemo机器学习学习机器");
            Console.WriteLine("【用户字典】:{0}", string.Join("/ ", segments));

            //词频统计
            var s     = "此领域探讨如何处理及运用自然语言。自然语言生成系统把计算机数据转化为自然语言。自然语言理解系统把自然语言转化为计算机程序更易于处理的形式。";
            var freqs = new Counter <string>(segmenter.Cut(s));

            foreach (var pair in freqs.MostCommon(5))
            {
                Console.WriteLine($"{pair.Key}: {pair.Value}");
            }
            Console.ReadKey();
        }
Beispiel #6
0
        public void TestCutAllSpecialWords()
        {
            // TODO: Enable this test case after confirming with jieba py.
            var seg = new JiebaSegmenter();

            seg.AddWord(".NET");
            seg.AddWord("U.S.A.");
            seg.AddWord("Steve Jobs");
            seg.AddWord("Mac OS X");

            var s        = ".NET平台是微软推出的, U.S.A.是美国的简写";
            var segments = seg.Cut(s);

            Console.WriteLine("Cut: {0}", string.Join("/ ", segments));
            segments = seg.Cut(s, cutAll: true);
            Console.WriteLine("Cut All: {0}", string.Join("/ ", segments));

            s        = "Steve Jobs重新定义了手机";
            segments = seg.Cut(s);
            Console.WriteLine("Cut: {0}", string.Join("/ ", segments));
            segments = seg.Cut(s, cutAll: true);
            Console.WriteLine("Cut All: {0}", string.Join("/ ", segments));

            s = "我们所熟悉的一个版本是Mac OS X 10.11 EI Capitan,在2015年推出。";

            segments = seg.Cut(s);
            Console.WriteLine("Cut: {0}", string.Join("/ ", segments));
            segments = seg.Cut(s, cutAll: true);
            Console.WriteLine("Cut All: {0}", string.Join("/ ", segments));
        }
Beispiel #7
0
        public void TestChineseDot()
        {
            var seg = new JiebaSegmenter();

            seg.AddWord("艾尔肯·吐尼亚孜");
            seg.AddWord("短P-R间期");

            var s        = "艾尔肯·吐尼亚孜新疆阿克苏人。 在短P-R间期。";
            var segments = seg.Cut(s).ToList();

            Assert.That(segments, Contains.Item("艾尔肯·吐尼亚孜"));
            Assert.That(segments, Contains.Item("短P-R间期"));
        }
Beispiel #8
0
        static string GetMostCommon(string html)
        {
            var result = "<h2>词频统计</h2>";
            var seg    = new JiebaSegmenter();
            var freqs  = new Counter <string>(seg.Cut(html));

            foreach (var item in freqs.MostCommon(100))
            {
                result = result + $"{item.Key}:{item.Value} <br/>";
            }

            return(result);
        }
Beispiel #9
0
        static void Main(string[] args)
        {
            WebGetter wg = new WebGetter(@"https://tw.news.yahoo.com/most-popular");

            wg.setMethod("GET");
            string html = wg.webReader();

            Console.WriteLine(html);
            WebDecoder wd = new WebDecoder();

            wd.setRule(@"//ul[@id='stream-container-scroll-template']/li/div/div/div/div/div/img");
            List <string> list      = wd.htmlDecode(html);
            int           i         = 0;
            var           segmenter = new JiebaSegmenter();

            //segmenter.LoadUserDict(@"myDic.txt");
            //segmenter.AddWord("陳菊",3,"nr");
            //segmenter.AddWord("後果", 3, "n");
            //segmenter.AddWord("高雄", 3);
            //segmenter.AddWord("陳致中",3,"nr");
            //segmenter.AddWord("這件事", 3);
            //segmenter.AddWord("身材照",3,"n");
            //segmenter.AddWord("道盡",3);
            segmenter.AddWord("韓國瑜", 3, "nr");
            segmenter.AddWord("台灣人", 3, "n");
            List <string> words = new List <string>();
            //segmenter.AddWord("市長", 3, "n");
            StreamWriter sw = new StreamWriter("test.txt");

            //var segments;
            foreach (var tmp in list)
            {
                Console.WriteLine(i + ": " + tmp);
                sw.WriteLine(i + ": " + tmp);
                var segments = segmenter.Cut(tmp);
                //foreach(var tmp2 in segments)
                //{
                //    Console.WriteLine("\t" + tmp2);
                //    sw.WriteLine("\t" + tmp2);
                //    words.Add(tmp2);
                //}
                i++;
            }
            sw.Close();
            words.Sort();
            //foreach(var tmp in words)
            //{
            //    Console.WriteLine(tmp);
            //}
            Console.ReadKey(true);
        }
Beispiel #10
0
        public void TestSpecialWords()
        {
            var seg = new JiebaSegmenter();

            seg.AddWord(".NET");
            seg.AddWord("U.S.A.");

            var s = ".NET平台是微软推出的, U.S.A.是美国的简写";

            var segments = seg.Cut(s);

            foreach (var segment in segments)
            {
                Console.WriteLine(segment);
            }

            s        = "Steve Jobs重新定义了手机";
            segments = seg.Cut(s);
            foreach (var segment in segments)
            {
                Console.WriteLine(segment);
            }
        }
Beispiel #11
0
        public List <Token> Tokenize(string sentence, TokenizationOptions options)
        {
            Init();

            var tokens = segmenter.Cut(sentence)
                         .Select(x => new Token
            {
                Text = x
            }).ToList();

            CorrectTokenPosition(sentence, tokens);

            return(tokens);
        }
Beispiel #12
0
        public void TestCut()
        {
            var sw = new Stopwatch();

            sw.Start();

            var sb = new StringBuilder();

            for (int i = 0; i < 20000; i++)
            {
                sb.AppendLine("PS: 我觉得开源有一个好处,就是能够敦促自己不断改进,避免敞帚自珍");
            }

            var text  = sb.ToString();
            var lines = Regex.Split(text, "\r?\n");

            var seg = new JiebaSegmenter();

            seg.Cut("热身");

            var raw = seg.Cut(text);

            Console.WriteLine(raw.Count());

            sw.Stop();
            Console.WriteLine(sw.Elapsed);

            sw.Restart();

            var processed = (from line in lines.AsParallel().AsOrdered()
                             select seg.Cut(line)).SelectMany(s => s);

            Console.WriteLine(processed.Count());

            sw.Stop();
            Console.WriteLine(sw.Elapsed);
        }
Beispiel #13
0
        private void button1_Click(object sender, EventArgs e)
        {
            // Directory.CreateDirectory("./in/");
            // Directory.CreateDirectory("./out/");
            var segmenter = new JiebaSegmenter();
            var file      = File.OpenRead("./in/src.txt");
            var rbuf      = new StreamReader(file);
            var txt       = rbuf.ReadToEnd();

            rbuf.Close();
            file.Close();
            int cnt = 1;
            var sb  = new StringBuilder();

            for (int i = 0; i < txt.Length; i++)
            {
                var chr = txt[i];
                if ((chr >= 'A' && chr <= 'Z') || (chr >= 'a' && chr <= 'z') || (chr >= '0' && chr <= '9') || (chr >= 0x4e00 && chr <= 0x9fa5))
                {
                    sb.Append(chr);
                }
                else
                {
                    if (sb.Length >= 80)
                    {
                        file = File.OpenWrite($"./out/无题{cnt}.txt");
                        var wbuf = new StreamWriter(file);
                        wbuf.WriteLine("  无题  ");
                        wbuf.WriteLine("作者:佚名");
                        wbuf.WriteLine();
                        var splitWords = segmenter.Cut(sb.ToString());
                        int sp         = 0;
                        while (sp < splitWords.Count())
                        {
                            wbuf.WriteLine(string.Join("", splitWords.Skip(sp).Take(3)));
                            sp += 3;
                        }
                        wbuf.Close();
                        file.Close();
                        cnt++;
                        sb.Clear();
                    }
                }
                if (cnt >= 1000)
                {
                    break;
                }
            }
        }
Beispiel #14
0
        static void Main(string[] args)
        {
            JiebaSegmenter segmenter = new JiebaSegmenter();

            segmenter.AddWord("學系");

            String[] sets = new String[] { "資訊工程學系", "資訊管理學系", "應用化學學系", "土木工程學系", "外國語言學系" };

            foreach (string s in sets)
            {
                Console.WriteLine(string.Join(" ", segmenter.Cut(s)));
            }

            Console.ReadKey();
        }
Beispiel #15
0
        static void Main(string[] args)
        {
            Encoding.RegisterProvider(CodePagesEncodingProvider.Instance);
            //TestDemo test = new TestDemo();
            //test.CutDemo();
            //test.TokenizeDemo();
            var segmenter = new JiebaSegmenter();
            var segments  = segmenter.Cut("我来到北京清华大学", cutAll: true);

            Console.WriteLine("【全模式】:{0}", string.Join("/ ", segments));

            segments = segmenter.Cut("我来到北京清华大学");  // 默认为精确模式
            Console.WriteLine("【精确模式】:{0}", string.Join("/ ", segments));

            segments = segmenter.Cut("他来到了网易杭研大厦");  // 默认为精确模式,同时也使用HMM模型
            Console.WriteLine("【新词识别】:{0}", string.Join("/ ", segments));

            segments = segmenter.CutForSearch("小明硕士毕业于中国科学院计算所,后在日本京都大学深造"); // 搜索引擎模式
            Console.WriteLine("【搜索引擎模式】:{0}", string.Join("/ ", segments));

            segments = segmenter.Cut("结过婚的和尚未结过婚的");
            Console.WriteLine("【歧义消除】:{0}", string.Join("/ ", segments));
            Console.ReadKey();
        }
Beispiel #16
0
    // ====程序代码====

    public Document(int id)
    {
        ID = id;
        using (var file = new StreamReader($"Data/Input{id}.txt")) {
            switch (file.ReadLine())
            {
            case "大众点评上能否查询到该企业?能查到正在营业":
                能查到正在营业 = true; break;

            case "大众点评上能否查询到该企业?能查到曾经营业":
                能查到曾经营业 = true; break;

            case "大众点评上能否查询到该企业?无信息":
                无营业信息 = true; break;

            default:
                throw new FormatException($"请检查 Input{id}.txt 的格式");
            }
            switch (file.ReadLine())
            {
            case "队员是否有成功的GPS定位?有":
                GPS定位 = true; break;

            case "队员是否有成功的GPS定位?没有":
                GPS定位 = false; break;

            default:
                throw new FormatException($"请检查 Input{id}.txt 的格式");
            }
            if (file.ReadLine().Length != 0)
            {
                throw new FormatException($"请检查 Input{id}.txt 的格式");
            }
            var comment = new StringBuilder();
            while (true)
            {
                var line = file.ReadLine();
                if (line == null)
                {
                    break;
                }
                comment.Append(line);
            }
            备注文本 = comment.ToString();
            备注词汇 = Segmenter.Cut(备注文本)
                   .Where(k => !char.IsPunctuation(k[0])).ToArray();
        }
    }
Beispiel #17
0
        public WordAnalyResult <List <string> > WordAnalyJieba(string text, Encoding encoding = null)
        {
            if (ConfigBase.Default.IsTraceStack)
            {
                LogService.AnyLog("Stack", new StackTrace().GetFrame(0).GetMethod().ToString());
            }
            if (string.IsNullOrEmpty(text))
            {
                return(WordAnalyResult <List <string> > .Empty);
            }
            var segmenter = new JiebaSegmenter();

            return(new WordAnalyResult <List <string> >().SetData(segmenter.Cut(text).ToList()));
            //var posSeg = new PosSegmenter(segmenter);
            //return posSeg.Cut(text, hmm).Select(token => string.Format("{0}/{1}", token.Word, token.Flag));
        }
        public void TestHyphen()
        {
            var seg = new JiebaSegmenter();

            seg.AddWord("cet-4");

            var s        = "你一定也考过cet-4了。";
            var segments = seg.Cut(s).ToList();

            Assert.That(segments, Contains.Item("cet-4"));
            Console.WriteLine(segments);
            foreach (var sm in segments)
            {
                Console.WriteLine(sm);
            }
        }
Beispiel #19
0
        public string[] Cut(string src)
        {
            IEnumerable <string> tokens = segmenter.Cut(src);

            return(tokens.ToArray());

            //if (src.Length == 2)
            //{

            //}
            //else
            //{
            //    IEnumerable<string> tokens = segmenter.Cut(src);
            //    return tokens.ToArray();
            //}
        }
        /// <summary>
        /// 切词
        /// </summary>
        /// <returns></returns>
        public List <string> JiebaCut()
        {
            JiebaSegmenter jiebaseg  = new JiebaSegmenter();
            var            segment   = jiebaseg.Cut(doc);
            List <string>  cutresult = new List <string>();

            foreach (var i in segment)
            {
                if (stopwords.Contains(i))
                {//不参与计算的词排除
                    continue;
                }
                cutresult.Add(i);
            }
            return(cutresult);
        }
        public List <string> JiebaCut()
        {
            JiebaSegmenter jiebaseg = new JiebaSegmenter();
            //Console.WriteLine(doc);
            var           segment   = jiebaseg.Cut(doc);
            List <string> cutresult = new List <string>();

            foreach (var i in segment)
            {
                if (!stopwords.Contains(i))
                {
                    cutresult.Add(i);
                }
            }
            return(cutresult);
        }
Beispiel #22
0
    /// <summary>
    /// 生成分词文件
    /// </summary>
    /// <param name="path"></param>
    /// <returns></returns>
    public string Build(string path)
    {
        var html = System.IO.File.ReadAllText(path, System.Text.Encoding.UTF8);

        var segmenter = new JiebaSegmenter();

        segmenter.LoadUserDict("Files/dict.txt");

        var segments = segmenter.Cut(html, cutAll: true);   //全匹配

        var jiebaFIle = $"{path}.jieba";

        System.IO.File.WriteAllText(jiebaFIle, string.Join(" ", segments), System.Text.Encoding.UTF8);   //生成分词文件

        return(jiebaFIle);
    }
Beispiel #23
0
        public static IEnumerable <Token> CutToToken(this JiebaSegmenter segmenter, string text, bool cutAll = true)
        {
            var words      = segmenter.Cut(text, cutAll).Where(s => !string.IsNullOrWhiteSpace(s)).ToArray();
            var indexDic   = new Dictionary <string, int>();
            var tokenArray = new Token[words.Length];
            var checkIndex = 0;

            for (var i = 0; i < words.Length; i++)
            {
                var word = words[i];
                checkIndex     = text.IndexOf(word, indexDic.ContainsKey(word) ? indexDic[word] + 1 : checkIndex, StringComparison.Ordinal);
                tokenArray[i]  = new Token(word, checkIndex, checkIndex + word.Length);
                indexDic[word] = checkIndex;
            }
            return(tokenArray);
        }
Beispiel #24
0
        public IEnumerable <string> Analytical(string input)
        {
            var segment       = new JiebaSegmenter();
            var features      = new List <string>();
            var stopWordsList = GetStopWords();

            foreach (var feature in segment.Cut(input))
            {
                if (stopWordsList.Any(s => s.Contains(feature)))
                {
                    continue;
                }
                features.Add(feature);
            }

            return(features);
        }
Beispiel #25
0
        //中文分词
        public static List <string> ChineseSegmenter(string content)
        {
            if (segmenter == null)
            {
                segmenter = new JiebaSegmenter();
                List <string> temp = new List <string>().Concat(TechnologyStack)
                                     .Concat(ProgramLanguage).Concat(Job).ToList();
                foreach (var word in temp)
                {
                    segmenter.AddWord(word);
                }
            }

            IEnumerable <string> segments = segmenter.Cut(content);

            return(segments.ToList());
        }
Beispiel #26
0
        /// <summary>
        /// 全文检索 查询
        /// </summary>
        /// <param name="keyword"></param>
        /// <returns></returns>
        public NpgsqlTsQuery GetSerachNpgsqlTsQuery(string keyword)
        {
            if (keyword.Contains('&'))
            {
                string[] keys = keyword.Split('&');
                return(GetSerachNpgsqlTsQuery_And(keys));
            }
            if (keyword.Contains("|"))
            {
                string[] keys = keyword.Split('|');
                return(GetSerachNpgsqlTsQuery_Or(keys));
            }


            NpgsqlTsQuery vector;

            try
            {
                var segmenter = new JiebaSegmenter();
                HtmlToTextHelper htmlToTextHelper = new HtmlToTextHelper();
                if (string.IsNullOrWhiteSpace(keyword))
                {
                    return(null);;
                }
                string noHtmlConent = htmlToTextHelper.Convert(keyword);
                var    list         = segmenter.Cut(noHtmlConent, hmm: true);
                var    cutList      = new List <string>();
                foreach (var item in list)
                {
                    if (item.Length > 1)
                    {
                        cutList.Add(item.ToUpper());
                    }
                }
                string str = string.Join(" & ", cutList);
                vector = NpgsqlTsQuery.Parse(str);
            }
            catch (Exception ex)
            {
                return(null);
            }

            return(vector);
        }
        /// <summary>
        /// 获取num个核心句
        /// </summary>
        /// <param name="text">文本</param>
        /// <param name="num">核心句数</param>
        public void GetList(string text, int num, int type)
        {
            keywordList.Clear();

            //获取核心关键词列表
            switch (type)
            {
            case 1:
            {
                TfidfExtractor te = new TfidfExtractor();
                keywordList = te.ExtractTags(text, num).ToList();
            }
            break;

            case 2:
            {
                TextRankExtractor te = new TextRankExtractor();
                keywordList = te.ExtractTags(text, num).ToList();
            }
            break;
            }

            AllsentenceList.Clear();
            keySentenceList.Clear();


            //将文章拆为句子列表,并分词
            text = text.Replace(Environment.NewLine.ToString(), " 。");
            //text = text.Replace(" ", "");
            AllsentenceList = text.Split('。', '?').Where(x => !string.IsNullOrEmpty(x) && x != "undefined").Select(x => x.Trim()).ToList();
            List <Sentence> temp = new List <Sentence>();

            for (int i = 0; i < AllsentenceList.Count; i++)
            {
                AllsentenceList[i] = AllsentenceList[i] + "。";
                var      sentence = segmenter.Cut(AllsentenceList[i]);
                Sentence v        = new Sentence();
                v.Sen   = string.Join(" ", sentence);
                v.Index = i;
                temp.Add(v);
            }
            GetSentenceList(keywordList, temp);
        }
Beispiel #28
0
        public void CutDemo()
        {
            var segmenter = new JiebaSegmenter();
            var segments  = segmenter.Cut("我来到北京清华大学", cutAll: true);

            Console.WriteLine("【全模式】:{0}", string.Join("/ ", segments));

            segments = segmenter.Cut("我来到北京清华大学");  // 默认为精确模式
            Console.WriteLine("【精确模式】:{0}", string.Join("/ ", segments));

            segments = segmenter.Cut("他来到了网易杭研大厦");  // 默认为精确模式,同时也使用HMM模型
            Console.WriteLine("【新词识别】:{0}", string.Join("/ ", segments));

            segments = segmenter.CutForSearch("小明硕士毕业于中国科学院计算所,后在日本京都大学深造"); // 搜索引擎模式
            Console.WriteLine("【搜索引擎模式】:{0}", string.Join("/ ", segments));

            segments = segmenter.Cut("结过婚的和尚未结过婚的");
            Console.WriteLine("【歧义消除】:{0}", string.Join("/ ", segments));

            segments = segmenter.Cut("北京大学生喝进口红酒");
            Console.WriteLine("【歧义消除】:{0}", string.Join("/ ", segments));

            segments = segmenter.Cut("在北京大学生活区喝进口红酒");
            Console.WriteLine("【歧义消除】:{0}", string.Join("/ ", segments));

            segments = segmenter.Cut("腾讯视频致力于打造中国最大的在线视频媒体平台,以丰富的内容、极致的观看体验");
            Console.WriteLine("【精确模式】:{0}", string.Join("/ ", segments));

            segmenter.DeleteWord("湖南");
            segmenter.AddWord("湖南");
            //segmenter.AddWord("长沙市");
            segments = segmenter.Cut("湖南长沙市天心区");
            Console.WriteLine("【精确模式】:{0}", string.Join("/ ", segments));
            TokenizeDemo();
            TokenizeSearchDemo();
            PosCutDemo();
            ExtractTagsDemo();
            ExtractTagsDemo2();
            TestWordFreq();
        }
Beispiel #29
0
        public static void method2()
        {
            string path       = @"D:\c#\stopwords-master\baidu_stopwords.txt"; //路径
            string str        = File.ReadAllText(path);
            var    stop_words = str.Split('\n');

            path = @"D:\c#\图云词频计算\files";
            DirectoryInfo root = new DirectoryInfo(path);

            foreach (FileInfo f in root.GetFiles())
            {
                string fullName = f.FullName;
                var    text     = File.ReadAllText(fullName);
                string pattern  = @"abstract:[\S\s]+";
                Regex  regex    = new Regex(pattern);
                Match  match    = regex.Match(text);
                if (match.Groups.Count != 0)
                {
                    text = match.Groups[0].ToString().Substring(9);
                }
                else
                {
                    text = "abstract"; //防止出现“”
                }
                if (text == "")
                {
                    text = "abstract";
                }
                var segmenter = new JiebaSegmenter();
                var segments  = segmenter.Cut(text, cutAll: true);                                         // 默认为精确模式

                System.IO.StreamWriter file = new System.IO.StreamWriter(@"D:\c#\图云词频计算\words.txt", true); //写入到文件末尾 不覆盖
                foreach (var temp in segments)
                {
                    if (!stop_words.Contains(temp))
                    {
                        file.WriteLine(temp);
                    }
                }
                file.Close();
            }
        }
Beispiel #30
0
        static void Main(string[] args)
        {
            if (args.Length < 1)
            {
                Console.WriteLine("No file specified");
                return;
            }

            var result = new List<string>();

            var filename = Path.GetFullPath(args[0]);
            var lines = File.ReadAllLines(filename);

            var segmenter = new JiebaSegmenter();
            foreach (var line in lines)
            {
                result.Add(string.Join("/ ", segmenter.Cut(line)));
            }
            Console.WriteLine(string.Join(Environment.NewLine, result));
        }
Beispiel #31
0
        protected static List <string> CutKeyWord(string key)
        {
            var rs        = new List <string>();
            var segmenter = new JiebaSegmenter();
            var list      = segmenter.Cut(key);

            if (list != null && list.Count() > 0)
            {
                foreach (var item in list)
                {
                    if (string.IsNullOrEmpty(item) || item.Length <= 1)
                    {
                        continue;
                    }

                    rs.Add(item);
                }
            }
            return(rs);
        }
Beispiel #32
0
        static void Main(string[] args)
        {
            if (args.Length < 1)
            {
                Console.WriteLine("No file specified");
                return;
            }

            var result = new List <string>();

            var filename = Path.GetFullPath(args[0]);
            var lines    = File.ReadAllLines(filename);

            var segmenter = new JiebaSegmenter();

            foreach (var line in lines)
            {
                result.Add(string.Join("/ ", segmenter.Cut(line)));
            }
            Console.WriteLine(string.Join(Environment.NewLine, result));
        }
        private void GoButtonClick(object sender, RoutedEventArgs e)
        {
            tb1.IsEnabled = false;
            string srcText = tb1.Text;

            if (srcText == "" || srcText == null)
            {
                MessageBox.Show("内容为空", "Error", MessageBoxButton.OK, MessageBoxImage.Error);
            }
            else
            {
                JiebaSegmenter       segmenter = new JiebaSegmenter();
                IEnumerable <string> segments  = segmenter.Cut(srcText);
                string outText = "";
                foreach (string s in segments)
                {
                    if (NeedChaos() && s.Length > 1)
                    {
                        Random rd = new Random();
                        int    n1 = rd.Next(0, s.Length);
                        int    n2 = rd.Next(0, s.Length);
                        while (n2 == n1)
                        {
                            n2 = rd.Next(0, s.Length);
                        }
                        char[] arr = s.ToCharArray();
                        arr[n1] = s[n2];
                        arr[n2] = s[n1];
                        string s_t = string.Join("", arr);
                        outText += s_t;
                    }
                    else
                    {
                        outText += s;
                    }
                }
                tb2.Text      = outText;
                tb1.IsEnabled = true;
            }
        }
Beispiel #34
0
    public static void AnlayzeEntitySurroundWords(HTMLEngine.MyRootHtmlNode root, string KeyWord)
    {
        Program.Training.WriteLine("关键字:[" + KeyWord + "]");
        JiebaSegmenter segmenter = new JiebaSegmenter();

        segmenter.AddWord(KeyWord);
        foreach (var paragrah in root.Children)
        {
            var segments = segmenter.Cut(paragrah.FullText.NormalizeKey()).ToList();  // 默认为精确模式
            //Console.WriteLine("【精确模式】:{0}", string.Join("/ ", segments));
            //寻找关键字的位置
            for (int i = 0; i < segments.Count; i++)
            {
                if (segments[i].Equals(KeyWord))
                {
                    //前5个词语和后五个词语
                    var startInx = Math.Max(0, i - 5);
                    var EndInx   = Math.Min(i + 5, segments.Count);
                    for (int s = startInx; s < i; s++)
                    {
                        Program.Training.WriteLine("前导关键字:[" + segments[s] + "]");
                        if (segments[s] == ":")
                        {
                            var leading = "";
                            for (int l = startInx; l < s; l++)
                            {
                                leading += segments[l];
                            }
                            Console.WriteLine("冒号前导词:" + leading);
                        }
                    }
                    Program.Training.WriteLine("关键字:[" + KeyWord + "]");
                    for (int s = i + 1; s < EndInx; s++)
                    {
                        Program.Training.WriteLine("后续关键字:[" + segments[s] + "]");
                    }
                }
            }
        }
    }
Beispiel #35
0
        /// <summary>
        /// 获取分词结果
        /// </summary>
        /// <param name="input"></param>
        /// <returns></returns>
        public static Dictionary <string, int> GetResult(string input, string mode = "", bool checkRepetitiveWord = false)
        {
            Dictionary <string, int> res = new Dictionary <string, int>();

            var segmenter = new JiebaSegmenter();

            var words = segmenter.Cut(input);

            var wordDict = new Dictionary <string, int>();

            foreach (var word in words)
            {
                if (2 <= word.Length && StringChecker.IsHanZi(word) || StringChecker.IsEnglish(word))
                {
                    wordDict[word] = 0;
                }
            }

            res = FenCi.GetRepetitiveWordCount(input, wordDict);

            return(res);
        }
        public void TestCutManySentences()
        {
            var text = GetTestSentences().Join(string.Empty);
            var fileSize = 1532 * 100;

            var seg = new JiebaSegmenter();
            seg.Cut("热身一下");

            Console.WriteLine("Start to cut");
            const int n = 20;
            var stopWatch = new Stopwatch();

            // Accurate mode
            stopWatch.Start();

            for (var i = 0; i < n; i++)
            {
                seg.Cut(text);
            }

            stopWatch.Stop();
            var timeConsumed = (double)stopWatch.ElapsedMilliseconds / (1000 * n);
            Console.WriteLine("Accurate mode: {0} ms, average: {1} / second",
                                timeConsumed, fileSize / timeConsumed);

            // Full mode
            stopWatch.Reset();
            stopWatch.Start();

            for (var i = 0; i < n; i++)
            {
                seg.Cut(text, true);
            }

            stopWatch.Stop();

            timeConsumed = (double)stopWatch.ElapsedMilliseconds / (1000 * n);
            Console.WriteLine("Full mode: {0} ms, average: {1} / second",
                                timeConsumed, fileSize / timeConsumed);
        }
Beispiel #37
0
        public void TestCut()
        {
            var sw = new Stopwatch();
            sw.Start();

            var sb = new StringBuilder();
            for (int i = 0; i < 20000; i++)
            {
                sb.AppendLine("PS: 我觉得开源有一个好处,就是能够敦促自己不断改进,避免敞帚自珍");
            }

            var text = sb.ToString();
            var lines = Regex.Split(text, "\r?\n");

            var seg = new JiebaSegmenter();
            seg.Cut("热身");

            var raw = seg.Cut(text);
            Console.WriteLine(raw.Count());

            sw.Stop();
            Console.WriteLine(sw.Elapsed);

            sw.Restart();

            var processed = (from line in lines.AsParallel().AsOrdered()
                             select seg.Cut(line)).SelectMany(s => s);
            Console.WriteLine(processed.Count());

            sw.Stop();
            Console.WriteLine(sw.Elapsed);
        }
Beispiel #38
0
        public void TestCutSpecialWords()
        {
            var seg = new JiebaSegmenter();
            seg.AddWord(".NET");
            seg.AddWord("U.S.A.");
            
            var s = ".NET平台是微软推出的, U.S.A.是美国的简写";

            var segments = seg.Cut(s);
            foreach (var segment in segments)
            {
                Console.WriteLine(segment);
            }

            seg.LoadUserDict(@"Resources\user_dict.txt");
            s = "Steve Jobs重新定义了手机";
            segments = seg.Cut(s);
            foreach (var segment in segments)
            {
                Console.WriteLine(segment);
            }

            s = "我们所熟悉的一个版本是Mac OS X 10.11 EI Capitan,在2015年推出。";
            segments = seg.Cut(s);
            foreach (var segment in segments)
            {
                Console.WriteLine(segment);
            }
        }
Beispiel #39
0
        public void TestSpecialWords()
        {
            var seg = new JiebaSegmenter();
            seg.AddWord(".NET");
            seg.AddWord("U.S.A.");
            
            var s = ".NET平台是微软推出的, U.S.A.是美国的简写";

            var segments = seg.Cut(s);
            foreach (var segment in segments)
            {
                Console.WriteLine(segment);
            }

            s = "Steve Jobs重新定义了手机";
            segments = seg.Cut(s);
            foreach (var segment in segments)
            {
                Console.WriteLine(segment);
            }
        }
Beispiel #40
0
        public void TestCutAllSpecialWords()
        {
            // TODO: Enable this test case after confirming with jieba py.
            var seg = new JiebaSegmenter();
            seg.AddWord(".NET");
            seg.AddWord("U.S.A.");
            seg.AddWord("Steve Jobs");
            seg.AddWord("Mac OS X");

            var s = ".NET平台是微软推出的, U.S.A.是美国的简写";
            var segments = seg.Cut(s);
            Console.WriteLine("Cut: {0}", string.Join("/ ", segments));
            segments = seg.Cut(s, cutAll: true);
            Console.WriteLine("Cut All: {0}", string.Join("/ ", segments));

            s = "Steve Jobs重新定义了手机";
            segments = seg.Cut(s);
            Console.WriteLine("Cut: {0}", string.Join("/ ", segments));
            segments = seg.Cut(s, cutAll: true);
            Console.WriteLine("Cut All: {0}", string.Join("/ ", segments));

            s = "我们所熟悉的一个版本是Mac OS X 10.11 EI Capitan,在2015年推出。";

            segments = seg.Cut(s);
            Console.WriteLine("Cut: {0}", string.Join("/ ", segments));
            segments = seg.Cut(s, cutAll: true);
            Console.WriteLine("Cut All: {0}", string.Join("/ ", segments));
        }
Beispiel #41
0
 public void TestEnglishWordsCut()
 {
     var seg = new JiebaSegmenter();
     var text = "HighestDegree";
     CollectionAssert.AreEqual(new[] { text }, seg.Cut(text));
     text = "HelloWorld";
     CollectionAssert.AreEqual(new[] { text }, seg.Cut(text));
     text = "HelloWorldle";
     CollectionAssert.AreEqual(new[] { text }, seg.Cut(text));
     text = "HelloWorldlee";
     CollectionAssert.AreEqual(new[] { text }, seg.Cut(text));
 }
Beispiel #42
0
        public void TestAddWord()
        {
            var seg = new JiebaSegmenter();
            var s = "小明最近在学习机器学习和自然语言处理";

            var segments = seg.Cut(s);
            Assert.That(segments, Contains.Item("机器"));
            Assert.That(segments, Contains.Item("学习"));

            seg.AddWord("机器学习");
            segments = seg.Cut(s);
            Assert.That(segments, Contains.Item("机器学习"));
            Assert.That(segments, Is.Not.Contains("机器"));
        }
Beispiel #43
0
 private static void TestCutThenPrint(JiebaSegmenter segmenter, string s)
 {
     Console.WriteLine(string.Join("/ ", segmenter.Cut(s)));
 }