Example #1
0
        static void Main(string[] args)
        {
            var str = "共产党是世界上最坏的政党,压迫人民,繁重的苛捐杂税,高压统治,反对人权, 自由万岁!";

            Segment.Init();

            var options = PanGu.Setting.PanGuSettings.Config.MatchOptions.Clone();
            var parameters = PanGu.Setting.PanGuSettings.Config.Parameters.Clone();
            options.FilterStopWords = false;

            Segment segment = new Segment();
            var words = segment.DoSegment(str, options, parameters);

            StringBuilder wordsString = new StringBuilder();
            foreach (WordInfo wordInfo in words)
            {
                if (wordInfo == null)
                {
                    continue;
                }
                wordsString.AppendFormat("{0} ", wordInfo.Word);
            }

            str = wordsString.ToString();
            Console.WriteLine(wordsString.ToString());

            var zhlist = PinyinCommon.GetZhDictsList();

            Console.WriteLine(PinyinCommon.GetPinyin(str, zhlist));

            Console.ReadKey();
        }
Example #2
0
        /// <summary>
        /// 盘古分词
        /// </summary>
        /// <param name="str"></param>
        /// <returns></returns>
        public static string StrSplit(string str)
        {
            try
            {
                Stopwatch watch = new Stopwatch();
                watch.Start();
                Segment segment = new Segment();

                ICollection<WordInfo> words = segment.DoSegment(str, _Options, _Parameters);

                watch.Stop();

                StringBuilder wordsString = new StringBuilder();
                foreach (WordInfo wordInfo in words)
                {
                    if (wordInfo == null || wordInfo.Word.Length <= 1)
                    {
                        continue;
                    }
                    wordsString.AppendFormat("{0},", wordInfo.Word);
                }

                return wordsString.ToString().TrimEnd(',');
            }
            catch (Exception ex)
            {
                return ex.Message;
            }
        }
Example #3
0
        private string DisplaySegment(string textSource)
        {
            //Stopwatch watch = new Stopwatch();
            //watch.Start();

            Segment segment = new Segment();
            ICollection<WordInfo> words = segment.DoSegment(textSource);

            /*watch.Stop();

            labelSrcLength.Text = textBoxSource.Text.Length.ToString();

            labelSegTime.Text = watch.Elapsed.ToString();
            if (watch.ElapsedMilliseconds == 0)
            {
                labelRegRate.Text = "无穷大";
            }
            else
            {
                labelRegRate.Text = ((double)(textBoxSource.Text.Length / watch.ElapsedMilliseconds) * 1000).ToString();
            }

            if (checkBoxShowTimeOnly.Checked)
            {
                return;
            }*/

            StringBuilder wordsString = new StringBuilder();
            foreach (WordInfo wordInfo in words)
            {
                if (wordInfo == null)
                {
                    continue;
                }

                /* if (showPosition)
                 {

                     wordsString.AppendFormat("{0}({1},{2})/", wordInfo.Word, wordInfo.Position, wordInfo.Rank);
                     //if (_Options.MultiDimensionality)
                     //{
                     //}
                     //else
                     //{
                     //    wordsString.AppendFormat("{0}({1})/", wordInfo.Word, wordInfo.Position);
                     //}
                 }*/
                // else
                // {
                wordsString.AppendFormat("{0}/", wordInfo.Word);
                // }
            }

            //textBoxSegwords.Text += "\n\r" + wordsString.ToString();
            return wordsString.ToString();
        }
Example #4
0
        protected void btnSplit_Click(object sender, EventArgs e)
        {
            ltlResult.Text = "";
            var segment = new PanGu.Segment();
            var words   = segment.DoSegment(txtContent.Text);


            foreach (var word in words)
            {
                ltlResult.Text += (word + "," + GetChsPos(word.Pos) + "<br />");
            }
        }
Example #5
0
 static Dictionary<string, int> ContetnWordSegment(string content)
 {
     Dictionary<string, int> ret = new Dictionary<string, int>();
     Segment seg = new Segment();
     ICollection<WordInfo> words = seg.DoSegment(content);
     foreach (WordInfo w in words)
     {
         if (!ret.ContainsKey(w.Word))
         {
             ret[w.Word] = 1;
         }
         else ret[w.Word]++;
     }
     return ret;
 }
Example #6
0
        public static List<string> panguDivide(String s)
        {
            Segment segment = new Segment();

            ICollection<WordInfo> words = segment.DoSegment(s);
            List<string> wordsResult = new List<string>();

            foreach (WordInfo wordInfo in words)
            {
                if (wordInfo == null) continue;
                wordsResult.Add(wordInfo.Word);
            }

            return wordsResult;
        }
Example #7
0
        public static List <Tuple <string, double> > GetTermFreq(this string text)
        {
            Segment.Init();
            var seg = new PanGu.Segment();
            var rst = seg.DoSegment(text, new PanGu.Match.MatchOptions
            {
                FilterNumeric  = true,
                FrequencyFirst = true,
                //EnglishSegment = true,
                IgnoreCapital = true,
            });

            return(rst.Where(t => t.Word.Length > 1)
                   .Select(t => new Tuple <string, double>(t.Word, 1))
                   .ToList());
        }
Example #8
0
        private static string SegmentKeyWord(VideoNode video)
        {
            var builder = new StringBuilder(50);
            var segment = new Segment();
            var words = segment.DoSegment(video.ChannelName);
            foreach (var word in words)
            {
                if (word == null)
                    continue;
                builder.AppendFormat("{0}^{1}.0 ", word.Word, (int)Math.Pow(3, word.Rank));
            }
            //if (!string.IsNullOrEmpty(video.Language[CustomArray.LanguageArray[0]].CatalogTags))
            //{
            //    var catalogs = video.Language[CustomArray.LanguageArray[0]].CatalogTags.FormatStrToArray(SplitArray.DHArray);
            //    foreach (var catalog in catalogs)
            //    {
            //        builder.AppendFormat("{0}^{1}.0 ", catalog, 3);
            //    }
            //}

            //if(!string.IsNullOrEmpty(video.BKInfo.Actors))
            //{
            //    var acs = video.BKInfo.Actors.FormatStrToArray(SplitArray.DHArray);
            //    foreach (var ac in acs)
            //    {
            //        builder.AppendFormat("{0}^{1}.0 ", ac, 2);
            //    }
            //}

            //if(!string.IsNullOrEmpty(video.Language[CustomArray.LanguageArray[0]].AreaTags))
            //{
            //    var areas = video.Language[CustomArray.LanguageArray[0]].AreaTags.FormatStrToArray(SplitArray.DHArray);
            //    foreach (var area in areas)
            //    {
            //        builder.AppendFormat("{0}^{1}.0 ", area, 1);
            //    }
            //}
            return builder.ToString();
        }
        public PanGuTokenizer(TextReader input)
            : base(input)
        {
            lock (_LockObj)
            {
                InitPanGuSegment();
            }
            _InputText = base.input.ReadToEnd();

            if (string.IsNullOrEmpty(_InputText))
            {
                char[] readBuf = new char[1024];
                int relCount = base.input.Read(readBuf, 0, readBuf.Length);
                StringBuilder inputStr = new StringBuilder(readBuf.Length);
                while (relCount > 0)
                {
                    inputStr.Append(readBuf, 0, relCount);
                    relCount = input.Read(readBuf, 0, readBuf.Length);
                }
                if (inputStr.Length > 0)
                {
                    _InputText = inputStr.ToString();
                }
            }

            if (string.IsNullOrEmpty(_InputText))
            {
                _WordList = new WordInfo[0];
            }
            else
            {
                global::PanGu.Segment segment = new Segment();
                ICollection<WordInfo> wordInfos = segment.DoSegment(_InputText);
                _WordList = new WordInfo[wordInfos.Count];
                wordInfos.CopyTo(_WordList, 0);
            }
        }
Example #10
0
        private void button2_Click(object sender, EventArgs e)
        {
            PanGu.Segment.Init();
            PanGu.Segment segment = new PanGu.Segment();
            ICollection <PanGu.WordInfo> words = segment.DoSegment("山东落花生花落东山,长春市长春花店");

            foreach (var word in words)
            {
                Console.WriteLine(word.Word);
            }

            PanGu.HighLight.SimpleHTMLFormatter simpleHTMLFormatter = new PanGu.HighLight.SimpleHTMLFormatter("<font color=\"red\">", "</font>");
            PanGu.HighLight.Highlighter         highlighter         = new PanGu.HighLight.Highlighter(simpleHTMLFormatter, new PanGu.Segment());
            highlighter.FragmentSize = 100; // 设置每个摘要段的字符数
            string keywords = "信号/道路/开通";
            string content  = @"高德完胜百度。我专门花了几个星期,在我所在的城市测试两个地图,高德数据不准确在少数,而百度就是家常便饭了,表现为:
已经管制一年的道路(双向变单向),百度仍然提示双向皆可走。
已经封闭数年的道路,百度仍然说是通的。
新修道路,还没有开通,百度居然让走。
有时候规划路线时明明是正确的,但是导航过程中,就出乱子,信号没问题、路线不复杂,明明是要左转,百度却叫右转。";
            string abs      = highlighter.GetBestFragment(keywords, content);

            Console.WriteLine(abs);
        }
Example #11
0
        public PanGuTokenizer(TextReader input)
            : base(input)
        {
            termAttribute = AddAttribute<ITermAttribute>();
            offsetAttribute = AddAttribute<IOffsetAttribute>();

            inputText = base.input.ReadToEnd();

            if (string.IsNullOrEmpty(inputText)) {
                char[] readBuf = new char[1024];

                int relCount = base.input.Read(readBuf, 0, readBuf.Length);

                StringBuilder inputStr = new StringBuilder(readBuf.Length);

                while (relCount > 0) {
                    inputStr.Append(readBuf, 0, relCount);

                    relCount = input.Read(readBuf, 0, readBuf.Length);
                }

                if (inputStr.Length > 0) {
                    inputText = inputStr.ToString();
                }
            }

            if (string.IsNullOrEmpty(inputText)) {
                words = new WordInfo[0];
            }
            else {
                global::PanGu.Segment segment = new Segment();
                ICollection<WordInfo> wordInfos = segment.DoSegment(inputText);
                words = new WordInfo[wordInfos.Count];
                wordInfos.CopyTo(words, 0);
            }
        }
Example #12
0
 public void SpellTest()
 {
     
     LuceneEngine engine = new LuceneEngine();
     engine.Init();
     var firsts = SpellUtils.GetCnSegment("NBA常规赛-快船vs凯尔特人");
     var spells = SpellUtils.GetSpellSegment("战重警和");
     var segment = new Segment();
     var collection = segment.DoSegment("国际足球100509K联赛釜山-大田");
     var list = new List<string>();
     foreach (WordInfo word in collection)
     {
         if (word == null)
             continue;
         list.Add(word.Word);
     }
     //var list = SpellUtils.GetSpellSegment("战警");
     string sss = "san国yan义";
     string ssss = Synacast.LuceneNetSearcher.Searcher.Searcher.SegmentKeyWord(sss);
     string text = "重庆";
     TextReader tr = new StringReader(text);
     PanGuTokenizer ct = new PanGuTokenizer(tr);
     int end = 0;
     Lucene.Net.Analysis.Token t;
     string ss = string.Empty;
     while (end < text.Length)
     {
         t = ct.Next();
         end = t.EndOffset();
         ss = ss + t.TermText() + "/ ";
     }
 }
Example #13
0
 static string ProcessQuery(string[] query)
 {
     /* 对query进行分词 */
     List<string> query_words = new List<string>();
     PanGu.Segment.Init();
     Segment seg = new Segment();
     foreach (string q in query)
     {
         ICollection<WordInfo> words = seg.DoSegment(q);
         foreach (WordInfo wi in words)
         {
             if (!query_words.Contains(wi.Word))
                 query_words.Add(wi.Word);
         }
     }
     /* 检索出符合条件的docIds */
     MongodbAccess mongo = new MongodbAccess();
     List<ObjectId> docIds = mongo.GetDocIDByQuery(query_words);
     docIds = SortResult(docIds, query_words);
     List<DocUrlAbstractResult> result = GetResult(docIds);
     /*
     foreach (DocUrlAbstractResult duar in result)
     {
         Console.WriteLine("{0}\n\t{1}", duar.title, duar.url);
     }
      * */
     StringBuilder strbuilder = new StringBuilder();
     foreach (DocUrlAbstractResult duar in result)
     {
         strbuilder.Append(String.Format(
             "<li><div><span><a href='{2}' target='_blank' class='link'>{0}<a></span><br/><span class='abstract'>{1}</span><br/><span class='url'>{2}</span></div></li>",
             duar.title, duar.abst, duar.url));
     }
     if (strbuilder.Length == 0)
     {
         strbuilder.Append("No pages mathch the query.");
     }
     return "<ol>" + strbuilder.ToString() + "</ol>";
 }
Example #14
0
 public PanguTokenizer(PanGu.Match.MatchOptions options, PanGu.Match.MatchParameter parameters)
 {
     _Options = options;_Parameters = parameters;
     segment = new Segment();
 }
Example #15
0
        /// <summary>
        /// 根据文章标题智能解析关键字(或标签)
        /// </summary>
        /// <param name="title"></param>
        /// <returns></returns>
        private static ICollection<WordInfo> TitleToKeywordWordInfos(string title)
        {
            PanGu.Segment segment = new Segment();

            PanGu.Match.MatchOptions matchOptions = new PanGu.Match.MatchOptions();

            //中文人名识别
            matchOptions.ChineseNameIdentify = false;
            //词频优先
            matchOptions.FrequencyFirst = false;
            //多元分词
            matchOptions.MultiDimensionality = false;
            //英文多元分词,这个开关,会将英文中的字母和数字分开
            matchOptions.EnglishMultiDimensionality = false;
            //过滤停用词
            matchOptions.FilterStopWords = true;
            //忽略空格、回车、Tab
            matchOptions.IgnoreSpace = true;
            //强制一元分词
            matchOptions.ForceSingleWord = false;
            //繁体中文开关
            matchOptions.TraditionalChineseEnabled = false;
            //同时输出简体和繁体
            matchOptions.OutputSimplifiedTraditional = false;
            //未登录词识别
            matchOptions.UnknownWordIdentify = false;
            //过滤英文,这个选项只有在过滤停用词选项生效时才有效
            matchOptions.FilterEnglish = true;
            //过滤数字,这个选项只有在过滤停用词选项生效时才有效
            matchOptions.FilterNumeric = true;
            //忽略英文大小写
            matchOptions.IgnoreCapital = false;
            //英文分词
            matchOptions.EnglishSegment = false;
            //同义词输出  (同义词输出功能一般用于对搜索字符串的分词,不建议在索引时使用)
            matchOptions.SynonymOutput = false;
            //通配符匹配输出 ()
            matchOptions.WildcardOutput = false;
            //对通配符匹配的结果分词
            matchOptions.WildcardSegment = false;

            PanGu.Match.MatchParameter matchParameter = new PanGu.Match.MatchParameter();
            //未登录词权值
            matchParameter.UnknowRank = 1;
            //最匹配词权值
            matchParameter.BestRank = 5;
            //次匹配词权值
            matchParameter.SecRank = 3;
            //再次匹配词权值
            matchParameter.ThirdRank = 2;
            //强行输出的单字的权值
            matchParameter.SingleRank = 1;
            //数字的权值
            matchParameter.NumericRank = 1;
            //英文词汇权值
            matchParameter.EnglishRank = 5;
            //英文词汇小写的权值
            matchParameter.EnglishLowerRank = 3;
            //英文词汇词根的权值
            matchParameter.EnglishStemRank = 2;
            //符号的权值
            matchParameter.SymbolRank = 1;
            //强制同时输出简繁汉字时,非原来文本的汉字输出权值。 比如原来文本是简体,这里就是输出的繁体字的权值,反之亦然。
            matchParameter.SimplifiedTraditionalRank = 1;
            //同义词权值
            matchParameter.SynonymRank = 1;
            //通配符匹配结果的权值
            matchParameter.WildcardRank = 1;
            //过滤英文选项生效时,过滤大于这个长度的英文
            matchParameter.FilterEnglishLength = 0;
            //过滤数字选项生效时,过滤大于这个长度的数字
            matchParameter.FilterNumericLength = 0;
            //用户自定义规则的配件文件名
            matchParameter.CustomRuleAssemblyFileName = string.Empty;
            //用户自定义规则的类的完整名,即带名字空间的名称
            matchParameter.CustomRuleFullClassName = string.Empty;
            //冗余度
            matchParameter.Redundancy = 0;

            return segment.DoSegment(title, matchOptions, matchParameter);
        }
Example #16
0
        public ICollection<WordInfo> SegmentToWordInfos(string str)
        {
            if (string.IsNullOrEmpty(str)) {
                return new LinkedList<WordInfo>();
            }

            Segment segment = new Segment();
            return segment.DoSegment(str);
        }
Example #17
0
        private List<string> GetSegmentWords(List<string> content)
        {
            Segment segment = new Segment();

            List<string> segmentWords = new List<string>();
            foreach (string s in content)
            {
                ICollection<WordInfo> words = segment.DoSegment(s);
                List<string> sWords = new List<string>();
                foreach (WordInfo wordInfo in words)
                {
                    if (wordInfo == null)
                    {
                        continue;
                    }
                    sWords.Add(wordInfo.Word);
                }
                segmentWords.AddRange(sWords);
            }
            return segmentWords;
        }
 public Highlighter(Formatter formatter, PanGu.Segment segment)
 {
     _Formatter = formatter;
     _PanGuSegment = segment;
 }
Example #19
0
        private static string segment(string s)
        {
            Segment segment = new Segment();
            ICollection<WordInfo> words = segment.DoSegment(s);
            StringBuilder wordsString = new StringBuilder();
            foreach (WordInfo wordInfo in words)
            {
                if (wordInfo == null)
                {
                    continue;
                }
                wordsString.AppendFormat("{0}\t", wordInfo.Word);
            }

            return wordsString.ToString();
        }
Example #20
0
        public PanGuTokenizer(TextReader input)
            : base(input)
        {
            _inputText = base.input.ReadToEnd();

            if (string.IsNullOrEmpty(_inputText))
            {
                char[] readBuf = new char[1024];
                int relCount = base.input.Read(readBuf, 0, readBuf.Length);
                StringBuilder inputStr = new StringBuilder(readBuf.Length);

                while (relCount > 0)
                {
                    inputStr.Append(readBuf, 0, relCount);

                    relCount = input.Read(readBuf, 0, readBuf.Length);
                }

                if (inputStr.Length > 0)
                {
                    _inputText = inputStr.ToString();
                }
            }

            _isFlag = AnalyzInput();
            if (!_isFlag)       //盘古分词
            {
                global::PanGu.Segment segment = new Segment();
                var wordInfos = segment.DoSegment(_inputText);
                foreach (var wi in wordInfos)
                {
                    var list = SpellUtils.GetSpellSegment(wi.Word);
                    if (list != null)
                        _segmentList.AddRange(list);
                }
                _wordList = new List<WordInfo>(wordInfos);
            }
            else
            {
                if (_inputText.EndsWith(_indexFlag))  //逗号空格分词
                {
                    string[] sources = _inputText.Replace(_indexFlag, "").Split(_splitFlag, StringSplitOptions.RemoveEmptyEntries);
                    foreach (string source in sources)
                    {
                        _segmentList.Add(source);
                        var spells = SpellUtils.GetSpellSegment(source);
                        if (spells != null)
                            _segmentList.AddRange(spells);
                    }
                    
                }
                else if (_inputText.EndsWith(_indexCnName))  //拼音首字母分词
                {
                    string source = _inputText.Replace(_indexCnName, "");
                    _wordList = SpellUtils.GetCnSegment(source);
                }
                else  //flag分词
                {
                    string source = _inputText.Replace("|", "");
                    for (int i = 0; i < source.Length; i++)
                    {
                        string first = source[i].ToString();
                        _segmentList.Add(first);
                        int f = 1;
                        for (int j = i + 1; j < source.Length; j++)
                        {
                            string s = string.Format("{0}|{1}", first, source[j]);
                            _segmentList.Add(s);
                            first = s;
                            if (j == source.Length - 1)
                            {
                                f++;
                                j = i + f - 1;
                                first = source[i].ToString();
                            }
                        }
                    }
                }
            }
        }
Example #21
0
 public Highlighter(Formatter formatter, PanGu.Segment segment)
 {
     _Formatter    = formatter;
     _PanGuSegment = segment;
 }
Example #22
0
 /// <summary>
 /// 处理客户端输入
 /// </summary>
 public static string SegmentKeyWord(string keyword)
 {
     #region Deleted
     //var cnbuilder = new StringBuilder();
     //var enbuilder = new StringBuilder ();
     //foreach (char k in keyword)
     //{
     //    if (SpellUtils.IsChinese(k.ToString()))
     //    {
     //        cnbuilder.Append(k);
     //        enbuilder.Append(" ");
     //    }
     //    else
     //    {
     //        enbuilder.Append(k);
     //    }
     //}
     //var rbuilder = new StringBuilder();
     //if (enbuilder.Length > 0)
     //{
     //    var enkeys = enbuilder.ToString().Split(_splitWord, StringSplitOptions.RemoveEmptyEntries);
     //    foreach (string enkey in enkeys)
     //    {
     //        rbuilder.AppendFormat("{0}^{1}.0 ", enkey, 1);
     //        //rbuilder.AppendFormat("{0}*^{1}.0", enkey, 1);
     //    }
     //}
     //if (cnbuilder.Length > 0)
     //{
     //    var segment = new Segment();
     //    var words = segment.DoSegment(cnbuilder.ToString(), _option);
     //    foreach (var word in words)
     //    {
     //        if (word == null)
     //            continue;
     //        rbuilder.AppendFormat("{0}^{1}.0 ", word.Word, (int)Math.Pow(3, word.Rank));
     //    }
     //}
     //return rbuilder.ToString().Trim();
     #endregion
     var builder = new StringBuilder(20);
     var segment = new Segment();
     var words = segment.DoSegment(keyword, _option);
     foreach (var word in words)
     {
         if (word == null)
             continue;
         builder.AppendFormat("{0}^{1}.0 ", word.Word, (int)Math.Pow(3, word.Rank));
     }
     return builder.ToString();
 }
Example #23
0
 private string TruncateParagraph(string paragraph) {
     Segment segment = new Segment();
     ICollection<WordInfo> words = segment.DoSegment(paragraph, segmentMatchOptions);
     int stop = 0;
     // 最短不能小于需要长度的1/2
     int lowerBound = truncateLimit / 2;
     foreach (WordInfo word in words) {
         if (word.Position + word.Word.Length > truncateLimit) {
             return stop < lowerBound ? 
                 paragraph.Substring(0, truncateLimit) : 
                 paragraph.Substring(0, stop);
         }
         stop = word.Position + word.Word.Length;
     }
     return paragraph;
 }