示例#1
0
        //盘古分词同时输出标题中的同义词
        public string PanGuFenCiTYC(string str)
        {
            string strtext = "";

            //初始化
            PanGu.Segment.Init();
            Segment seg = new Segment();

            PanGu.Match.MatchOptions m = new PanGu.Match.MatchOptions(); //设置分词属性
            m.ChineseNameIdentify = true;                                //中文人名识别
            m.SynonymOutput       = true;                                //同义词输出
            m.MultiDimensionality = true;                                //多元分词
            m.FilterStopWords     = true;                                //过滤停用词
            m.IgnoreSpace         = false;                               //忽略空格、回车、Tab
            if (seg != null)
            {
                ICollection <WordInfo> words = seg.DoSegment(str, m);//内容分词处理
                foreach (Object item in words)
                {
                    //item.ToString();分词,根据词替换
                    strtext += item.ToString();
                }
            }
            return(str);
        }
示例#2
0
        private void buttonSaveConfig_Click(object sender, EventArgs e)
        {
            _Options    = PanGu.Setting.PanGuSettings.Config.MatchOptions;
            _Parameters = PanGu.Setting.PanGuSettings.Config.Parameters;

            UpdateSettings();

            PanGu.Setting.PanGuSettings.Save("PanGu.xml");
        }
        static void Main(string[] args)
        {
            Console.WriteLine("PanGu Word Segment");

            string input = "", output = "";

            if (args.Length >= 2)
            {
                input  = args[0];
                output = args[1];
            }
            else
            {
                Console.WriteLine("Usage: wordsegment <input> <output> [-pos]");
                FormDemo demo = new FormDemo();
                demo.ShowDialog();
                return;
            }

            bool showPosition = args.Length >= 3 && args[2] == "-pos";

            // init

            PanGu.Segment.Init();
            PanGu.Match.MatchOptions   options    = PanGu.Setting.PanGuSettings.Config.MatchOptions;
            PanGu.Match.MatchParameter parameters = PanGu.Setting.PanGuSettings.Config.Parameters;

            using (System.IO.StreamWriter sw = new System.IO.StreamWriter(output))
            {
                foreach (string doc in System.IO.File.ReadAllLines(input))
                {
                    Segment segment = new Segment();
                    ICollection <WordInfo> words = segment.DoSegment(doc, options, parameters);

                    StringBuilder wordsString = new StringBuilder();
                    foreach (WordInfo wordInfo in words)
                    {
                        if (wordInfo == null)
                        {
                            continue;
                        }

                        if (showPosition)
                        {
                            wordsString.AppendFormat("{0}/({1},{2}) ", wordInfo.Word, wordInfo.Position, wordInfo.Rank);
                        }
                        else
                        {
                            wordsString.AppendFormat("{0} ", wordInfo.Word);
                        }
                    }

                    sw.WriteLine(wordsString);
                }
                sw.Flush(); sw.Close();
            }
        }
示例#4
0
        public void Test1()
        {
            PanGu.Segment.Init();

            var segment = new Segment();
            var ops     = new PanGu.Match.MatchOptions();

            ops.MultiDimensionality = true;
            ICollection <WordInfo> words = segment.DoSegment(_InitSource, ops);
        }
示例#5
0
        public Match.MatchOptions GetOptionsCopy()
        {
            Match.MatchOptions options = new PanGu.Match.MatchOptions();

            options.ChineseNameIdentify         = this.MatchOptions.ChineseNameIdentify;
            options.FrequencyFirst              = this.MatchOptions.FrequencyFirst;
            options.MultiDimensionality         = this.MatchOptions.MultiDimensionality;
            options.FilterStopWords             = this.MatchOptions.FilterStopWords;
            options.IgnoreSpace                 = this.MatchOptions.IgnoreSpace;
            options.ForceSingleWord             = this.MatchOptions.ForceSingleWord;
            options.TraditionalChineseEnabled   = this.MatchOptions.TraditionalChineseEnabled;
            options.OutputSimplifiedTraditional = this.MatchOptions.OutputSimplifiedTraditional;

            return(options);
        }
示例#6
0
        private void buttonSegment_Click(object sender, EventArgs e)
        {
            _Options    = PanGu.Setting.PanGuSettings.Config.MatchOptions.Clone();
            _Parameters = PanGu.Setting.PanGuSettings.Config.Parameters.Clone();

            UpdateSettings();

            if (checkBoxDisplayPosition.Checked)
            {
                DisplaySegmentAndPostion();
            }
            else
            {
                DisplaySegment();
            }
        }
示例#7
0
        public static void Init()
        {
            _Options    = new PanGu.Match.MatchOptions();
            _Parameters = new PanGu.Match.MatchParameter();

            _Options.FrequencyFirst              = Convert.ToBoolean(ConfigurationManager.AppSettings["checkBoxFreqFirst"]);
            _Options.FilterStopWords             = Convert.ToBoolean(ConfigurationManager.AppSettings["FilterStopWords"]);
            _Options.ChineseNameIdentify         = Convert.ToBoolean(ConfigurationManager.AppSettings["ChineseNameIdentify"]);
            _Options.MultiDimensionality         = Convert.ToBoolean(ConfigurationManager.AppSettings["MultiDimensionality"]);
            _Options.EnglishMultiDimensionality  = Convert.ToBoolean(ConfigurationManager.AppSettings["EnglishMultiDimensionality"]);
            _Options.ForceSingleWord             = Convert.ToBoolean(ConfigurationManager.AppSettings["ForceSingleWord"]);
            _Options.TraditionalChineseEnabled   = Convert.ToBoolean(ConfigurationManager.AppSettings["TraditionalChineseEnabled"]);
            _Options.OutputSimplifiedTraditional = Convert.ToBoolean(ConfigurationManager.AppSettings["OutputSimplifiedTraditional"]);
            _Options.UnknownWordIdentify         = Convert.ToBoolean(ConfigurationManager.AppSettings["UnknownWordIdentify"]);
            _Options.FilterEnglish   = Convert.ToBoolean(ConfigurationManager.AppSettings["FilterEnglish"]);
            _Options.FilterNumeric   = Convert.ToBoolean(ConfigurationManager.AppSettings["FilterNumeric"]);
            _Options.IgnoreCapital   = Convert.ToBoolean(ConfigurationManager.AppSettings["IgnoreCapital"]);
            _Options.EnglishSegment  = Convert.ToBoolean(ConfigurationManager.AppSettings["EnglishSegment"]);
            _Options.SynonymOutput   = Convert.ToBoolean(ConfigurationManager.AppSettings["SynonymOutput"]);
            _Options.WildcardOutput  = Convert.ToBoolean(ConfigurationManager.AppSettings["WildcardOutput"]);
            _Options.WildcardSegment = Convert.ToBoolean(ConfigurationManager.AppSettings["WildcardSegment"]);
            _Options.CustomRule      = Convert.ToBoolean(ConfigurationManager.AppSettings["CustomRule"]);

            _Parameters.Redundancy          = Convert.ToInt16(ConfigurationManager.AppSettings["Redundancy"]);
            _Parameters.FilterEnglishLength = Convert.ToInt16(ConfigurationManager.AppSettings["FilterEnglishLength"]);
            _Parameters.FilterNumericLength = Convert.ToInt16(ConfigurationManager.AppSettings["FilterNumericLength"]);


            var config = new Config();

            config.ServerHosts     = JsonConvert.DeserializeObject <List <ServerHost> >(ConfigurationManager.ConnectionStrings["mongodb"].ToString());
            config.BaseDbName      = "serch_base";
            config.IndexListDbName = "serch_index";
            CoreIoc.Register(o => o.RegisterInstance(config).As <Config>().ExternallyOwned());
            CoreIoc.Register(o => o.RegisterType <ServerManage>().As <ServerManage>().SingleInstance());

            CoreIoc.Register(o => o.RegisterType <DocumentOp>().As <IDocument>().SingleInstance());
            CoreIoc.Register(o => o.RegisterInstance(new PanguTokenizer(_Options, _Parameters)).As <ITokenizer>().ExternallyOwned());
            CoreIoc.Register(o => o.RegisterType <Index>().As <IIndex>().SingleInstance());
            CoreIoc.Register(o => o.RegisterType <Query>().As <Iquery>().SingleInstance());
            CoreIoc.Build();
        }
示例#8
0
        private void FormDemo_Load(object sender, EventArgs e)
        {
            textBoxSource.Text = _InitSource;
            PanGu.Segment.Init();

            PanGu.Match.MatchOptions options = PanGu.Setting.PanGuSettings.Config.MatchOptions;
            checkBoxFreqFirst.Checked          = options.FrequencyFirst;
            checkBoxFilterStopWords.Checked    = options.FilterStopWords;
            checkBoxMatchName.Checked          = options.ChineseNameIdentify;
            checkBoxMultiSelect.Checked        = options.MultiDimensionality;
            checkBoxEnglishMultiSelect.Checked = options.EnglishMultiDimensionality;
            checkBoxForceSingleWord.Checked    = options.ForceSingleWord;
            checkBoxTraditionalChs.Checked     = options.TraditionalChineseEnabled;
            checkBoxST.Checked              = options.OutputSimplifiedTraditional;
            checkBoxUnknownWord.Checked     = options.UnknownWordIdentify;
            checkBoxFilterEnglish.Checked   = options.FilterEnglish;
            checkBoxFilterNumeric.Checked   = options.FilterNumeric;
            checkBoxIgnoreCapital.Checked   = options.IgnoreCapital;
            checkBoxEnglishSegment.Checked  = options.EnglishSegment;
            checkBoxSynonymOutput.Checked   = options.SynonymOutput;
            checkBoxWildcard.Checked        = options.WildcardOutput;
            checkBoxWildcardSegment.Checked = options.WildcardSegment;
            checkBoxCustomRule.Checked      = options.CustomRule;
            checkBoxDisplayPosition.Checked = options.DisplayPosition;
            checkBoxIgnoreSpace.Checked     = options.IgnoreSpace;
            checkBoxIgnoreEOL.Checked       = options.IgnoreEndOfLine;

            //if (checkBoxMultiSelect.Checked)
            //{
            //    checkBoxDisplayPosition.Checked = true;
            //}

            PanGu.Match.MatchParameter parameters = PanGu.Setting.PanGuSettings.Config.Parameters;

            numericUpDownRedundancy.Value          = parameters.Redundancy;
            numericUpDownFilterEnglishLength.Value = parameters.FilterEnglishLength;
            numericUpDownFilterNumericLength.Value = parameters.FilterNumericLength;
            divider.Text = parameters.CustomDivider;

            //str = Microsoft.VisualBasic.Strings.StrConv(str, Microsoft.VisualBasic.VbStrConv.SimplifiedChinese, 0);
        }
示例#9
0
        public static void Init()
        {
            _Options = new PanGu.Match.MatchOptions();
            _Parameters = new PanGu.Match.MatchParameter();

            _Options.FrequencyFirst = Convert.ToBoolean(ConfigurationManager.AppSettings["checkBoxFreqFirst"]);
            _Options.FilterStopWords = Convert.ToBoolean(ConfigurationManager.AppSettings["FilterStopWords"]);
            _Options.ChineseNameIdentify = Convert.ToBoolean(ConfigurationManager.AppSettings["ChineseNameIdentify"]);
            _Options.MultiDimensionality = Convert.ToBoolean(ConfigurationManager.AppSettings["MultiDimensionality"]);
            _Options.EnglishMultiDimensionality = Convert.ToBoolean(ConfigurationManager.AppSettings["EnglishMultiDimensionality"]);
            _Options.ForceSingleWord = Convert.ToBoolean(ConfigurationManager.AppSettings["ForceSingleWord"]);
            _Options.TraditionalChineseEnabled = Convert.ToBoolean(ConfigurationManager.AppSettings["TraditionalChineseEnabled"]);
            _Options.OutputSimplifiedTraditional = Convert.ToBoolean(ConfigurationManager.AppSettings["OutputSimplifiedTraditional"]);
            _Options.UnknownWordIdentify = Convert.ToBoolean(ConfigurationManager.AppSettings["UnknownWordIdentify"]);
            _Options.FilterEnglish = Convert.ToBoolean(ConfigurationManager.AppSettings["FilterEnglish"]);
            _Options.FilterNumeric = Convert.ToBoolean(ConfigurationManager.AppSettings["FilterNumeric"]);
            _Options.IgnoreCapital = Convert.ToBoolean(ConfigurationManager.AppSettings["IgnoreCapital"]);
            _Options.EnglishSegment = Convert.ToBoolean(ConfigurationManager.AppSettings["EnglishSegment"]);
            _Options.SynonymOutput = Convert.ToBoolean(ConfigurationManager.AppSettings["SynonymOutput"]);
            _Options.WildcardOutput = Convert.ToBoolean(ConfigurationManager.AppSettings["WildcardOutput"]);
            _Options.WildcardSegment = Convert.ToBoolean(ConfigurationManager.AppSettings["WildcardSegment"]);
            _Options.CustomRule = Convert.ToBoolean(ConfigurationManager.AppSettings["CustomRule"]);

            _Parameters.Redundancy = Convert.ToInt16(ConfigurationManager.AppSettings["Redundancy"]);
            _Parameters.FilterEnglishLength = Convert.ToInt16(ConfigurationManager.AppSettings["FilterEnglishLength"]);
            _Parameters.FilterNumericLength = Convert.ToInt16(ConfigurationManager.AppSettings["FilterNumericLength"]);

            var config = new Config();
            config.ServerHosts = JsonConvert.DeserializeObject<List<ServerHost>>(ConfigurationManager.ConnectionStrings["mongodb"].ToString());
            config.BaseDbName = "serch_base";
            config.IndexListDbName = "serch_index";
            CoreIoc.Register(o => o.RegisterInstance(config).As<Config>().ExternallyOwned());
            CoreIoc.Register(o=>o.RegisterType<ServerManage>().As<ServerManage>().SingleInstance());

            CoreIoc.Register(o=>o.RegisterType<DocumentOp>().As<IDocument>().SingleInstance());
            CoreIoc.Register(o => o.RegisterInstance(new PanguTokenizer(_Options,_Parameters)).As<ITokenizer>().ExternallyOwned());
            CoreIoc.Register(o => o.RegisterType<Index>().As<IIndex>().SingleInstance());
            CoreIoc.Register(o => o.RegisterType<Query>().As<Iquery>().SingleInstance());
            CoreIoc.Build();
        }
示例#10
0
 public PanguTokenizer(PanGu.Match.MatchOptions options, PanGu.Match.MatchParameter parameters)
 {
     _Options = options;_Parameters = parameters;
     segment = new Segment();
 }
示例#11
0
        /// <summary>
        /// 根据文章标题智能解析关键字(或标签)
        /// </summary>
        /// <param name="title"></param>
        /// <returns></returns>
        private static ICollection<WordInfo> TitleToKeywordWordInfos(string title)
        {
            PanGu.Segment segment = new Segment();

            PanGu.Match.MatchOptions matchOptions = new PanGu.Match.MatchOptions();

            //中文人名识别
            matchOptions.ChineseNameIdentify = false;
            //词频优先
            matchOptions.FrequencyFirst = false;
            //多元分词
            matchOptions.MultiDimensionality = false;
            //英文多元分词,这个开关,会将英文中的字母和数字分开
            matchOptions.EnglishMultiDimensionality = false;
            //过滤停用词
            matchOptions.FilterStopWords = true;
            //忽略空格、回车、Tab
            matchOptions.IgnoreSpace = true;
            //强制一元分词
            matchOptions.ForceSingleWord = false;
            //繁体中文开关
            matchOptions.TraditionalChineseEnabled = false;
            //同时输出简体和繁体
            matchOptions.OutputSimplifiedTraditional = false;
            //未登录词识别
            matchOptions.UnknownWordIdentify = false;
            //过滤英文,这个选项只有在过滤停用词选项生效时才有效
            matchOptions.FilterEnglish = true;
            //过滤数字,这个选项只有在过滤停用词选项生效时才有效
            matchOptions.FilterNumeric = true;
            //忽略英文大小写
            matchOptions.IgnoreCapital = false;
            //英文分词
            matchOptions.EnglishSegment = false;
            //同义词输出  (同义词输出功能一般用于对搜索字符串的分词,不建议在索引时使用)
            matchOptions.SynonymOutput = false;
            //通配符匹配输出 ()
            matchOptions.WildcardOutput = false;
            //对通配符匹配的结果分词
            matchOptions.WildcardSegment = false;

            PanGu.Match.MatchParameter matchParameter = new PanGu.Match.MatchParameter();
            //未登录词权值
            matchParameter.UnknowRank = 1;
            //最匹配词权值
            matchParameter.BestRank = 5;
            //次匹配词权值
            matchParameter.SecRank = 3;
            //再次匹配词权值
            matchParameter.ThirdRank = 2;
            //强行输出的单字的权值
            matchParameter.SingleRank = 1;
            //数字的权值
            matchParameter.NumericRank = 1;
            //英文词汇权值
            matchParameter.EnglishRank = 5;
            //英文词汇小写的权值
            matchParameter.EnglishLowerRank = 3;
            //英文词汇词根的权值
            matchParameter.EnglishStemRank = 2;
            //符号的权值
            matchParameter.SymbolRank = 1;
            //强制同时输出简繁汉字时,非原来文本的汉字输出权值。 比如原来文本是简体,这里就是输出的繁体字的权值,反之亦然。
            matchParameter.SimplifiedTraditionalRank = 1;
            //同义词权值
            matchParameter.SynonymRank = 1;
            //通配符匹配结果的权值
            matchParameter.WildcardRank = 1;
            //过滤英文选项生效时,过滤大于这个长度的英文
            matchParameter.FilterEnglishLength = 0;
            //过滤数字选项生效时,过滤大于这个长度的数字
            matchParameter.FilterNumericLength = 0;
            //用户自定义规则的配件文件名
            matchParameter.CustomRuleAssemblyFileName = string.Empty;
            //用户自定义规则的类的完整名,即带名字空间的名称
            matchParameter.CustomRuleFullClassName = string.Empty;
            //冗余度
            matchParameter.Redundancy = 0;

            return segment.DoSegment(title, matchOptions, matchParameter);
        }
示例#12
0
        private void buttonSaveConfig_Click(object sender, EventArgs e)
        {
            _Options = PanGu.Setting.PanGuSettings.Config.MatchOptions;
            _Parameters = PanGu.Setting.PanGuSettings.Config.Parameters;

            UpdateSettings();

            PanGu.Setting.PanGuSettings.Save("PanGu.xml");
        }
示例#13
0
        private void buttonSegment_Click(object sender, EventArgs e)
        {
            _Options = PanGu.Setting.PanGuSettings.Config.MatchOptions.Clone();
            _Parameters = PanGu.Setting.PanGuSettings.Config.Parameters.Clone();

            UpdateSettings();

            if (checkBoxDisplayPosition.Checked)
            {
                DisplaySegmentAndPostion();
            }
            else
            {
                DisplaySegment();
            }
        }
示例#14
0
        /// <summary>
        /// 根据文章标题智能解析关键字(或标签)
        /// </summary>
        /// <param name="title"></param>
        /// <returns></returns>
        private static ICollection <WordInfo> TitleToKeywordWordInfos(string title)
        {
            PanGu.Segment segment = new Segment();

            PanGu.Match.MatchOptions matchOptions = new PanGu.Match.MatchOptions();

            //中文人名识别
            matchOptions.ChineseNameIdentify = false;
            //词频优先
            matchOptions.FrequencyFirst = false;
            //多元分词
            matchOptions.MultiDimensionality = false;
            //英文多元分词,这个开关,会将英文中的字母和数字分开
            matchOptions.EnglishMultiDimensionality = false;
            //过滤停用词
            matchOptions.FilterStopWords = true;
            //忽略空格、回车、Tab
            matchOptions.IgnoreSpace = true;
            //强制一元分词
            matchOptions.ForceSingleWord = false;
            //繁体中文开关
            matchOptions.TraditionalChineseEnabled = false;
            //同时输出简体和繁体
            matchOptions.OutputSimplifiedTraditional = false;
            //未登录词识别
            matchOptions.UnknownWordIdentify = false;
            //过滤英文,这个选项只有在过滤停用词选项生效时才有效
            matchOptions.FilterEnglish = true;
            //过滤数字,这个选项只有在过滤停用词选项生效时才有效
            matchOptions.FilterNumeric = true;
            //忽略英文大小写
            matchOptions.IgnoreCapital = false;
            //英文分词
            matchOptions.EnglishSegment = false;
            //同义词输出  (同义词输出功能一般用于对搜索字符串的分词,不建议在索引时使用)
            matchOptions.SynonymOutput = false;
            //通配符匹配输出 ()
            matchOptions.WildcardOutput = false;
            //对通配符匹配的结果分词
            matchOptions.WildcardSegment = false;

            PanGu.Match.MatchParameter matchParameter = new PanGu.Match.MatchParameter();
            //未登录词权值
            matchParameter.UnknowRank = 1;
            //最匹配词权值
            matchParameter.BestRank = 5;
            //次匹配词权值
            matchParameter.SecRank = 3;
            //再次匹配词权值
            matchParameter.ThirdRank = 2;
            //强行输出的单字的权值
            matchParameter.SingleRank = 1;
            //数字的权值
            matchParameter.NumericRank = 1;
            //英文词汇权值
            matchParameter.EnglishRank = 5;
            //英文词汇小写的权值
            matchParameter.EnglishLowerRank = 3;
            //英文词汇词根的权值
            matchParameter.EnglishStemRank = 2;
            //符号的权值
            matchParameter.SymbolRank = 1;
            //强制同时输出简繁汉字时,非原来文本的汉字输出权值。 比如原来文本是简体,这里就是输出的繁体字的权值,反之亦然。
            matchParameter.SimplifiedTraditionalRank = 1;
            //同义词权值
            matchParameter.SynonymRank = 1;
            //通配符匹配结果的权值
            matchParameter.WildcardRank = 1;
            //过滤英文选项生效时,过滤大于这个长度的英文
            matchParameter.FilterEnglishLength = 0;
            //过滤数字选项生效时,过滤大于这个长度的数字
            matchParameter.FilterNumericLength = 0;
            //用户自定义规则的配件文件名
            matchParameter.CustomRuleAssemblyFileName = string.Empty;
            //用户自定义规则的类的完整名,即带名字空间的名称
            matchParameter.CustomRuleFullClassName = string.Empty;
            //冗余度
            matchParameter.Redundancy = 0;

            return(segment.DoSegment(title, matchOptions, matchParameter));
        }
示例#15
0
 public PanguTokenizer(PanGu.Match.MatchOptions options, PanGu.Match.MatchParameter parameters)
 {
     _Options = options; _Parameters = parameters;
     segment  = new Segment();
 }
示例#16
0
        public Match.MatchOptions GetOptionsCopy()
        {
            Match.MatchOptions options = new PanGu.Match.MatchOptions();

            options.ChineseNameIdentify = this.MatchOptions.ChineseNameIdentify;
            options.FrequencyFirst = this.MatchOptions.FrequencyFirst;
            options.MultiDimensionality = this.MatchOptions.MultiDimensionality;
            options.FilterStopWords = this.MatchOptions.FilterStopWords;
            options.IgnoreSpace = this.MatchOptions.IgnoreSpace;
            options.ForceSingleWord = this.MatchOptions.ForceSingleWord;
            options.TraditionalChineseEnabled = this.MatchOptions.TraditionalChineseEnabled;
            options.OutputSimplifiedTraditional = this.MatchOptions.OutputSimplifiedTraditional;

            return options;
        }