static void Main(string[] args) { Console.WriteLine("PanGu Word Segment"); string input = "", output = ""; if (args.Length >= 2) { input = args[0]; output = args[1]; } else { Console.WriteLine("Usage: wordsegment <input> <output> [-pos]"); FormDemo demo = new FormDemo(); demo.ShowDialog(); return; } bool showPosition = args.Length >= 3 && args[2] == "-pos"; // init PanGu.Segment.Init(); PanGu.Match.MatchOptions options = PanGu.Setting.PanGuSettings.Config.MatchOptions; PanGu.Match.MatchParameter parameters = PanGu.Setting.PanGuSettings.Config.Parameters; using (System.IO.StreamWriter sw = new System.IO.StreamWriter(output)) { foreach (string doc in System.IO.File.ReadAllLines(input)) { Segment segment = new Segment(); ICollection <WordInfo> words = segment.DoSegment(doc, options, parameters); StringBuilder wordsString = new StringBuilder(); foreach (WordInfo wordInfo in words) { if (wordInfo == null) { continue; } if (showPosition) { wordsString.AppendFormat("{0}/({1},{2}) ", wordInfo.Word, wordInfo.Position, wordInfo.Rank); } else { wordsString.AppendFormat("{0} ", wordInfo.Word); } } sw.WriteLine(wordsString); } sw.Flush(); sw.Close(); } }
private void buttonSaveConfig_Click(object sender, EventArgs e) { _Options = PanGu.Setting.PanGuSettings.Config.MatchOptions; _Parameters = PanGu.Setting.PanGuSettings.Config.Parameters; UpdateSettings(); PanGu.Setting.PanGuSettings.Save("PanGu.xml"); }
private void buttonSegment_Click(object sender, EventArgs e) { _Options = PanGu.Setting.PanGuSettings.Config.MatchOptions.Clone(); _Parameters = PanGu.Setting.PanGuSettings.Config.Parameters.Clone(); UpdateSettings(); if (checkBoxDisplayPosition.Checked) { DisplaySegmentAndPostion(); } else { DisplaySegment(); } }
public Match.MatchParameter GetParameterCopy() { Match.MatchParameter parameter = new PanGu.Match.MatchParameter(); parameter.Redundancy = this.Parameters.Redundancy; parameter.UnknowRank = this.Parameters.UnknowRank; parameter.BestRank = this.Parameters.BestRank; parameter.SecRank = this.Parameters.SecRank; parameter.ThirdRank = this.Parameters.ThirdRank; parameter.SingleRank = this.Parameters.SingleRank; parameter.NumericRank = this.Parameters.NumericRank; parameter.EnglishRank = this.Parameters.EnglishRank; parameter.SymbolRank = this.Parameters.SymbolRank; parameter.SimplifiedTraditionalRank = this.Parameters.SimplifiedTraditionalRank; return(parameter); }
public static void Init() { _Options = new PanGu.Match.MatchOptions(); _Parameters = new PanGu.Match.MatchParameter(); _Options.FrequencyFirst = Convert.ToBoolean(ConfigurationManager.AppSettings["checkBoxFreqFirst"]); _Options.FilterStopWords = Convert.ToBoolean(ConfigurationManager.AppSettings["FilterStopWords"]); _Options.ChineseNameIdentify = Convert.ToBoolean(ConfigurationManager.AppSettings["ChineseNameIdentify"]); _Options.MultiDimensionality = Convert.ToBoolean(ConfigurationManager.AppSettings["MultiDimensionality"]); _Options.EnglishMultiDimensionality = Convert.ToBoolean(ConfigurationManager.AppSettings["EnglishMultiDimensionality"]); _Options.ForceSingleWord = Convert.ToBoolean(ConfigurationManager.AppSettings["ForceSingleWord"]); _Options.TraditionalChineseEnabled = Convert.ToBoolean(ConfigurationManager.AppSettings["TraditionalChineseEnabled"]); _Options.OutputSimplifiedTraditional = Convert.ToBoolean(ConfigurationManager.AppSettings["OutputSimplifiedTraditional"]); _Options.UnknownWordIdentify = Convert.ToBoolean(ConfigurationManager.AppSettings["UnknownWordIdentify"]); _Options.FilterEnglish = Convert.ToBoolean(ConfigurationManager.AppSettings["FilterEnglish"]); _Options.FilterNumeric = Convert.ToBoolean(ConfigurationManager.AppSettings["FilterNumeric"]); _Options.IgnoreCapital = Convert.ToBoolean(ConfigurationManager.AppSettings["IgnoreCapital"]); _Options.EnglishSegment = Convert.ToBoolean(ConfigurationManager.AppSettings["EnglishSegment"]); _Options.SynonymOutput = Convert.ToBoolean(ConfigurationManager.AppSettings["SynonymOutput"]); _Options.WildcardOutput = Convert.ToBoolean(ConfigurationManager.AppSettings["WildcardOutput"]); _Options.WildcardSegment = Convert.ToBoolean(ConfigurationManager.AppSettings["WildcardSegment"]); _Options.CustomRule = Convert.ToBoolean(ConfigurationManager.AppSettings["CustomRule"]); _Parameters.Redundancy = Convert.ToInt16(ConfigurationManager.AppSettings["Redundancy"]); _Parameters.FilterEnglishLength = Convert.ToInt16(ConfigurationManager.AppSettings["FilterEnglishLength"]); _Parameters.FilterNumericLength = Convert.ToInt16(ConfigurationManager.AppSettings["FilterNumericLength"]); var config = new Config(); config.ServerHosts = JsonConvert.DeserializeObject <List <ServerHost> >(ConfigurationManager.ConnectionStrings["mongodb"].ToString()); config.BaseDbName = "serch_base"; config.IndexListDbName = "serch_index"; CoreIoc.Register(o => o.RegisterInstance(config).As <Config>().ExternallyOwned()); CoreIoc.Register(o => o.RegisterType <ServerManage>().As <ServerManage>().SingleInstance()); CoreIoc.Register(o => o.RegisterType <DocumentOp>().As <IDocument>().SingleInstance()); CoreIoc.Register(o => o.RegisterInstance(new PanguTokenizer(_Options, _Parameters)).As <ITokenizer>().ExternallyOwned()); CoreIoc.Register(o => o.RegisterType <Index>().As <IIndex>().SingleInstance()); CoreIoc.Register(o => o.RegisterType <Query>().As <Iquery>().SingleInstance()); CoreIoc.Build(); }
private void FormDemo_Load(object sender, EventArgs e) { textBoxSource.Text = _InitSource; PanGu.Segment.Init(); PanGu.Match.MatchOptions options = PanGu.Setting.PanGuSettings.Config.MatchOptions; checkBoxFreqFirst.Checked = options.FrequencyFirst; checkBoxFilterStopWords.Checked = options.FilterStopWords; checkBoxMatchName.Checked = options.ChineseNameIdentify; checkBoxMultiSelect.Checked = options.MultiDimensionality; checkBoxEnglishMultiSelect.Checked = options.EnglishMultiDimensionality; checkBoxForceSingleWord.Checked = options.ForceSingleWord; checkBoxTraditionalChs.Checked = options.TraditionalChineseEnabled; checkBoxST.Checked = options.OutputSimplifiedTraditional; checkBoxUnknownWord.Checked = options.UnknownWordIdentify; checkBoxFilterEnglish.Checked = options.FilterEnglish; checkBoxFilterNumeric.Checked = options.FilterNumeric; checkBoxIgnoreCapital.Checked = options.IgnoreCapital; checkBoxEnglishSegment.Checked = options.EnglishSegment; checkBoxSynonymOutput.Checked = options.SynonymOutput; checkBoxWildcard.Checked = options.WildcardOutput; checkBoxWildcardSegment.Checked = options.WildcardSegment; checkBoxCustomRule.Checked = options.CustomRule; checkBoxDisplayPosition.Checked = options.DisplayPosition; checkBoxIgnoreSpace.Checked = options.IgnoreSpace; checkBoxIgnoreEOL.Checked = options.IgnoreEndOfLine; //if (checkBoxMultiSelect.Checked) //{ // checkBoxDisplayPosition.Checked = true; //} PanGu.Match.MatchParameter parameters = PanGu.Setting.PanGuSettings.Config.Parameters; numericUpDownRedundancy.Value = parameters.Redundancy; numericUpDownFilterEnglishLength.Value = parameters.FilterEnglishLength; numericUpDownFilterNumericLength.Value = parameters.FilterNumericLength; divider.Text = parameters.CustomDivider; //str = Microsoft.VisualBasic.Strings.StrConv(str, Microsoft.VisualBasic.VbStrConv.SimplifiedChinese, 0); }
public static void Init() { _Options = new PanGu.Match.MatchOptions(); _Parameters = new PanGu.Match.MatchParameter(); _Options.FrequencyFirst = Convert.ToBoolean(ConfigurationManager.AppSettings["checkBoxFreqFirst"]); _Options.FilterStopWords = Convert.ToBoolean(ConfigurationManager.AppSettings["FilterStopWords"]); _Options.ChineseNameIdentify = Convert.ToBoolean(ConfigurationManager.AppSettings["ChineseNameIdentify"]); _Options.MultiDimensionality = Convert.ToBoolean(ConfigurationManager.AppSettings["MultiDimensionality"]); _Options.EnglishMultiDimensionality = Convert.ToBoolean(ConfigurationManager.AppSettings["EnglishMultiDimensionality"]); _Options.ForceSingleWord = Convert.ToBoolean(ConfigurationManager.AppSettings["ForceSingleWord"]); _Options.TraditionalChineseEnabled = Convert.ToBoolean(ConfigurationManager.AppSettings["TraditionalChineseEnabled"]); _Options.OutputSimplifiedTraditional = Convert.ToBoolean(ConfigurationManager.AppSettings["OutputSimplifiedTraditional"]); _Options.UnknownWordIdentify = Convert.ToBoolean(ConfigurationManager.AppSettings["UnknownWordIdentify"]); _Options.FilterEnglish = Convert.ToBoolean(ConfigurationManager.AppSettings["FilterEnglish"]); _Options.FilterNumeric = Convert.ToBoolean(ConfigurationManager.AppSettings["FilterNumeric"]); _Options.IgnoreCapital = Convert.ToBoolean(ConfigurationManager.AppSettings["IgnoreCapital"]); _Options.EnglishSegment = Convert.ToBoolean(ConfigurationManager.AppSettings["EnglishSegment"]); _Options.SynonymOutput = Convert.ToBoolean(ConfigurationManager.AppSettings["SynonymOutput"]); _Options.WildcardOutput = Convert.ToBoolean(ConfigurationManager.AppSettings["WildcardOutput"]); _Options.WildcardSegment = Convert.ToBoolean(ConfigurationManager.AppSettings["WildcardSegment"]); _Options.CustomRule = Convert.ToBoolean(ConfigurationManager.AppSettings["CustomRule"]); _Parameters.Redundancy = Convert.ToInt16(ConfigurationManager.AppSettings["Redundancy"]); _Parameters.FilterEnglishLength = Convert.ToInt16(ConfigurationManager.AppSettings["FilterEnglishLength"]); _Parameters.FilterNumericLength = Convert.ToInt16(ConfigurationManager.AppSettings["FilterNumericLength"]); var config = new Config(); config.ServerHosts = JsonConvert.DeserializeObject<List<ServerHost>>(ConfigurationManager.ConnectionStrings["mongodb"].ToString()); config.BaseDbName = "serch_base"; config.IndexListDbName = "serch_index"; CoreIoc.Register(o => o.RegisterInstance(config).As<Config>().ExternallyOwned()); CoreIoc.Register(o=>o.RegisterType<ServerManage>().As<ServerManage>().SingleInstance()); CoreIoc.Register(o=>o.RegisterType<DocumentOp>().As<IDocument>().SingleInstance()); CoreIoc.Register(o => o.RegisterInstance(new PanguTokenizer(_Options,_Parameters)).As<ITokenizer>().ExternallyOwned()); CoreIoc.Register(o => o.RegisterType<Index>().As<IIndex>().SingleInstance()); CoreIoc.Register(o => o.RegisterType<Query>().As<Iquery>().SingleInstance()); CoreIoc.Build(); }
public PanguTokenizer(PanGu.Match.MatchOptions options, PanGu.Match.MatchParameter parameters) { _Options = options;_Parameters = parameters; segment = new Segment(); }
/// <summary> /// 根据文章标题智能解析关键字(或标签) /// </summary> /// <param name="title"></param> /// <returns></returns> private static ICollection<WordInfo> TitleToKeywordWordInfos(string title) { PanGu.Segment segment = new Segment(); PanGu.Match.MatchOptions matchOptions = new PanGu.Match.MatchOptions(); //中文人名识别 matchOptions.ChineseNameIdentify = false; //词频优先 matchOptions.FrequencyFirst = false; //多元分词 matchOptions.MultiDimensionality = false; //英文多元分词,这个开关,会将英文中的字母和数字分开 matchOptions.EnglishMultiDimensionality = false; //过滤停用词 matchOptions.FilterStopWords = true; //忽略空格、回车、Tab matchOptions.IgnoreSpace = true; //强制一元分词 matchOptions.ForceSingleWord = false; //繁体中文开关 matchOptions.TraditionalChineseEnabled = false; //同时输出简体和繁体 matchOptions.OutputSimplifiedTraditional = false; //未登录词识别 matchOptions.UnknownWordIdentify = false; //过滤英文,这个选项只有在过滤停用词选项生效时才有效 matchOptions.FilterEnglish = true; //过滤数字,这个选项只有在过滤停用词选项生效时才有效 matchOptions.FilterNumeric = true; //忽略英文大小写 matchOptions.IgnoreCapital = false; //英文分词 matchOptions.EnglishSegment = false; //同义词输出 (同义词输出功能一般用于对搜索字符串的分词,不建议在索引时使用) matchOptions.SynonymOutput = false; //通配符匹配输出 () matchOptions.WildcardOutput = false; //对通配符匹配的结果分词 matchOptions.WildcardSegment = false; PanGu.Match.MatchParameter matchParameter = new PanGu.Match.MatchParameter(); //未登录词权值 matchParameter.UnknowRank = 1; //最匹配词权值 matchParameter.BestRank = 5; //次匹配词权值 matchParameter.SecRank = 3; //再次匹配词权值 matchParameter.ThirdRank = 2; //强行输出的单字的权值 matchParameter.SingleRank = 1; //数字的权值 matchParameter.NumericRank = 1; //英文词汇权值 matchParameter.EnglishRank = 5; //英文词汇小写的权值 matchParameter.EnglishLowerRank = 3; //英文词汇词根的权值 matchParameter.EnglishStemRank = 2; //符号的权值 matchParameter.SymbolRank = 1; //强制同时输出简繁汉字时,非原来文本的汉字输出权值。 比如原来文本是简体,这里就是输出的繁体字的权值,反之亦然。 matchParameter.SimplifiedTraditionalRank = 1; //同义词权值 matchParameter.SynonymRank = 1; //通配符匹配结果的权值 matchParameter.WildcardRank = 1; //过滤英文选项生效时,过滤大于这个长度的英文 matchParameter.FilterEnglishLength = 0; //过滤数字选项生效时,过滤大于这个长度的数字 matchParameter.FilterNumericLength = 0; //用户自定义规则的配件文件名 matchParameter.CustomRuleAssemblyFileName = string.Empty; //用户自定义规则的类的完整名,即带名字空间的名称 matchParameter.CustomRuleFullClassName = string.Empty; //冗余度 matchParameter.Redundancy = 0; return segment.DoSegment(title, matchOptions, matchParameter); }
/// <summary> /// 根据文章标题智能解析关键字(或标签) /// </summary> /// <param name="title"></param> /// <returns></returns> private static ICollection <WordInfo> TitleToKeywordWordInfos(string title) { PanGu.Segment segment = new Segment(); PanGu.Match.MatchOptions matchOptions = new PanGu.Match.MatchOptions(); //中文人名识别 matchOptions.ChineseNameIdentify = false; //词频优先 matchOptions.FrequencyFirst = false; //多元分词 matchOptions.MultiDimensionality = false; //英文多元分词,这个开关,会将英文中的字母和数字分开 matchOptions.EnglishMultiDimensionality = false; //过滤停用词 matchOptions.FilterStopWords = true; //忽略空格、回车、Tab matchOptions.IgnoreSpace = true; //强制一元分词 matchOptions.ForceSingleWord = false; //繁体中文开关 matchOptions.TraditionalChineseEnabled = false; //同时输出简体和繁体 matchOptions.OutputSimplifiedTraditional = false; //未登录词识别 matchOptions.UnknownWordIdentify = false; //过滤英文,这个选项只有在过滤停用词选项生效时才有效 matchOptions.FilterEnglish = true; //过滤数字,这个选项只有在过滤停用词选项生效时才有效 matchOptions.FilterNumeric = true; //忽略英文大小写 matchOptions.IgnoreCapital = false; //英文分词 matchOptions.EnglishSegment = false; //同义词输出 (同义词输出功能一般用于对搜索字符串的分词,不建议在索引时使用) matchOptions.SynonymOutput = false; //通配符匹配输出 () matchOptions.WildcardOutput = false; //对通配符匹配的结果分词 matchOptions.WildcardSegment = false; PanGu.Match.MatchParameter matchParameter = new PanGu.Match.MatchParameter(); //未登录词权值 matchParameter.UnknowRank = 1; //最匹配词权值 matchParameter.BestRank = 5; //次匹配词权值 matchParameter.SecRank = 3; //再次匹配词权值 matchParameter.ThirdRank = 2; //强行输出的单字的权值 matchParameter.SingleRank = 1; //数字的权值 matchParameter.NumericRank = 1; //英文词汇权值 matchParameter.EnglishRank = 5; //英文词汇小写的权值 matchParameter.EnglishLowerRank = 3; //英文词汇词根的权值 matchParameter.EnglishStemRank = 2; //符号的权值 matchParameter.SymbolRank = 1; //强制同时输出简繁汉字时,非原来文本的汉字输出权值。 比如原来文本是简体,这里就是输出的繁体字的权值,反之亦然。 matchParameter.SimplifiedTraditionalRank = 1; //同义词权值 matchParameter.SynonymRank = 1; //通配符匹配结果的权值 matchParameter.WildcardRank = 1; //过滤英文选项生效时,过滤大于这个长度的英文 matchParameter.FilterEnglishLength = 0; //过滤数字选项生效时,过滤大于这个长度的数字 matchParameter.FilterNumericLength = 0; //用户自定义规则的配件文件名 matchParameter.CustomRuleAssemblyFileName = string.Empty; //用户自定义规则的类的完整名,即带名字空间的名称 matchParameter.CustomRuleFullClassName = string.Empty; //冗余度 matchParameter.Redundancy = 0; return(segment.DoSegment(title, matchOptions, matchParameter)); }
public PanguTokenizer(PanGu.Match.MatchOptions options, PanGu.Match.MatchParameter parameters) { _Options = options; _Parameters = parameters; segment = new Segment(); }
public Match.MatchParameter GetParameterCopy() { Match.MatchParameter parameter = new PanGu.Match.MatchParameter(); parameter.Redundancy = this.Parameters.Redundancy; parameter.UnknowRank = this.Parameters.UnknowRank; parameter.BestRank = this.Parameters.BestRank; parameter.SecRank = this.Parameters.SecRank; parameter.ThirdRank = this.Parameters.ThirdRank; parameter.SingleRank = this.Parameters.SingleRank; parameter.NumericRank = this.Parameters.NumericRank; parameter.EnglishRank = this.Parameters.EnglishRank; parameter.SymbolRank = this.Parameters.SymbolRank; parameter.SimplifiedTraditionalRank = this.Parameters.SimplifiedTraditionalRank; return parameter; }