static void Main(string[] args) { var str = "共产党是世界上最坏的政党,压迫人民,繁重的苛捐杂税,高压统治,反对人权, 自由万岁!"; Segment.Init(); var options = PanGu.Setting.PanGuSettings.Config.MatchOptions.Clone(); var parameters = PanGu.Setting.PanGuSettings.Config.Parameters.Clone(); options.FilterStopWords = false; Segment segment = new Segment(); var words = segment.DoSegment(str, options, parameters); StringBuilder wordsString = new StringBuilder(); foreach (WordInfo wordInfo in words) { if (wordInfo == null) { continue; } wordsString.AppendFormat("{0} ", wordInfo.Word); } str = wordsString.ToString(); Console.WriteLine(wordsString.ToString()); var zhlist = PinyinCommon.GetZhDictsList(); Console.WriteLine(PinyinCommon.GetPinyin(str, zhlist)); Console.ReadKey(); }
/// <summary> /// 盘古分词 /// </summary> /// <param name="str"></param> /// <returns></returns> public static string StrSplit(string str) { try { Stopwatch watch = new Stopwatch(); watch.Start(); Segment segment = new Segment(); ICollection<WordInfo> words = segment.DoSegment(str, _Options, _Parameters); watch.Stop(); StringBuilder wordsString = new StringBuilder(); foreach (WordInfo wordInfo in words) { if (wordInfo == null || wordInfo.Word.Length <= 1) { continue; } wordsString.AppendFormat("{0},", wordInfo.Word); } return wordsString.ToString().TrimEnd(','); } catch (Exception ex) { return ex.Message; } }
private List <Fragment> GetFragments(string keywords, string content) { ICollection <WordInfo> keywordsWordInfos = _PanGuSegment.DoSegment(keywords); //Make lower foreach (WordInfo wordInfo in keywordsWordInfos) { if (wordInfo == null) { continue; } if (wordInfo.Word == null) { continue; } wordInfo.Word = wordInfo.Word.ToLower(); } ICollection <WordInfo> contentWordInfos = _PanGuSegment.DoSegment(content); //Make lower foreach (WordInfo wordInfo in contentWordInfos) { if (wordInfo == null) { continue; } if (wordInfo.Word == null) { continue; } wordInfo.Word = wordInfo.Word.ToLower(); } _Content = content; _Selection = PickupKeywords(keywordsWordInfos, contentWordInfos); _Selection = Optimize(_Selection); return(GetFragments(_Selection)); }
private string DisplaySegment(string textSource) { //Stopwatch watch = new Stopwatch(); //watch.Start(); Segment segment = new Segment(); ICollection<WordInfo> words = segment.DoSegment(textSource); /*watch.Stop(); labelSrcLength.Text = textBoxSource.Text.Length.ToString(); labelSegTime.Text = watch.Elapsed.ToString(); if (watch.ElapsedMilliseconds == 0) { labelRegRate.Text = "无穷大"; } else { labelRegRate.Text = ((double)(textBoxSource.Text.Length / watch.ElapsedMilliseconds) * 1000).ToString(); } if (checkBoxShowTimeOnly.Checked) { return; }*/ StringBuilder wordsString = new StringBuilder(); foreach (WordInfo wordInfo in words) { if (wordInfo == null) { continue; } /* if (showPosition) { wordsString.AppendFormat("{0}({1},{2})/", wordInfo.Word, wordInfo.Position, wordInfo.Rank); //if (_Options.MultiDimensionality) //{ //} //else //{ // wordsString.AppendFormat("{0}({1})/", wordInfo.Word, wordInfo.Position); //} }*/ // else // { wordsString.AppendFormat("{0}/", wordInfo.Word); // } } //textBoxSegwords.Text += "\n\r" + wordsString.ToString(); return wordsString.ToString(); }
protected void btnSplit_Click(object sender, EventArgs e) { ltlResult.Text = ""; var segment = new PanGu.Segment(); var words = segment.DoSegment(txtContent.Text); foreach (var word in words) { ltlResult.Text += (word + "," + GetChsPos(word.Pos) + "<br />"); } }
static Dictionary<string, int> ContetnWordSegment(string content) { Dictionary<string, int> ret = new Dictionary<string, int>(); Segment seg = new Segment(); ICollection<WordInfo> words = seg.DoSegment(content); foreach (WordInfo w in words) { if (!ret.ContainsKey(w.Word)) { ret[w.Word] = 1; } else ret[w.Word]++; } return ret; }
public static List<string> panguDivide(String s) { Segment segment = new Segment(); ICollection<WordInfo> words = segment.DoSegment(s); List<string> wordsResult = new List<string>(); foreach (WordInfo wordInfo in words) { if (wordInfo == null) continue; wordsResult.Add(wordInfo.Word); } return wordsResult; }
public static List <Tuple <string, double> > GetTermFreq(this string text) { Segment.Init(); var seg = new PanGu.Segment(); var rst = seg.DoSegment(text, new PanGu.Match.MatchOptions { FilterNumeric = true, FrequencyFirst = true, //EnglishSegment = true, IgnoreCapital = true, }); return(rst.Where(t => t.Word.Length > 1) .Select(t => new Tuple <string, double>(t.Word, 1)) .ToList()); }
private static string SegmentKeyWord(VideoNode video) { var builder = new StringBuilder(50); var segment = new Segment(); var words = segment.DoSegment(video.ChannelName); foreach (var word in words) { if (word == null) continue; builder.AppendFormat("{0}^{1}.0 ", word.Word, (int)Math.Pow(3, word.Rank)); } //if (!string.IsNullOrEmpty(video.Language[CustomArray.LanguageArray[0]].CatalogTags)) //{ // var catalogs = video.Language[CustomArray.LanguageArray[0]].CatalogTags.FormatStrToArray(SplitArray.DHArray); // foreach (var catalog in catalogs) // { // builder.AppendFormat("{0}^{1}.0 ", catalog, 3); // } //} //if(!string.IsNullOrEmpty(video.BKInfo.Actors)) //{ // var acs = video.BKInfo.Actors.FormatStrToArray(SplitArray.DHArray); // foreach (var ac in acs) // { // builder.AppendFormat("{0}^{1}.0 ", ac, 2); // } //} //if(!string.IsNullOrEmpty(video.Language[CustomArray.LanguageArray[0]].AreaTags)) //{ // var areas = video.Language[CustomArray.LanguageArray[0]].AreaTags.FormatStrToArray(SplitArray.DHArray); // foreach (var area in areas) // { // builder.AppendFormat("{0}^{1}.0 ", area, 1); // } //} return builder.ToString(); }
public PanGuTokenizer(TextReader input) : base(input) { lock (_LockObj) { InitPanGuSegment(); } _InputText = base.input.ReadToEnd(); if (string.IsNullOrEmpty(_InputText)) { char[] readBuf = new char[1024]; int relCount = base.input.Read(readBuf, 0, readBuf.Length); StringBuilder inputStr = new StringBuilder(readBuf.Length); while (relCount > 0) { inputStr.Append(readBuf, 0, relCount); relCount = input.Read(readBuf, 0, readBuf.Length); } if (inputStr.Length > 0) { _InputText = inputStr.ToString(); } } if (string.IsNullOrEmpty(_InputText)) { _WordList = new WordInfo[0]; } else { global::PanGu.Segment segment = new Segment(); ICollection<WordInfo> wordInfos = segment.DoSegment(_InputText); _WordList = new WordInfo[wordInfos.Count]; wordInfos.CopyTo(_WordList, 0); } }
private void button2_Click(object sender, EventArgs e) { PanGu.Segment.Init(); PanGu.Segment segment = new PanGu.Segment(); ICollection <PanGu.WordInfo> words = segment.DoSegment("山东落花生花落东山,长春市长春花店"); foreach (var word in words) { Console.WriteLine(word.Word); } PanGu.HighLight.SimpleHTMLFormatter simpleHTMLFormatter = new PanGu.HighLight.SimpleHTMLFormatter("<font color=\"red\">", "</font>"); PanGu.HighLight.Highlighter highlighter = new PanGu.HighLight.Highlighter(simpleHTMLFormatter, new PanGu.Segment()); highlighter.FragmentSize = 100; // 设置每个摘要段的字符数 string keywords = "信号/道路/开通"; string content = @"高德完胜百度。我专门花了几个星期,在我所在的城市测试两个地图,高德数据不准确在少数,而百度就是家常便饭了,表现为: 已经管制一年的道路(双向变单向),百度仍然提示双向皆可走。 已经封闭数年的道路,百度仍然说是通的。 新修道路,还没有开通,百度居然让走。 有时候规划路线时明明是正确的,但是导航过程中,就出乱子,信号没问题、路线不复杂,明明是要左转,百度却叫右转。"; string abs = highlighter.GetBestFragment(keywords, content); Console.WriteLine(abs); }
public PanGuTokenizer(TextReader input) : base(input) { termAttribute = AddAttribute<ITermAttribute>(); offsetAttribute = AddAttribute<IOffsetAttribute>(); inputText = base.input.ReadToEnd(); if (string.IsNullOrEmpty(inputText)) { char[] readBuf = new char[1024]; int relCount = base.input.Read(readBuf, 0, readBuf.Length); StringBuilder inputStr = new StringBuilder(readBuf.Length); while (relCount > 0) { inputStr.Append(readBuf, 0, relCount); relCount = input.Read(readBuf, 0, readBuf.Length); } if (inputStr.Length > 0) { inputText = inputStr.ToString(); } } if (string.IsNullOrEmpty(inputText)) { words = new WordInfo[0]; } else { global::PanGu.Segment segment = new Segment(); ICollection<WordInfo> wordInfos = segment.DoSegment(inputText); words = new WordInfo[wordInfos.Count]; wordInfos.CopyTo(words, 0); } }
private string TruncateParagraph(string paragraph) { Segment segment = new Segment(); ICollection<WordInfo> words = segment.DoSegment(paragraph, segmentMatchOptions); int stop = 0; // 最短不能小于需要长度的1/2 int lowerBound = truncateLimit / 2; foreach (WordInfo word in words) { if (word.Position + word.Word.Length > truncateLimit) { return stop < lowerBound ? paragraph.Substring(0, truncateLimit) : paragraph.Substring(0, stop); } stop = word.Position + word.Word.Length; } return paragraph; }
static string ProcessQuery(string[] query) { /* 对query进行分词 */ List<string> query_words = new List<string>(); PanGu.Segment.Init(); Segment seg = new Segment(); foreach (string q in query) { ICollection<WordInfo> words = seg.DoSegment(q); foreach (WordInfo wi in words) { if (!query_words.Contains(wi.Word)) query_words.Add(wi.Word); } } /* 检索出符合条件的docIds */ MongodbAccess mongo = new MongodbAccess(); List<ObjectId> docIds = mongo.GetDocIDByQuery(query_words); docIds = SortResult(docIds, query_words); List<DocUrlAbstractResult> result = GetResult(docIds); /* foreach (DocUrlAbstractResult duar in result) { Console.WriteLine("{0}\n\t{1}", duar.title, duar.url); } * */ StringBuilder strbuilder = new StringBuilder(); foreach (DocUrlAbstractResult duar in result) { strbuilder.Append(String.Format( "<li><div><span><a href='{2}' target='_blank' class='link'>{0}<a></span><br/><span class='abstract'>{1}</span><br/><span class='url'>{2}</span></div></li>", duar.title, duar.abst, duar.url)); } if (strbuilder.Length == 0) { strbuilder.Append("No pages mathch the query."); } return "<ol>" + strbuilder.ToString() + "</ol>"; }
/// <summary> /// 根据文章标题智能解析关键字(或标签) /// </summary> /// <param name="title"></param> /// <returns></returns> private static ICollection<WordInfo> TitleToKeywordWordInfos(string title) { PanGu.Segment segment = new Segment(); PanGu.Match.MatchOptions matchOptions = new PanGu.Match.MatchOptions(); //中文人名识别 matchOptions.ChineseNameIdentify = false; //词频优先 matchOptions.FrequencyFirst = false; //多元分词 matchOptions.MultiDimensionality = false; //英文多元分词,这个开关,会将英文中的字母和数字分开 matchOptions.EnglishMultiDimensionality = false; //过滤停用词 matchOptions.FilterStopWords = true; //忽略空格、回车、Tab matchOptions.IgnoreSpace = true; //强制一元分词 matchOptions.ForceSingleWord = false; //繁体中文开关 matchOptions.TraditionalChineseEnabled = false; //同时输出简体和繁体 matchOptions.OutputSimplifiedTraditional = false; //未登录词识别 matchOptions.UnknownWordIdentify = false; //过滤英文,这个选项只有在过滤停用词选项生效时才有效 matchOptions.FilterEnglish = true; //过滤数字,这个选项只有在过滤停用词选项生效时才有效 matchOptions.FilterNumeric = true; //忽略英文大小写 matchOptions.IgnoreCapital = false; //英文分词 matchOptions.EnglishSegment = false; //同义词输出 (同义词输出功能一般用于对搜索字符串的分词,不建议在索引时使用) matchOptions.SynonymOutput = false; //通配符匹配输出 () matchOptions.WildcardOutput = false; //对通配符匹配的结果分词 matchOptions.WildcardSegment = false; PanGu.Match.MatchParameter matchParameter = new PanGu.Match.MatchParameter(); //未登录词权值 matchParameter.UnknowRank = 1; //最匹配词权值 matchParameter.BestRank = 5; //次匹配词权值 matchParameter.SecRank = 3; //再次匹配词权值 matchParameter.ThirdRank = 2; //强行输出的单字的权值 matchParameter.SingleRank = 1; //数字的权值 matchParameter.NumericRank = 1; //英文词汇权值 matchParameter.EnglishRank = 5; //英文词汇小写的权值 matchParameter.EnglishLowerRank = 3; //英文词汇词根的权值 matchParameter.EnglishStemRank = 2; //符号的权值 matchParameter.SymbolRank = 1; //强制同时输出简繁汉字时,非原来文本的汉字输出权值。 比如原来文本是简体,这里就是输出的繁体字的权值,反之亦然。 matchParameter.SimplifiedTraditionalRank = 1; //同义词权值 matchParameter.SynonymRank = 1; //通配符匹配结果的权值 matchParameter.WildcardRank = 1; //过滤英文选项生效时,过滤大于这个长度的英文 matchParameter.FilterEnglishLength = 0; //过滤数字选项生效时,过滤大于这个长度的数字 matchParameter.FilterNumericLength = 0; //用户自定义规则的配件文件名 matchParameter.CustomRuleAssemblyFileName = string.Empty; //用户自定义规则的类的完整名,即带名字空间的名称 matchParameter.CustomRuleFullClassName = string.Empty; //冗余度 matchParameter.Redundancy = 0; return segment.DoSegment(title, matchOptions, matchParameter); }
public ICollection<WordInfo> SegmentToWordInfos(string str) { if (string.IsNullOrEmpty(str)) { return new LinkedList<WordInfo>(); } Segment segment = new Segment(); return segment.DoSegment(str); }
private List<string> GetSegmentWords(List<string> content) { Segment segment = new Segment(); List<string> segmentWords = new List<string>(); foreach (string s in content) { ICollection<WordInfo> words = segment.DoSegment(s); List<string> sWords = new List<string>(); foreach (WordInfo wordInfo in words) { if (wordInfo == null) { continue; } sWords.Add(wordInfo.Word); } segmentWords.AddRange(sWords); } return segmentWords; }
private static string segment(string s) { Segment segment = new Segment(); ICollection<WordInfo> words = segment.DoSegment(s); StringBuilder wordsString = new StringBuilder(); foreach (WordInfo wordInfo in words) { if (wordInfo == null) { continue; } wordsString.AppendFormat("{0}\t", wordInfo.Word); } return wordsString.ToString(); }
public PanGuTokenizer(TextReader input) : base(input) { _inputText = base.input.ReadToEnd(); if (string.IsNullOrEmpty(_inputText)) { char[] readBuf = new char[1024]; int relCount = base.input.Read(readBuf, 0, readBuf.Length); StringBuilder inputStr = new StringBuilder(readBuf.Length); while (relCount > 0) { inputStr.Append(readBuf, 0, relCount); relCount = input.Read(readBuf, 0, readBuf.Length); } if (inputStr.Length > 0) { _inputText = inputStr.ToString(); } } _isFlag = AnalyzInput(); if (!_isFlag) //盘古分词 { global::PanGu.Segment segment = new Segment(); var wordInfos = segment.DoSegment(_inputText); foreach (var wi in wordInfos) { var list = SpellUtils.GetSpellSegment(wi.Word); if (list != null) _segmentList.AddRange(list); } _wordList = new List<WordInfo>(wordInfos); } else { if (_inputText.EndsWith(_indexFlag)) //逗号空格分词 { string[] sources = _inputText.Replace(_indexFlag, "").Split(_splitFlag, StringSplitOptions.RemoveEmptyEntries); foreach (string source in sources) { _segmentList.Add(source); var spells = SpellUtils.GetSpellSegment(source); if (spells != null) _segmentList.AddRange(spells); } } else if (_inputText.EndsWith(_indexCnName)) //拼音首字母分词 { string source = _inputText.Replace(_indexCnName, ""); _wordList = SpellUtils.GetCnSegment(source); } else //flag分词 { string source = _inputText.Replace("|", ""); for (int i = 0; i < source.Length; i++) { string first = source[i].ToString(); _segmentList.Add(first); int f = 1; for (int j = i + 1; j < source.Length; j++) { string s = string.Format("{0}|{1}", first, source[j]); _segmentList.Add(s); first = s; if (j == source.Length - 1) { f++; j = i + f - 1; first = source[i].ToString(); } } } } } }
/// <summary> /// 处理客户端输入 /// </summary> public static string SegmentKeyWord(string keyword) { #region Deleted //var cnbuilder = new StringBuilder(); //var enbuilder = new StringBuilder (); //foreach (char k in keyword) //{ // if (SpellUtils.IsChinese(k.ToString())) // { // cnbuilder.Append(k); // enbuilder.Append(" "); // } // else // { // enbuilder.Append(k); // } //} //var rbuilder = new StringBuilder(); //if (enbuilder.Length > 0) //{ // var enkeys = enbuilder.ToString().Split(_splitWord, StringSplitOptions.RemoveEmptyEntries); // foreach (string enkey in enkeys) // { // rbuilder.AppendFormat("{0}^{1}.0 ", enkey, 1); // //rbuilder.AppendFormat("{0}*^{1}.0", enkey, 1); // } //} //if (cnbuilder.Length > 0) //{ // var segment = new Segment(); // var words = segment.DoSegment(cnbuilder.ToString(), _option); // foreach (var word in words) // { // if (word == null) // continue; // rbuilder.AppendFormat("{0}^{1}.0 ", word.Word, (int)Math.Pow(3, word.Rank)); // } //} //return rbuilder.ToString().Trim(); #endregion var builder = new StringBuilder(20); var segment = new Segment(); var words = segment.DoSegment(keyword, _option); foreach (var word in words) { if (word == null) continue; builder.AppendFormat("{0}^{1}.0 ", word.Word, (int)Math.Pow(3, word.Rank)); } return builder.ToString(); }
public void SpellTest() { LuceneEngine engine = new LuceneEngine(); engine.Init(); var firsts = SpellUtils.GetCnSegment("NBA常规赛-快船vs凯尔特人"); var spells = SpellUtils.GetSpellSegment("战重警和"); var segment = new Segment(); var collection = segment.DoSegment("国际足球100509K联赛釜山-大田"); var list = new List<string>(); foreach (WordInfo word in collection) { if (word == null) continue; list.Add(word.Word); } //var list = SpellUtils.GetSpellSegment("战警"); string sss = "san国yan义"; string ssss = Synacast.LuceneNetSearcher.Searcher.Searcher.SegmentKeyWord(sss); string text = "重庆"; TextReader tr = new StringReader(text); PanGuTokenizer ct = new PanGuTokenizer(tr); int end = 0; Lucene.Net.Analysis.Token t; string ss = string.Empty; while (end < text.Length) { t = ct.Next(); end = t.EndOffset(); ss = ss + t.TermText() + "/ "; } }