/// <summary> /// 社交媒体词频 /// </summary> /// <param name="usr_id"></param> /// <param name="projectId"></param> /// <param name="categoryId"></param> /// <returns></returns> public FrequencyResult MediaFrequency(string usr_id, string projectId, string categoryId) { //获取词组对应链接的标题与摘要 var builder = Builders <WXLinkMainMongo> .Filter; List <string> keywordIds = new List <string>(); var builderMap = Builders <MediaKeywordMappingMongo> .Filter; var filterMap = builderMap.Eq(x => x.IsDel, false) & builderMap.Eq(x => x.ProjectId, new ObjectId(projectId)); if (!string.IsNullOrEmpty(categoryId)) { //判断是否有分组 var cateIds = categoryId.Split(';').Select(x => new ObjectId(x)).ToList(); if (cateIds.Count == 1 && cateIds[0].Equals(ObjectId.Empty)) { filterMap &= builderMap.Eq(x => x.CategoryId, ObjectId.Empty); } else { //去除根结点 cateIds.Remove(ObjectId.Empty); filterMap &= builderMap.In(x => x.CategoryId, cateIds); } } else { filterMap &= builderMap.Eq(x => x.CategoryId, ObjectId.Empty); } keywordIds = MongoDBHelper.Instance.GetMediaKeywordMapping().Find(filterMap).Project(x => x.KeywordId.ToString()).ToList(); //获取项目内已删除的链接Id var builderLinkMap = Builders <Dnl_LinkMapping_Baidu> .Filter; var filterLinkMap = builderLinkMap.Eq(x => x.ProjectId, new ObjectId(projectId)) & builderLinkMap.Eq(x => x.DataCleanStatus, (byte)2); filterLinkMap &= builderLinkMap.Eq(x => x.Source, SourceType.Media); var exLinkObjIds = MongoDBHelper.Instance.GetDnl_LinkMapping_Baidu().Find(filterLinkMap).Project(x => x.LinkId).ToList(); //项目中已删除的链接ID列表 var filter = builder.In(x => x.KeywordId, keywordIds); filter &= builder.Nin(x => x._id, exLinkObjIds); var TaskList = MongoDBHelper.Instance.GetWXLinkMain().Find(filter).Project(x => x.Title).ToList(); string text = string.Join(" ", TaskList.ToArray()); var stopWords = GetStopWord(usr_id).Words; FrequencyResult result = GetFrequency(text, stopWords); return(result); }
/// <summary> /// 获取文章内名、动词词频 /// </summary> /// <param name="text">要获取词频的文章</param> /// <param name="stopWords">停用词列表</param> /// <returns></returns> public FrequencyResult GetFrequency(string text, List <string> stopWords) { //临时数据列表 List <string> noun = new List <string>(); List <int> nounCount = new List <int>(); List <string> verb = new List <string>(); List <int> verbCount = new List <int>(); int num = 0; int i = 0, j = 0; //对数据分词,并统计词频 var segment = posSegementer.Cut(text).ToList(); var wordList = segment.GroupBy(x => x.ToString()).Select(x => new { Word = x.Key.ToString(), Count = x.Count() }).OrderByDescending(x => x.Count).ToList(); //提取名、动词词频 Regex regN = new Regex("(?<word>[\u4E00-\u9FA5][\u4E00-\u9FA5]+?)/n[a-zA-Z]*"); Regex regV = new Regex("(?<word>[\u4E00-\u9FA5][\u4E00-\u9FA5]+?)/v[a-zA-Z]*"); foreach (var wordInfo in wordList) { if (num == 100) { break; } string word = regN.Match(wordInfo.Word).Groups["word"].Value; if (!string.IsNullOrEmpty(word) & i < 100) { //排除已停用的词 if (stopWords.Contains(word)) { continue; } int nc = wordInfo.Count; noun.Add(word); nounCount.Add(nc); i++; num++; } //else //{ // word = regV.Match(wordInfo.Word).Groups["word"].Value; // if (!string.IsNullOrEmpty(word) & j < 10) // { // //排除已停用的词 // if (stopWords.Contains(word)) // { // continue; // } // int nc = wordInfo.Count; // verb.Add(word); // verbCount.Add(nc); // j++; // num++; // } //} } FrequencyResult result = new FrequencyResult(); result.noun = noun; result.nounCount = nounCount; result.verb = verb; result.verbCount = verbCount; return(result); }
public FrequencyResult MediaFrequency(string usr_id, string projectId, string categoryId) { var result = new FrequencyResult(); ObjectId proObjId = new ObjectId(projectId); JavaScriptSerializer serializer = new JavaScriptSerializer(); //Json序列化与反序列化 var cateIds = new List <string>(); if (!string.IsNullOrEmpty(categoryId)) { cateIds = CommonHelper.GetIdListFromStr(categoryId); cateIds.Remove(ObjectId.Empty.ToString()); cateIds.Sort(); } //生成参数Json JObject factorJson = new JObject(); factorJson.Add(new JProperty("categoryIds", string.Join(";", cateIds))); //获取图表数据 var builderChart = Builders <PojectChartMongo> .Filter; var filterChart = builderChart.Eq(x => x.ProjectId, proObjId) & builderChart.Eq(x => x.Type, ChartType.WordFrequence); filterChart &= builderChart.Eq(x => x.Source, SourceType.Media) & builderChart.Eq(x => x.Name, "默认"); var colChart = MongoDBHelper.Instance.GetPojectChart(); var queryChart = colChart.Find(filterChart).FirstOrDefault(); /* 查询本设置对应的图表是否已存在 */ //判断是否不刷新数据且图表数据存在并参数相同 if (queryChart != null && queryChart.FactorJson == factorJson.ToString()) { //反序列化图表数据 result = serializer.Deserialize <FrequencyResult>(queryChart.DataJson); } else { //获取词组对应链接的标题与摘要 var builder = Builders <WXLinkMainMongo> .Filter; List <string> keywordIds = new List <string>(); var builderMap = Builders <MediaKeywordMappingMongo> .Filter; var filterMap = builderMap.Eq(x => x.IsDel, false) & builderMap.Eq(x => x.ProjectId, new ObjectId(projectId)); if (!string.IsNullOrEmpty(categoryId)) { //判断是否有分组 var cateObjIds = categoryId.Split(';').Select(x => new ObjectId(x)).ToList(); if (cateObjIds.Count == 1 && cateObjIds[0].Equals(ObjectId.Empty)) { filterMap &= builderMap.Eq(x => x.CategoryId, ObjectId.Empty); } else { //去除根结点 cateObjIds.Remove(ObjectId.Empty); filterMap &= builderMap.In(x => x.CategoryId, cateObjIds); } } else { filterMap &= builderMap.Eq(x => x.CategoryId, ObjectId.Empty); } keywordIds = MongoDBHelper.Instance.GetMediaKeywordMapping().Find(filterMap).Project(x => x.KeywordId.ToString()).ToList(); //获取项目内已删除的链接Id var builderLinkMap = Builders <Dnl_LinkMapping_Baidu> .Filter; var filterLinkMap = builderLinkMap.Eq(x => x.ProjectId, new ObjectId(projectId)) & builderLinkMap.Eq(x => x.DataCleanStatus, (byte)2); var exLinkObjIds = MongoDBHelper.Instance.GetDnl_LinkMapping_Baidu().Find(filterLinkMap).Project(x => x.LinkId).ToList(); //项目中已删除的链接ID列表 var filter = builder.In(x => x.KeywordId, keywordIds); filter &= builder.Nin(x => x._id, exLinkObjIds); var TaskList = MongoDBHelper.Instance.GetWXLinkMain().Find(filter).Project(x => string.Format("{0} {1}", x.Title, x.Description)).ToList(); string text = string.Join(" ", TaskList.ToArray()); var stopWords = GetStopWord(usr_id).Words; result = GetFrequency(text, stopWords); } return(result); }