/// <summary>
        /// 社交媒体词频
        /// </summary>
        /// <param name="usr_id"></param>
        /// <param name="projectId"></param>
        /// <param name="categoryId"></param>
        /// <returns></returns>
        public FrequencyResult MediaFrequency(string usr_id, string projectId, string categoryId)
        {
            //获取词组对应链接的标题与摘要
            var           builder    = Builders <WXLinkMainMongo> .Filter;
            List <string> keywordIds = new List <string>();

            var builderMap = Builders <MediaKeywordMappingMongo> .Filter;
            var filterMap  = builderMap.Eq(x => x.IsDel, false) & builderMap.Eq(x => x.ProjectId, new ObjectId(projectId));

            if (!string.IsNullOrEmpty(categoryId))
            {
                //判断是否有分组
                var cateIds = categoryId.Split(';').Select(x => new ObjectId(x)).ToList();

                if (cateIds.Count == 1 && cateIds[0].Equals(ObjectId.Empty))
                {
                    filterMap &= builderMap.Eq(x => x.CategoryId, ObjectId.Empty);
                }
                else
                {
                    //去除根结点
                    cateIds.Remove(ObjectId.Empty);
                    filterMap &= builderMap.In(x => x.CategoryId, cateIds);
                }
            }
            else
            {
                filterMap &= builderMap.Eq(x => x.CategoryId, ObjectId.Empty);
            }
            keywordIds = MongoDBHelper.Instance.GetMediaKeywordMapping().Find(filterMap).Project(x => x.KeywordId.ToString()).ToList();

            //获取项目内已删除的链接Id
            var builderLinkMap = Builders <Dnl_LinkMapping_Baidu> .Filter;
            var filterLinkMap  = builderLinkMap.Eq(x => x.ProjectId, new ObjectId(projectId)) & builderLinkMap.Eq(x => x.DataCleanStatus, (byte)2);

            filterLinkMap &= builderLinkMap.Eq(x => x.Source, SourceType.Media);
            var exLinkObjIds = MongoDBHelper.Instance.GetDnl_LinkMapping_Baidu().Find(filterLinkMap).Project(x => x.LinkId).ToList();       //项目中已删除的链接ID列表

            var filter = builder.In(x => x.KeywordId, keywordIds);

            filter &= builder.Nin(x => x._id, exLinkObjIds);
            var    TaskList = MongoDBHelper.Instance.GetWXLinkMain().Find(filter).Project(x => x.Title).ToList();
            string text     = string.Join(" ", TaskList.ToArray());

            var             stopWords = GetStopWord(usr_id).Words;
            FrequencyResult result    = GetFrequency(text, stopWords);

            return(result);
        }
Exemplo n.º 2
0
        /// <summary>
        /// 获取文章内名、动词词频
        /// </summary>
        /// <param name="text">要获取词频的文章</param>
        /// <param name="stopWords">停用词列表</param>
        /// <returns></returns>
        public FrequencyResult GetFrequency(string text, List <string> stopWords)
        {
            //临时数据列表
            List <string> noun = new List <string>();
            List <int>    nounCount = new List <int>();
            List <string> verb = new List <string>();
            List <int>    verbCount = new List <int>();
            int           num = 0;
            int           i = 0, j = 0;

            //对数据分词,并统计词频
            var segment  = posSegementer.Cut(text).ToList();
            var wordList = segment.GroupBy(x => x.ToString()).Select(x => new { Word = x.Key.ToString(), Count = x.Count() }).OrderByDescending(x => x.Count).ToList();

            //提取名、动词词频
            Regex regN = new Regex("(?<word>[\u4E00-\u9FA5][\u4E00-\u9FA5]+?)/n[a-zA-Z]*");
            Regex regV = new Regex("(?<word>[\u4E00-\u9FA5][\u4E00-\u9FA5]+?)/v[a-zA-Z]*");

            foreach (var wordInfo in wordList)
            {
                if (num == 100)
                {
                    break;
                }
                string word = regN.Match(wordInfo.Word).Groups["word"].Value;
                if (!string.IsNullOrEmpty(word) & i < 100)
                {
                    //排除已停用的词
                    if (stopWords.Contains(word))
                    {
                        continue;
                    }
                    int nc = wordInfo.Count;
                    noun.Add(word);
                    nounCount.Add(nc);
                    i++;
                    num++;
                }
                //else
                //{
                //    word = regV.Match(wordInfo.Word).Groups["word"].Value;
                //    if (!string.IsNullOrEmpty(word) & j < 10)
                //    {
                //        //排除已停用的词
                //        if (stopWords.Contains(word))
                //        {
                //            continue;
                //        }
                //        int nc = wordInfo.Count;
                //        verb.Add(word);
                //        verbCount.Add(nc);
                //        j++;
                //        num++;
                //    }
                //}
            }

            FrequencyResult result = new FrequencyResult();

            result.noun      = noun;
            result.nounCount = nounCount;
            result.verb      = verb;
            result.verbCount = verbCount;
            return(result);
        }
Exemplo n.º 3
0
        public FrequencyResult MediaFrequency(string usr_id, string projectId, string categoryId)
        {
            var                  result     = new FrequencyResult();
            ObjectId             proObjId   = new ObjectId(projectId);
            JavaScriptSerializer serializer = new JavaScriptSerializer();       //Json序列化与反序列化

            var cateIds = new List <string>();

            if (!string.IsNullOrEmpty(categoryId))
            {
                cateIds = CommonHelper.GetIdListFromStr(categoryId);
                cateIds.Remove(ObjectId.Empty.ToString());
                cateIds.Sort();
            }

            //生成参数Json
            JObject factorJson = new JObject();

            factorJson.Add(new JProperty("categoryIds", string.Join(";", cateIds)));

            //获取图表数据
            var builderChart = Builders <PojectChartMongo> .Filter;
            var filterChart  = builderChart.Eq(x => x.ProjectId, proObjId) & builderChart.Eq(x => x.Type, ChartType.WordFrequence);

            filterChart &= builderChart.Eq(x => x.Source, SourceType.Media) & builderChart.Eq(x => x.Name, "默认");
            var colChart   = MongoDBHelper.Instance.GetPojectChart();
            var queryChart = colChart.Find(filterChart).FirstOrDefault();

            /* 查询本设置对应的图表是否已存在 */
            //判断是否不刷新数据且图表数据存在并参数相同
            if (queryChart != null && queryChart.FactorJson == factorJson.ToString())
            {
                //反序列化图表数据
                result = serializer.Deserialize <FrequencyResult>(queryChart.DataJson);
            }
            else
            {
                //获取词组对应链接的标题与摘要
                var           builder    = Builders <WXLinkMainMongo> .Filter;
                List <string> keywordIds = new List <string>();

                var builderMap = Builders <MediaKeywordMappingMongo> .Filter;
                var filterMap  = builderMap.Eq(x => x.IsDel, false) & builderMap.Eq(x => x.ProjectId, new ObjectId(projectId));
                if (!string.IsNullOrEmpty(categoryId))
                {
                    //判断是否有分组
                    var cateObjIds = categoryId.Split(';').Select(x => new ObjectId(x)).ToList();

                    if (cateObjIds.Count == 1 && cateObjIds[0].Equals(ObjectId.Empty))
                    {
                        filterMap &= builderMap.Eq(x => x.CategoryId, ObjectId.Empty);
                    }
                    else
                    {
                        //去除根结点
                        cateObjIds.Remove(ObjectId.Empty);
                        filterMap &= builderMap.In(x => x.CategoryId, cateObjIds);
                    }
                }
                else
                {
                    filterMap &= builderMap.Eq(x => x.CategoryId, ObjectId.Empty);
                }
                keywordIds = MongoDBHelper.Instance.GetMediaKeywordMapping().Find(filterMap).Project(x => x.KeywordId.ToString()).ToList();

                //获取项目内已删除的链接Id
                var builderLinkMap = Builders <Dnl_LinkMapping_Baidu> .Filter;
                var filterLinkMap  = builderLinkMap.Eq(x => x.ProjectId, new ObjectId(projectId)) & builderLinkMap.Eq(x => x.DataCleanStatus, (byte)2);
                var exLinkObjIds   = MongoDBHelper.Instance.GetDnl_LinkMapping_Baidu().Find(filterLinkMap).Project(x => x.LinkId).ToList();     //项目中已删除的链接ID列表

                var filter = builder.In(x => x.KeywordId, keywordIds);
                filter &= builder.Nin(x => x._id, exLinkObjIds);
                var    TaskList = MongoDBHelper.Instance.GetWXLinkMain().Find(filter).Project(x => string.Format("{0} {1}", x.Title, x.Description)).ToList();
                string text     = string.Join(" ", TaskList.ToArray());

                var stopWords = GetStopWord(usr_id).Words;
                result = GetFrequency(text, stopWords);
            }

            return(result);
        }