Example #1
0
        public void wordCut()
        {
            string[]     folders  = Directory.GetDirectories(path);
            string       filePath = savePath + @"\分词结果.txt";
            StreamWriter writer   = new StreamWriter(filePath);

            foreach (string folder in folders)
            {
                int    index      = folder.LastIndexOf('\\');
                string folderName = folder.Substring(index + 1);
                writer.Write("\"{0}\": [", folderName);
                Console.WriteLine(folderName);
                string[] files = Directory.GetFiles(folder);
                string   text  = "";
                foreach (string file in files)
                {
                    text += File.ReadAllText(file);
                }
                var extractor = new TfidfExtractor();
                var keywords  = extractor.ExtractTagsWithWeight(text);

                foreach (WordWeightPair w in keywords)
                {
                    writer.Write("[\"{0}\", {1}],", w.Word, (int)(w.Weight * 100 * 2));      //将权重按照一定倍数放大并取整,便于后期处理
                    Console.WriteLine("{0}: {1}", w.Word, (int)(w.Weight * 100 * 2));
                }
                writer.WriteLine("],");
            }

            writer.Close();
        }
        public void TestExtractTagsWithWeights()
        {
            var tfidf  = new TfidfExtractor();
            var text   = GetFileContents(@"Resources\article.txt");
            var result = tfidf.ExtractTagsWithWeight(text);

            foreach (var tag in result)
            {
                Console.WriteLine("({0}, {1})", tag.Word, tag.Weight);
            }
        }
Example #3
0
        private async void Button1_Click(object sender, EventArgs e)
        {
            if (textBox1.Text == "")
            {
                return;
            }
            _query = textBox1.Text;
            var result = "";
            var client = new HttpClient();
            var uri    = Host + "cx=" + Cx + "&key=" + Key + "&num=" + Num + "&start=" + Start + "&q=" + System.Net.WebUtility.UrlEncode(_query);

            var response = await client.GetAsync(uri);

            var contentString = await response.Content.ReadAsStringAsync();

            dynamic parsedJson = JsonConvert.DeserializeObject(contentString);

            var items = parsedJson?.items;

            for (var i = Start; i < Num; i++)
            {
                result += items?[i].snippet.ToString();
            }

            var extractor = new TfidfExtractor();
            var pairs     = extractor.ExtractTagsWithWeight(result, 30);

            var words = new List <string>();
            var freqs = new List <int>();

            foreach (var pair in pairs)
            {
                if (pair.Word.Equals("..."))
                {
                    continue;
                }
                words.Add(pair.Word);
                freqs.Add(Convert.ToInt32(pair.Weight * Math.Pow(10, 6)));
            }

            var wc    = new WordCloud.WordCloud(1920, 1080);
            var image = wc.Draw(words, freqs);

            pictureBox1.Image = image;
            button2.Enabled   = true;
            button2.Visible   = true;
        }
Example #4
0
        public static double Similarity(string s1, IEnumerable <WordWeightPair> keywords2)
        {
            var keywords1 = extractor.ExtractTagsWithWeight(s1, 200);
            Dictionary <string, double> map = new Dictionary <string, double>();

            //将两个文本的关键词key集合合并到一个map中
            foreach (var keyword in keywords1)
            {
                map.Add(keyword.Word, 0);
//                Console.WriteLine("{0}:{1}", keyword.Word, keyword.Weight);
            }
            foreach (var keyword in keywords2)
            {
                map[keyword.Word] = 0;
//                Console.WriteLine("{0}:{1}", keyword.Word, keyword.Weight);
            }
            //分别将两个关键词集合存入map中
            Dictionary <string, double> map1 = new Dictionary <string, double>();
            Dictionary <string, double> map2 = new Dictionary <string, double>();

            foreach (var key in map.Keys)
            {
                map1.Add(key, 0);
            }
            foreach (var key in map.Keys)
            {
                map2.Add(key, 0);
            }

            foreach (var keyword in keywords1)
            {
                Match match = Regex.Match(keyword.Word, pattern);
//                Console.WriteLine("{0} : {1}",match.Length,match.Value);
                if (match.Length > 1)
                {
                    map1[keyword.Word] = keyword.Weight * 3;
                }
                else
                {
                    map1[keyword.Word] = keyword.Weight;
                }
            }
            foreach (var keyword in keywords2)
            {
                map2[keyword.Word] = keyword.Weight;
            }
            // 只获得map中的所有值
            List <double> list1 = new List <double>();
            List <double> list2 = new List <double>();

            foreach (var key in map1.Keys)
            {
                list1.Add(map1[key]);
//                Console.WriteLine("{0}:{1}", key, map1[key]);
            }
//            Console.WriteLine("");
//            Console.WriteLine("");
//            Console.WriteLine("");
            foreach (var key in map2.Keys)
            {
                list2.Add(map2[key]);
//                Console.WriteLine("{0}:{1}", key, map2[key]);
            }

            //计算内积
            double sum = 0;

            for (int m = 0; m < list1.Count; m++)
            {
                if (m >= list2.Count)
                {
                    break;
                }
                sum += list1[m] * list2[m];
            }
            double i_length = 0;
            double j_length = 0;

            //第i份文档的向量模长
            for (int n = 0; n < list1.Count; n++)
            {
                i_length += list1[n] * list1[n];
            }
            i_length = Math.Sqrt(i_length);
            // 第j份文档的向量模长
            for (int n = 0; n < list2.Count; n++)
            {
                j_length += list2[n] * list2[n];
            }
            j_length = Math.Sqrt(j_length);
            //夹角余弦值计算公式,两向量内积除以两向量的模长乘积
            long c = DateTime.Now.Millisecond;

            return(sum / (i_length * j_length));
        }
        public double CalcTFIDFSimilarity(string paperText1, string paperText2)
        {
            ObjectArgs arg = new ObjectArgs();
            //去掉分隔符,。...
            string txt1 = paperText1;

            txt1 = Regex.Replace(txt1, @"[^a-zA-Z0-9\u4e00-\u9fa5\s]", "");

            string txt2 = paperText2;

            txt2 = Regex.Replace(txt2, @"[^a-zA-Z0-9\u4e00-\u9fa5\s]", "");

            var segmenter = new JiebaSegmenter();

            JiebaNet.Analyser.TfidfExtractor tfd = new TfidfExtractor(segmenter);
            arg.ObjData = "开始分词";
            OnRaiseReporting(arg);
            Console.WriteLine("开始分词");
            Stopwatch sw = new Stopwatch();

            sw.Start();
            IEnumerable <string> segment1 = segmenter.Cut(txt1);

            IEnumerable <string> segment2 = segmenter.Cut(txt2);

            int num1 = segment1.Count();
            int num2 = segment2.Count();

            Console.WriteLine("文档1分词数:" + num1);

            Console.WriteLine("文档2分词数:" + num2);

            arg.ObjData = "文档1分词数:" + num1 + "\r\n文档2分词数: " + num2;
            OnRaiseReporting(arg);

            IEnumerable <string> seg1 = null; //多的
            IEnumerable <string> seg2 = null; //少的

            if (num1 > num2)
            {
                seg1 = segment1;
                seg2 = segment2;
            }
            else
            {
                seg2 = segment1;
                seg1 = segment2;
            }

            int    maxLength = seg1.Count();
            int    minLength = seg2.Count();
            double similar   = 0;

            //Dictionary<string, double> seg2Dic = CalcTF(seg2);
            string str2 = GetText(seg2);

            for (int i = 0; i + minLength <= maxLength; i++)
            {
                //0-interval
                //1-interval+1
                //2-interval+2 ...
                IEnumerable <string> seg = seg1.Where((item, index) => index > i && index < i + minLength);//取i 到 i+minLength
                //从seg1 中截取与seg2相同数量的词集合seg
                //分别计算词频 seg2的词频只需要计算一次
                //计算cos ===相似度
                //Dictionary<string, double> dic = CalcTF(seg);

                //double s = CalcSimilar(dic, seg2Dic);
                string str    = GetText(seg);
                int    topNum = 500;
                IEnumerable <WordWeightPair> tf_a = tfd.ExtractTagsWithWeight(str2, topNum);
                IEnumerable <WordWeightPair> tf_b = tfd.ExtractTagsWithWeight(str, topNum);

                double molecular     = 0; // 分子
                double denominator_a = 0; // 分母
                double denominator_b = 0;

                Dictionary <string, WordWeightPair> dic_a = new Dictionary <string, WordWeightPair>();
                Dictionary <string, WordWeightPair> dic_b = new Dictionary <string, WordWeightPair>();
                foreach (var a in tf_a)
                {
                    dic_a.Add(a.Word, a);
                }

                foreach (var b in tf_b)
                {
                    dic_b.Add(b.Word, b);
                }

                //Console.WriteLine("两篇文档相似的词有:");

                foreach (var k in dic_a.Keys)
                {
                    WordWeightPair a = dic_a[k];
                    WordWeightPair b;
                    dic_b.TryGetValue(k, out b);
                    denominator_a += a.Weight * a.Weight;

                    molecular += a.Weight * (null == b ? 0 : b.Weight);
                    //if (a != null && b != null)
                    //{

                    //    Console.WriteLine(a.Word + "  TF-IDF词频统计 文档一:" + a.Weight + "|文档二:"
                    //            + b.Weight);
                    //}
                }
                foreach (var k in dic_b.Keys)
                {
                    WordWeightPair b = dic_b[k];
                    denominator_b += b.Weight * b.Weight;
                }
                double s = 0;
                if (denominator_a != 0 && denominator_b != 0)
                {
                    s = (molecular / (Math.Sqrt(denominator_a) * Math.Sqrt(denominator_b)));
                }

                //Console.WriteLine("两篇文档相似度:" + s * 100 + "%");
                if ((i + 1) % 50 == 0)
                {
                    Console.WriteLine(string.Format("第{0}次计算出的相似度:{1}", i + 1, s));

                    arg.ObjData = string.Format("第{0}次计算出的相似度:{1}", i + 1, s);
                    OnRaiseReporting(arg);
                }
                if (s > similar)
                {
                    similar = s;
                }
                if (s >= 0.99)
                {
                    //极高相似度
                    Console.WriteLine(string.Format("第{0}次计算出的相似度:{1}", i + 1, s));

                    arg.ObjData = string.Format("第{0}次计算出的相似度:{1}", i + 1, s);
                    OnRaiseReporting(arg);
                }

                //Console.WriteLine("第"+i+"次花费时间:" + sw.ElapsedMilliseconds / 1000 + "秒");
            }
            sw.Stop();
            Console.WriteLine("两篇文章的相似度:" + similar);

            Console.WriteLine("花费时间:" + sw.ElapsedMilliseconds + "ms");
            arg.ObjData = string.Format("两篇文章的相似度:" + similar + "\r\n花费时间:" + sw.ElapsedMilliseconds + "ms");
            OnRaiseReporting(arg);
            return(similar);
        }
Example #6
0
        public string BaiduExtract(string usr_id, string projectId, string categoryId)
        {
            string               result     = "";
            ObjectId             proObjId   = new ObjectId(projectId);
            JavaScriptSerializer serializer = new JavaScriptSerializer();       //Json序列化与反序列化

            var cateIds = new List <string>();

            if (!string.IsNullOrEmpty(categoryId))
            {
                cateIds = CommonHelper.GetIdListFromStr(categoryId);
                cateIds.Remove(ObjectId.Empty.ToString());
                cateIds.Sort();
            }

            //生成参数Json
            JObject factorJson = new JObject();

            factorJson.Add(new JProperty("categoryIds", string.Join(";", cateIds)));

            //获取图表数据
            var builderChart = Builders <PojectChartMongo> .Filter;
            var filterChart  = builderChart.Eq(x => x.ProjectId, proObjId) & builderChart.Eq(x => x.Type, ChartType.WordCloud);

            filterChart &= builderChart.Eq(x => x.Source, SourceType.Enginee) & builderChart.Eq(x => x.Name, "默认");
            var colChart   = MongoDBHelper.Instance.GetPojectChart();
            var queryChart = colChart.Find(filterChart).FirstOrDefault();

            /* 查询本设置对应的图表是否已存在 */
            //判断是否不刷新数据且图表数据存在并参数相同
            if (queryChart != null && queryChart.FactorJson == factorJson.ToString())
            {
                //反序列化图表数据
                result = serializer.Deserialize <string>(queryChart.DataJson);
            }

            else
            {
                //获取词组对应链接的标题与摘要
                var builderLink = Builders <Dnl_Link_Baidu> .Filter;
                var builderMap  = Builders <Dnl_KeywordMapping> .Filter;
                var filterMap   = builderMap.Eq(x => x.IsDel, false) & builderMap.Eq(x => x.ProjectId, new ObjectId(projectId));

                List <string> commendIds = new List <string>();       //关键词组Id列表
                if (!string.IsNullOrEmpty(categoryId))
                {
                    //判断是否有分组
                    var cateObjIds = categoryId.Split(';').Select(x => new ObjectId(x)).ToList();
                    if (cateObjIds.Count == 1 && cateObjIds[0].Equals(ObjectId.Empty))
                    {
                        //无分组时使用所有词
                        filterMap &= builderMap.Eq(x => x.CategoryId, ObjectId.Empty);
                    }
                    else
                    {
                        //去除根结点
                        cateObjIds.Remove(ObjectId.Empty);
                        filterMap &= builderMap.In(x => x.CategoryId, cateObjIds);
                    }
                }
                else
                {
                    filterMap &= builderMap.Eq(x => x.CategoryId, ObjectId.Empty);
                }
                commendIds = MongoDBHelper.Instance.GetDnl_KeywordMapping().Find(filterMap).Project(x => x.KeywordId.ToString()).ToList();

                //获取项目内已删除的链接Id
                var builderLinkMap = Builders <Dnl_LinkMapping_Baidu> .Filter;
                var filterLinkMap  = builderLinkMap.Eq(x => x.ProjectId, new ObjectId(projectId)) & builderLinkMap.Eq(x => x.DataCleanStatus, (byte)2);
                var exLinkObjIds   = MongoDBHelper.Instance.GetDnl_LinkMapping_Baidu().Find(filterLinkMap).Project(x => x.LinkId).ToList();     //项目中已删除的链接ID列表

                var filterLink = builderLink.In(x => x.SearchkeywordId, commendIds);
                filterLink &= builderLink.Nin(x => x._id, exLinkObjIds);
                var    TaskList = MongoDBHelper.Instance.GetDnl_Link_Baidu().Find(filterLink).Project(x => string.Format("{0} {1}", x.Title, x.Description)).ToList();
                string all      = string.Join(" ", TaskList.ToArray());

                //获取关键词及其权重并转换成对应格式
                var wordlist = tfidf.ExtractTagsWithWeight(all, 40).ToList();

                //排除停用词
                var stopWords = GetStopWord(usr_id).Words;
                for (int i = 0; i < wordlist.Count;)
                {
                    if (stopWords.Contains(wordlist[i].Word))
                    {
                        wordlist.Remove(wordlist[i]);
                        continue;
                    }
                    i++;
                }
                if (wordlist.Count > 30)
                {
                    wordlist = wordlist.Take(30).ToList();
                }

                foreach (var x in wordlist)
                {
                    //int weight = Convert.ToInt32(x.Weight * 100) * 50 / max;
                    //if (weight < 50) weight = weight +10;
                    result += "<span data-weight=\"" + Convert.ToInt32(x.Weight * 100) + "\">" + x.Word + "</span>";
                }
            }

            return(result);
        }
Example #7
0
 private void button3_Click(object sender, EventArgs e)
 {
     try
     {
         //进行计时
         Stopwatch time = new Stopwatch();
         time.Start();
         //将inform,normal,spam三个文件夹下的文件汇入gather文件夹下的collect.txt文档
         File.Create(@"..\Debug\textgather\gather\collect.txt").Close();
         string[] subdirectory = Directory.GetFiles(@"..\Debug\textgather\spam\");
         foreach (string path in subdirectory)
         {
             StreamReader sr      = new StreamReader(path, Encoding.Default);
             string       content = sr.ReadLine();
             sr.Close();
             StreamWriter sw = new StreamWriter(@"..\Debug\textgather\gather\collect.txt", true, Encoding.Default);
             sw.WriteLine(content);
             sw.Close();
         }
         subdirectory = Directory.GetFiles(@"..\Debug\textgather\inform\");
         foreach (string path in subdirectory)
         {
             StreamReader sr      = new StreamReader(path, Encoding.Default);
             string       content = sr.ReadLine();
             sr.Close();
             StreamWriter sw = new StreamWriter(@"..\Debug\textgather\gather\collect.txt", true, Encoding.Default);
             sw.WriteLine(content);
             sw.Close();
         }
         subdirectory = Directory.GetFiles(@"..\Debug\textgather\normal\");
         foreach (string path in subdirectory)
         {
             StreamReader sr      = new StreamReader(path, Encoding.Default);
             string       content = sr.ReadLine();
             sr.Close();
             StreamWriter sw = new StreamWriter(@"..\Debug\textgather\gather\collect.txt", true, Encoding.Default);
             sw.WriteLine(content);
             sw.Close();
         }
         //将collect文本中的词语进行分词,以及用TF-IDF算法提取关键词保存在collectok文本
         StreamReader srt  = new StreamReader(@"..\Debug\textgather\gather\collect.txt", Encoding.Default);
         string       cont = srt.ReadToEnd();
         srt.Close();
         //调用jieba提取仅包含名词和动词的关键词
         var extractor = new TfidfExtractor();
         var keywords  = extractor.ExtractTagsWithWeight(cont, 6000, Constants.NounAndVerbPos);
         File.Create(@"..\Debug\textgather\gather\collecto.txt").Close();
         StreamWriter swt     = new StreamWriter(@"..\Debug\textgather\gather\collecto.txt", true, Encoding.Default);
         string       strline = null;
         foreach (var keyword in keywords)
         {
             strline = keyword.Word + "    " + keyword.Weight;
             swt.WriteLine(strline);
         }
         swt.Close();
         //计时
         time.Stop();
         TimeSpan ts2 = time.Elapsed;
         listBox1.Items.Clear();
         listBox1.Items.Add("文本预处理完成,总耗时:" + ts2.TotalSeconds + "秒");
     }
     catch (Exception ex)
     {
         MessageBox.Show(ex.Message);
     }
 }
        //百度权重图
        public string BaiduExtract(string usr_id, string projectId, string categoryId)
        {
            //获取词组对应链接的标题与摘要
            var builderLink = Builders <Dnl_Link_Baidu> .Filter;
            var builderMap  = Builders <Dnl_KeywordMapping> .Filter;
            var filterMap   = builderMap.Eq(x => x.IsDel, false) & builderMap.Eq(x => x.ProjectId, new ObjectId(projectId));

            List <string> commendIds = new List <string>();       //关键词组Id列表

            if (!string.IsNullOrEmpty(categoryId))
            {
                //判断是否有分组
                var cateIds = categoryId.Split(';').Select(x => new ObjectId(x)).ToList();
                if (cateIds.Count == 1 && cateIds[0].Equals(ObjectId.Empty))
                {
                    //无分组时使用所有词
                    filterMap &= builderMap.Eq(x => x.CategoryId, ObjectId.Empty);
                }
                else
                {
                    //去除根结点
                    cateIds.Remove(ObjectId.Empty);
                    filterMap &= builderMap.In(x => x.CategoryId, cateIds);
                }
            }
            else
            {
                filterMap &= builderMap.Eq(x => x.CategoryId, ObjectId.Empty);
            }
            commendIds = MongoDBHelper.Instance.GetDnl_KeywordMapping().Find(filterMap).Project(x => x.KeywordId.ToString()).ToList();

            //获取项目内已删除的链接Id
            var builderLinkMap = Builders <Dnl_LinkMapping_Baidu> .Filter;
            var filterLinkMap  = builderLinkMap.Eq(x => x.ProjectId, new ObjectId(projectId)) & builderLinkMap.Eq(x => x.DataCleanStatus, (byte)2);

            filterLinkMap &= builderLinkMap.Eq(x => x.Source, SourceType.Enginee);
            var exLinkObjIds = MongoDBHelper.Instance.GetDnl_LinkMapping_Baidu().Find(filterLinkMap).Project(x => x.LinkId).ToList();       //项目中已删除的链接ID列表

            var filterLink = builderLink.In(x => x.SearchkeywordId, commendIds);

            filterLink &= builderLink.Nin(x => x._id, exLinkObjIds);
            var    TaskList = MongoDBHelper.Instance.GetDnl_Link_Baidu().Find(filterLink).Project(x => string.Format("{0} {1}", x.Title, x.Description)).ToList();
            string all      = string.Join(" ", TaskList.ToArray());

            //获取关键词及其权重并转换成对应格式
            string text     = "";
            var    wordlist = tfidf.ExtractTagsWithWeight(all, 40).ToList();

            //排除停用词
            var stopWords = GetStopWord(usr_id).Words;

            for (int i = 0; i < wordlist.Count;)
            {
                if (stopWords.Contains(wordlist[i].Word))
                {
                    wordlist.Remove(wordlist[i]);
                    continue;
                }
                i++;
            }
            if (wordlist.Count > 30)
            {
                wordlist = wordlist.Take(30).ToList();
            }

            foreach (var x in wordlist)
            {
                //int weight = Convert.ToInt32(x.Weight * 100) * 50 / max;
                //if (weight < 50) weight = weight +10;
                text += "<span data-weight=\"" + Convert.ToInt32(x.Weight * 100) + "\">" + x.Word + "</span>";
            }

            return(text);
        }