Example #1
0
        private void Monitoring_Load(object sender, EventArgs e)
        {
            weiboSpiderTimer.Elapsed += new System.Timers.ElapsedEventHandler(weiboSpiderTimer_Elapsed);
            weiboSpiderTimer.AutoReset = true;
            weiboSpiderTimer.Enabled = false;

            webSpiderTimer.Elapsed += new System.Timers.ElapsedEventHandler(webSpiderTimer_Elapsed);
            webSpiderTimer.AutoReset = true;
            webSpiderTimer.Enabled = false;

            tri = new TbReleaseInfo();

            FormatDataView(dvAll, false);
            FormatDataView(dvBBs, false);
            FormatDataView(dvBlog, false);
            FormatDataView(dvWBlog, true);
            FormatDataView(dvWeb, false);
            FormatDataView(dvtieba, false);
            FormatDataView(dvWeiXin, false);

            dtnewsinfo = tri.GetReleaseInfoFormat();
            dttiebainfo = tri.GetReleaseInfoFormat();
            dtbloginfo = tri.GetReleaseInfoFormat();
            dtbbsinfo = tri.GetReleaseInfoFormat();
            dtqueryinfo = tri.GetReleaseInfoFormat();
            dtweixininfo = tri.GetReleaseInfoFormat();

            dtWebNewsInfo = tri.SelReleaseInfo(DateTime.Now.ToString("yyyy-MM-dd HH:mm:ss"), DateTime.Now.AddDays(-30).ToString("yyyy-MM-dd HH:mm:ss"), "0 AND webName<>'百度'");
            dvWeb.DataSource = dtnewsinfo;
            dtTieBaInfo = tri.SelReleaseInfo(DateTime.Now.ToString("yyyy-MM-dd HH:mm:ss"), DateTime.Now.AddDays(-30).ToString("yyyy-MM-dd HH:mm:ss"), "5");
            dvtieba.DataSource = dttiebainfo;
            dtWebBlogInfo = tri.SelReleaseInfo(DateTime.Now.ToString("yyyy-MM-dd HH:mm:ss"), DateTime.Now.AddDays(-30).ToString("yyyy-MM-dd HH:mm:ss"), "1");
            dvBlog.DataSource = dtbloginfo;
            dtWebBBSInfo = tri.SelReleaseInfo(DateTime.Now.ToString("yyyy-MM-dd HH:mm:ss"), DateTime.Now.AddDays(-30).ToString("yyyy-MM-dd HH:mm:ss"), "2");
            dvBBs.DataSource = dtbbsinfo;
            dtWebQueryInfo = tri.SelReleaseInfo(DateTime.Now.ToString("yyyy-MM-dd HH:mm:ss"), DateTime.Now.AddDays(-30).ToString("yyyy-MM-dd HH:mm:ss"), "4");
            dvAll.DataSource = dtqueryinfo;
            dtWeiXinInfo = tri.SelReleaseInfo(DateTime.Now.ToString("yyyy-MM-dd HH:mm:ss"), DateTime.Now.AddDays(-30).ToString("yyyy-MM-dd HH:mm:ss"), "6");
            dvWeiXin.DataSource = dtweixininfo;

            SoftVer = !GlobalPars.GloPars.ContainsKey("SoftVer")? "1" : GlobalPars.GloPars["SoftVer"].ToString();
            if (!SoftVer.Equals("3"))
            {
                tabControl1.TabPages.RemoveByKey("tabPage7");
            }
        }
Example #2
0
        public void StartWrite()
        {
            Action actionG = () =>
            {
                List <ModelReleaseInfo> data;
                while (true)
                {
                    if (Program.ProClose)
                    {
                        break;
                    }
                    if (Queue.TryDequeue(out data))
                    {
                        //// 写入数据库
                        #region 数据入库
                        try
                        {
                            StringBuilder sb = new StringBuilder();
                            sb.Append("");
                            TbReleaseInfo tri = new TbReleaseInfo();
                            MySqlCmd      cmd = new MySqlCmd();
                            foreach (var mri in data)
                            {
                                #region 2016.11.16 加入精确匹配的判断
                                string keywords = mri.KeyWords;
                                string title    = mri.Title;
                                string context  = mri.Contexts;
                                if (!string.IsNullOrEmpty(keywords))
                                {
                                    bool     isFundTitle   = true;
                                    bool     isFundContext = true;
                                    string[] keyw          = keywords.Split(' ');
                                    if (keyw != null && keyw.Count() > 0)
                                    {
                                        foreach (string key in keyw)
                                        {
                                            if (title.IndexOf(key) < 0)
                                            {
                                                isFundTitle = false;
                                            }
                                            if (context.IndexOf(key) < 0)
                                            {
                                                isFundContext = false;
                                            }
                                        }
                                    }
                                    if (!isFundTitle && !isFundContext)
                                    {
                                        //如果标题或者内容没有匹配全部关键字则去掉该条数据
                                        continue;
                                    }
                                }
                                #endregion
                                if (tri.GetReleaseInfoCount(mri.InfoSource, mri.KeyWords) > 0)
                                {
                                    continue;
                                }
                                string sql = tri.GetInsertStr(mri);
                                if (!sql.Trim().EndsWith(";"))
                                {
                                    sql += sql + ";";
                                }
                                sb.Append(sql);
                            }

                            if (sb.ToString().Length > 0)
                            {
                                //执行插入
                                cmd.ExecuteNonQuery(sb.ToString());
                                //清除插入字段串
                                sb.Clear();
                            }
                        }
                        catch (Exception ex)
                        {
                            Comm.WriteErrorLog(ex.Message);
                            Comm.WriteErrorLog(ex.StackTrace);
                        }
                        #endregion

                        log.Info("数据层写入数据库成功");
                    }
                    else
                    {
                        System.Threading.Thread.Sleep(1000);
                    }
                }
            };

            Parallel.Invoke(actionG, actionG);
        }
Example #3
0
        /// <summary>
        /// 得到网站的贴吧类数据
        /// </summary>
        private void GetWebTieBaInfo()
        {
            this.BeginInvoke(new MethodInvoker(delegate()
            {
                lbtieba.Text = "";
                lbtieba.Visible = true;
            }));

            //相似链接
            string Similar = "";

            DataBaseServer.SQLitecommand cmd = new SQLitecommand();
            //得到关键字列表
            DataTable dtkey;
            DataTable dtParts;
            dtkey = cmd.GetTabel("select * from Keywords");
            dtParts = cmd.GetTabel("SELECT * FROM partword");
            //得到相似表
            DataTable dtXs = new DataTable();
            dtXs = cmd.GetTabel("Select * from WebAddress WHERE pid=5");

            //相似表中的被抓取网址
            string webInfo = "";

            //要过滤链接中首页的正则
            string strTopFormat = "https?://.+/";
            List<string> strTop = new List<string>();
            sb = new StringBuilder();
            sb.Append("");
            string filterStr = "";

            #region 读取相似度表中的数据据,循环抓取
            for (int xs = 0; xs < dtXs.Rows.Count; xs++)
            {
                this.BeginInvoke(new MethodInvoker(delegate()
                {
                    lbtieba.ForeColor = Color.DarkBlue;
                    lbtieba.Text = "正在搜索:" + dtXs.Rows[xs]["name"].ToString();
                }));

                //读取相似表中要抓取的网址
                webInfo = HtmlUtil.getHtml(dtXs.Rows[xs]["url"].ToString(), "");
                //读取相似链接
                Similar = dtXs.Rows[xs]["likeurl"].ToString();

                //取出
                //string[] strA = HtmlUtil.GetElementsByTagName(webInfo, "a");
                List<string> strList = HtmlUtil.GetElementsByTagNameList(webInfo, "a");

                string strURLformat = "https?://.[^\"]+";

                TbReleaseInfo ri = new TbReleaseInfo();

                string[] strA = GetLIstDate(strList.Distinct());
                #region 逐个链接判断
                //循环时判断是否要验证
                bool isThere = false;

                for (int i = 0; i < strA.Length; i++)
                {
                    if (Program.ProClose == true) break;
                    Application.DoEvents();
                    Dictionary<string, int> events = new Dictionary<string, int>();
                    //创建数据对象
                    ModelReleaseInfo newsInfo = new ModelReleaseInfo();
                    try
                    {
                        //得到目标网址中的所有链接,如果未得到,那么就继续读取下一个
                        string tmp = strA[i];
                        newsInfo.Title = HtmlUtil.NoHTML(tmp);
                        //得到目标网址中的所有链接,如果未得到,那么就继续读取下一个
                        for (int j = 0; j < dtkey.Rows.Count; j++)
                        {
                            //Application.DoEvents();
                            string[] keys = dtkey.Rows[j][4].ToString().Split(new char[] { ' ' });
                            if (!events.ContainsKey(dtkey.Rows[j][1].ToString() + "-" + dtkey.Rows[j][6].ToString()))
                            {
                                events.Add(dtkey.Rows[j][1].ToString() + "-" + dtkey.Rows[j][6].ToString(), 1);
                                foreach (string k in keys)
                                {
                                    if (!strA[i].ToLower().Contains(k.ToLower()))
                                    {
                                        events.Remove(dtkey.Rows[j][1].ToString() + "-" + dtkey.Rows[j][6].ToString());
                                        break;
                                    }
                                }
                            }
                        }
                        foreach (KeyValuePair<string, int> ev in events)
                        {
                            if (ev.Value == 1)
                            {
                                newsInfo.KeyWords += "," + ev.Key.Split(new char[] { '-' })[0] + "-" + int.Parse(ev.Key.Split(new char[] { '-' })[1]);
                            }
                        }

                        string[] _a = HtmlUtil.GetListByHtml(dtXs.Rows[xs]["url"].ToString(), strA[i], strURLformat);
                        if (_a.Length > 0)
                        {
                            strA[i] = _a[0];
                        }
                        else
                        {
                            strA[i] = "";
                        }
                        //处理含有单引号的链接
                        strA[i] = HtmlUtil.UrlCl(strA[i]);

                        //处理单引号的链接
                        if (strA[i].IndexOf("'") != -1)
                        {
                            strA[i] = HtmlUtil.GetstringByHtmlArray(strA[i], "https?://.[^\']+");
                        }
                    }
                    catch (Exception)
                    {
                        continue;
                    }
                    //得到相似值,大于0.70的认为相同,并开始抓取
                    if (HtmlUtil.getSimilarDegree(Similar, strA[i]) >= 0.70)
                    {
                        //判断这个链接是否已经在库中或者列表中,如果存在,此次就不再执行
                        strTop = HtmlUtil.GetListByHtmlArray(strA[i], strTopFormat);
                        if (strTop.Count != 0)
                        {
                            continue;//同新闻,如果将首页去掉
                        }

                        if (isThere)
                        {
                            continue;
                        }
                        else
                        {
                            if (UrlThereare(strA[i], this.dttiebainfo, dtTieBaInfo, true) != 0) {
                                //isThere = true;
                                continue;
                            }
                        }
                        Thread.Sleep(2000);
                        //得到此链接的源码
                        webInfo = HtmlUtil.getHtml(strA[i], "");
                        if (webInfo.Length == 0) { continue; }

                        try
                        {
                            //流水+1
                            newsInfo.Uid = this.dvAll.Rows.Count + 1;

                            //标题
                            //string[] strT = HtmlUtil.GetElementsByTagName(webInfo, "title");
                            //if (strT.Length == 0)
                            //{
                            //    continue;
                            //}
                            //else
                            //{
                            //    newsInfo.Title = HtmlUtil.NoHTML(HtmlUtil.GetElementsByTagName(webInfo, "title")[0]);
                            //}

                            //得到正文,以P标签来区分
                            string[] strContext = HtmlUtil.GetElementsByClass(webInfo, "tieba");
                            //string[] strContext = HtmlUtil.GetElementsByTagName(webInfo, "post_content");
                            newsInfo.Contexts = "";
                            for (int j = 0; j < strContext.Length; j++)
                            {
                                //循环累加正文信息
                                newsInfo.Contexts += HtmlUtil.NoHTML(strContext[j]);
                            }

                            //如果正文信息为空,那么将无法做关键字对照,此条数据舍弃
                            if (newsInfo.Contexts.Length == 0)
                            {
                                continue;
                            }

                            //网站链接
                            newsInfo.InfoSource = strA[i].Trim();

                            //关键字的设置
                            //newsInfo.KeyWords = "";
                            //Dictionary<string, int> events = new Dictionary<string, int>();
                            //for (int j = 0; j < dtkey.Rows.Count; j++)
                            //{
                            //    Application.DoEvents();
                            //    if (!events.ContainsKey(dtkey.Rows[j][1].ToString() + "-" + dtkey.Rows[j][6].ToString()))
                            //    {
                            //        events.Add(dtkey.Rows[j][1].ToString() + "-" + dtkey.Rows[j][6].ToString(), 1);
                            //    }
                            //    if (!newsInfo.Contexts.Contains(dtkey.Rows[j][4].ToString()))
                            //    {
                            //        if (events.ContainsKey(dtkey.Rows[j][1].ToString() + "-" + dtkey.Rows[j][6].ToString()) && events[dtkey.Rows[j][1].ToString() + "-" + dtkey.Rows[j][6].ToString()] == 1)
                            //        {
                            //            events[dtkey.Rows[j][1].ToString() + "-" + dtkey.Rows[j][6].ToString()] = 0;
                            //        }
                            //    }
                            //}
                            if (newsInfo.KeyWords == null || newsInfo.KeyWords.Length == 0)
                            {
                                for (int j = 0; j < dtkey.Rows.Count; j++)
                                {
                                    Application.DoEvents();
                                    string[] keys = dtkey.Rows[j][4].ToString().Split(new char[] { ' ' });
                                    if (!events.ContainsKey(dtkey.Rows[j][1].ToString() + "-" + dtkey.Rows[j][6].ToString()))
                                    {
                                        events.Add(dtkey.Rows[j][1].ToString() + "-" + dtkey.Rows[j][6].ToString(), 1);
                                        foreach (string k in keys)
                                        {
                                            if (!newsInfo.Contexts.ToLower().Contains(k.ToLower()))
                                            {
                                                events.Remove(dtkey.Rows[j][1].ToString() + "-" + dtkey.Rows[j][6].ToString());
                                                break;
                                            }
                                        }
                                    }
                                }
                                foreach (KeyValuePair<string, int> ev in events)
                                {
                                    if (ev.Value == 1)
                                    {
                                        newsInfo.KeyWords += "," + ev.Key.Split(new char[] { '-' })[0] + "-" + int.Parse(ev.Key.Split(new char[] { '-' })[1]);
                                    }
                                }
                            }
                            if (newsInfo.KeyWords.Length == 0) { continue; }
                            newsInfo.KeyWords = newsInfo.KeyWords.Substring(1);

                            //收集日期
                            newsInfo.CollectDate = string.Format(DateTime.Now.ToString("yyyy-MM-dd HH:mm:ss"));

                            //发布人和发布日期暂时无法取到,手工赋值为空
                            newsInfo.ReleaseDate = "";
                            newsInfo.ReleaseName = "";

                            //网页快照,这里为用户指定生成,如果未选择生成,那么为空
                            newsInfo.Snapshot = "";
                            newsInfo.Sheng = dtXs.Rows[xs]["sheng"] == null ? "" : dtXs.Rows[xs]["sheng"].ToString();
                            newsInfo.Shi = dtXs.Rows[xs]["shi"] == null ? "" : dtXs.Rows[xs]["shi"].ToString();
                            newsInfo.Xian = dtXs.Rows[xs]["xian"] == null ? "" : dtXs.Rows[xs]["xian"].ToString();
                            //网站名
                            newsInfo.WebName = dtXs.Rows[xs]["Name"].ToString();
                            //pid
                            newsInfo.Pid = 5;
                            //part正负判断
                            newsInfo.Part = GetParts(newsInfo.Contexts);
                            //reposts
                            newsInfo.Reposts = 0;
                            //comments
                            newsInfo.Comments = 0;

                            //新建数据行
                            DataRow dr = dttiebainfo.NewRow();
                            if (dvtieba.RowCount == 0)
                            {
                                dr[0] = 1;
                            }
                            else
                            {
                                dr[0] = int.Parse(dvtieba.Rows[dvtieba.RowCount - 1].Cells[0].Value.ToString()) + 1;
                            }
                            //dr[0] = newsInfo.Uid;
                            dr[1] = newsInfo.Title;
                            dr[2] = newsInfo.Contexts;
                            dr[3] = newsInfo.ReleaseDate;
                            dr[4] = newsInfo.InfoSource;
                            dr[5] = newsInfo.KeyWords.Substring(0, newsInfo.KeyWords.IndexOf("-"));
                            dr[6] = newsInfo.ReleaseName;
                            dr[7] = newsInfo.CollectDate;
                            dr[8] = newsInfo.Snapshot;
                            dr[9] = newsInfo.WebName;
                            dr[10] = newsInfo.Pid;
                            dr[11] = newsInfo.Part;
                            dr[12] = newsInfo.Reposts;
                            dr[13] = newsInfo.Comments;

                            //把行加到DT中
                            dttiebainfo.Rows.InsertAt(dr, 0);

                            //数据源刷新
                            if (dttiebainfo.Rows.Count >= 500)
                            {
                                dttiebainfo.Rows.RemoveAt(500);
                            }
                            this.BeginInvoke(new MethodInvoker(delegate()
                            {
                                dvtieba.Refresh();
                            }));
                        }
                        catch (Exception ex)
                        {
                            Comm.WriteErrorLog(ex.StackTrace);
                        }

                        ////总表刷新
                        //dt.Rows.Add(dr);
                        //dvAll.Refresh();

                        //得到插入语句
                        try
                        {
                            if (isThere)
                            {
                                continue;
                            }
                            else
                            {
                                sb.Append(ri.GetInsString(newsInfo) + ";");
                            }

                            //每10次执行一次插入数据库
                            if (sb.ToString().Length != 0)
                            {
                                if (i % 10 == 0)
                                {
                                    filterStr = sb.ToString();
                                    filterStr = filterStr.Replace("[ ", "[");
                                    filterStr = filterStr.Replace(" ]", "]");
                                    //执行插入
                                    cmd.ExecuteNonQuery(filterStr);
                                    //清除插入字段串
                                    sb.Clear();
                                    filterStr = "";
                                }
                            }
                        }
                        catch (Exception ex)
                        {
                            Comm.WriteErrorLog(ex.StackTrace);
                        }
                    }
                }
                #endregion
            }
            #endregion

            try
            {
                if (sb.ToString().Length != 0)
                {
                    filterStr = sb.ToString();
                    filterStr = filterStr.Replace("[ ", "[");
                    filterStr = filterStr.Replace(" ]", "]");
                    //执行插入
                    cmd.ExecuteNonQuery(filterStr);
                    //清除插入字段串
                    sb.Clear();
                    filterStr = "";
                }
            }
            catch (Exception ex)
            {
                Comm.WriteErrorLog(ex.StackTrace);
            }

            //执行完毕后,重新获取一次数据库的数据
            dtTieBaInfo = tri.SelReleaseInfo(DateTime.Now.ToString("yyyy-MM-dd HH:mm:ss"), DateTime.Now.AddDays(-15).ToString("yyyy-MM-dd HH:mm:ss"), "5");
            this.BeginInvoke(new MethodInvoker(delegate()
            {
                lbtieba.Text = "一轮搜索完毕!";
                lbtieba.ForeColor = Color.Red;
            }));
        }
Example #4
0
        private void Monitoring_Load(object sender, EventArgs e)
        {
            Thread re = new Thread(new ThreadStart(Report));
            re.IsBackground = true;
            re.Start();

            tri = new TbReleaseInfo();

            FormatDataView(dvAll, false);
            FormatDataView(dvBBs, false);
            FormatDataView(dvBlog, false);
            FormatDataView(dvWBlog, true);
            FormatDataView(dvWeb, false);

            dtnewsinfo = tri.GetReleaseInfoFormat();
            dtbloginfo = tri.GetReleaseInfoFormat();
            dtbbsinfo = tri.GetReleaseInfoFormat();
            dtqueryinfo = tri.GetReleaseInfoFormat();

            dtWebNewsInfo = tri.SelReleaseInfo(DateTime.Now.ToString("yyyy-MM-dd HH:mm:ss"), DateTime.Now.AddDays(-15).ToString("yyyy-MM-dd HH:mm:ss"), "0 AND webName<>'百度'");
            dvWeb.DataSource = dtnewsinfo;
            dtWebBlogInfo = tri.SelReleaseInfo(DateTime.Now.ToString("yyyy-MM-dd HH:mm:ss"), DateTime.Now.AddDays(-15).ToString("yyyy-MM-dd HH:mm:ss"), "1");
            dvBlog.DataSource = dtbloginfo;
            dtWebBBSInfo = tri.SelReleaseInfo(DateTime.Now.ToString("yyyy-MM-dd HH:mm:ss"), DateTime.Now.AddDays(-15).ToString("yyyy-MM-dd HH:mm:ss"), "2");
            dvBBs.DataSource = dtbbsinfo;
            dtWebQueryInfo = tri.SelReleaseInfo(DateTime.Now.ToString("yyyy-MM-dd HH:mm:ss"), DateTime.Now.AddDays(-15).ToString("yyyy-MM-dd HH:mm:ss"), "0 AND webName='百度'");
            dvAll.DataSource = dtqueryinfo;
        }
Example #5
0
        /// <summary>
        /// 得到网站的新闻类数据
        /// </summary>
        private void GetWebNewsInfo()
        {
            lbWeb.Text = "";
            lbWeb.Visible = true;
            //相似链接
            string Similar = "";

            DataBaseServer.MySqlCmd cmd = new MySqlCmd();

            //得到相似表
            DataTable dtXs = new DataTable();
            dtXs = cmd.GetTabel("Select * from WebAddress WHERE pid=0");

            dtParts = cmd.GetTabel("SELECT * FROM partword");

            DataTable dtkey = new DataTable();
            dtkey = cmd.GetTabel("select * from Keywords");

            //相似表中的被抓取网址
            string webInfo = "";

            //要过滤链接中首页的正则
            string strTopFormat = "http://.+/";
            List<string> strTop = new List<string>();
            sb = new StringBuilder();
            sb.Append("");
            string filterStr = "";

            #region 读取相似度表中的数据据,循环抓取
            for (int xs = 0; xs < dtXs.Rows.Count; xs++)
            {
                lbWeb.ForeColor = Color.DarkBlue;
                lbWeb.Text = "正在搜索:" + dtXs.Rows[xs]["name"].ToString();
                //读取相似表中要抓取的网址
                webInfo = getHtml(dtXs.Rows[xs]["url"].ToString(), "");
                //读取相似链接
                Similar = dtXs.Rows[xs]["likeurl"].ToString();

                //取出
                //string[] strA = HtmlUtil.GetElementsByTagName(webInfo, "a");
                List<string> strList = HtmlUtil.GetElementsByTagNameList(webInfo, "a");

                string strURLformat = "http://.[^\"]+";

                TbReleaseInfo ri = new TbReleaseInfo();

                string[] strA = GetLIstDate(strList.Distinct());
                #region 逐个链接判断
                //循环时判断是否要验证
                bool isThere = false;

                for (int i = 0; i < strA.Length; i++)
                {
                    if (Program.ProClose == true) break;
                    Application.DoEvents();
                    try
                    {
                        //得到目标网址中的所有链接,如果未得到,那么就继续读取下一个
                        strA[i] = HtmlUtil.GetListByHtml(dtXs.Rows[xs]["url"].ToString(), strA[i], strURLformat)[0];
                        //处理含有单引号的链接
                        strA[i] = UrlCl(strA[i]);

                        //处理单引号的链接
                        if (strA[i].IndexOf("'") != -1)
                        {
                            strA[i] = GetstringByHtmlArray(strA[i], "http://.[^\']+");
                        }
                    }
                    catch (Exception)
                    {
                        continue;
                    }
                    //得到相似值,大于0.70的认为相同,并开始抓取
                    if (HtmlUtil.getSimilarDegree(Similar, strA[i]) >= 0.60)
                    {
                        //判断这个链接是否已经在库中或者列表中,如果存在,此次就不再执行
                        strTop = HtmlUtil.GetListByHtmlArray(strA[i], strTopFormat);
                        if (strTop.Count != 0)
                        {
                            //if (strTop[0] == "http://blog.sohu.com/")
                            continue;//同新闻,如果将首页去掉
                        }

                        if (isThere)
                        {
                            continue;
                        }
                        else
                        {
                            //if (strA[i] == "http://news.ifeng.com/mainland/detail_2013_10/18/30459577_0.shtml'>[详细]</a>")
                            //{
                            //    strA[i] = strA[i];
                            //}

                            if (UrlThereare(strA[i], this.dtnewsinfo, dtWebNewsInfo, true) != 0) { isThere = true; continue; }
                        }

                        //得到此链接的源码
                        webInfo = getHtml(strA[i], "");
                        if (webInfo.Length == 0) { continue; }

                        //创建数据对象
                        ModelReleaseInfo newsInfo = new ModelReleaseInfo();

                        try
                        {
                            //流水+1
                            newsInfo.Uid = this.dvAll.Rows.Count + 1;

                            //标题
                            string[] strT = HtmlUtil.GetElementsByTagName(webInfo, "title");
                            if (strT.Length == 0)
                            {
                                continue;
                            }
                            else
                            {
                                newsInfo.Title = HtmlUtil.NoHTML(HtmlUtil.GetElementsByTagName(webInfo, "title")[0]);
                            }

                            //得到正文,以P标签来区分
                            string[] strContext = HtmlUtil.GetElementsByTagName(webInfo, "p");
                            newsInfo.Contexts = "";
                            for (int j = 0; j < strContext.Length; j++)
                            {
                                //循环累加正文信息
                                newsInfo.Contexts += HtmlUtil.NoHTML(strContext[j]);
                            }

                            //如果正文信息为空,那么将无法做关键字对照,此条数据舍弃
                            if (newsInfo.Contexts.Length == 0)
                            {
                                continue;
                            }

                            //网站链接
                            newsInfo.InfoSource = strA[i].Trim();

                            //关键字的设置
                            newsInfo.KeyWords = "";
                            for (int j = 0; j < dtkey.Rows.Count; j++)
                            {
                                Application.DoEvents();
                                if (newsInfo.Contexts.IndexOf(dtkey.Rows[j][1].ToString()) > 0)
                                { newsInfo.KeyWords += dtkey.Rows[j][1].ToString() + ","; }
                                else
                                {

                                }
                            }
                            if (newsInfo.KeyWords.Length == 0) { continue; }
                            newsInfo.KeyWords = newsInfo.KeyWords.Substring(0, newsInfo.KeyWords.Length - 1);

                            //收集日期
                            newsInfo.CollectDate = string.Format(DateTime.Now.ToString("yyyy-MM-dd HH:mm:ss"));

                            //发布人和发布日期暂时无法取到,手工赋值为空
                            newsInfo.ReleaseDate = "";
                            newsInfo.ReleaseName = "";

                            //网页快照,这里为用户指定生成,如果未选择生成,那么为空
                            newsInfo.Snapshot = "";

                            //网站名
                            newsInfo.WebName = dtXs.Rows[xs]["Name"].ToString();
                            //pid
                            newsInfo.Pid = 0;
                            //part正负判断
                            newsInfo.Part = GetParts(newsInfo.Contexts);
                            //reposts
                            newsInfo.Reposts = 0;
                            //comments
                            newsInfo.Comments = 0;

                            //新建数据行
                            DataRow dr = dtnewsinfo.NewRow();
                            if (dvWeb.RowCount == 0)
                            {
                                dr[0] = 1;
                            }
                            else
                            {
                                dr[0] = int.Parse(dvWeb.Rows[dvWeb.RowCount - 1].Cells[0].Value.ToString()) + 1;
                            }
                            //dr[0] = newsInfo.Uid;
                            dr[1] = newsInfo.Title;
                            dr[2] = newsInfo.Contexts;
                            dr[3] = newsInfo.ReleaseDate;
                            dr[4] = newsInfo.InfoSource;
                            dr[5] = newsInfo.KeyWords;
                            dr[6] = newsInfo.ReleaseName;
                            dr[7] = newsInfo.CollectDate;
                            dr[8] = newsInfo.Snapshot;
                            dr[9] = newsInfo.WebName;
                            dr[10] = newsInfo.Pid;
                            dr[11] = newsInfo.Part;
                            dr[12] = newsInfo.Reposts;
                            dr[13] = newsInfo.Comments;

                            //把行加到DT中
                            dtnewsinfo.Rows.InsertAt(dr, 0);

                            //数据源刷新
                            if (dtnewsinfo.Rows.Count >= 500)
                            {
                                dtnewsinfo.Rows.RemoveAt(500);
                            }
                            dvWeb.Refresh();
                        }
                        catch (Exception ex)
                        {
                            StreamWriter sw = File.AppendText("log.txt");
                            sw.WriteLine(DateTime.Now.ToLongDateString());
                            sw.WriteLine("begin");
                            sw.WriteLine(ex.Message);
                            sw.WriteLine(sb.ToString());
                            sw.WriteLine("end");
                            sw.WriteLine("");

                            sw.Close();
                        }

                        ////总表刷新
                        //dt.Rows.Add(dr);
                        //dvAll.Refresh();

                        //得到插入语句
                        try
                        {
                            if (isThere)
                            {
                                continue;
                            }
                            else
                            {
                                sb.Append(ri.GetInsString(newsInfo) + ";");
                            }

                            //每10次执行一次插入数据库
                            if (sb.ToString().Length != 0)
                            {
                                if (i % 10 == 0)
                                {
                                    filterStr = sb.ToString();
                                    filterStr = filterStr.Replace("[ ", "[");
                                    filterStr = filterStr.Replace(" ]", "]");
                                    //执行插入
                                    cmd.ExecuteNonQuery(filterStr);
                                    //清除插入字段串
                                    sb.Clear();
                                    filterStr = "";
                                }
                            }
                        }
                        catch (Exception ex)
                        {
                            StreamWriter sw = File.AppendText("log.txt");
                            sw.WriteLine(DateTime.Now.ToLongDateString());
                            sw.WriteLine("begin");
                            sw.WriteLine(ex.Message);
                            sw.WriteLine(sb.ToString());
                            sw.WriteLine("end");
                            sw.WriteLine("");

                            sw.Close();
                        }
                    }
                }
                #endregion
            }
            #endregion

            try
            {
                if (sb.ToString().Length != 0)
                {
                    filterStr = sb.ToString();
                    filterStr = filterStr.Replace("[ ", "[");
                    filterStr = filterStr.Replace(" ]", "]");
                    //执行插入
                    cmd.ExecuteNonQuery(filterStr);
                    //清除插入字段串
                    sb.Clear();
                    filterStr = "";
                }
            }
            catch (Exception ex)
            {
                StreamWriter sw = File.AppendText("log.txt");
                sw.WriteLine(DateTime.Now.ToLongDateString());
                sw.WriteLine("begin");
                sw.WriteLine(ex.Message);
                sw.WriteLine(sb.ToString());
                sw.WriteLine("end");
                sw.WriteLine("");

                sw.Close();
            }

            //执行完毕后,重新获取一次数据库的数据
            dtWebNewsInfo = tri.SelReleaseInfo(DateTime.Now.ToString("yyyy-MM-dd HH:mm:ss"), DateTime.Now.AddDays(-15).ToString("yyyy-MM-dd HH:mm:ss"), "0 AND webName<>'百度'");
            //MessageBox.Show("ok");
            lbWeb.Text = "一轮搜索完毕!";
            lbWeb.ForeColor = Color.Red;
        }
Example #6
0
        public void StartWrite()
        {
            Action actionG = () =>
            {
                List<ModelReleaseInfo> data;
                while (true)
                {
                    if (Program.ProClose)
                    {
                        break;
                    }
                    if (Queue.TryDequeue(out data))
                    {
                        //// 写入数据库
                        #region 数据入库
                        try
                        {
                            StringBuilder sb = new StringBuilder();
                            sb.Append("");
                            TbReleaseInfo tri = new TbReleaseInfo();
                            MySqlCmd cmd = new MySqlCmd();
                            foreach (var mri in data)
                            {
                                #region 2016.11.16 加入精确匹配的判断
                                string keywords = mri.KeyWords;
                                string title = mri.Title;
                                string context = mri.Contexts;
                                if (!string.IsNullOrEmpty(keywords))
                                {
                                    bool isFundTitle = true;
                                    bool isFundContext = true;
                                    string[] keyw = keywords.Split(' ');
                                    if (keyw != null && keyw.Count() > 0)
                                    {
                                        foreach (string key in keyw)
                                        {
                                            if (title.IndexOf(key) < 0)
                                            {
                                                isFundTitle = false;
                                            }
                                            if (context.IndexOf(key) < 0)
                                            {
                                                isFundContext = false;
                                            }
                                        }
                                    }
                                    if (!isFundTitle && !isFundContext)
                                    {
                                        //如果标题或者内容没有匹配全部关键字则去掉该条数据
                                        continue;
                                    }
                                }
                                #endregion
                                if (tri.GetReleaseInfoCount(mri.InfoSource, mri.KeyWords) > 0) continue;
                                string sql = tri.GetInsertStr(mri);
                                if (!sql.Trim().EndsWith(";"))
                                {
                                    sql += sql + ";";
                                }
                                sb.Append(sql);
                            }

                            if (sb.ToString().Length > 0)
                            {
                                //执行插入
                                cmd.ExecuteNonQuery(sb.ToString());
                                //清除插入字段串
                                sb.Clear();
                            }
                        }
                        catch (Exception ex)
                        {
                            Comm.WriteErrorLog(ex.Message);
                            Comm.WriteErrorLog(ex.StackTrace);
                        }
                        #endregion

                        log.Info("数据层写入数据库成功");
                    }
                    else
                    {
                        System.Threading.Thread.Sleep(1000);
                    }
                }
            };

            Parallel.Invoke(actionG, actionG);
        }
Example #7
0
        private void Monitoring_Load(object sender, EventArgs e)
        {
            #region 抓取定时器设置
            //2015.3.9 wangcg 通用抓取
            GeneralWebSpiderTimer.Elapsed += new System.Timers.ElapsedEventHandler(GeneralWebSpiderTimer_Elapsed);
            GeneralWebSpiderTimer.AutoReset = true;
            GeneralWebSpiderTimer.Enabled = false;

            //2015.3.10 wangcg 主流媒体抓取
            MediaWebSpiderTimer.Elapsed += new System.Timers.ElapsedEventHandler(MediaWebSpiderTimer_Elapsed);
            MediaWebSpiderTimer.AutoReset = true;
            MediaWebSpiderTimer.Enabled = false;

            //2015.3.11 wangcg 微信抓取
            WeixinWebSpiderTimer.Elapsed += new System.Timers.ElapsedEventHandler(WeixinWebSpiderTimer_Elapsed);
            WeixinWebSpiderTimer.AutoReset = true;
            WeixinWebSpiderTimer.Enabled = false;

            //2015.3.11 wangcg 博客抓取
            BlogWebSpiderTimer.Elapsed += new System.Timers.ElapsedEventHandler(BlogWebSpiderTimer_Elapsed);
            BlogWebSpiderTimer.AutoReset = true;
            BlogWebSpiderTimer.Enabled = false;

            //2015.3.11 wangcg 论坛抓取
            BBSWebSpiderTimer.Elapsed += new System.Timers.ElapsedEventHandler(BBSWebSpiderTimer_Elapsed);
            BBSWebSpiderTimer.AutoReset = true;
            BBSWebSpiderTimer.Enabled = false;

            //2015.3.14 wangcg 贴吧抓取
            TiebaWebSpiderTimer.Elapsed += new System.Timers.ElapsedEventHandler(TiebaWebSpiderTimer_Elapsed);
            TiebaWebSpiderTimer.AutoReset = true;
            TiebaWebSpiderTimer.Enabled = false;

            //2015.3.14 wangcg 新浪微博抓取
            WeiboWebSpiderTimer.Elapsed += new System.Timers.ElapsedEventHandler(WeiboWebSpiderTimer_Elapsed);
            WeiboWebSpiderTimer.AutoReset = true;
            WeiboWebSpiderTimer.Enabled = false;

            //2015.3.17 wangcg 百度网页搜索抓取
            BaiduWebWebSpiderTimer.Elapsed += new System.Timers.ElapsedEventHandler(BaiduWebWebSpiderTimer_Elapsed);
            BaiduWebWebSpiderTimer.AutoReset = true;
            BaiduWebWebSpiderTimer.Enabled = false;
            #endregion

            RefreshWebSpiderTimer.Elapsed += new System.Timers.ElapsedEventHandler(RefreshDataTimer_Elapsed);
            RefreshWebSpiderTimer.AutoReset = true;
            RefreshWebSpiderTimer.Enabled = false;

            #region 初始化表格控件
            tri = new TbReleaseInfo();
            FormatDataView2(dvView);

            dtData = tri.GetReleaseInfoFormat();

            dvView.DataSource = dtData;
            #endregion

            SoftVer = !GlobalPars.GloPars.ContainsKey("SoftVer") ? "1" : GlobalPars.GloPars["SoftVer"].ToString();
            if (!SoftVer.Equals("3"))
            {
                //不能使用微信
                //tabControl1.TabPages.RemoveByKey("tabPage7");
                chkWeixin.Visible = false;
            }

            chkAllWeb.Checked = true;
            chkBBS.Checked = true;
            chkBlog.Checked = true;
            chkCustom.Checked = true;
            chkMedia.Checked = true;
            chkTieba.Checked = true;
            chkWeixin.Checked = true;

            kidlist.SelectedIndex = 4;  //事件类型 (默认选择全部)
            kwlist.SelectedIndex = 0;   //事件名称 (启动时隐藏)
            kwlist.Hide();  //事件名称
            label8.Hide();  //事件名称
            selectKID = -1;

            #region 提取事件与关键字
            DataTable kwdtAll = cmd.GetTabel("select name, keyword from keywords");
            for (int i = 0; i < kwdtAll.Rows.Count; i++)
            {
                string key = kwdtAll.Rows[i]["name"].ToString();
                if (!dicKeywords.ContainsKey(key))
                {
                    List<string> keywords = new List<string>();
                    keywords.Add(kwdtAll.Rows[i]["keyword"].ToString());
                    dicKeywords.Add(key, keywords);
                }
                else
                {
                    dicKeywords[key].Add(kwdtAll.Rows[i]["keyword"].ToString());
                }
            }
            #endregion
        }