예제 #1
0
        /// <summary>
        /// 添加用户
        /// </summary>
        /// <param name="obj">用户对象</param>
        /// <returns></returns>
        public int InsReleaseInfo(ModelReleaseInfo obj)
        {
            string sql = @"INSERT INTO ReleaseInfo(Title,Contexts,ReleaseDate,InfoSource,KeyWords,ReleaseName,CollectDate,Snapshot) 
                            VALUES(@Title,@Contexts,@RleaseDate,@InfoSource,@KeyWords,@ReleaseName,@CollectDate,@Snapshot) ";

            List <MySqlParameter> par = new List <MySqlParameter>();

            par.Add(new MySqlParameter("@Title", obj.Title));
            par.Add(new MySqlParameter("@Contexts", obj.Contexts));
            par.Add(new MySqlParameter("@RleaseDate", obj.ReleaseDate));
            par.Add(new MySqlParameter("@InfoSource", obj.InfoSource));
            par.Add(new MySqlParameter("@KeyWords", obj.KeyWords));
            par.Add(new MySqlParameter("@ReleaseName", obj.ReleaseName));
            par.Add(new MySqlParameter("@CollectDate", obj.CollectDate));
            par.Add(new MySqlParameter("@Snapshot", obj.Snapshot));


            try
            {
                MySqlCmd dbobj = new MySqlCmd();
                return(dbobj.ExecuteNonQueryInt(sql, par));
            }
            catch (Exception ex)
            {
                throw new Exception("新建失败,位置:InsReleaseInfo.原因:" + ex.Message);
            }
        }
예제 #2
0
        /// <summary>
        /// 修改用户
        /// </summary>
        /// <param name="obj">用户对象</param>
        /// <returns></returns>
        public int FixReleaseInfo(ModelReleaseInfo obj)
        {
            string sql = @"UPDATE SET ReleaseInfo Title=@Title,Contexts=@Contexts,ReleaseDate=@ReleaseDate,
                                InfoSource=@InfoSource,KeyWords=@KeyWords,ReleaseName=@ReleaseName,
                                CollectDate=@CollectDate,Snapshot=@Snapshot  
                        WHERE uid=@uid";

            List <MySqlParameter> par = new List <MySqlParameter>();

            par.Add(new MySqlParameter("@uid", obj.Uid));
            par.Add(new MySqlParameter("@Title", obj.Title));
            par.Add(new MySqlParameter("@Contexts", obj.Contexts));
            par.Add(new MySqlParameter("@RleaseDate", obj.ReleaseDate));
            par.Add(new MySqlParameter("@InfoSource", obj.InfoSource));
            par.Add(new MySqlParameter("@KeyWords", obj.KeyWords));
            par.Add(new MySqlParameter("@ReleaseName", obj.ReleaseName));
            par.Add(new MySqlParameter("@CollectDate", obj.CollectDate));
            par.Add(new MySqlParameter("@Snapshot", obj.Snapshot));

            try
            {
                DataBaseServer.MySqlCmd dbobj = new DataBaseServer.MySqlCmd();
                return(dbobj.ExecuteNonQueryInt(sql, par));
            }
            catch (Exception ex)
            {
                throw new Exception("新建失败,位置:FixReleaseInfo.原因:" + ex.Message);
            }
        }
예제 #3
0
        public String GetInsString(ModelReleaseInfo obj)
        {
            StringBuilder insertSql = new StringBuilder();

            string[] keywords = obj.KeyWords.Split(new string[] { "," }, StringSplitOptions.RemoveEmptyEntries);
            foreach (string k in keywords)
            {
                string sql = @"INSERT INTO ReleaseInfo(Title,Contexts,ReleaseDate,InfoSource,KeyWords,ReleaseName,CollectDate,Snapshot,webName,pid,part,reposts,comments,kid,sheng,shi,xian) 
                            VALUES('{0}','{1}','{2}','{3}','{4}','{5}','{6}','{7}','{8}','{9}','{10}','{11}','{12}','{13}','{14}','{15}','{16}'); ";

                obj.Title    = filtRiskChar(obj.Title);
                obj.Contexts = filtRiskChar(obj.Contexts);
                insertSql.Append(string.Format(sql, obj.Title, obj.Contexts, obj.ReleaseDate, obj.InfoSource, k.Contains('-') ? k.Split(new char[] { '-' })[0] : k,
                                               obj.ReleaseName, obj.CollectDate, obj.Snapshot, obj.WebName, obj.Pid, obj.Part, obj.Reposts, obj.Comments, k.Contains('-') ? k.Split(new char[] { '-' })[1] : k, obj.Sheng, obj.Shi, obj.Xian));
            }
            return(insertSql.ToString());
        }
예제 #4
0
        public String GetInsertStr(ModelReleaseInfo mri)
        {
            string sql = @"INSERT INTO ReleaseInfo(Title,Contexts,ReleaseDate,InfoSource,KeyWords,ReleaseName,CollectDate,Snapshot,webName,pid,part,reposts,comments,kid,sheng,shi,xian) 
                            VALUES('{0}','{1}','{2}','{3}','{4}','{5}','{6}','{7}','{8}','{9}','{10}','{11}','{12}','{13}','{14}','{15}','{16}'); ";

            mri.Title    = filtRiskChar(mri.Title);
            mri.Contexts = filtRiskChar(mri.Contexts);
            //统一处理一下发布时间
            string date = mri.ReleaseDate;

            string[] formats = { "yyyy-MM-dd HH:mm:ss", "yyyy-M-dd HH:mm:ss", "yyyy-M-d HH:mm:ss",  "yyyy-MM-d HH:mm:ss",
                                 "yyyy-MM-dd HH:mm",    "yyyy-MM-dd hh:mm",   "yyyy-MM-dd H:mm",    "yyyy-MM-dd h:mm",   "yyyy-MM-dd HH:m",     "yyyy-MM-dd hh:m",   "yyyy-MM-dd h:m",
                                 "yyyy-MM-dd hh:mm:ss", "yyyy-MM-dd hh:mm:s", "yyyy-MM-dd hh:m:s",  "yyyy-MM-dd hh:m:ss","yyyy-MM-dd h:mm:ss",  "yyyy-MM-dd h:mm:s", "yyyy-MM-dd h:m:s", "yyyy-MM-dd h:m:ss",
                                 "yyyy-MM-dd HH:mm:s",  "yyyy-MM-dd HH:m:s",  "yyyy-MM-dd HH:m:ss", "yyyy-MM-dd H:mm:ss","yyyy-MM-dd H:mm:s",   "yyyy-MM-dd H:m:s",  "yyyy-MM-dd H:m:ss",
                                 "yyyy-M-dd HH:mm",     "yyyy-M-dd hh:mm",    "yyyy-M-dd H:mm",     "yyyy-M-dd h:mm",    "yyyy-M-dd HH:m",      "yyyy-M-dd hh:m",    "yyyy-M-dd h:m",
                                 "yyyy-M-dd hh:mm:ss",  "yyyy-M-dd hh:mm:s",  "yyyy-M-dd hh:m:s",   "yyyy-M-dd hh:m:ss", "yyyy-M-dd h:mm:ss",   "yyyy-M-dd h:mm:s",  "yyyy-M-dd h:m:s",  "yyyy-M-dd h:m:ss",
                                 "yyyy-M-dd HH:mm:s",   "yyyy-M-dd HH:m:s",   "yyyy-M-dd HH:m:ss",  "yyyy-M-dd H:mm:ss", "yyyy-M-dd H:mm:s",    "yyyy-M-dd H:m:s",   "yyyy-M-dd H:m:ss",
                                 "yyyy-M-d HH:mm",      "yyyy-M-d hh:mm",     "yyyy-M-d H:mm",      "yyyy-M-d h:mm",     "yyyy-M-d HH:m",       "yyyy-M-d hh:m",     "yyyy-M-d h:m",
                                 "yyyy-M-d hh:mm:ss",   "yyyy-M-d hh:mm:s",   "yyyy-M-d hh:m:s",    "yyyy-M-d hh:m:ss",  "yyyy-M-d h:mm:ss",    "yyyy-M-d h:mm:s",   "yyyy-M-d h:m:s",   "yyyy-M-d h:m:ss",
                                 "yyyy-M-d HH:mm:s",    "yyyy-M-d HH:m:s",    "yyyy-M-d HH:m:ss",   "yyyy-M-d H:mm:ss",  "yyyy-M-d H:mm:s",     "yyyy-M-d H:m:s",    "yyyy-M-d H:m:ss",
                                 "yyyy-MM-d HH:mm",     "yyyy-MM-d hh:mm",    "yyyy-MM-d H:mm",     "yyyy-MM-d h:mm",    "yyyy-MM-d HH:m",      "yyyy-MM-d hh:m",    "yyyy-MM-d h:m",
                                 "yyyy-MM-d hh:mm:ss",  "yyyy-MM-d hh:mm:s",  "yyyy-MM-d hh:m:s",   "yyyy-MM-d hh:m:ss", "yyyy-MM-d h:mm:ss",   "yyyy-MM-d h:mm:s",  "yyyy-MM-d h:m:s",  "yyyy-MM-d h:m:ss",
                                 "yyyy-MM-d HH:mm:s",   "yyyy-MM-d HH:m:s",   "yyyy-MM-d HH:m:ss",  "yyyy-MM-d H:mm:ss", "yyyy-MM-d H:mm:s",    "yyyy-MM-d H:m:s",   "yyyy-MM-d H:m:ss",
                                 "yyyy-MM-dd",          "yyyy-M-dd",          "yyyy-M-d",           "yyyy-MM-d" };

            DateTime dateValue;

            if (DateTime.TryParseExact(date, formats,
                                       System.Globalization.DateTimeFormatInfo.CurrentInfo, System.Globalization.DateTimeStyles.None, out dateValue))
            {
                date = dateValue.ToString("yyyy-MM-dd HH:mm:ss");
            }
            else
            {
                date = mri.CollectDate;
            }
            sql = string.Format(sql, mri.Title, mri.Contexts, date, mri.InfoSource, mri.KeyWords,
                                mri.ReleaseName, mri.CollectDate, mri.Snapshot, mri.WebName, mri.Pid, mri.Part, mri.Reposts, mri.Comments, mri.Kid, mri.Sheng, mri.Shi, mri.Xian);
            return(sql);
        }
예제 #5
0
파일: Monitor.cs 프로젝트: wcgcw/Finder
        private void GetBaiduInfo()
        {
            this.BeginInvoke(new MethodInvoker(delegate()
            {
                lbAll.Text = "";
                lbAll.Visible = true;
            }));

            SQLitecommand cmd = new SQLitecommand();
            //得到关键字列表
            DataTable dtkey;
            DataTable dtParts;
            dtkey = cmd.GetTabel("select * from Keywords");
            dtParts = cmd.GetTabel("SELECT * FROM partword");
            Dictionary<string, string> keywords = new Dictionary<string, string>();
            for (int kw = 0; kw < dtkey.Rows.Count; kw++)
            {
                if (keywords.ContainsKey(dtkey.Rows[kw]["Name"].ToString().Trim() + "-" + dtkey.Rows[kw]["kid"].ToString().Trim()))
                {
                    keywords[dtkey.Rows[kw]["Name"].ToString().Trim() + "-" + dtkey.Rows[kw]["kid"].ToString().Trim()] = keywords[dtkey.Rows[kw]["Name"].ToString().Trim() + "-" + dtkey.Rows[kw]["kid"].ToString().Trim()] + "," + dtkey.Rows[kw]["KeyWord"].ToString().Trim();
                }
                else
                {
                    keywords.Add(dtkey.Rows[kw]["Name"].ToString().Trim() + "-" + dtkey.Rows[kw]["kid"].ToString().Trim(), "," + dtkey.Rows[kw]["KeyWord"].ToString().Trim());
                }
            }

            //链接的正则
            string aa = "https?://.[^\"]+";
            string[] sDate;

            sb = new StringBuilder();
            sb.Append("");

            //TbReleaseInfo ri = new TbReleaseInfo();

            //按关键字循环
            foreach (KeyValuePair<string,string> kv in keywords)
            {
                string k = kv.Key;
                string v = kv.Value.Substring(1);
                this.BeginInvoke(new MethodInvoker(delegate()
                {
                    lbAll.Text = "正在搜索事件为<" + k.Substring(0,k.IndexOf("-")) + ">的数据.";
                    lbAll.ForeColor = Color.DarkBlue;
                }));

                //取得关键字
                string keys = v;
                //组成查询字串
                //string url = "http://www.baidu.com/s?wd=" + keys + "&rn=20";
                string url = "http://news.baidu.com/ns?rn=20&word=" + keys;
                //得到结果放在数组内
                List<string> lis = new List<string>();
                lis = HtmlUtil.GetElementsByClassList(HtmlUtil.getHtml(url, "utf-8"), "result");

                //如果没取到,就结束本次循环
                if (lis == null) return;
                //webBrowser1.Navigate(url);
                if (lis.Count <= 0)
                {
                    continue;
                }
                //循环时判断是否要验证
                bool isThere = false;

                for (int i = 0; i < lis.Count; i++)
                {
                    if (Program.ProClose == true) break;

                    ModelReleaseInfo mri = new ModelReleaseInfo();

                    //发布日期的赋值
                    sDate = HtmlUtil.GetElementsByTagAndClass(lis[i], "span", "g");
                    if (sDate.Length <= 0) continue;

                    mri.ReleaseDate = HtmlUtil.NoHTML(sDate[0]);
                    mri.ReleaseDate = mri.ReleaseDate.Substring(mri.ReleaseDate.IndexOf('2'), 17);

                    //判断日期
                    DateTime ddt;
                    if (DateTime.TryParse(mri.ReleaseDate, out ddt))
                    {
                    }
                    else
                    {
                        //百度的快照日期有时会是9位或8位,如果是这种情况,那么按规则去掉
                        mri.ReleaseDate = mri.ReleaseDate.Substring(1, 9);
                        if (DateTime.TryParse(mri.ReleaseDate, out ddt))
                        {
                        }
                        else
                        {
                            mri.ReleaseDate = mri.ReleaseDate.Substring(1, 8);
                        }
                    }
                    //处理日期
                    try
                    {
                        mri.ReleaseDate = DateTime.Parse(mri.ReleaseDate).ToString("yyyy-MM-dd HH:mm:ss");
                    }
                    catch (Exception ex)
                    {
                        StreamWriter sw = File.AppendText("log.txt");
                        sw.WriteLine(DateTime.Now.ToLongDateString());
                        sw.WriteLine("begin");
                        sw.WriteLine(ex.Message);
                        sw.WriteLine(sb.ToString());
                        sw.WriteLine("end");
                        sw.WriteLine("");

                        sw.Close();

                    }

                    //只拿取三天的内的数据
                    try
                    {
                        if (DateTime.Parse(mri.ReleaseDate) < DateTime.Now.AddDays(-30)) continue;
                    }
                    catch (Exception ex) { continue; }
                    try
                    {
                        //得到标题
                        mri.Title = HtmlUtil.NoHTML(HtmlUtil.GetElementsByTagName(lis[i], "h3")[0]);
                        string[] temp = HtmlUtil.GetElementsByClass(lis[i], "c-summary");

                        //如果未取到内容部分,就跳出
                        if (temp.Length == 0)
                            continue;

                        mri.Contexts = HtmlUtil.NoHTML(temp[0]);
                        mri.InfoSource = HtmlUtil.GetListByHtml("",HtmlUtil.GetElementsByTagName(lis[i], "a")[0], aa)[0];

                        //去掉重复
                        if (isThere)
                        {
                            continue;
                        }
                        else
                        {
                            if (UrlThereare(mri.Title, this.dtqueryinfo, dtWebQueryInfo, false) != 0)
                            { isThere = true; continue; }
                        }

                        mri.KeyWords = k;
                        mri.CollectDate = string.Format(DateTime.Now.ToString("yyyy-MM-dd HH:mm:ss"));
                        mri.Snapshot = "";
                        mri.ReleaseName = "";
                        mri.Sheng = "";
                        mri.Shi = "";
                        mri.Xian = "";
                        mri.WebName = "百度";
                        mri.Pid = 0;
                        mri.Part = GetParts(mri.Contexts);
                        mri.Comments = 0;
                        mri.Reposts = 0;

                        DataRow dr = dtqueryinfo.NewRow();
                        if (dvAll.RowCount == 0)
                        {
                            dr[0] = 1;
                        }
                        else
                        {
                            dr[0] = int.Parse(dvAll.Rows[dvAll.RowCount - 1].Cells[0].Value.ToString()) + 1;
                        }
                        dr[1] = mri.Title;
                        dr[2] = mri.Contexts;
                        dr[3] = mri.ReleaseDate;
                        dr[4] = mri.InfoSource;
                        dr[5] = mri.KeyWords.Substring(0, k.IndexOf("-"));
                        dr[6] = mri.ReleaseName;
                        dr[7] = mri.CollectDate;
                        dr[8] = mri.Snapshot;
                        dr[9] = mri.WebName;
                        dr[10] = mri.Pid;
                        dr[11] = mri.Part;
                        dr[12] = mri.Reposts;
                        dr[13] = mri.Comments;

                        dtqueryinfo.Rows.InsertAt(dr, 0);

                        if (dtqueryinfo.Rows.Count >= 500)
                        {
                            dtqueryinfo.Rows.RemoveAt(500);
                        }
                        this.BeginInvoke(new MethodInvoker(delegate() {
                            dvAll.Refresh();
                        }));
                    }
                    catch (Exception ex)
                    {
                        StreamWriter sw = File.AppendText("log.txt");
                        sw.WriteLine(DateTime.Now.ToLongDateString());
                        sw.WriteLine("begin");
                        sw.WriteLine(ex.Message);
                        sw.WriteLine(sb.ToString());
                        sw.WriteLine("end");
                        sw.WriteLine("");

                        sw.Close();
                    }

                    try
                    {
                        //得到插入语句
                        if (isThere)
                        {
                            continue;
                        }
                        else
                        {
                            sb.Append(tri.GetInsString(mri) + ";");
                        }

                        //每10次执行一次插入数据库
                        if (sb.ToString().Length != 0)
                        {
                            if (i % 10 == 0)
                            {
                                //执行插入
                                cmd.ExecuteNonQuery(sb.ToString());
                                //清除插入字段串
                                sb.Clear();
                            }
                        }
                    }
                    catch (Exception ex)
                    {
                        StreamWriter sw = File.AppendText("log.txt");
                        sw.WriteLine(DateTime.Now.ToLongDateString());
                        sw.WriteLine("begin");
                        sw.WriteLine(ex.Message);
                        sw.WriteLine(sb.ToString());
                        sw.WriteLine("end");
                        sw.WriteLine("");

                        sw.Close();
                    }

                }
            }
            try
            {
                if (sb.ToString().Length != 0)
                {
                    //执行插入
                    cmd.ExecuteNonQuery(sb.ToString());
                    //清除插入字段串
                    sb.Clear();
                }
            }
            catch (Exception ex)
            {
                StreamWriter sw = File.AppendText("log.txt");
                sw.WriteLine(DateTime.Now.ToLongDateString());
                sw.WriteLine("begin");
                sw.WriteLine(ex.Message);
                sw.WriteLine(sb.ToString());
                sw.WriteLine("end");
                sw.WriteLine("");

                sw.Close();
            }

            //执行完毕后,重新获取一次数据库的数据
            dtWebQueryInfo = tri.SelReleaseInfo(DateTime.Now.ToString("yyyy-MM-dd HH:mm:ss"), DateTime.Now.AddDays(-15).ToString("yyyy-MM-dd HH:mm:ss"), "0 AND webName='百度'");
            this.BeginInvoke(new MethodInvoker(delegate()
            {
                lbAll.Text = "一轮搜索完毕!";
                lbAll.ForeColor = Color.Red;
            }));
        }
예제 #6
0
파일: Monitor.cs 프로젝트: wcgcw/Finder
        /// <summary>
        /// 得到网站的贴吧类数据
        /// </summary>
        private void GetWebTieBaInfo()
        {
            this.BeginInvoke(new MethodInvoker(delegate()
            {
                lbtieba.Text = "";
                lbtieba.Visible = true;
            }));

            //相似链接
            string Similar = "";

            DataBaseServer.SQLitecommand cmd = new SQLitecommand();
            //得到关键字列表
            DataTable dtkey;
            DataTable dtParts;
            dtkey = cmd.GetTabel("select * from Keywords");
            dtParts = cmd.GetTabel("SELECT * FROM partword");
            //得到相似表
            DataTable dtXs = new DataTable();
            dtXs = cmd.GetTabel("Select * from WebAddress WHERE pid=5");

            //相似表中的被抓取网址
            string webInfo = "";

            //要过滤链接中首页的正则
            string strTopFormat = "https?://.+/";
            List<string> strTop = new List<string>();
            sb = new StringBuilder();
            sb.Append("");
            string filterStr = "";

            #region 读取相似度表中的数据据,循环抓取
            for (int xs = 0; xs < dtXs.Rows.Count; xs++)
            {
                this.BeginInvoke(new MethodInvoker(delegate()
                {
                    lbtieba.ForeColor = Color.DarkBlue;
                    lbtieba.Text = "正在搜索:" + dtXs.Rows[xs]["name"].ToString();
                }));

                //读取相似表中要抓取的网址
                webInfo = HtmlUtil.getHtml(dtXs.Rows[xs]["url"].ToString(), "");
                //读取相似链接
                Similar = dtXs.Rows[xs]["likeurl"].ToString();

                //取出
                //string[] strA = HtmlUtil.GetElementsByTagName(webInfo, "a");
                List<string> strList = HtmlUtil.GetElementsByTagNameList(webInfo, "a");

                string strURLformat = "https?://.[^\"]+";

                TbReleaseInfo ri = new TbReleaseInfo();

                string[] strA = GetLIstDate(strList.Distinct());
                #region 逐个链接判断
                //循环时判断是否要验证
                bool isThere = false;

                for (int i = 0; i < strA.Length; i++)
                {
                    if (Program.ProClose == true) break;
                    Application.DoEvents();
                    Dictionary<string, int> events = new Dictionary<string, int>();
                    //创建数据对象
                    ModelReleaseInfo newsInfo = new ModelReleaseInfo();
                    try
                    {
                        //得到目标网址中的所有链接,如果未得到,那么就继续读取下一个
                        string tmp = strA[i];
                        newsInfo.Title = HtmlUtil.NoHTML(tmp);
                        //得到目标网址中的所有链接,如果未得到,那么就继续读取下一个
                        for (int j = 0; j < dtkey.Rows.Count; j++)
                        {
                            //Application.DoEvents();
                            string[] keys = dtkey.Rows[j][4].ToString().Split(new char[] { ' ' });
                            if (!events.ContainsKey(dtkey.Rows[j][1].ToString() + "-" + dtkey.Rows[j][6].ToString()))
                            {
                                events.Add(dtkey.Rows[j][1].ToString() + "-" + dtkey.Rows[j][6].ToString(), 1);
                                foreach (string k in keys)
                                {
                                    if (!strA[i].ToLower().Contains(k.ToLower()))
                                    {
                                        events.Remove(dtkey.Rows[j][1].ToString() + "-" + dtkey.Rows[j][6].ToString());
                                        break;
                                    }
                                }
                            }
                        }
                        foreach (KeyValuePair<string, int> ev in events)
                        {
                            if (ev.Value == 1)
                            {
                                newsInfo.KeyWords += "," + ev.Key.Split(new char[] { '-' })[0] + "-" + int.Parse(ev.Key.Split(new char[] { '-' })[1]);
                            }
                        }

                        string[] _a = HtmlUtil.GetListByHtml(dtXs.Rows[xs]["url"].ToString(), strA[i], strURLformat);
                        if (_a.Length > 0)
                        {
                            strA[i] = _a[0];
                        }
                        else
                        {
                            strA[i] = "";
                        }
                        //处理含有单引号的链接
                        strA[i] = HtmlUtil.UrlCl(strA[i]);

                        //处理单引号的链接
                        if (strA[i].IndexOf("'") != -1)
                        {
                            strA[i] = HtmlUtil.GetstringByHtmlArray(strA[i], "https?://.[^\']+");
                        }
                    }
                    catch (Exception)
                    {
                        continue;
                    }
                    //得到相似值,大于0.70的认为相同,并开始抓取
                    if (HtmlUtil.getSimilarDegree(Similar, strA[i]) >= 0.70)
                    {
                        //判断这个链接是否已经在库中或者列表中,如果存在,此次就不再执行
                        strTop = HtmlUtil.GetListByHtmlArray(strA[i], strTopFormat);
                        if (strTop.Count != 0)
                        {
                            continue;//同新闻,如果将首页去掉
                        }

                        if (isThere)
                        {
                            continue;
                        }
                        else
                        {
                            if (UrlThereare(strA[i], this.dttiebainfo, dtTieBaInfo, true) != 0) {
                                //isThere = true;
                                continue;
                            }
                        }
                        Thread.Sleep(2000);
                        //得到此链接的源码
                        webInfo = HtmlUtil.getHtml(strA[i], "");
                        if (webInfo.Length == 0) { continue; }

                        try
                        {
                            //流水+1
                            newsInfo.Uid = this.dvAll.Rows.Count + 1;

                            //标题
                            //string[] strT = HtmlUtil.GetElementsByTagName(webInfo, "title");
                            //if (strT.Length == 0)
                            //{
                            //    continue;
                            //}
                            //else
                            //{
                            //    newsInfo.Title = HtmlUtil.NoHTML(HtmlUtil.GetElementsByTagName(webInfo, "title")[0]);
                            //}

                            //得到正文,以P标签来区分
                            string[] strContext = HtmlUtil.GetElementsByClass(webInfo, "tieba");
                            //string[] strContext = HtmlUtil.GetElementsByTagName(webInfo, "post_content");
                            newsInfo.Contexts = "";
                            for (int j = 0; j < strContext.Length; j++)
                            {
                                //循环累加正文信息
                                newsInfo.Contexts += HtmlUtil.NoHTML(strContext[j]);
                            }

                            //如果正文信息为空,那么将无法做关键字对照,此条数据舍弃
                            if (newsInfo.Contexts.Length == 0)
                            {
                                continue;
                            }

                            //网站链接
                            newsInfo.InfoSource = strA[i].Trim();

                            //关键字的设置
                            //newsInfo.KeyWords = "";
                            //Dictionary<string, int> events = new Dictionary<string, int>();
                            //for (int j = 0; j < dtkey.Rows.Count; j++)
                            //{
                            //    Application.DoEvents();
                            //    if (!events.ContainsKey(dtkey.Rows[j][1].ToString() + "-" + dtkey.Rows[j][6].ToString()))
                            //    {
                            //        events.Add(dtkey.Rows[j][1].ToString() + "-" + dtkey.Rows[j][6].ToString(), 1);
                            //    }
                            //    if (!newsInfo.Contexts.Contains(dtkey.Rows[j][4].ToString()))
                            //    {
                            //        if (events.ContainsKey(dtkey.Rows[j][1].ToString() + "-" + dtkey.Rows[j][6].ToString()) && events[dtkey.Rows[j][1].ToString() + "-" + dtkey.Rows[j][6].ToString()] == 1)
                            //        {
                            //            events[dtkey.Rows[j][1].ToString() + "-" + dtkey.Rows[j][6].ToString()] = 0;
                            //        }
                            //    }
                            //}
                            if (newsInfo.KeyWords == null || newsInfo.KeyWords.Length == 0)
                            {
                                for (int j = 0; j < dtkey.Rows.Count; j++)
                                {
                                    Application.DoEvents();
                                    string[] keys = dtkey.Rows[j][4].ToString().Split(new char[] { ' ' });
                                    if (!events.ContainsKey(dtkey.Rows[j][1].ToString() + "-" + dtkey.Rows[j][6].ToString()))
                                    {
                                        events.Add(dtkey.Rows[j][1].ToString() + "-" + dtkey.Rows[j][6].ToString(), 1);
                                        foreach (string k in keys)
                                        {
                                            if (!newsInfo.Contexts.ToLower().Contains(k.ToLower()))
                                            {
                                                events.Remove(dtkey.Rows[j][1].ToString() + "-" + dtkey.Rows[j][6].ToString());
                                                break;
                                            }
                                        }
                                    }
                                }
                                foreach (KeyValuePair<string, int> ev in events)
                                {
                                    if (ev.Value == 1)
                                    {
                                        newsInfo.KeyWords += "," + ev.Key.Split(new char[] { '-' })[0] + "-" + int.Parse(ev.Key.Split(new char[] { '-' })[1]);
                                    }
                                }
                            }
                            if (newsInfo.KeyWords.Length == 0) { continue; }
                            newsInfo.KeyWords = newsInfo.KeyWords.Substring(1);

                            //收集日期
                            newsInfo.CollectDate = string.Format(DateTime.Now.ToString("yyyy-MM-dd HH:mm:ss"));

                            //发布人和发布日期暂时无法取到,手工赋值为空
                            newsInfo.ReleaseDate = "";
                            newsInfo.ReleaseName = "";

                            //网页快照,这里为用户指定生成,如果未选择生成,那么为空
                            newsInfo.Snapshot = "";
                            newsInfo.Sheng = dtXs.Rows[xs]["sheng"] == null ? "" : dtXs.Rows[xs]["sheng"].ToString();
                            newsInfo.Shi = dtXs.Rows[xs]["shi"] == null ? "" : dtXs.Rows[xs]["shi"].ToString();
                            newsInfo.Xian = dtXs.Rows[xs]["xian"] == null ? "" : dtXs.Rows[xs]["xian"].ToString();
                            //网站名
                            newsInfo.WebName = dtXs.Rows[xs]["Name"].ToString();
                            //pid
                            newsInfo.Pid = 5;
                            //part正负判断
                            newsInfo.Part = GetParts(newsInfo.Contexts);
                            //reposts
                            newsInfo.Reposts = 0;
                            //comments
                            newsInfo.Comments = 0;

                            //新建数据行
                            DataRow dr = dttiebainfo.NewRow();
                            if (dvtieba.RowCount == 0)
                            {
                                dr[0] = 1;
                            }
                            else
                            {
                                dr[0] = int.Parse(dvtieba.Rows[dvtieba.RowCount - 1].Cells[0].Value.ToString()) + 1;
                            }
                            //dr[0] = newsInfo.Uid;
                            dr[1] = newsInfo.Title;
                            dr[2] = newsInfo.Contexts;
                            dr[3] = newsInfo.ReleaseDate;
                            dr[4] = newsInfo.InfoSource;
                            dr[5] = newsInfo.KeyWords.Substring(0, newsInfo.KeyWords.IndexOf("-"));
                            dr[6] = newsInfo.ReleaseName;
                            dr[7] = newsInfo.CollectDate;
                            dr[8] = newsInfo.Snapshot;
                            dr[9] = newsInfo.WebName;
                            dr[10] = newsInfo.Pid;
                            dr[11] = newsInfo.Part;
                            dr[12] = newsInfo.Reposts;
                            dr[13] = newsInfo.Comments;

                            //把行加到DT中
                            dttiebainfo.Rows.InsertAt(dr, 0);

                            //数据源刷新
                            if (dttiebainfo.Rows.Count >= 500)
                            {
                                dttiebainfo.Rows.RemoveAt(500);
                            }
                            this.BeginInvoke(new MethodInvoker(delegate()
                            {
                                dvtieba.Refresh();
                            }));
                        }
                        catch (Exception ex)
                        {
                            Comm.WriteErrorLog(ex.StackTrace);
                        }

                        ////总表刷新
                        //dt.Rows.Add(dr);
                        //dvAll.Refresh();

                        //得到插入语句
                        try
                        {
                            if (isThere)
                            {
                                continue;
                            }
                            else
                            {
                                sb.Append(ri.GetInsString(newsInfo) + ";");
                            }

                            //每10次执行一次插入数据库
                            if (sb.ToString().Length != 0)
                            {
                                if (i % 10 == 0)
                                {
                                    filterStr = sb.ToString();
                                    filterStr = filterStr.Replace("[ ", "[");
                                    filterStr = filterStr.Replace(" ]", "]");
                                    //执行插入
                                    cmd.ExecuteNonQuery(filterStr);
                                    //清除插入字段串
                                    sb.Clear();
                                    filterStr = "";
                                }
                            }
                        }
                        catch (Exception ex)
                        {
                            Comm.WriteErrorLog(ex.StackTrace);
                        }
                    }
                }
                #endregion
            }
            #endregion

            try
            {
                if (sb.ToString().Length != 0)
                {
                    filterStr = sb.ToString();
                    filterStr = filterStr.Replace("[ ", "[");
                    filterStr = filterStr.Replace(" ]", "]");
                    //执行插入
                    cmd.ExecuteNonQuery(filterStr);
                    //清除插入字段串
                    sb.Clear();
                    filterStr = "";
                }
            }
            catch (Exception ex)
            {
                Comm.WriteErrorLog(ex.StackTrace);
            }

            //执行完毕后,重新获取一次数据库的数据
            dtTieBaInfo = tri.SelReleaseInfo(DateTime.Now.ToString("yyyy-MM-dd HH:mm:ss"), DateTime.Now.AddDays(-15).ToString("yyyy-MM-dd HH:mm:ss"), "5");
            this.BeginInvoke(new MethodInvoker(delegate()
            {
                lbtieba.Text = "一轮搜索完毕!";
                lbtieba.ForeColor = Color.Red;
            }));
        }
예제 #7
0
파일: CrawlHtml.cs 프로젝트: wcgcw/Finder
        public static ModelReleaseInfo CrawlHtmlSource(string html, string url, DataTable dtkey, string sheng, string shi, string xian, string webName, string webInfo, int pid)
        {
            //string strURLformat = "https?://.[^\"]+";
            Dictionary <string, int> events = new Dictionary <string, int>();
            //创建数据对象
            ModelReleaseInfo newsInfo = new ModelReleaseInfo();

            try
            {
                newsInfo.Title = HtmlUtil.NoHTML(html);
                //newsInfo.Title = html;
                for (int j = 0; j < dtkey.Rows.Count; j++)
                {
                    string[] keys = dtkey.Rows[j][4].ToString().Split(new char[] { ' ' });
                    if (!events.ContainsKey(dtkey.Rows[j][1].ToString() + "-" + dtkey.Rows[j][6].ToString()))
                    {
                        events.Add(dtkey.Rows[j][1].ToString() + "-" + dtkey.Rows[j][6].ToString(), 1);
                        foreach (string k in keys)
                        {
                            if (!html.ToLower().Contains(k.ToLower()))
                            {
                                events.Remove(dtkey.Rows[j][1].ToString() + "-" + dtkey.Rows[j][6].ToString());
                                break;
                            }
                        }
                    }
                }
                foreach (KeyValuePair <string, int> ev in events)
                {
                    if (ev.Value == 1)
                    {
                        newsInfo.KeyWords += "," + ev.Key.Split(new char[] { '-' })[0] + "-" + int.Parse(ev.Key.Split(new char[] { '-' })[1]);
                    }
                }
                newsInfo.Contexts = HtmlUtil.NoHTML(webInfo);

                //网站链接
                newsInfo.InfoSource = url;

                //关键字的设置
                if (newsInfo.KeyWords == null || newsInfo.KeyWords.Length == 0)
                {
                    for (int j = 0; j < dtkey.Rows.Count; j++)
                    {
                        //Application.DoEvents();
                        string[] keys = dtkey.Rows[j][4].ToString().Split(new char[] { ' ' });
                        if (!events.ContainsKey(dtkey.Rows[j][1].ToString() + "-" + dtkey.Rows[j][6].ToString()))
                        {
                            events.Add(dtkey.Rows[j][1].ToString() + "-" + dtkey.Rows[j][6].ToString(), 1);
                            foreach (string k in keys)
                            {
                                if (!newsInfo.Contexts.ToLower().Contains(k.ToLower()))
                                {
                                    events.Remove(dtkey.Rows[j][1].ToString() + "-" + dtkey.Rows[j][6].ToString());
                                    break;
                                }
                            }
                        }
                    }
                    foreach (KeyValuePair <string, int> ev in events)
                    {
                        if (ev.Value == 1)
                        {
                            newsInfo.KeyWords += "," + ev.Key.Split(new char[] { '-' })[0] + "-" + int.Parse(ev.Key.Split(new char[] { '-' })[1]);
                        }
                    }
                }
                //if (newsInfo.KeyWords.Length == 0) { continue; }
                if (newsInfo.KeyWords != null)
                {
                    newsInfo.KeyWords = newsInfo.KeyWords.Substring(1);
                }

                //收集日期
                newsInfo.CollectDate = string.Format(DateTime.Now.ToString("yyyy-MM-dd HH:mm:ss"));

                //发布人和发布日期暂时无法取到,手工赋值为空
                newsInfo.ReleaseDate = "";
                newsInfo.ReleaseName = "";

                //网页快照,这里为用户指定生成,如果未选择生成,那么为空
                newsInfo.Snapshot = "";
                newsInfo.Sheng    = sheng == null ? "" : sheng;
                newsInfo.Shi      = shi == null ? "" : shi;
                newsInfo.Xian     = xian == null ? "" : xian;
                //网站名
                newsInfo.WebName = webName == null ? "" : webName;
                //pid
                newsInfo.Pid = pid;
                //part正负判断
                newsInfo.Part = GetParts(newsInfo.Contexts);
                //reposts
                newsInfo.Reposts = 0;
                //comments
                newsInfo.Comments = 0;
            }
            catch (Exception ex)
            {
                Comm.WriteErrorLog(ex.StackTrace);
            }

            return(newsInfo);
        }