Example #1
0
        private void CrawlerForm_Load(object sender, EventArgs e)
        {
            WebInfo        wInfo = null;
            RequestHeaders rinfo = null;

            try
            {
                DataTable dt_param = db.selectDatas("SELECT * FROM getNetworkData..ArticleParam WHERE isqy=1");
                for (int i = 0; i < dt_param.Rows.Count; i++)
                {
                    wInfo                = new WebInfo();
                    wInfo.WebName        = dt_param.Rows[i]["WebName"].ToString();
                    wInfo.RkUrl          = dt_param.Rows[i]["RkUrl"].ToString();
                    rinfo                = new RequestHeaders();
                    rinfo.UserAgent      = dt_param.Rows[i]["UserAgent"].ToString();
                    rinfo.Accept         = dt_param.Rows[i]["Accept"].ToString();
                    rinfo.Referer        = dt_param.Rows[i]["Referer"].ToString();
                    rinfo.Charset        = Encoding.GetEncoding(dt_param.Rows[i]["Charset"].ToString());
                    wInfo.RequestInfo    = rinfo;
                    wInfo.Pattern        = dt_param.Rows[i]["Pattern"].ToString();
                    wInfo.PageNumPattern = dt_param.Rows[i]["PageNumPattern"].ToString();
                    int PageSize = 0;
                    if (int.TryParse(dt_param.Rows[i]["PageSize"].ToString(), out PageSize))
                    {
                        wInfo.PageSize = int.Parse(dt_param.Rows[i]["PageSize"].ToString());
                    }
                    wInfoList.Add(wInfo);
                }
            }
            catch (Exception ecp)
            {
                MessageBox.Show("异常:" + ecp.Message);
                return;
            }
            //duibiTime = DateTime.Now.AddDays(-1).ToString("yyyy.MM.dd");
            getTaskThread = new Thread(getTask);
            getTaskThread.Start();
        }
Example #2
0
        /// <summary>
        /// 获取结果列表  -  采集方法
        /// </summary>
        /// <param name="keyWords">关键词</param>
        /// <param name="webName">网站</param>
        private void getArticleList(string keyWords, string webName)
        {
            WebInfo wInfo = wInfoList.Find(m => m.WebName == webName);

            if (wInfo != null)
            {
                string keyWordsTemp = keyWords;
                if (webName == "中国创新网" || webName == "万维家电")
                {
                    keyWordsTemp = HttpUtility.UrlEncode(keyWords, Encoding.GetEncoding("GB2312"));
                }
                string          rkUrl     = wInfo.RkUrl.Replace("[keyWords]", keyWordsTemp);
                int             pageTotal = 2;
                bool            flag      = true;
                MatchCollection mcLast    = null;
                for (int i = 1; i < pageTotal; i++)
                {
                    if (!flag)
                    {
                        break;
                    }

                    string url = "";
                    string m   = "";
                    try
                    {
                        if (webName == "百度学术" || webName == "techweb")
                        {
                            url = rkUrl.Replace("{p}", ((i - 1) * 10).ToString());
                        }
                        else if (webName == "光明家电")
                        {
                            url = rkUrl.Replace("{p}", (i - 1).ToString());
                        }
                        else if (webName == "中国创新网" && i > 1)
                        {
                            Regex r_rkUrl = new Regex("(?<=href=\").*?(?=\" class=\"f1\">下一页)");
                            url = r_rkUrl.Match(m).Value.ToString().Trim();
                            if (url == "")
                            {
                                break;
                            }
                            else
                            {
                                url = "http://www.chinahightech.com/search/" + url;
                            }
                        }
                        else
                        {
                            url = rkUrl.Replace("{p}", i.ToString());
                        }
                        m = cClass.getHtmlCode(url, wInfo.RequestInfo);

                        #region 获取页码
                        if (wInfo.PageNumPattern != null && wInfo.PageNumPattern != "")
                        {
                            string pageNumText = Regex.Match(m, wInfo.PageNumPattern).Value.ToString();
                            pageNumText = pageNumText.Replace(",", "").Replace(",", "").Replace(" ", "").Trim();
                            int pageSize = wInfo.PageSize;
                            if (int.TryParse(pageNumText, out pageSize))
                            {
                                pageTotal = (int.Parse(pageNumText) / wInfo.PageSize);
                                if (pageTotal >= 10 && pageTotal < 50)
                                {
                                    pageTotal = pageTotal / 2 + 1;
                                }
                                else if (pageTotal >= 50)
                                {
                                    pageTotal = 11;
                                }
                            }
                            else
                            {
                                pageTotal = 2;
                            }
                        }
                        else
                        {
                            pageTotal += 1;
                        }
                        #endregion

                        MatchCollection mList = Regex.Matches(m, wInfo.Pattern);
                        if (mList != null && mList.Count > 0)
                        {
                            #region  正则取得结果
                            mcLast = cClass.cfBool(mList, mcLast);
                            if (mcLast == null)
                            {
                                break;
                            }
                            foreach (Match match in mList)
                            {
                                try
                                {
                                    //#region  正则取得结果
                                    ArticleInfo aInfo = new ArticleInfo();
                                    aInfo.Url      = url;
                                    aInfo.KeyWords = keyWords;
                                    aInfo.WebName  = wInfo.WebName;

                                    //第一财经日报 2015-05-19 01:55:52
                                    string sourceBy = match.Groups["sourceBy"].Value.ToString().Replace("\r", "").Replace("\n", "").Trim();
                                    sourceBy = r.Replace(sourceBy, "").Trim();

                                    #region 日期 判断
                                    if (wInfo.WebName == "新浪科技" || wInfo.WebName == "techweb")
                                    {
                                        string riqi = Regex.Match(sourceBy, @"\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2}").Value.ToString();
                                        try
                                        {
                                            aInfo.Riqi = Convert.ToDateTime(riqi.Substring(0, 10)).ToString("yyyy.MM.dd");
                                        }
                                        catch { }
                                        if (Convert.ToDateTime(aInfo.Riqi) < Convert.ToDateTime(duibiTime))
                                        {
                                            flag = false;
                                            break;
                                        }
                                        aInfo.SourceBy = sourceBy.Replace(riqi, "").Trim();
                                    }
                                    else
                                    {
                                        aInfo.Riqi     = Convert.ToDateTime(match.Groups["riqi"].Value.ToString().Trim()).ToString("yyyy.MM.dd");
                                        aInfo.SourceBy = sourceBy;
                                        if (wInfo.WebName == "百度学术")
                                        {
                                            try
                                            {
                                                if (int.Parse(aInfo.Riqi) < int.Parse(duibiTime.Substring(0, 4)))
                                                {
                                                    flag = false;
                                                    break;
                                                }
                                            }
                                            catch
                                            {
                                                flag = false;
                                                break;
                                            }
                                        }
                                        else
                                        {
                                            if (!(wInfo.PageNumPattern != null && wInfo.PageNumPattern != ""))
                                            {
                                                if (Convert.ToDateTime(aInfo.Riqi) < Convert.ToDateTime(duibiTime))
                                                {
                                                    flag = false;
                                                    break;
                                                }
                                            }
                                        }
                                    }
                                    #endregion

                                    string title = match.Groups["title"].Value.ToString();
                                    aInfo.Title = r.Replace(title, "").Trim();

                                    string content = match.Groups["content"].Value.ToString();
                                    aInfo.Content = r.Replace(content, "").Trim();

                                    if (containStr((aInfo.Title + aInfo.Content), aInfo.KeyWords) && !(aInfo.Title + aInfo.Content).Contains("车"))
                                    {
                                        if (Convert.ToDateTime(aInfo.Riqi) >= Convert.ToDateTime(duibiTime))
                                        {
                                            #region
                                            aInfo.Click = match.Groups["click"].Value.ToString().Trim();

                                            string sourceUrl = match.Groups["sourceUrl"].Value.ToString().Trim();
                                            aInfo.SourceUrl = r.Replace(sourceUrl, "").Trim();
                                            if (webName == "千龙网")
                                            {
                                                aInfo.SourceUrl = "http://www.chinaso.com" + aInfo.SourceUrl;
                                            }
                                            else if (webName == "中国创新网")
                                            {
                                                aInfo.SourceUrl = "http://www.chinahightech.com/" + aInfo.SourceUrl;
                                            }

                                            string insertSql = "INSERT INTO getNetworkData..ArticleTable_刘大任(Kid,Did,抓取网页,来源,访问数量,日期,标题,内容摘要,关键词,新闻链接,渠道) VALUES('" + Kid + "','" + Did + "','" + aInfo.Url + "','" + aInfo.SourceBy + "','" + aInfo.Click + "','" + aInfo.Riqi + "','" + aInfo.Title + "','" + aInfo.Content + "','" + keyWords + "','" + aInfo.SourceUrl + "','" + aInfo.WebName + "')";
                                            #endregion

                                            #region
                                            lock (objLockSql)
                                            {
                                                SQLStringList.Add(insertSql);
                                                cjCount += 1;
                                                if (SQLStringList.Count >= 200)
                                                {
                                                    if (db.ExecuteSqlTran(SQLStringList))
                                                    {
                                                        SQLStringList = new ArrayList();
                                                    }
                                                }
                                                this.Invoke(new ThreadStart(delegate()
                                                {
                                                    this.label2.Text = "当前已处理关键词:[" + keyWordsCountDone + " / " + keyWordsCount + "]  共采集数据:" + cjCount + " 条";
                                                }));
                                            }
                                            #endregion
                                        }
                                    }
                                }
                                catch (Exception ecp)
                                {
                                    writeLog(@"C:\Log_ArticleCrawler.log",
                                             DateTime.Now.ToString("yyyy-MM-dd HH:mm:ss") + " 采集方法异常:" + ecp.Message + " url=[" + url + "]\r\n");
                                }
                            }
                            #endregion
                        }
                        else
                        {
                            break;
                        }
                    }
                    catch (Exception ecp)
                    {
                        writeLog(@"C:\Log_ArticleCrawler.log",
                                 DateTime.Now.ToString("yyyy-MM-dd HH:mm:ss") + " 采集方法异常:" + ecp.Message + "\r\n");
                    }
                }
            }
        }