Esempio n. 1
0
        private void btnGo_Click(object sender, EventArgs e)
        {
            long     id      = 0;
            long     counter = 0;
            TraceLog log     = new TraceLog("GrabberLog.txt");

            try
            {
                log.AddLog("===============开始抓取===============");
                //输入参数处理
                if (string.IsNullOrEmpty(txtUrl.Text))
                {
                    txtMessage.Text = "请输入要抓取的网页地址!";
                    return;
                }
                if (!string.IsNullOrEmpty(txtOutput.Text))
                {
                    OUTPUT_PATH = txtOutput.Text;
                }
                if (!System.IO.Directory.Exists(OUTPUT_PATH))
                {
                    System.IO.Directory.CreateDirectory(OUTPUT_PATH);
                }
                long startID = Convert.ToInt64(txtStartID.Text);
                long endID   = Convert.ToInt64(txtEndID.Text);

                //遍历ID的方式
                if (string.IsNullOrEmpty(txtListUrl.Text))
                {
                    for (id = startID; id <= endID; id++)
                    {
                        System.Threading.Thread.Sleep(1000);
                        url = string.Format(txtUrl.Text, id);
                        //获取抓取地址的内容
                        if (CatchContent(id, url))
                        {
                            counter++;
                            log.AddLog(id.ToString());
                        }
                    }
                    txtMessage.Text = string.Format("抓取成功!累计请求{0},其中抓取有效数据{1}条,当前时间{2}",
                                                    id - startID, counter, DateTime.Now.ToString());
                }
                else //先遍历列表页,列出ID,然后再逐个ID找
                {
                    dataEncoding = cbbEncoding.Text;
                    List <string> listID    = new List <string>();
                    string        detailUrl = txtUrl.Text;

                    string          strRegex = string.Format(txtUrl.Text, "\\d+").Replace("?", "\\?");
                    Regex           reg      = new Regex(strRegex);
                    MatchCollection regexList;
                    string          tempID;

                    for (id = startID; id <= endID; id++)
                    {
                        System.Threading.Thread.Sleep(1000);
                        url = string.Format(txtListUrl.Text, id);

                        data = WebAccess.Request(url, string.Empty, WebAccess.WebAccessMethod.POST, "text\\html", null, dataEncoding);
                        if (!string.IsNullOrEmpty(data))
                        {
                            log.AddLog(string.Format("=============第{0}页=================", id));
                            regexList = reg.Matches(data);
                            for (int i = 0; i < regexList.Count; i++)
                            {
                                tempID = regexList[i].Value.Replace(txtUrl.Text.Replace("{0}", string.Empty), string.Empty);
                                if (!listID.Contains(tempID))
                                {
                                    listID.Add(tempID);
                                }
                            }
                            //在当前页找到所有ID后立马遍历下载详情页
                            foreach (var item in listID)
                            {
                                System.Threading.Thread.Sleep(100);
                                url = string.Format(txtUrl.Text, item);
                                url = System.Web.HttpUtility.UrlDecode(url);
                                if (!url.Contains("http")) //相对Url,根据列表页Url计算前段
                                {
                                    url = string.Format("{0}/{1}", txtListUrl.Text.Substring(0, txtListUrl.Text.IndexOf("/", 7)), url);
                                }
                                if (CatchContent(Convert.ToInt32(item), url))
                                {
                                    counter++;
                                    log.AddLog(item);
                                }
                            }
                            listID.Clear();
                        }
                    }
                    txtMessage.Text = string.Format("抓取成功!抓取有效数据{0}条,当前时间{1}", counter, DateTime.Now.ToString());
                }
                log.AddLog("============抓取结束==================");
                log.EndLog();
            }
            catch (Exception ex)
            {
                txtMessage.Text = ex.Message;
                log.AddLog(ex.Message);
                log.EndLog();
            }
        }