Exemple #1
0
 public void Process(UrlStates url)
 {
     try
     {
         WebClient webClient = new WebClient();
         webClient.Encoding = Encoding.UTF8;
         string html     = webClient.DownloadString(url.url);
         string fileName = count.ToString();
         File.WriteAllText(fileName, html, Encoding.UTF8);
         url.html = html;
         PageDownloaded(this, url);
         Parse(html, url.url);//解析,并加入新的链接
     }
     catch (Exception)
     {
     }
 }
Exemple #2
0
        public void Crawl()
        {
            UrlStates surl = new UrlStates()
            {
                url = startUrl, processing = false, html = ""
            };

            urls.Add(surl);

            string str = @"(www\.){0,1}.*?\..*?/";
            Regex  r   = new Regex(str);
            Match  m   = r.Match(startUrl);

            startWith = m.Value;

            while (true)
            {
                UrlStates current = null;
                foreach (var url in urls)
                {
                    if (url.processing)
                    {
                        continue;
                    }
                    current = url;
                    if (count > 20)
                    {
                        break;
                    }
                    if (current == null)
                    {
                        continue;
                    }
                    current.processing = true;
                    var t = new Thread(() => Process(current));
                    t.Start();
                    count++;
                }
            }
        }