void CheckIfCensored(SpiderInfo spiderInfo) { if (spiderInfo.Depth == 0) { // no test needed return; } try { // TODO: censored } catch (Exception) { } }
private void FindNewUrls(object o) { var spiderInfo = (SpiderInfo)o; running++; // We cannot use WebClient or similar, since we cannot rely on the DNS resolution! TcpClient client = new TcpClient(); IPAddress ip = DNSHelper.ResolveUri(openDnsResolver, spiderInfo.URL).First(); //check for censorship CheckIfCensored(spiderInfo); if (ip == null) { // Invalid Response pooled--; running--; return; } try { client.Connect(ip, 80); } catch (Exception) { pooled--; running--; return; } //Send Request TextWriter tw = new StreamWriter(client.GetStream()); tw.WriteLine("GET / HTTP/1.1"); tw.WriteLine("Host: " + ((SpiderInfo)spiderInfo).URL); tw.WriteLine("User-Agent: Mozilla/5.0 (compatible; zensorchecker/" + this.GetType().Assembly.GetName().Version.ToString() + "; http://zensorchecker.origo.ethz.ch/)"); tw.WriteLine(); tw.Flush(); TextReader document = new StreamReader(client.GetStream()); string line; try { while ((line = document.ReadLine()) != null) { MatchCollection mc = hrefMatch.Matches(line); foreach (Match m in mc) { string href = m.Value + "/"; string url = href.Substring(0, href.IndexOf('/')); if (!spidercheck.ContainsKey(url)) { spiderlist.Add(new SpiderInfo(url, ((SpiderInfo)spiderInfo).Depth + 1)); spidercheck.Add(url, true); } } } } catch (Exception) { ((SpiderInfo)spiderInfo).ReadError = true; } lastfinshed = (SpiderInfo)spiderInfo; pooled--; running--; }
public void CrawlSpiderList() { int index = 0; lastfinshed = spiderlist[0]; while (crawl) { if (index >= spiderlist.Count || pooled >= 100) { Thread.Sleep(500); Console.WriteLine("status: i" + (index - 1) + "|" + lastfinshed.URL + "|" + lastfinshed.Depth + "|c" + spiderlist.Count + "|" + running + "/" + pooled); continue; } pooled++; ThreadPool.QueueUserWorkItem(FindNewUrls, (object)spiderlist[index]); index++; } }