コード例 #1
0
 void CheckIfCensored(SpiderInfo spiderInfo)
 {
     if (spiderInfo.Depth == 0)
     {
         // no test needed
         return;
     }
     try
     {
         // TODO: censored
     }
     catch (Exception) { }
 }
コード例 #2
0
        private void FindNewUrls(object o)
        {
            var spiderInfo = (SpiderInfo)o;
            running++;
            // We cannot use WebClient or similar, since we cannot rely on the DNS resolution!
            TcpClient client = new TcpClient();
            IPAddress ip = DNSHelper.ResolveUri(openDnsResolver, spiderInfo.URL).First();
            //check for censorship

            CheckIfCensored(spiderInfo);

            if (ip == null)
            {
                // Invalid Response
                pooled--; running--;
                return;
            }

            try
            {
                client.Connect(ip, 80);
            }
            catch (Exception)
            {
                pooled--; running--;
                return;
            }

            //Send Request
            TextWriter tw = new StreamWriter(client.GetStream());
            tw.WriteLine("GET / HTTP/1.1");
            tw.WriteLine("Host: " + ((SpiderInfo)spiderInfo).URL);
            tw.WriteLine("User-Agent: Mozilla/5.0 (compatible; zensorchecker/" + this.GetType().Assembly.GetName().Version.ToString() + ";  http://zensorchecker.origo.ethz.ch/)");
            tw.WriteLine();
            tw.Flush();

            TextReader document = new StreamReader(client.GetStream());
            string line;
            try
            {
                while ((line = document.ReadLine()) != null)
                {
                    MatchCollection mc = hrefMatch.Matches(line);

                    foreach (Match m in mc)
                    {
                        string href = m.Value + "/";
                        string url = href.Substring(0, href.IndexOf('/'));
                        if (!spidercheck.ContainsKey(url))
                        {
                            spiderlist.Add(new SpiderInfo(url, ((SpiderInfo)spiderInfo).Depth + 1));
                            spidercheck.Add(url, true);

                        }
                    }
                }
            }
            catch (Exception)
            {
                ((SpiderInfo)spiderInfo).ReadError = true;
            }
            lastfinshed = (SpiderInfo)spiderInfo;
            pooled--; running--;
        }
コード例 #3
0
        public void CrawlSpiderList()
        {
            int index = 0; lastfinshed = spiderlist[0];
            while (crawl)
            {

                if (index >= spiderlist.Count || pooled >= 100)
                {
                    Thread.Sleep(500);
                    Console.WriteLine("status: i" + (index - 1) + "|" + lastfinshed.URL + "|" + lastfinshed.Depth + "|c" + spiderlist.Count + "|" + running + "/" + pooled);
                    continue;
                }

                pooled++;

                ThreadPool.QueueUserWorkItem(FindNewUrls, (object)spiderlist[index]);

                index++;

            }
        }