Example #1
0
 public void addorSetChild(siteChecked child)
 {
     if (child != null)
     {
         children.Add(child);
     }
 }
Example #2
0
        public void crawledResultShow()
        {
            siteChecked site = showBase();

            saveData(site);
            while (crawledQue.Count != 0)
            {
                showBase();
            }
            Console.WriteLine("\n" + mark);
        }
Example #3
0
        static void Main(string[] args)
        {
            string baseUrl = @"https://www.qunar.com/";
            Crawl  crawl   = new Crawl();

            crawl.levelLimit = 1;
            siteChecked first = new siteChecked(baseUrl, 0, null);

            crawl.crawlQueue.Enqueue(first);
            crawl.startCrawl();
            crawl.crawledResultShow();
        }
Example #4
0
        private void getTitle(siteChecked site, string html)
        {
            Regex rt = new Regex("<title[^>]*?>(?<title>.*)</title>");
            Match m  = rt.Match(html);

            if (m.Success)
            {
                site.setTitle(m.Groups["title"].Value);
            }
            else
            {
                site.setTitle("");
            }
        }
Example #5
0
        public void urlQueue(siteChecked head, string html)
        {
            HashSet <string> result;

            result = parseForUrl(head.getHost(), html);
            foreach (string site in result)
            {
                if (!crawled.Contains(site))
                {
                    siteChecked newSite = new siteChecked(site, head.getLevel() + 1, head);
                    head.addorSetChild(newSite);
                    crawlQueue.Enqueue(newSite);
                }
            }
        }
Example #6
0
        public void saveData(siteChecked site)
        {
            HashSet <siteChecked> children = site.getChildren(); string name = site.getTitle();

            if (name == null || name.Length == 0)
            {
                name = "temp";
            }
            if (children.Count != 0)
            {
                StreamWriter result = new StreamWriter(new FileStream(name, FileMode.Create), Encoding.Default);
                result.WriteLine("Url: " + site.getUrl());
                foreach (siteChecked child in children)
                {
                    result.WriteLine(child.getCorrectCode() + "\t" + child.getErrorCode());
                }
                result.Close();
            }
        }
Example #7
0
        public siteChecked showBase()
        {
            siteChecked  site = crawledQue.Dequeue();
            StreamWriter sr   = new StreamWriter(new FileStream("statusShow.txt", FileMode.Append), Encoding.Default);

            Console.WriteLine();
            Console.WriteLine("CrawledUrl: " + site.getUrl());
            Console.WriteLine("Title: " + site.getTitle());
            Console.WriteLine("Host: " + site.getHost());
            Console.WriteLine("Level: " + site.getLevel());
            Console.WriteLine("CorrectCode: " + site.getCorrectCode());
            Console.WriteLine("ErrorCode: " + site.getErrorCode());
            Console.WriteLine("ChildSite: " + site.getChildCount());
            sr.WriteLine();
            sr.WriteLine("CrawledUrl: " + site.getUrl());
            sr.WriteLine("Title: " + site.getTitle());
            sr.WriteLine("Host: " + site.getHost());
            sr.WriteLine("Level: " + site.getLevel());
            sr.WriteLine("CorrectCode: " + site.getCorrectCode());
            sr.WriteLine("ErrorCode: " + site.getErrorCode());
            sr.WriteLine("ChildSite: " + site.getChildCount());
            sr.Close();
            return(site);
        }
Example #8
0
 public siteChecked(string url, int level, siteChecked parent)
 {
     this.url    = url;
     this.level  = level;
     this.parent = parent;
 }
Example #9
0
        private void dequeue()
        {
            siteChecked tmp = crawlQueue.Dequeue();

            crawledQue.Enqueue(tmp);
        }
Example #10
0
        public string downloadhtml(siteChecked site)
        {
            string url = site.getUrl();

            if (url == null || url.Length == 0)
            {
                return("");
            }
            string htmlDecoded = "";

            try
            {
                HttpWebRequest req = WebRequest.Create(url) as HttpWebRequest;
                req.UserAgent   = "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/49.0.2623.22 Safari/537.36 SE 2.X MetaSr 1.0";
                req.Credentials = CredentialCache.DefaultNetworkCredentials;
                HttpWebResponse resp = req.GetResponse() as HttpWebResponse;
                site.setHost(req.Headers["Host"]);
                site.setCorrectCode(resp.StatusCode);
                Stream   respStream = resp.GetResponseStream();
                Encoding code       = analyzedCode(resp);
                if (code != null)
                {
                    StreamReader sr = new StreamReader(respStream, code);
                    htmlDecoded = sr.ReadToEnd();
                    sr.Close();
                }
                else
                {
                    byte[] htmlByte = getContentByte(respStream);
                    //用utf-8来解析head里的meta以获取charset(手动分析)
                    string htmlForMeta  = Encoding.GetEncoding("utf-8").GetString(htmlByte);
                    string meta_charset = "(<meta[^>]*charset=(['\"]|)(?<charset>[^>\"']*)[\\s\\S]*?>)|(xml[^>]*encoding=('|\")(?<charset>[^>\"']*)[\\S\\s]*?>)";
                    Regex  reg          = new Regex(meta_charset, RegexOptions.IgnoreCase);
                    Match  m            = reg.Match(htmlForMeta);
                    string codeType     = (m.Captures.Count != 0) ? m.Result("${charset}") : "";
                    if (codeType == "")
                    {
                        //*非标准,此处指无charset
                        site.setCorrectCode(HttpStatusCode.NoContent);
                        htmlDecoded = null;
                    }
                    else
                    {
                        htmlDecoded = Encoding.GetEncoding(codeType).GetString(htmlByte);
                    }
                }
                resp.Close();
            }
            catch (UriFormatException e)
            {
                //*非标准,指无效url
                site.setErrorCode(WebExceptionStatus.UnknownError);
                htmlDecoded = null;
            }
            catch (WebException ee)
            {
                site.setErrorCode(ee.Status);
                htmlDecoded = null;
            }
            return(htmlDecoded);
        }