public void addorSetChild(siteChecked child) { if (child != null) { children.Add(child); } }
public void crawledResultShow() { siteChecked site = showBase(); saveData(site); while (crawledQue.Count != 0) { showBase(); } Console.WriteLine("\n" + mark); }
static void Main(string[] args) { string baseUrl = @"https://www.qunar.com/"; Crawl crawl = new Crawl(); crawl.levelLimit = 1; siteChecked first = new siteChecked(baseUrl, 0, null); crawl.crawlQueue.Enqueue(first); crawl.startCrawl(); crawl.crawledResultShow(); }
private void getTitle(siteChecked site, string html) { Regex rt = new Regex("<title[^>]*?>(?<title>.*)</title>"); Match m = rt.Match(html); if (m.Success) { site.setTitle(m.Groups["title"].Value); } else { site.setTitle(""); } }
public void urlQueue(siteChecked head, string html) { HashSet <string> result; result = parseForUrl(head.getHost(), html); foreach (string site in result) { if (!crawled.Contains(site)) { siteChecked newSite = new siteChecked(site, head.getLevel() + 1, head); head.addorSetChild(newSite); crawlQueue.Enqueue(newSite); } } }
public void saveData(siteChecked site) { HashSet <siteChecked> children = site.getChildren(); string name = site.getTitle(); if (name == null || name.Length == 0) { name = "temp"; } if (children.Count != 0) { StreamWriter result = new StreamWriter(new FileStream(name, FileMode.Create), Encoding.Default); result.WriteLine("Url: " + site.getUrl()); foreach (siteChecked child in children) { result.WriteLine(child.getCorrectCode() + "\t" + child.getErrorCode()); } result.Close(); } }
public siteChecked showBase() { siteChecked site = crawledQue.Dequeue(); StreamWriter sr = new StreamWriter(new FileStream("statusShow.txt", FileMode.Append), Encoding.Default); Console.WriteLine(); Console.WriteLine("CrawledUrl: " + site.getUrl()); Console.WriteLine("Title: " + site.getTitle()); Console.WriteLine("Host: " + site.getHost()); Console.WriteLine("Level: " + site.getLevel()); Console.WriteLine("CorrectCode: " + site.getCorrectCode()); Console.WriteLine("ErrorCode: " + site.getErrorCode()); Console.WriteLine("ChildSite: " + site.getChildCount()); sr.WriteLine(); sr.WriteLine("CrawledUrl: " + site.getUrl()); sr.WriteLine("Title: " + site.getTitle()); sr.WriteLine("Host: " + site.getHost()); sr.WriteLine("Level: " + site.getLevel()); sr.WriteLine("CorrectCode: " + site.getCorrectCode()); sr.WriteLine("ErrorCode: " + site.getErrorCode()); sr.WriteLine("ChildSite: " + site.getChildCount()); sr.Close(); return(site); }
public siteChecked(string url, int level, siteChecked parent) { this.url = url; this.level = level; this.parent = parent; }
private void dequeue() { siteChecked tmp = crawlQueue.Dequeue(); crawledQue.Enqueue(tmp); }
public string downloadhtml(siteChecked site) { string url = site.getUrl(); if (url == null || url.Length == 0) { return(""); } string htmlDecoded = ""; try { HttpWebRequest req = WebRequest.Create(url) as HttpWebRequest; req.UserAgent = "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/49.0.2623.22 Safari/537.36 SE 2.X MetaSr 1.0"; req.Credentials = CredentialCache.DefaultNetworkCredentials; HttpWebResponse resp = req.GetResponse() as HttpWebResponse; site.setHost(req.Headers["Host"]); site.setCorrectCode(resp.StatusCode); Stream respStream = resp.GetResponseStream(); Encoding code = analyzedCode(resp); if (code != null) { StreamReader sr = new StreamReader(respStream, code); htmlDecoded = sr.ReadToEnd(); sr.Close(); } else { byte[] htmlByte = getContentByte(respStream); //用utf-8来解析head里的meta以获取charset(手动分析) string htmlForMeta = Encoding.GetEncoding("utf-8").GetString(htmlByte); string meta_charset = "(<meta[^>]*charset=(['\"]|)(?<charset>[^>\"']*)[\\s\\S]*?>)|(xml[^>]*encoding=('|\")(?<charset>[^>\"']*)[\\S\\s]*?>)"; Regex reg = new Regex(meta_charset, RegexOptions.IgnoreCase); Match m = reg.Match(htmlForMeta); string codeType = (m.Captures.Count != 0) ? m.Result("${charset}") : ""; if (codeType == "") { //*非标准,此处指无charset site.setCorrectCode(HttpStatusCode.NoContent); htmlDecoded = null; } else { htmlDecoded = Encoding.GetEncoding(codeType).GetString(htmlByte); } } resp.Close(); } catch (UriFormatException e) { //*非标准,指无效url site.setErrorCode(WebExceptionStatus.UnknownError); htmlDecoded = null; } catch (WebException ee) { site.setErrorCode(ee.Status); htmlDecoded = null; } return(htmlDecoded); }