public static TcpResponse Download(Uri uri) { ApplicationCache.Dns.TryGetValue(uri.Authority, out DnsResponse dns); if (dns == null) { dns = DnsResolver.GetDnsAddress(uri.Authority); if (dns == null) { return(null); } ApplicationCache.Dns.Add(uri.Authority, dns); } else if (dns.ExpireDate < DateTime.Now) { ApplicationCache.Dns.Remove(uri.Authority); dns = DnsResolver.GetDnsAddress(uri.Authority); if (dns == null) { return(null); } ApplicationCache.Dns.Add(uri.Authority, dns); } if (ApplicationCache.Dns.Count > RunSettings.MaxDnsSize) { ApplicationCache.TrimDnsCache(); } var relativePath = uri.AbsolutePath; var requestMessage = new StringBuilder(); requestMessage.Append($"GET {relativePath} HTTP/1.1\r\n"); requestMessage.Append($"Host:{uri.Authority}\r\n"); //riweb.tibeica.com requestMessage.Append($"User-Agent:{RunSettings.UserAgent}\r\n"); requestMessage.Append("Connection:close\r\n"); //requestMessage.Append("Accept: text/html,application/xhtml+xml,application/xml;q=0.9\r\n"); //requestMessage.Append("Accept-encoding: gzip, deflate\r\n"); requestMessage.Append("\r\n"); var client = GetTcpClient(uri, dns); if (client == null) { return(null); } var sendBuffer = Encoding.ASCII.GetBytes(requestMessage.ToString()); var stream = GetStream(client, uri); stream.Write(sendBuffer, 0, sendBuffer.Length); var response = Encoding.Default.GetString(ReadFully(stream)); return(BuildTcpResponse(response)); }
private void ProcessUrl(string siteUrl, int retries = 0) { if (retries == RunSettings.MaxRetries) { return; } retries++; //Console.WriteLine($"Crawling {siteUrl}"); var baseUrl = new Uri(siteUrl); //get robots RobotsTextResponse robots = null; if (!ApplicationCache.Robots.TryGetValue(baseUrl.Authority, out robots)) { robots = TcpCrawlingClient.GetRobotsResponse(baseUrl); } if (ApplicationCache.Robots.Count > RunSettings.MaxRobotsSize) { ApplicationCache.TrimRobots(); } //check robots if (!RobotsTextResponse.IsAllowed(robots, baseUrl)) { return; } var page = TcpCrawlingClient.Download(baseUrl); if (ApplicationCache.VisitedUrls.Count > RunSettings.MaxVisitedUrls) { ApplicationCache.TrimVisitedUrls(); } ApplicationCache.VisitedUrls.Add(siteUrl); if (page == null) { return; } if ((page.StatusCode == HttpStatusCode.MovedPermanently || page.StatusCode == HttpStatusCode.TemporaryRedirect) && page.Headers.ContainsKey("Location")) { ProcessRedirectedLocation(page, baseUrl, retries); return; } if (page.StatusCode != HttpStatusCode.OK) { return; } var doc = new HtmlDocument(); doc.LoadHtml(page.Html); //no index is returned, no follow means we index this but don't take anchors var metaTags = doc.DocumentNode.Descendants("meta").ToList(); bool containsNoIndex = false, containsNoFollow = false; var robotsMeta = metaTags.FirstOrDefault(x => x.Attributes["name"]?.Value == "robots")?.Attributes["content"]?.Value; if (robotsMeta != null) { containsNoFollow = robotsMeta.Contains("nofollow"); containsNoIndex = robotsMeta.Contains("noindex"); } var anchors = doc.DocumentNode.Descendants("a"); if (!containsNoIndex) { //we can save it on the disk so the indexer can do its job FileTreeWorker.CreateWebsiteTree(baseUrl, page.Html); } if (containsNoFollow) { //do not extract the links return; } foreach (var a in anchors) { var currentAnchor = a.Attributes["href"]?.Value; if (string.IsNullOrEmpty(currentAnchor)) { continue; } var rel = a.Attributes["rel"]?.Value; if (rel != null && rel == "nofollow") { continue; } UrlFrontier.Enqueue(baseUrl, currentAnchor); } }