Exemple #1
0
        public static TcpResponse Download(Uri uri)
        {
            ApplicationCache.Dns.TryGetValue(uri.Authority, out DnsResponse dns);
            if (dns == null)
            {
                dns = DnsResolver.GetDnsAddress(uri.Authority);
                if (dns == null)
                {
                    return(null);
                }
                ApplicationCache.Dns.Add(uri.Authority, dns);
            }
            else if (dns.ExpireDate < DateTime.Now)
            {
                ApplicationCache.Dns.Remove(uri.Authority);
                dns = DnsResolver.GetDnsAddress(uri.Authority);
                if (dns == null)
                {
                    return(null);
                }
                ApplicationCache.Dns.Add(uri.Authority, dns);
            }
            if (ApplicationCache.Dns.Count > RunSettings.MaxDnsSize)
            {
                ApplicationCache.TrimDnsCache();
            }

            var relativePath   = uri.AbsolutePath;
            var requestMessage = new StringBuilder();

            requestMessage.Append($"GET {relativePath} HTTP/1.1\r\n");
            requestMessage.Append($"Host:{uri.Authority}\r\n"); //riweb.tibeica.com
            requestMessage.Append($"User-Agent:{RunSettings.UserAgent}\r\n");
            requestMessage.Append("Connection:close\r\n");
            //requestMessage.Append("Accept: text/html,application/xhtml+xml,application/xml;q=0.9\r\n");
            //requestMessage.Append("Accept-encoding: gzip, deflate\r\n");
            requestMessage.Append("\r\n");
            var client = GetTcpClient(uri, dns);

            if (client == null)
            {
                return(null);
            }
            var sendBuffer = Encoding.ASCII.GetBytes(requestMessage.ToString());
            var stream     = GetStream(client, uri);

            stream.Write(sendBuffer, 0, sendBuffer.Length);

            var response = Encoding.Default.GetString(ReadFully(stream));

            return(BuildTcpResponse(response));
        }
Exemple #2
0
        private void ProcessUrl(string siteUrl, int retries = 0)
        {
            if (retries == RunSettings.MaxRetries)
            {
                return;
            }
            retries++;

            //Console.WriteLine($"Crawling {siteUrl}");
            var baseUrl = new Uri(siteUrl);
            //get robots
            RobotsTextResponse robots = null;

            if (!ApplicationCache.Robots.TryGetValue(baseUrl.Authority, out robots))
            {
                robots = TcpCrawlingClient.GetRobotsResponse(baseUrl);
            }

            if (ApplicationCache.Robots.Count > RunSettings.MaxRobotsSize)
            {
                ApplicationCache.TrimRobots();
            }

            //check robots
            if (!RobotsTextResponse.IsAllowed(robots, baseUrl))
            {
                return;
            }

            var page = TcpCrawlingClient.Download(baseUrl);

            if (ApplicationCache.VisitedUrls.Count > RunSettings.MaxVisitedUrls)
            {
                ApplicationCache.TrimVisitedUrls();
            }

            ApplicationCache.VisitedUrls.Add(siteUrl);

            if (page == null)
            {
                return;
            }
            if ((page.StatusCode == HttpStatusCode.MovedPermanently || page.StatusCode == HttpStatusCode.TemporaryRedirect) && page.Headers.ContainsKey("Location"))
            {
                ProcessRedirectedLocation(page, baseUrl, retries);
                return;
            }

            if (page.StatusCode != HttpStatusCode.OK)
            {
                return;
            }

            var doc = new HtmlDocument();

            doc.LoadHtml(page.Html);
            //no index is returned, no follow means we index this but don't take anchors
            var  metaTags = doc.DocumentNode.Descendants("meta").ToList();
            bool containsNoIndex = false, containsNoFollow = false;
            var  robotsMeta = metaTags.FirstOrDefault(x => x.Attributes["name"]?.Value == "robots")?.Attributes["content"]?.Value;

            if (robotsMeta != null)
            {
                containsNoFollow = robotsMeta.Contains("nofollow");
                containsNoIndex  = robotsMeta.Contains("noindex");
            }


            var anchors = doc.DocumentNode.Descendants("a");

            if (!containsNoIndex)
            {
                //we can save it on the disk so the indexer can do its job
                FileTreeWorker.CreateWebsiteTree(baseUrl, page.Html);
            }
            if (containsNoFollow)
            {
                //do not extract the links
                return;
            }

            foreach (var a in anchors)
            {
                var currentAnchor = a.Attributes["href"]?.Value;
                if (string.IsNullOrEmpty(currentAnchor))
                {
                    continue;
                }
                var rel = a.Attributes["rel"]?.Value;
                if (rel != null && rel == "nofollow")
                {
                    continue;
                }
                UrlFrontier.Enqueue(baseUrl, currentAnchor);
            }
        }