Esempio n. 1
0
        public static RobotsTextResponse ParseText(string text)
        {
            var       response       = new RobotsTextResponse();
            var       startedParsing = false;
            UserAgent currentAgent   = null;
            int       index          = 0;

            while (index < text.Length)
            {
                var line = GetLine(text, ref index);
                if (line.Length == 0)
                {
                    if (startedParsing)
                    {
                        startedParsing = false;
                    }
                    continue;
                }

                if (line.StartsWith("User-agent:") && line.Length > 12 && (line[12] == '*' || IsMatchingUserAgent(line)))
                {
                    if (line[12] == '*')
                    {
                        response.AnyAgent = new UserAgent();
                        currentAgent      = response.AnyAgent;
                        currentAgent.Name = "*";
                        startedParsing    = true;
                    }
                    else if (IsMatchingUserAgent(line))
                    {
                        response.MyAgent  = new UserAgent();
                        currentAgent      = response.MyAgent;
                        currentAgent.Name = line.Substring(12);
                        startedParsing    = true;
                    }
                    else
                    {
                        continue;
                    }
                }
                if (!startedParsing)
                {
                    continue;
                }


                if (line.StartsWith("Disallow:") && line.Length > 10)
                {
                    currentAgent.DisallowedUrls.Add(line.Substring(10));
                }
                else if (line.StartsWith("Allow:") && line.Length > 7)
                {
                    currentAgent.AllowedUrls.Add(line.Substring(7));
                }
            }
            return(response);
        }
Esempio n. 2
0
        public static bool IsAllowed(RobotsTextResponse robots, Uri baseUrl)
        {
            if (robots == null)
            {
                return(true);
            }

            var allowed = true;

            //check my agent first
            if (robots.MyAgent != null)
            {
                allowed = IsAllowedUserAgent(robots.MyAgent, baseUrl);
            }
            else if (robots.AnyAgent != null)
            {
                allowed = IsAllowedUserAgent(robots.AnyAgent, baseUrl);
            }

            return(allowed);
        }
Esempio n. 3
0
        public static RobotsTextResponse GetRobotsResponse(Uri uri, int retries = 0, string cacheKey = null)
        {
            if (cacheKey == null)
            {
                //in case of redirects, keep same key
                cacheKey = uri.Authority;
            }

            if (retries == RunSettings.MaxRetries)
            {
                ApplicationCache.Robots.Add(cacheKey, null);
                return(null);
            }
            retries++;
            var robots = new Uri($"{uri.Scheme}://{uri.Authority}/robots.txt");
            var page   = Download(robots);

            if (page == null)
            {
                ApplicationCache.Robots.Add(cacheKey, null);
                return(null);
            }
            if (page.StatusCode != HttpStatusCode.OK && page.StatusCode != HttpStatusCode.NotModified)
            {
                if ((page.StatusCode == HttpStatusCode.Moved || page.StatusCode == HttpStatusCode.Redirect &&
                     page.StatusCode == HttpStatusCode.TemporaryRedirect) && page.Headers.ContainsKey("Location"))
                {
                    return(GetRobotsResponse(new Uri(page.Headers["Location"]), retries, cacheKey));
                }
                ApplicationCache.Robots.Add(cacheKey, null);
                return(null);
            }
            var parsedRobots = RobotsTextResponse.ParseText(page.Html);

            ApplicationCache.Robots.Add(cacheKey, parsedRobots);
            return(parsedRobots);
        }
Esempio n. 4
0
        private void ProcessUrl(string siteUrl, int retries = 0)
        {
            if (retries == RunSettings.MaxRetries)
            {
                return;
            }
            retries++;

            //Console.WriteLine($"Crawling {siteUrl}");
            var baseUrl = new Uri(siteUrl);
            //get robots
            RobotsTextResponse robots = null;

            if (!ApplicationCache.Robots.TryGetValue(baseUrl.Authority, out robots))
            {
                robots = TcpCrawlingClient.GetRobotsResponse(baseUrl);
            }

            if (ApplicationCache.Robots.Count > RunSettings.MaxRobotsSize)
            {
                ApplicationCache.TrimRobots();
            }

            //check robots
            if (!RobotsTextResponse.IsAllowed(robots, baseUrl))
            {
                return;
            }

            var page = TcpCrawlingClient.Download(baseUrl);

            if (ApplicationCache.VisitedUrls.Count > RunSettings.MaxVisitedUrls)
            {
                ApplicationCache.TrimVisitedUrls();
            }

            ApplicationCache.VisitedUrls.Add(siteUrl);

            if (page == null)
            {
                return;
            }
            if ((page.StatusCode == HttpStatusCode.MovedPermanently || page.StatusCode == HttpStatusCode.TemporaryRedirect) && page.Headers.ContainsKey("Location"))
            {
                ProcessRedirectedLocation(page, baseUrl, retries);
                return;
            }

            if (page.StatusCode != HttpStatusCode.OK)
            {
                return;
            }

            var doc = new HtmlDocument();

            doc.LoadHtml(page.Html);
            //no index is returned, no follow means we index this but don't take anchors
            var  metaTags = doc.DocumentNode.Descendants("meta").ToList();
            bool containsNoIndex = false, containsNoFollow = false;
            var  robotsMeta = metaTags.FirstOrDefault(x => x.Attributes["name"]?.Value == "robots")?.Attributes["content"]?.Value;

            if (robotsMeta != null)
            {
                containsNoFollow = robotsMeta.Contains("nofollow");
                containsNoIndex  = robotsMeta.Contains("noindex");
            }


            var anchors = doc.DocumentNode.Descendants("a");

            if (!containsNoIndex)
            {
                //we can save it on the disk so the indexer can do its job
                FileTreeWorker.CreateWebsiteTree(baseUrl, page.Html);
            }
            if (containsNoFollow)
            {
                //do not extract the links
                return;
            }

            foreach (var a in anchors)
            {
                var currentAnchor = a.Attributes["href"]?.Value;
                if (string.IsNullOrEmpty(currentAnchor))
                {
                    continue;
                }
                var rel = a.Attributes["rel"]?.Value;
                if (rel != null && rel == "nofollow")
                {
                    continue;
                }
                UrlFrontier.Enqueue(baseUrl, currentAnchor);
            }
        }