Example #1
0
        public bool AddURLToFrontQueue(PrettyURL url)
        {
            // tjek om allerede besøgt
            if (AllURLS.Contains(url.GetPrettyURL.GetHashCode()))
            {
                return false;
            }

            int rand = new Random().Next(0, FrontQueues.Count);
            FrontQueues[rand].Enqueue(url);
            AllURLS.Add(url.GetPrettyURL.GetHashCode());

            return true;
        }
Example #2
0
        /// <summary>
        /// Parse robots.txt from a specific prettyURL.
        /// Assumes robots.txt is properly formatted.
        /// </summary>
        /// <param name="prettyURL"></param>
        /// <param name="robotsContent"></param>
        /// <returns>A regex to determine if a prettyURL is disallowed.</returns>
        private Regex CalcRobotRegexForDomain(PrettyURL url, IEnumerable<string> robotsContent)
        {
            if (CachedRegexes.ContainsKey(url.GetDomain) && DateTime.Now - CachedRegexes[url.GetDomain].Item1 < MaxAge)
            {
                return CachedRegexes[url.GetDomain].Item2;
            }

            regexCalcWatch.Restart();

            // No restrictions
            // 
            if (robotsContent == null || robotsContent.Count() == 0)
            {
                return new Regex(@"@.");
            }

            // Find what is disallowed.
            var disallow = robotsContent.SkipWhile(s => !Regex.IsMatch(s, @"User-Agent: \*", RegexOptions.IgnoreCase)) // Start from user agent *.
                .TakeWhile(s => !string.IsNullOrWhiteSpace(s)) // Read until blank line (where allow/disallow hopefully ends).
                .Skip(1) // Skip the user agent string.
                .Where(s => s.StartsWith("Disallow")) // We only need disallowed stuff.
                .Select(s => s.Split(':').Last().Trim()); // Select the disallowed stuff.

            if (disallow.Count() == 0)
            {
                return new Regex(@"$.");
            }

            // Build the regex string
            StringBuilder regPattern = new StringBuilder(url + "(" + disallow.First());
            foreach (var s in disallow.Skip(1))
            {
                regPattern.Append('|');
                //regPattern.Append(prettyURL);
                regPattern.Append(s);
            }
            regPattern.Append(')');
            regPattern.Replace("*", ".*").Replace(".", "\\.").Replace("+", "\\+");

            CachedRegexes[url.GetDomain] = new Tuple<DateTime, Regex>(DateTime.Now, new Regex(regPattern.ToString()));

            regexCalcWatch.Stop();
            regexCalcTimes[url.GetDomain] = (int)regexCalcWatch.Elapsed.TotalMilliseconds;

            return CachedRegexes[url.GetDomain].Item2;
        }
Example #3
0
        public bool IsVisitAllowed(PrettyURL url)
        {
            if (CachedRegexes.ContainsKey(url.GetDomain))
            {
                // Too old?
                if (DateTime.Now - CachedRegexes[url.GetDomain].Item1 > MaxAge)
                {
                    var robotContent = DownloadRobotContent(url).Split('\n');

                    var regex = CalcRobotRegexForDomain(url, robotContent);
                    CachedRegexes[url.GetDomain] = new Tuple<DateTime, Regex>(DateTime.Now, regex);
                }
            }
            else
            {
                var robotContent = DownloadRobotContent(url).Split(new string[] { "\r\n", "\n" }, StringSplitOptions.None);
                var regex = CalcRobotRegexForDomain(url, robotContent);
                CachedRegexes[url.GetDomain] = new Tuple<DateTime, Regex>(DateTime.Now, regex);
            }

            return !CachedRegexes[url.GetDomain].Item2.IsMatch(url.GetPrettyURL);
        }
Example #4
0
 public void RemoveRobot(PrettyURL url)
 {
     CachedRegexes.Remove(url.GetDomain);
 }
Example #5
0
        private string DownloadRobotContent(PrettyURL url)
        {
            string content = "";
            robotDownloadWatch.Restart();

            try
            {
                string robot = "http://" + url.GetDomain + "/" + "robots.txt";
                //System.Diagnostics.Debug.WriteLine("Downloading " + robot);
                content = new System.Net.WebClient().DownloadString(robot);
            }
            catch (Exception ex)
            {
                //System.Diagnostics.Debug.WriteLine("Robot: Could not download " + url);
            }

            robotDownloadWatch.Stop();
            robotDownloadTimes[url.GetDomain] = (int)robotDownloadWatch.Elapsed.TotalMilliseconds;

            return content;
        }
Example #6
0
        public IEnumerable<PrettyURL> ExtractLinksFromHTML(PrettyURL url, string html)
        {
            if (html == null)
            {
                return Enumerable.Empty<PrettyURL>();
            }

            var hrefs = html.Split(new string[] { "<a href=\"" }, StringSplitOptions.RemoveEmptyEntries)
                .Skip(1)
                .Where(s => !s.StartsWith("feed") && !s.StartsWith("javascript"));

            var urls = new List<PrettyURL>();

            foreach (var href in hrefs)
            {
                var link = href.Split('\"').First();
                string fullPath = (link.StartsWith("/") ? url.GetPrettyURL : "") + link;
                if (PrettyURL.IsValidURL(fullPath))
                {
                    var pretty = new PrettyURL(fullPath);

                    if (pretty.GetDomain.EndsWith(".dk"))
                    {
                        urls.Add(new PrettyURL(fullPath));
                    }
                }
            }

            return urls;
        }
Example #7
0
        public string DownloadHTML(PrettyURL url)
        {
            WebClient web = new WebClient();

            if (Robot_IAm.IsVisitAllowed(url))
            {
                try
                {
                    return web.DownloadString(url.GetPrettyURL);
                }
                catch (Exception ex)
                {
                    // tough luck
                    //System.Diagnostics.Debug.WriteLine("Error downloading " + url, CRAWLER);
                    return null;
                }
            }
            else
            {
                return null;
            }
        }