Ejemplo n.º 1
0
        public void DownloadAndEnqueue(string seed)
        {
            //validate
            try
            {
                var validate = ValidateUrl.Validate(seed);
                if (!validate.Item1)
                {
                    return;
                }

                //if is in hashset
                if (CrawledUrls.Contains(validate.Item2.AbsoluteUri))
                {
                    return;
                }

                CrawledUrls.Add(validate.Item2.AbsoluteUri);

                //download and save html file
                var html = new HtmlDownloader(validate.Item2).Load();
                if (html == null)
                {
                    return;
                }

                //parse html file to get urls
                var linkedPages = Parser.Parse(html, validate.Item2);
                foreach (string linkedPage in linkedPages)
                {
                    Queue.Enqueue(linkedPage);
                }
            }
            catch (Exception ex) { }
        }
        private List <UrlLink> GetChildren(string startingUrlAddress)
        {
            Stopwatch stopwatch = new Stopwatch();

            stopwatch.Start();
            var html = new HtmlDownloader(startingUrlAddress).HtmlText;

            //WriteToFile.WriteTextToFile(html, "BeforeMod.txt");

            // Restrictions for the Pcmag website -> CANNOT use links that are in elements with id beggining with comment-
            if (startingUrlAddress.Contains("konkurs.pcmagbg.net"))
            {
                html = RemoveTags(html, "id=\"comment-");
                html = RemoveTags(html, "id=\"recentcomments");
            }

            // Restrictions for TelerikAcademy website -> CANNOT use links that are in elements with attribute:
            // id="fb0021a1c"
            // id="RecentVideoHeader"
            // id="ForumPostsHeader"
            // id="BlogPostsHeader"
            // id="IndexCalendarHeader"
            if (startingUrlAddress.Contains("telerikacademy.com"))
            {
                html = RemoveTags(html, "id=\"f202640e64");
                html = RemoveTags(html, "id=\"RecentVideos");
                html = RemoveTags(html, "id=\"LatestForumPosts");
                html = RemoveTags(html, "id=\"BlogPosts");
                html = RemoveTags(html, "id=\"Calendar");
            }

            // Restrictions for youtube website -> can ONLY use links that are in elements with attribute:
            // id="eow-description"
            // class="primary-pane"
            if (startingUrlAddress.Contains("youtube.com"))
            {
                html = RetriveTags(html, "eow-description", "primary-pane");
            }

            //WriteToFile.WriteTextToFile(html, "AfterMod.txt");

            // Extract the anchor tags from text
            var tags = new LinkFilter(html).AnchorTags;
            //WriteToFile.WriteTextToFile(tags, "tags.txt");

            // Extract the href links from the anchor tags
            var hrefs = new LinkFilter(tags).LinksFromHrefs;
            //WriteToFile.WriteTextToFile(hrefs, "hrefs.txt");

            // Links can be only from the following sites:
            // http://academy.telerik.com
            // http://telerikacademy.com
            // http://konkurs.pcmagbg.net
            // http://www.youtube.com/playlist?list=
            // http://www.youtube.com/watch?v=
            // If NO protocol or server is present in the link -> relative link
            // All relative links are equal to the current website plus relative link
            // Current site is always Correct!
            // The resulting links are the allowed children links of the initial url address


            List <UrlLink> linksList = new List <UrlLink>();
            var            links     = hrefs.Split(new char[] { '\n' }, StringSplitOptions.RemoveEmptyEntries);

            for (int index = 0; index < links.Length; index++)
            {
                UrlLink validUrl;
                links[index] = links[index].Trim();
                links[index] = links[index].Trim('\r');
                if (!links[index].StartsWith("http"))
                {
                    if (startingUrlAddress[startingUrlAddress.Length - 1] == '/')
                    {
                        validUrl = new UrlLink(startingUrlAddress + links[index], links[index]);
                    }
                    else
                    {
                        validUrl = new UrlLink(startingUrlAddress + "/" + links[index], links[index]);
                    }

                    linksList.Add(validUrl);
                }
                else
                {
                    if (links[index].StartsWith("http://academy.telerik.com") ||
                        links[index].StartsWith("http://telerikacademy.com") ||
                        links[index].StartsWith("http://konkurs.pcmagbg.net") ||
                        links[index].StartsWith("http://www.youtube.com/playlist?list=") ||
                        links[index].StartsWith("http://www.youtube.com/watch?v=")
                        )
                    {
                        validUrl = new UrlLink(links[index], links[index]);
                        linksList.Add(validUrl);
                    }
                }
            }

            //StringBuilder sb = new StringBuilder();
            //foreach (var l in linksList)
            //{
            //    sb.AppendLine(l.ToString());
            //}
            //WriteToFile.WriteTextToFile(sb.ToString(), "links.txt");

            //Process.Start(Environment.CurrentDirectory + "\\" + "BeforeMod.txt");
            //Process.Start(Environment.CurrentDirectory + "\\" + "AfterMod.txt");
            //Process.Start(Environment.CurrentDirectory + "\\" + "tags.txt");
            //Process.Start(Environment.CurrentDirectory + "\\" + "hrefs.txt");
            //Process.Start(Environment.CurrentDirectory + "\\" + "links.txt");

            //stopwatch.Stop();
            ColorConsole.Write(string.Format("FINISHED with {0} in {1:0.00} seconds.\n", startingUrlAddress, stopwatch.Elapsed.TotalSeconds), ConsoleColor.Green);

            return(linksList);
        }