Example #1
0
        public void DownloadAndEnqueue(string seed)
        {
            //validate
            try
            {
                var validate = ValidateUrl.Validate(seed);
                if (!validate.Item1)
                {
                    return;
                }

                //if is in hashset
                if (CrawledUrls.Contains(validate.Item2.AbsoluteUri))
                {
                    return;
                }

                CrawledUrls.Add(validate.Item2.AbsoluteUri);

                //download and save html file
                var html = new HtmlDownloader(validate.Item2).Load();
                if (html == null)
                {
                    return;
                }

                //parse html file to get urls
                var linkedPages = Parser.Parse(html, validate.Item2);
                foreach (string linkedPage in linkedPages)
                {
                    Queue.Enqueue(linkedPage);
                }
            }
            catch (Exception ex) { }
        }
Example #2
0
        public IEnumerable <string> Parse(HtmlDocument html, Uri url)
        {
            Print.Show("Parsing url:" + url.AbsoluteUri);
            var linkedPages = html.DocumentNode.Descendants("a")
                              .Select(a => a.GetAttributeValue("href", null))
                              .Where(u => !String.IsNullOrEmpty(u));

            Print.Show(linkedPages.Count() + " urls in" + url.AbsoluteUri);

            List <string> uriList = new List <string>();

            foreach (string linkedPage in linkedPages)
            {
                if (linkedPage.Contains("#"))
                {
                    continue;
                }
                var result = ValidateUrl.Validate(linkedPage);
                try
                {
                    if (!result.Item1)
                    {
                        uriList.Add(new Uri(url, linkedPage).AbsoluteUri);
                    }
                    else
                    {
                        uriList.Add(linkedPage);
                    }
                }
                catch (Exception ex)
                {
                    Print.Show(ex.Message);
                }
            }
            return(uriList);
        }