public void DownloadAndEnqueue(string seed) { //validate try { var validate = ValidateUrl.Validate(seed); if (!validate.Item1) { return; } //if is in hashset if (CrawledUrls.Contains(validate.Item2.AbsoluteUri)) { return; } CrawledUrls.Add(validate.Item2.AbsoluteUri); //download and save html file var html = new HtmlDownloader(validate.Item2).Load(); if (html == null) { return; } //parse html file to get urls var linkedPages = Parser.Parse(html, validate.Item2); foreach (string linkedPage in linkedPages) { Queue.Enqueue(linkedPage); } } catch (Exception ex) { } }
public IEnumerable <string> Parse(HtmlDocument html, Uri url) { Print.Show("Parsing url:" + url.AbsoluteUri); var linkedPages = html.DocumentNode.Descendants("a") .Select(a => a.GetAttributeValue("href", null)) .Where(u => !String.IsNullOrEmpty(u)); Print.Show(linkedPages.Count() + " urls in" + url.AbsoluteUri); List <string> uriList = new List <string>(); foreach (string linkedPage in linkedPages) { if (linkedPage.Contains("#")) { continue; } var result = ValidateUrl.Validate(linkedPage); try { if (!result.Item1) { uriList.Add(new Uri(url, linkedPage).AbsoluteUri); } else { uriList.Add(linkedPage); } } catch (Exception ex) { Print.Show(ex.Message); } } return(uriList); }