private static HtmlData ParseURL(string URL, SiteMapWriter stmapper) { //parse this page out put a map with HtmlData parsedData = new HtmlData(); try { Document document = GetDocumentFromUrl(URL); // Parsing ... parsedData.Clear(); parsedData.Url = document.BaseUri; parsedData.RawHtml = document.Html(); parsedData.Title = document.Title; foreach (Element meta in document.Select("meta")) { parsedData.MetadataList.Add(meta.OuterHtml()); } foreach (Element image in document.Select("img")) { parsedData.ImageList.Add(image.Attr("src")); } foreach (Element anchor in document.Select("a")) { if (anchor.BaseUri.TrimEnd('/') != anchor.Attr("href")) { if (URLS.IsValidUri(anchor.Attr("href"))) { parsedData.AnchorList.Add(new Uri(anchor.Attr("href"))); } else { log.Info("A href was invalid : " + anchor.Attr("href")); } } else { parsedData.AnchorList.Add(new Uri(anchor.BaseUri)); } } //links to static content such as images //and to external URLs (but do not visit them) stmapper.WritePageStatic(parsedData); } catch (Exception ex) { log.Error("There was a problem retriving URL : " + URL); log.Error(ex.Message); } return(parsedData); }
public List <Uri> Filter(IEnumerable <Uri> input) { var result = input.Where(i => URLS.RemoveWWWfromUrl(i.AbsoluteUri) != URLS.RemoveWWWfromUrl(_root.AbsoluteUri)).ToList(); return(result); }