public void WritePageStatic(HtmlData pageData) { try { //Write to a file sw.WriteLine("Current Page : " + pageData.Title + " URL :" + pageData.Url); foreach (var line in pageData.MetadataList) { sw.WriteLine(line); } foreach (var line in pageData.AnchorList) { sw.WriteLine(line); } foreach (var line in pageData.ImageList) { sw.WriteLine(line); } } catch (Exception ex) { Program.log.Error("Error wrting page statistic"); Program.log.Error(ex.Message); } }
private static HtmlData ParseURL(string URL, SiteMapWriter stmapper) { //parse this page out put a map with HtmlData parsedData = new HtmlData(); try { Document document = GetDocumentFromUrl(URL); // Parsing ... parsedData.Clear(); parsedData.Url = document.BaseUri; parsedData.RawHtml = document.Html(); parsedData.Title = document.Title; foreach (Element meta in document.Select("meta")) { parsedData.MetadataList.Add(meta.OuterHtml()); } foreach (Element image in document.Select("img")) { parsedData.ImageList.Add(image.Attr("src")); } foreach (Element anchor in document.Select("a")) { if (anchor.BaseUri.TrimEnd('/') != anchor.Attr("href")) { if (URLS.IsValidUri(anchor.Attr("href"))) { parsedData.AnchorList.Add(new Uri(anchor.Attr("href"))); } else { log.Info("A href was invalid : " + anchor.Attr("href")); } } else { parsedData.AnchorList.Add(new Uri(anchor.BaseUri)); } } //links to static content such as images //and to external URLs (but do not visit them) stmapper.WritePageStatic(parsedData); } catch (Exception ex) { log.Error("There was a problem retriving URL : " + URL); log.Error(ex.Message); } return(parsedData); }