Example #1
0
        private static HtmlData ParseURL(string URL, SiteMapWriter stmapper)
        {
            //parse this page out put a map with
            HtmlData parsedData = new HtmlData();

            try
            {
                Document document = GetDocumentFromUrl(URL);

                // Parsing ...
                parsedData.Clear();
                parsedData.Url     = document.BaseUri;
                parsedData.RawHtml = document.Html();
                parsedData.Title   = document.Title;
                foreach (Element meta in document.Select("meta"))
                {
                    parsedData.MetadataList.Add(meta.OuterHtml());
                }
                foreach (Element image in document.Select("img"))
                {
                    parsedData.ImageList.Add(image.Attr("src"));
                }
                foreach (Element anchor in document.Select("a"))
                {
                    if (anchor.BaseUri.TrimEnd('/') != anchor.Attr("href"))
                    {
                        if (URLS.IsValidUri(anchor.Attr("href")))
                        {
                            parsedData.AnchorList.Add(new Uri(anchor.Attr("href")));
                        }
                        else
                        {
                            log.Info("A href was invalid : " + anchor.Attr("href"));
                        }
                    }
                    else
                    {
                        parsedData.AnchorList.Add(new Uri(anchor.BaseUri));
                    }
                }
                //links to static content such as images
                //and to external URLs (but do not visit them)
                stmapper.WritePageStatic(parsedData);
            }
            catch (Exception ex)
            {
                log.Error("There was a problem retriving URL : " + URL);
                log.Error(ex.Message);
            }
            return(parsedData);
        }
Example #2
0
        public List <Uri> Filter(IEnumerable <Uri> input)
        {
            var result = input.Where(i => URLS.RemoveWWWfromUrl(i.AbsoluteUri) != URLS.RemoveWWWfromUrl(_root.AbsoluteUri)).ToList();

            return(result);
        }