Exemplo n.º 1
0
        private void ParseHref(FlatPage page, Match href)
        {
            string url = href.Groups[1].Value;

            if (url.ToLower() == _rootUrl)
            {
                return;
            }

            if (url.StartsWith("#") || url.StartsWith("/#"))
            {
                return;
            }

            if (url.Contains(_domain) || url.StartsWith("/"))
            {
                if (!page.Links.ContainsKey(url))
                {
                    page.Links.Add(url, url);
                }
            }
            else
            {
                if (!page.ExternalLinks.ContainsKey(url))
                {
                    page.ExternalLinks.Add(url, url);
                }
            }
        }
Exemplo n.º 2
0
        private void ParseLinks(string pageContent, FlatPage page)
        {
            MatchCollection anchors = Regex.Matches(pageContent, AnchorRegex, RegexOptions.Singleline);

            foreach (Match anchor in anchors)
            {
                Match href = Regex.Match(anchor.Groups[1].Value, HrefRegex, RegexOptions.Singleline);

                if (href.Success)
                {
                    ParseHref(page, href);
                }
            }
        }
Exemplo n.º 3
0
        public FlatPage Parse(string pageContent, string pageUrl)
        {
            if (string.IsNullOrWhiteSpace(_domain))
            {
                throw new ArgumentException("Please invoke SetRootUrl before parsing");
            }

            var newPage = new FlatPage { Url = pageUrl };

            ParseLinks(pageContent, newPage);

            ParseImages(pageContent, newPage);

            return newPage;
        }
Exemplo n.º 4
0
        private void MergeFlatPage(Page page, FlatPage flatPage, SiteDictionary siteDictionary)
        {
            foreach (var link in flatPage.Links)
            {
                string url = link.Value;
                Page existingPage;

                if (!siteDictionary.Links.TryGetValue(url, out existingPage))
                {
                    var newPage = new Page { Url = url };
                    siteDictionary.Links.Add(url, newPage);
                    page.InternalLinks.Add(newPage);
                }
                else
                {
                    page.InternalLinks.Add((Page)existingPage);
                }
            }

            foreach (var image in flatPage.Images)
            {
                string url = image.Value;
                Image existingImage;

                if (!siteDictionary.Images.TryGetValue(url, out existingImage))
                {
                    var newImage = new Image { Url = url };
                    siteDictionary.Images.Add(url, newImage);
                    page.Images.Add(newImage);
                }
                else
                {
                    page.Images.Add((Image)existingImage);
                }
            }

            foreach (var image in flatPage.ExternalLinks)
            {
                string url = image.Value;
                ExternalLink existingLink;

                if (!siteDictionary.ExternalLinks.TryGetValue(url, out existingLink))
                {
                    var newExternalLink = new ExternalLink { Url = url };
                    siteDictionary.ExternalLinks.Add(url, newExternalLink);
                    page.ExternalLinks.Add(newExternalLink);
                }
                else
                {
                    page.ExternalLinks.Add((ExternalLink)existingLink);
                }
            }

            page.Visited = true;
        }
Exemplo n.º 5
0
        private void ParseImages(string pageContent, FlatPage page)
        {
            MatchCollection images = Regex.Matches(pageContent, ImageRegex, RegexOptions.Singleline);

            foreach (Match image in images)
            {
                Match href = Regex.Match(image.Groups[1].Value, SrcRegex, RegexOptions.Singleline);

                if (href.Success)
                {
                    string url = href.Groups[1].Value;

                    if (!page.Images.ContainsKey(url))
                    {
                        page.Images.Add(url, url);
                    }
                }
            }
        }