Exemple #1
0
        /// <summary>
        /// Crawls a page.
        /// </summary>
        /// <param name="url">The url to crawl.</param>
        private static void CrawlPage(string url)
        {
            if (!PageHasBeenCrawled(url))
            {
                string htmlText = GetWebText(url);

                Page page = new Page();
                page.Text = htmlText;
                page.Url = url;
                page.CalculateViewstateSize();

                _pages.Add(page);

                LinkParser linkParser = new LinkParser();
                linkParser.ParseLinks(page, url);

                CSSClassParser classParser = new CSSClassParser();
                classParser.ParseForCssClasses(page);


                //Add data to main data lists
                AddRangeButNoDuplicates(_externalUrls, linkParser.ExternalUrls);
                AddRangeButNoDuplicates(_otherUrls, linkParser.OtherUrls);
                AddRangeButNoDuplicates(_failedUrls, linkParser.BadUrls);
                AddRangeButNoDuplicates(_classes, classParser.Classes);

                foreach (string exception in linkParser.Exceptions)
                    _exceptions.Add(exception);


                //Crawl all the links found on the page.
                foreach (string link in linkParser.GoodUrls)
                {
                    string formattedLink = link;
                    try
                    {

                        formattedLink = FixPath(url, formattedLink);

                        if (formattedLink != String.Empty)
                        {
                            CrawlPage(formattedLink);
                        }
                    }
                    catch (Exception exc)
                    {
                        _failedUrls.Add(formattedLink + " (on page at url " + url + ") - " + exc.Message);
                    }
                }
            }
        }
        /// <summary>
        /// Parses the page looking for css classes that are in use.
        /// </summary>
        /// <param name="page">The page to parse.</param>
        public void ParseForCssClasses(Page page)
        {
            MatchCollection matches = Regex.Matches(page.Text, _CSS_CLASS_REGEX);

            for (int i = 0; i <= matches.Count - 1; i++)
            {
                Match classMatch = matches[i];
                string[] classesArray = classMatch.Value.Substring(classMatch.Value.IndexOf('"') + 1, classMatch.Value.LastIndexOf('"') - classMatch.Value.IndexOf('"') - 1).Split(new char[] { ' ' }, StringSplitOptions.RemoveEmptyEntries);

                foreach (string classValue in classesArray)
                {
                    if (!_classes.Contains(classValue))
                    {
                        _classes.Add(classValue);
                    }
                }
            }
        }
Exemple #3
0
        /// <summary>
        /// Parses a page looking for links.
        /// </summary>
        /// <param name="page">The page whose text is to be parsed.</param>
        /// <param name="sourceUrl">The source url of the page.</param>
        public void ParseLinks(Page page, string sourceUrl)
        {
            MatchCollection matches = Regex.Matches(page.Text, _LINK_REGEX);

            for (int i = 0; i <= matches.Count - 1; i++)
            {
                Match anchorMatch = matches[i];

                if (anchorMatch.Value == String.Empty)
                {
                    BadUrls.Add("Blank url value on page " + sourceUrl);
                    continue;
                }

                string foundHref = null;
                try
                {
                    foundHref = anchorMatch.Value.Replace("href=\"", "");
                    foundHref = foundHref.Substring(0, foundHref.IndexOf("\""));
                }
                catch (Exception exc)
                {
                    Exceptions.Add("Error parsing matched href: " + exc.Message);
                }


                if (!GoodUrls.Contains(foundHref))
                {
                    if (IsExternalUrl(foundHref))
                    {
                        _externalUrls.Add(foundHref);
                    }
                    else if (!IsAWebPage(foundHref))
                    {
                        foundHref = Crawler.FixPath(sourceUrl, foundHref);
                        _otherUrls.Add(foundHref);
                    }
                    else
                    {
                        GoodUrls.Add(foundHref);
                    }
                }
            }
        }