Пример #1
0
        /// <summary>
        /// Parses a page looking for links.
        /// </summary>
        /// <param name="page">The page whose text is to be parsed.</param>
        /// <param name="sourceUrl">The source url of the page.</param>
        public void ParseLinks(Page page, string sourceUrl)
        {
            MatchCollection matches = Regex.Matches(page.Text, _LINK_REGEX);

            for (int i = 0; i <= matches.Count - 1; i++)
            {
                Match anchorMatch = matches[i];

                if (anchorMatch.Value == String.Empty)
                {
                    BadUrls.Add("Blank url value on page " + sourceUrl);
                    continue;
                }

                string foundHref = null;
                try
                {
                    foundHref = anchorMatch.Value.Replace("href=\"", "");
                    foundHref = foundHref.Substring(0, foundHref.IndexOf("\""));
                }
                catch (Exception exc)
                {
                    Exceptions.Add("Error parsing matched href: " + exc.Message);
                }


                if (!GoodUrls.Contains(foundHref))
                {
                    if (foundHref != "/")
                    {
                        if (IsExternalUrl(foundHref))
                        {
                            _externalUrls.Add(foundHref);
                        }
                        else if (!IsAWebPage(foundHref))
                        {
                            foundHref = Crawler.FixPath(sourceUrl, foundHref);
                            _otherUrls.Add(foundHref);
                        }
                        else
                        {
                            GoodUrls.Add(foundHref);
                        }
                    }
                }
            }
        }
Пример #2
0
        /// <summary>
        ///     Parses a page looking for links.
        /// </summary>
        /// <param name="page">The page whose text is to be parsed.</param>
        /// <param name="sourceUrl">The source url of the page.</param>
        public void ParseLinks(InputSite inputSite, Page page, string sourceUrl)
        {
            if (sourceUrl.EndsWith(".xml"))
            {
                var matches = Regex.Matches(page.Text, _SITEMAP_REGEX);

                for (var i = 0; i <= matches.Count - 1; i++)
                {
                    var anchorMatch = matches[i];
                    var foundHref   = BddJson.NormalizeUrl(anchorMatch.Value);
                    // TODO faire un Regex Match
                    foundHref = foundHref.Replace("<loc>", "");
                    foundHref = foundHref.Replace("</loc>", "");

                    if (!IsBad(foundHref) && !GoodUrls.Contains(foundHref))
                    {
                        GoodUrls.Add(foundHref);
                    }
                }
            }
            else
            {
                var matches = Regex.Matches(page.Text, _LINK_REGEX);

                for (var i = 0; i <= matches.Count - 1; i++)
                {
                    var anchorMatch = matches[i];

                    if (anchorMatch.Value == string.Empty)
                    {
                        BadUrls.Add("Blank url value on page " + sourceUrl);
                        continue;
                    }

                    string foundHref = null;
                    try
                    {
                        foundHref = anchorMatch.Value.Replace("href=\"", "");
                        foundHref = foundHref.Substring(0, foundHref.IndexOf("\""));
                    }
                    catch (Exception exc)
                    {
                        Exceptions.Add("Error parsing matched href: " + exc.Message);
                    }

                    foundHref = BddJson.NormalizeUrl(foundHref);

                    if (!IsBad(foundHref) && !GoodUrls.Contains(foundHref))
                    {
                        if (IsExternalUrl(inputSite, foundHref))
                        {
                            ExternalUrls.Add(foundHref);
                        }
                        else if (!IsAWebPage(foundHref))
                        {
                            foundHref = Crawler.FixPath(inputSite, sourceUrl);
                            OtherUrls.Add(foundHref);
                        }
                        else
                        {
                            GoodUrls.Add(foundHref);
                        }
                    }
                }
            }
        }