Beispiel #1
0
 public void ProcessRequest(HttpContext context)
 {
     if (System.IO.File.Exists(Filename))
     {
         var ext = System.IO.Path.GetExtension(Filename);
         context.Response.ContentType = ExtendedHtmlUtility.TranslateContentType(ext);
         context.Response.WriteFile(Filename);
     }
     else
     {
         RouteModule.SendAccessIsDaniedResponse(context.Response);
     }
 }
Beispiel #2
0
        public async Task ProcessAsync(ICrawler crawler, PropertyBag propertyBag)
        {
            if (propertyBag.StatusCode != HttpStatusCode.OK)
            {
                return;
            }

            if (!IsXmlContent(propertyBag.ContentType))
            {
                return;
            }

            using (var reader = propertyBag.GetResponse())
            {
                using (var sr = new StreamReader(reader))
                {
                    var mydoc = XDocument.Load(sr);
                    if (mydoc.Root == null)
                    {
                        return;
                    }

                    var qualifiedName = XName.Get("loc", "http://www.sitemaps.org/schemas/sitemap/0.9");
                    var urlNodes      =
                        from e in mydoc.Descendants(qualifiedName)
                        where !e.Value.IsNullOrEmpty() && e.Value.StartsWith("http://", StringComparison.OrdinalIgnoreCase)
                        select e.Value;

                    foreach (var url in urlNodes)
                    {
                        // add new crawler steps
                        var baseUrl        = propertyBag.ResponseUri.GetLeftPart(UriPartial.Path);
                        var decodedLink    = ExtendedHtmlUtility.HtmlEntityDecode(url);
                        var normalizedLink = NormalizeLink(baseUrl, decodedLink);

                        if (normalizedLink.IsNullOrEmpty())
                        {
                            continue;
                        }

                        await crawler.AddStepAsync(new Uri(normalizedLink), propertyBag.Step.Depth + 1,
                                                   propertyBag.Step, new Dictionary <string, object>
                        {
                            { Resources.PropertyBagKeyOriginalUrl, url },
                            { Resources.PropertyBagKeyOriginalReferrerUrl, propertyBag.ResponseUri }
                        });
                    }
                }
            }
        }
        public Task <bool> Process(ICrawler crawler, PropertyBag propertyBag)
        {
            if (propertyBag.StatusCode != HttpStatusCode.OK ||
                propertyBag.Response == null ||
                propertyBag.Response.Length == 0)
            {
                return(Task.FromResult(true));
            }

            if (!IsXmlContent(propertyBag.ContentType))
            {
                return(Task.FromResult(true));
            }

            using (MemoryStream ms = new MemoryStream(propertyBag.Response))
            {
                XDocument mydoc = XDocument.Load(ms);
                if (mydoc.Root == null)
                {
                    return(Task.FromResult(true));
                }

                XName qualifiedName           = XName.Get("loc", "http://www.sitemaps.org/schemas/sitemap/0.9");
                IEnumerable <string> urlNodes =
                    from e in mydoc.Descendants(qualifiedName)
                    where !e.Value.IsNullOrEmpty() && e.Value.StartsWith("http://", StringComparison.OrdinalIgnoreCase)
                    select e.Value;

                foreach (string url in urlNodes)
                {
                    // add new crawler steps
                    string baseUrl        = propertyBag.ResponseUri.GetLeftPart(UriPartial.Path);
                    string decodedLink    = ExtendedHtmlUtility.HtmlEntityDecode(url);
                    string normalizedLink = NormalizeLink(baseUrl, decodedLink);
                    if (normalizedLink.IsNullOrEmpty())
                    {
                        continue;
                    }

                    propertyBag["PropertyBagKeyOriginalUrl"].Value         = url;
                    propertyBag["PropertyBagKeyOriginalReferrerUrl"].Value = propertyBag.ResponseUri;
                    crawler.Crawl(new Uri(normalizedLink), propertyBag);
                }
            }

            return(Task.FromResult(true));
        }
Beispiel #4
0
 public static void HtmlEncode(string value, TextWriter output)
 {
     output.Write(ExtendedHtmlUtility.HtmlEntityEncode(value));
 }
Beispiel #5
0
 public static string HtmlEncode(string value)
 {
     return(ExtendedHtmlUtility.HtmlEntityEncode(value));
 }
Beispiel #6
0
        public virtual async Task ProcessAsync(ICrawler crawler, PropertyBag propertyBag)
        {
            AspectF.Define.
            NotNull(crawler, "crawler").
            NotNull(propertyBag, "propertyBag");

            if (propertyBag.StatusCode != HttpStatusCode.OK)
            {
                return;
            }

            if (!IsHtmlContent(propertyBag.ContentType))
            {
                return;
            }


            var htmlDoc = new HtmlDocument
            {
                OptionAddDebuggingAttributes = false,
                OptionAutoCloseOnEnd         = true,
                OptionFixNestedTags          = true,
                OptionReadEncoding           = true
            };

            using (var reader = propertyBag.GetResponse())
            {
                var documentEncoding = htmlDoc.DetectEncoding(reader);
                reader.Seek(0, SeekOrigin.Begin);
                if (!documentEncoding.IsNull())
                {
                    htmlDoc.Load(reader, documentEncoding, true);
                }
                else
                {
                    htmlDoc.Load(reader, true);
                }
            }

            var originalContent = htmlDoc.DocumentNode.OuterHtml;

            if (this.HasTextStripRules || this.HasSubstitutionRules)
            {
                var content = this.StripText(originalContent);
                content = this.Substitute(content, propertyBag.Step);
                using (TextReader tr = new StringReader(content))
                {
                    htmlDoc.Load(tr);
                }
            }

            propertyBag["HtmlDoc"].Value = htmlDoc;

            var nodes = htmlDoc.DocumentNode.SelectNodes("//title");

            // Extract Title
            if (!nodes.IsNull())
            {
                propertyBag.Title = string.Join(";", nodes.
                                                Select(n => n.InnerText).
                                                ToArray()).Trim();
            }

            // Extract Meta Data
            nodes = htmlDoc.DocumentNode.SelectNodes("//meta[@content and @name]");
            if (!nodes.IsNull())
            {
                propertyBag["Meta"].Value = (
                    from entry in nodes
                    let name = entry.Attributes["name"]
                               let content = entry.Attributes["content"]
                                             where !name.IsNull() && !name.Value.IsNullOrEmpty() && !content.IsNull() && !content.Value.IsNullOrEmpty()
                                             select name.Value + ": " + content.Value).ToArray();
            }

            // Extract text
            propertyBag.Text = htmlDoc.ExtractText().Trim();
            if (this.HasLinkStripRules || this.HasTextStripRules)
            {
                var content = this.StripLinks(originalContent);
                using (TextReader tr = new StringReader(content))
                {
                    htmlDoc.Load(tr);
                }
            }

            var baseUrl = propertyBag.ResponseUri.GetLeftPath();

            // Extract Head Base
            nodes = htmlDoc.DocumentNode.SelectNodes("//head/base[@href]");
            if (!nodes.IsNull())
            {
                baseUrl =
                    nodes.
                    Select(entry => new { entry, href = entry.Attributes["href"] }).
                    Where(@t => [email protected]() && [email protected]() &&
                          Uri.IsWellFormedUriString(@t.href.Value, UriKind.RelativeOrAbsolute)).
                    Select(@t => @t.href.Value).
                    AddToEnd(baseUrl).
                    FirstOrDefault();
            }

            // Extract Links
            var links = htmlDoc.GetLinks();

            foreach (var link in links.Links.Union(links.References))
            {
                if (link.IsNullOrEmpty())
                {
                    continue;
                }

                var decodedLink    = ExtendedHtmlUtility.HtmlEntityDecode(link);
                var normalizedLink = this.NormalizeLink(baseUrl, decodedLink);
                if (normalizedLink.IsNullOrEmpty())
                {
                    continue;
                }

                await crawler.AddStepAsync(new Uri(normalizedLink), propertyBag.Step.Depth + 1,
                                           propertyBag.Step, new Dictionary <string, object>
                {
                    { Resources.PropertyBagKeyOriginalUrl, link },
                    { Resources.PropertyBagKeyOriginalReferrerUrl, propertyBag.ResponseUri }
                }).ConfigureAwait(false);
            }
        }
        public Task <bool> Process(ICrawler crawler, PropertyBag propertyBag)
        {
            AspectF.Define
            .NotNull(crawler, nameof(crawler))
            .NotNull(propertyBag, nameof(propertyBag));

            if (propertyBag.StatusCode != HttpStatusCode.OK)
            {
                return(Task.FromResult(true));
            }

            if (!IsHtmlContent(propertyBag.ContentType))
            {
                return(Task.FromResult(true));
            }

            HtmlDocument htmlDoc = new HtmlDocument
            {
                OptionAddDebuggingAttributes = false,
                OptionAutoCloseOnEnd         = true,
                OptionFixNestedTags          = true,
                OptionReadEncoding           = true
            };

            using (MemoryStream ms = new MemoryStream(propertyBag.Response))
            {
                Encoding documentEncoding = htmlDoc.DetectEncoding(ms);
                ms.Seek(0, SeekOrigin.Begin);
                if (!documentEncoding.IsNull())
                {
                    htmlDoc.Load(ms, documentEncoding, true);
                }
                else
                {
                    htmlDoc.Load(ms, true);
                }
            }

            string originalContent = htmlDoc.DocumentNode.OuterHtml;

            if (HasTextStripRules || HasSubstitutionRules)
            {
                string content = StripText(originalContent);
                content = Substitute(content, propertyBag.Step);
                using (TextReader tr = new StringReader(content))
                {
                    htmlDoc.Load(tr);
                }
            }

            propertyBag["HtmlDoc"].Value = htmlDoc;

            HtmlNodeCollection nodes = htmlDoc.DocumentNode.SelectNodes("//title");

            // Extract Title
            if (!nodes.IsNull())
            {
                propertyBag.Title = string.Join(";", nodes.
                                                Select(n => n.InnerText).
                                                ToArray()).Trim();
            }

            // Extract Meta Data
            nodes = htmlDoc.DocumentNode.SelectNodes("//meta[@content and @name]");
            if (!nodes.IsNull())
            {
                propertyBag["Meta"].Value = (
                    from entry in nodes
                    let name = entry.Attributes["name"]
                               let content = entry.Attributes["content"]
                                             where !name.IsNull() && !name.Value.IsNullOrEmpty() && !content.IsNull() && !content.Value.IsNullOrEmpty()
                                             select $"{name.Value}: {content.Value}").ToArray();
            }

            // Extract text
            propertyBag.Text = htmlDoc.ExtractText().Trim();
            if (HasLinkStripRules || HasTextStripRules)
            {
                string content = StripLinks(originalContent);
                using (TextReader tr = new StringReader(content))
                {
                    htmlDoc.Load(tr);
                }
            }

            string baseUrl = propertyBag.ResponseUri.GetLeftPart(UriPartial.Path);

            // Extract Head Base
            nodes = htmlDoc.DocumentNode.SelectNodes("//head/base[@href]");
            if (!nodes.IsNull())
            {
                baseUrl = nodes
                          .Select(entry => new { entry, href = entry.Attributes["href"] })
                          .Where(arg => !arg.href.IsNull() &&
                                 !arg.href.Value.IsNullOrEmpty() &&
                                 Uri.IsWellFormedUriString(arg.href.Value, UriKind.RelativeOrAbsolute))
                          .Select(t =>
                {
                    if (Uri.IsWellFormedUriString(t.href.Value, UriKind.Relative))
                    {
                        return(propertyBag.ResponseUri.GetComponents(UriComponents.SchemeAndServer, UriFormat.Unescaped) + t.href.Value);
                    }

                    return(t.href.Value);
                })
                          .AddToEnd(baseUrl)
                          .FirstOrDefault();
            }

            // Extract Links
            DocumentWithLinks links = htmlDoc.GetLinks();

            foreach (string link in links.Links.Union(links.References))
            {
                if (link.IsNullOrEmpty())
                {
                    continue;
                }

                string decodedLink    = ExtendedHtmlUtility.HtmlEntityDecode(link);
                string normalizedLink = NormalizeLink(baseUrl, decodedLink);
                if (normalizedLink.IsNullOrEmpty())
                {
                    continue;
                }

                crawler.Crawl(new Uri(normalizedLink), propertyBag);
            }

            return(Task.FromResult(true));
        }
Beispiel #8
0
        public void Process(Crawler crawler, PropertyBag propertyBag)
        {
            AspectF.Define.
            NotNull(crawler, "crawler").
            NotNull(propertyBag, "propertyBag");

            if (propertyBag.StatusCode != HttpStatusCode.OK)
            {
                return;
            }

            if (!IsHtmlContent(propertyBag.ContentType))
            {
                return;
            }

            HtmlDocument htmlDoc = new HtmlDocument
            {
                OptionAddDebuggingAttributes = false,
                OptionAutoCloseOnEnd         = true,
                OptionFixNestedTags          = true,
                OptionReadEncoding           = true
            };

            using (MemoryStream reader = propertyBag.GetResponseStream())
            {
                Encoding documentEncoding = htmlDoc.DetectEncoding(reader);
                reader.Seek(0, SeekOrigin.Begin);
                if (!documentEncoding.IsNull())
                {
                    htmlDoc.Load(reader, documentEncoding, true);
                }
                else
                {
                    htmlDoc.Load(reader, true);
                }
            }

            string originalContent = htmlDoc.DocumentNode.OuterHtml;

            if (HasTextStripRules || HasSubstitutionRules)
            {
                string content = StripText(originalContent);
                content = Substitute(content, propertyBag.Step);
                using (TextReader tr = new StringReader(content))
                {
                    htmlDoc.Load(tr);
                }
            }

            HtmlNodeCollection nodes = htmlDoc.DocumentNode.SelectNodes("//title");

            // Extract Title
            if (!nodes.IsNull())
            {
                propertyBag.Title = string.Join(";", nodes.
                                                Select(n => n.InnerText).
                                                ToArray()).Trim();
            }

            // Extract Meta Data
            nodes = htmlDoc.DocumentNode.SelectNodes("//meta[@content and @name]");
            if (!nodes.IsNull())
            {
                propertyBag["Meta"].Value = (
                    from entry in nodes
                    let name = entry.Attributes["name"]
                               let content = entry.Attributes["content"]
                                             where !name.IsNull() && !name.Value.IsNullOrEmpty() && !content.IsNull() && !content.Value.IsNullOrEmpty()
                                             select name.Value + ": " + content.Value).ToArray();
            }

            propertyBag.Text = htmlDoc.ExtractText().Trim();
            if (HasLinkStripRules || HasTextStripRules)
            {
                string content = StripLinks(originalContent);
                using (TextReader tr = new StringReader(content))
                {
                    htmlDoc.Load(tr);
                }
            }

            // Extract Links
            DocumentWithLinks links = htmlDoc.GetLinks();

            foreach (string link in links.Links.Union(links.References))
            {
                if (link.IsNullOrEmpty())
                {
                    continue;
                }

                string baseUrl        = propertyBag.ResponseUri.GetLeftPart(UriPartial.Path);
                string decodedLink    = ExtendedHtmlUtility.HtmlEntityDecode(link);
                string normalizedLink = NormalizeLink(baseUrl, decodedLink);
                if (normalizedLink.IsNullOrEmpty())
                {
                    continue;
                }

                crawler.AddStep(new Uri(normalizedLink), propertyBag.Step.Depth + 1,
                                propertyBag.Step, new Dictionary <string, object>
                {
                    { Resources.PropertyBagKeyOriginalUrl, link },
                    { Resources.PropertyBagKeyOriginalReferrerUrl, propertyBag.ResponseUri }
                });
            }
        }
        public static void HtmlDecode(string value, TextWriter output)
        {
            string str = ExtendedHtmlUtility.HtmlEntityDecode(value);

            output.Write(str);
        }
        public override void Process(Crawler crawler, PropertyBag propertyBag)
        {
            AspectF.Define.
            NotNull(crawler, "crawler").
            NotNull(propertyBag, "propertyBag");

            if (propertyBag.StatusCode != HttpStatusCode.OK)
            {
                return;
            }

            if (!IsHtmlContent(propertyBag.ContentType))
            {
                return;
            }

            HtmlDocument htmlDoc = new HtmlDocument
            {
                OptionAddDebuggingAttributes = false,
                OptionAutoCloseOnEnd         = true,
                OptionFixNestedTags          = true,
                OptionReadEncoding           = true
            };

            using (Stream reader = propertyBag.GetResponse())
            {
                Encoding documentEncoding = htmlDoc.DetectEncoding(reader);
                reader.Seek(0, SeekOrigin.Begin);
                if (!documentEncoding.IsNull())
                {
                    htmlDoc.Load(reader, documentEncoding, true);
                }
                else
                {
                    htmlDoc.Load(reader, true);
                }
            }

            string originalContent = htmlDoc.DocumentNode.OuterHtml;

            if (HasTextStripRules || HasSubstitutionRules)
            {
                string content = StripText(originalContent);
                content = Substitute(content, propertyBag.Step);
                using (TextReader tr = new StringReader(content))
                {
                    htmlDoc.Load(tr);
                }
            }

            propertyBag["HtmlDoc"].Value = htmlDoc;

            HtmlNodeCollection nodes = htmlDoc.DocumentNode.SelectNodes("//title");

            // Extract Title
            if (!nodes.IsNull())
            {
                propertyBag.Title = string.Join(";", nodes.
                                                Select(n => n.InnerText).
                                                ToArray()).Trim();
            }

            // Extract Meta Data
            nodes = htmlDoc.DocumentNode.SelectNodes("//meta[@content and @name]");
            if (!nodes.IsNull())
            {
                propertyBag["Meta"].Value = (
                    from entry in nodes
                    let name = entry.Attributes["name"]
                               let content = entry.Attributes["content"]
                                             where !name.IsNull() && !name.Value.IsNullOrEmpty() && !content.IsNull() && !content.Value.IsNullOrEmpty()
                                             select name.Value + ": " + content.Value).ToArray();
            }

            // Extract text
            propertyBag.Text = htmlDoc.ExtractText().Trim();
            if (HasLinkStripRules || HasTextStripRules)
            {
                string content = StripLinks(originalContent);
                using (TextReader tr = new StringReader(content))
                {
                    htmlDoc.Load(tr);
                }
            }

            string baseUrl = propertyBag.ResponseUri.GetLeftPart(UriPartial.Path);

            // Extract Head Base
            nodes = htmlDoc.DocumentNode.SelectNodes("//head/base[@href]");
            if (!nodes.IsNull())
            {
                baseUrl =
                    nodes.
                    Select(entry => new { entry, href = entry.Attributes["href"] }).
                    Where(@t => [email protected]() && [email protected]() &&
                          Uri.IsWellFormedUriString(@t.href.Value, UriKind.RelativeOrAbsolute)).
                    Select(@t => @t.href.Value).
                    AddToEnd(baseUrl).
                    FirstOrDefault();
            }

            // Extract Links
            DocumentWithLinks links = htmlDoc.GetLinks();

            foreach (string link in links.Links.Union(links.References))
            {
                if (link.IsNullOrEmpty())
                {
                    continue;
                }

                string decodedLink = ExtendedHtmlUtility.HtmlEntityDecode(link);
                try
                {
                    string normalizedLink = NormalizeLink(baseUrl, decodedLink);
                    if (normalizedLink.IsNullOrEmpty())
                    {
                        continue;
                    }
                    AddStepToCrawler(crawler, propertyBag, normalizedLink, link);
                }
                catch (UriFormatException)
                {
                    //When the link is not propper formatted the link mist be ignored
                }
            }
        }