Пример #1
0
		public void Process(Crawler crawler, PropertyBag propertyBag)
		{
			if (propertyBag.StatusCode != HttpStatusCode.OK)
			{
				return;
			}

			string extension = MapContentTypeToExtension(propertyBag.ContentType);
			if (extension.IsNullOrEmpty())
			{
				return;
			}

			propertyBag.Title = propertyBag.Step.Uri.PathAndQuery;
			using (TempFile temp = new TempFile())
			{
				temp.FileName += "." + extension;
				using (FileStream fs = new FileStream(temp.FileName, FileMode.Create, FileAccess.Write, FileShare.Read, 0x1000))
				using (Stream input = propertyBag.GetResponse())
				{
					input.CopyToStream(fs);
				}

				using (FilterReader filterReader = new FilterReader(temp.FileName))
				{
					string content = filterReader.ReadToEnd();
					propertyBag.Text = content.Trim();
				}
			}
		}
Пример #2
0
        public void Process(Crawler crawler, PropertyBag propertyBag)
        {
            if (propertyBag.StatusCode != HttpStatusCode.OK)
            {
                return;
            }

            string extension = MapContentTypeToExtension(propertyBag.ContentType);

            if (extension.IsNullOrEmpty())
            {
                return;
            }

            propertyBag.Title = propertyBag.Step.Uri.PathAndQuery;
            using (TempFile temp = new TempFile())
            {
                temp.FileName += "." + extension;
                using (FileStream fs = new FileStream(temp.FileName, FileMode.Create, FileAccess.Write, FileShare.Read, 0x1000))
                    using (Stream input = propertyBag.GetResponse())
                    {
                        input.CopyToStream(fs);
                    }

                using (FilterReader filterReader = new FilterReader(temp.FileName))
                {
                    string content = filterReader.ReadToEnd();
                    propertyBag.Text = content.Trim();
                }
            }
        }
Пример #3
0
        public Task ProcessAsync(ICrawler crawler, PropertyBag propertyBag)
        {
            if (propertyBag.StatusCode != HttpStatusCode.OK)
            {
                return(Task.CompletedTask);
            }

            using (var tempFile = new TempFile())
            {
                using (var fs = new FileStream(tempFile.FileName, FileMode.Create, FileAccess.Write, FileShare.Read, 0x1000))
                    using (var input = propertyBag.GetResponse())
                    {
                        input.CopyTo(fs);
                    }

                var id3 = new UltraID3();
                id3.Read(tempFile.FileName);

                propertyBag["MP3_Album"].Value    = id3.Album;
                propertyBag["MP3_Artist"].Value   = id3.Artist;
                propertyBag["MP3_Comments"].Value = id3.Comments;
                propertyBag["MP3_Duration"].Value = id3.Duration;
                propertyBag["MP3_Genre"].Value    = id3.Genre;
                propertyBag["MP3_Title"].Value    = id3.Title;
            }

            return(Task.CompletedTask);
        }
Пример #4
0
        public void Process(Crawler crawler, PropertyBag propertyBag)
        {
            if (propertyBag.StatusCode != HttpStatusCode.OK)
            {
                return;
            }

            using (TempFile tempFile = new TempFile())
            {
                using (FileStream fs = new FileStream(tempFile.FileName, FileMode.Create, FileAccess.Write, FileShare.Read, 0x1000))
                using (Stream input = propertyBag.GetResponse())
                {
                    input.CopyToStream(fs);
                }

                UltraID3 id3 = new UltraID3();
                id3.Read(tempFile.FileName);

                propertyBag["MP3_Album"].Value = id3.Album;
                propertyBag["MP3_Artist"].Value = id3.Artist;
                propertyBag["MP3_Comments"].Value = id3.Comments;
                propertyBag["MP3_Duration"].Value = id3.Duration;
                propertyBag["MP3_Genre"].Value = id3.Genre;
                propertyBag["MP3_Title"].Value = id3.Title;
            }
        }
Пример #5
0
        public async Task ProcessAsync(ICrawler crawler, PropertyBag propertyBag)
        {
            if (propertyBag.StatusCode != HttpStatusCode.OK)
            {
                return;
            }

            var extension = MapContentTypeToExtension(propertyBag.ContentType);

            if (extension.IsNullOrEmpty())
            {
                return;
            }

            propertyBag.Title = propertyBag.Step.Uri.PathAndQuery;
            using (var temp = new TempFile())
            {
                temp.FileName += "." + extension;
                using (var fs = new FileStream(temp.FileName, FileMode.Create, FileAccess.Write, FileShare.Read, 0x1000))
                    using (var input = propertyBag.GetResponse())
                    {
                        input.CopyTo(fs);
                    }

                using (var filterReader = new FilterReader(temp.FileName))
                {
                    var content = await filterReader.ReadToEndAsync();

                    propertyBag.Text = content.Trim();
                }
            }
        }
 public static HtmlDocument LoadFromHtml(PropertyBag propertyBag)
 {
     try
     {
         HtmlDocument htmlDoc = new HtmlDocument
         {
             OptionAddDebuggingAttributes = false,
             OptionAutoCloseOnEnd         = true,
             OptionFixNestedTags          = true,
             OptionReadEncoding           = true
         };
         using (Stream reader = propertyBag.GetResponse())
         {
             Encoding documentEncoding = Encoding.GetEncoding(propertyBag.CharacterSet);
             if (propertyBag.CharacterSet == "ISO-8859-1")
             {
                 documentEncoding = htmlDoc.DetectEncoding(reader);
             }
             reader.Seek(0, SeekOrigin.Begin);
             if (!documentEncoding.IsNull())
             {
                 htmlDoc.Load(reader, documentEncoding, true);
             }
             else
             {
                 htmlDoc.Load(reader, true);
             }
         }
         return(htmlDoc);
     }
     catch (Exception)
     {
         return(null);
     }
 }
Пример #7
0
        /// <summary>
        ///     Gets or Sets a value indicating if cookies will be stored.
        /// </summary>
        private async Task <PropertyBag> DownloadInternalSync(CrawlStep crawlStep, CrawlStep referrer, DownloadMethod method)
        {
            PropertyBag result = null;
            Exception   ex     = null;

            using (var resetEvent = new ManualResetEvent(false))
            {
                await DownloadAsync <object>(crawlStep, referrer, method,
                                             (RequestState <object> state) =>
                {
                    if (state.Exception.IsNull())
                    {
                        result = state.PropertyBag;
                        if (!result.GetResponse.IsNull())
                        {
                            using (var response = result.GetResponse())
                            {
                                byte[] data;
                                if (response is MemoryStream)
                                {
                                    data = ((MemoryStream)response).ToArray();
                                }
                                else
                                {
                                    using (var copy = response.CopyToMemory())
                                    {
                                        data = copy.ToArray();
                                    }
                                }

                                result.GetResponse = () => new MemoryStream(data);
                            }
                        }
                    }
                    else
                    {
                        ex = state.Exception;
                    }

                    resetEvent.Set();
                    return(Task.FromResult(0));
                }, null, null);

                resetEvent.WaitOne();
            }

            if (!ex.IsNull())
            {
                throw new Exception("Error write downloading {0}".FormatWith(crawlStep.Uri), ex);
            }

            return(result);
        }
        public void Process(Crawler crawler, PropertyBag propertyBag)
        {
            AspectF.Define.
            NotNull(crawler, "crawler").
            NotNull(propertyBag, "propertyBag");

            string content = propertyBag.Text;

            if (content.IsNullOrEmpty())
            {
                return;
            }

            string contentLookupText         = content.Max(MaxPostSize);
            string encodedRequestUrlFragment =
                "http://ajax.googleapis.com/ajax/services/language/detect?v=1.0&q={0}".FormatWith(contentLookupText);

            m_Logger.Verbose("Google language detection using: {0}", encodedRequestUrlFragment);

            try
            {
                IWebDownloader downloader = NCrawlerModule.Container.Resolve <IWebDownloader>();
                PropertyBag    result     = downloader.Download(new CrawlStep(new Uri(encodedRequestUrlFragment), 0), null, DownloadMethod.GET);
                if (result.IsNull())
                {
                    return;
                }

                using (Stream responseReader = result.GetResponse())
                    using (StreamReader reader = new StreamReader(responseReader))
                    {
                        string json = reader.ReadLine();
                        using (MemoryStream ms = new MemoryStream(Encoding.Unicode.GetBytes(json)))
                        {
                            DataContractJsonSerializer ser =
                                new DataContractJsonSerializer(typeof(LanguageDetector));
                            LanguageDetector detector = ser.ReadObject(ms) as LanguageDetector;

                            if (!detector.IsNull())
                            {
                                CultureInfo culture = CultureInfo.GetCultureInfo(detector.responseData.language);
                                propertyBag["Language"].Value        = detector.responseData.language;
                                propertyBag["LanguageCulture"].Value = culture;
                            }
                        }
                    }
            }
            catch (Exception ex)
            {
                m_Logger.Error("Error during google language detection, the error was: {0}", ex.ToString());
            }
        }
Пример #9
0
        public async Task ProcessAsync(ICrawler crawler, PropertyBag propertyBag)
        {
            if (propertyBag.StatusCode != HttpStatusCode.OK)
            {
                return;
            }

            if (!IsXmlContent(propertyBag.ContentType))
            {
                return;
            }

            using (var reader = propertyBag.GetResponse())
            {
                using (var sr = new StreamReader(reader))
                {
                    var mydoc = XDocument.Load(sr);
                    if (mydoc.Root == null)
                    {
                        return;
                    }

                    var qualifiedName = XName.Get("loc", "http://www.sitemaps.org/schemas/sitemap/0.9");
                    var urlNodes      =
                        from e in mydoc.Descendants(qualifiedName)
                        where !e.Value.IsNullOrEmpty() && e.Value.StartsWith("http://", StringComparison.OrdinalIgnoreCase)
                        select e.Value;

                    foreach (var url in urlNodes)
                    {
                        // add new crawler steps
                        var baseUrl        = propertyBag.ResponseUri.GetLeftPart(UriPartial.Path);
                        var decodedLink    = ExtendedHtmlUtility.HtmlEntityDecode(url);
                        var normalizedLink = NormalizeLink(baseUrl, decodedLink);

                        if (normalizedLink.IsNullOrEmpty())
                        {
                            continue;
                        }

                        await crawler.AddStepAsync(new Uri(normalizedLink), propertyBag.Step.Depth + 1,
                                                   propertyBag.Step, new Dictionary <string, object>
                        {
                            { Resources.PropertyBagKeyOriginalUrl, url },
                            { Resources.PropertyBagKeyOriginalReferrerUrl, propertyBag.ResponseUri }
                        });
                    }
                }
            }
        }
Пример #10
0
        public void Process(Crawler crawler, PropertyBag propertyBag)
        {
            if (propertyBag.StatusCode != HttpStatusCode.OK)
            {
                return;
            }

            if (!IsXmlContent(propertyBag.ContentType))
            {
                return;
            }

            using (Stream reader = propertyBag.GetResponse())
            using (StreamReader sr = new StreamReader(reader))
            {
                XDocument mydoc = XDocument.Load(sr);
                if (mydoc.Root == null)
                {
                    return;
                }

                XName qualifiedName = XName.Get("loc", "http://www.sitemaps.org/schemas/sitemap/0.9");
                IEnumerable<string> urlNodes =
                    from e in mydoc.Descendants(qualifiedName)
                    where !e.Value.IsNullOrEmpty() && e.Value.StartsWith("http://", StringComparison.OrdinalIgnoreCase)
                    select e.Value;

                foreach (string url in urlNodes)
                {
                    // add new crawler steps
                    string baseUrl = propertyBag.ResponseUri.GetLeftPart(UriPartial.Path);
                    string decodedLink = ExtendedHtmlUtility.HtmlEntityDecode(url);
                    string normalizedLink = NormalizeLink(baseUrl, decodedLink);

                    if (normalizedLink.IsNullOrEmpty())
                    {
                        continue;
                    }

                    crawler.AddStep(new Uri(normalizedLink), propertyBag.Step.Depth + 1,
                        propertyBag.Step, new Dictionary<string, object>
                            {
                                {Resources.PropertyBagKeyOriginalUrl, url},
                                {Resources.PropertyBagKeyOriginalReferrerUrl, propertyBag.ResponseUri}
                            });
                }
            }
        }
Пример #11
0
        public void Process(Crawler crawler, PropertyBag propertyBag)
        {
            if (propertyBag.StatusCode != HttpStatusCode.OK)
            {
                return;
            }

            if (!IsTextContent(propertyBag.ContentType))
            {
                return;
            }

            using (Stream reader = propertyBag.GetResponse())
            {
                string content = reader.ReadToEnd();
                propertyBag.Text = content.Trim();
            }
        }
Пример #12
0
        public void Process(Crawler crawler, PropertyBag propertyBag)
        {
            var rsp = propertyBag.GetResponse();

            try
            {
                HtmlDocument htmlDoc  = HtmlParse.LoadFromHtml(propertyBag);
                var          siteType = HtmlParse.RecogSite(propertyBag.ResponseUri);
                var          records  = Parse(htmlDoc, siteType);
                if (records == null)
                {
                    return;
                }
            }
            catch (NullReferenceException)
            {
            }
        }
Пример #13
0
		public void Process(Crawler crawler, PropertyBag propertyBag)
		{
			if (propertyBag.StatusCode != HttpStatusCode.OK)
			{
				return;
			}

			if (!IsTextContent(propertyBag.ContentType))
			{
				return;
			}

			using (Stream reader = propertyBag.GetResponse())
			{
				string content = reader.ReadToEnd();
				propertyBag.Text = content.Trim();
			}
		}
Пример #14
0
        private async Task <int> ProcessCoreAsync(ICrawler crawler, PropertyBag propertyBag)
        {
            if (propertyBag.StatusCode != HttpStatusCode.OK)
            {
                return(0);
            }

            if (!IsTextContent(propertyBag.ContentType))
            {
                return(0);
            }

            using (var reader = propertyBag.GetResponse())
            {
                var content = await reader.ReadToEndAsync().ConfigureAwait(false);

                propertyBag.Text = content.Trim();
            }

            return(0);
        }
Пример #15
0
        public Task ProcessAsync(ICrawler crawler, PropertyBag propertyBag)
        {
            AspectF.Define.
            NotNull(crawler, "crawler").
            NotNull(propertyBag, "propertyBag");

            if (propertyBag.StatusCode != HttpStatusCode.OK)
            {
                return(Task.CompletedTask);
            }

            if (!IsPdfContent(propertyBag.ContentType))
            {
                return(Task.CompletedTask);
            }

            using (var input = propertyBag.GetResponse())
            {
                var pdfReader = new PdfReader(input);
                try
                {
                    if (pdfReader.Info.TryGetValue("Title", out var title))
                    {
                        propertyBag.Title = Convert.ToString(title, CultureInfo.InvariantCulture).Trim();
                    }

                    var textExtractionStrategy = new SimpleTextExtractionStrategy();
                    propertyBag.Text = Enumerable.Range(1, pdfReader.NumberOfPages).
                                       Select(pageNumber => PdfTextExtractor.GetTextFromPage(pdfReader, pageNumber, textExtractionStrategy)).
                                       Join(Environment.NewLine);
                }
                finally
                {
                    pdfReader.Close();
                }
            }

            return(Task.CompletedTask);
        }
		public void Process(Crawler crawler, PropertyBag propertyBag)
		{
			AspectF.Define.
				NotNull(crawler, "crawler").
				NotNull(propertyBag, "propertyBag");

			if (propertyBag.StatusCode != HttpStatusCode.OK)
			{
				return;
			}

			if (!IsPdfContent(propertyBag.ContentType))
			{
				return;
			}

			using (Stream input = propertyBag.GetResponse())
			{
				PdfReader pdfReader = new PdfReader(input);
				try
				{
					string title;
					if (pdfReader.Info.TryGetValue("Title", out title))
					{
						propertyBag.Title = Convert.ToString(title, CultureInfo.InvariantCulture).Trim();
					}

					SimpleTextExtractionStrategy textExtractionStrategy = new SimpleTextExtractionStrategy();
					propertyBag.Text = Enumerable.Range(1, pdfReader.NumberOfPages).
						Select(pageNumber => PdfTextExtractor.GetTextFromPage(pdfReader, pageNumber, textExtractionStrategy)).
						Join(Environment.NewLine);
				}
				finally
				{
					pdfReader.Close();
				}
			}
		}
        public void Process(NCrawler.Crawler crawler, PropertyBag propertyBag)
        {
            var rsp = propertyBag.GetResponse();

            try
            {
                HtmlDocument htmlDoc  = HtmlParse.LoadFromHtml(propertyBag);
                var          siteType = HtmlParse.RecogSite(propertyBag.ResponseUri);
                var          records  = Parse(htmlDoc, siteType);
                if (records == null)
                {
                    return;
                }
                foreach (var record in records)
                {
                    DAL.Data.Add(record);
                    ++ci.Count;
                }
            }
            catch (NullReferenceException)
            {
            }
        }
Пример #18
0
        public void Process(Crawler crawler, PropertyBag propertyBag)
        {
            AspectF.Define.
                NotNull(crawler, "crawler").
                NotNull(propertyBag, "propertyBag");

            if (propertyBag.StatusCode != HttpStatusCode.OK)
            {
                return;
            }

            if (!IsPdfContent(propertyBag.ContentType))
            {
                return;
            }

            using (Stream input = propertyBag.GetResponse())
            {
                PdfReader pdfReader = new PdfReader(input);
                try
                {
                    object title = pdfReader.Info["Title"];
                    if (!title.IsNull())
                    {
                        string pdfTitle = Convert.ToString(title, CultureInfo.InvariantCulture).Trim();
                        if (!pdfTitle.IsNullOrEmpty())
                        {
                            propertyBag.Title = pdfTitle;
                        }
                    }

                    StringBuilder sb = new StringBuilder();
                    // Following code from:
                    // http://www.vbforums.com/showthread.php?t=475759
                    for (int p = 1; p <= pdfReader.NumberOfPages; p++)
                    {
                        byte[] pageBytes = pdfReader.GetPageContent(p);

                        if (pageBytes.IsNull())
                        {
                            continue;
                        }

                        PRTokeniser token = new PRTokeniser(pageBytes);
                        while (token.NextToken())
                        {
                            int tknType = token.TokenType;
                            string tknValue = token.StringValue;

                            if (tknType == PRTokeniser.TK_STRING)
                            {
                                sb.Append(token.StringValue);
                                sb.Append(" ");
                            }
                            else if (tknType == 1 && tknValue == "-600")
                            {
                                sb.Append(" ");
                            }
                            else if (tknType == 10 && tknValue == "TJ")
                            {
                                sb.Append(" ");
                            }
                        }
                    }

                    propertyBag.Text = sb.ToString();
                }
                finally
                {
                    pdfReader.Close();
                }
            }
        }
Пример #19
0
        private void Initialize()
        {
            try
            {
                Uri         robotsUri = new Uri("http://{0}/robots.txt".FormatWith(m_StartPageUri.Host));
                PropertyBag robots    = m_WebDownloader.Download(new CrawlStep(robotsUri, 0), null, DownloadMethod.GET);

                if (robots == null || robots.StatusCode != HttpStatusCode.OK)
                {
                    return;
                }

                string fileContents;
                using (StreamReader stream = new StreamReader(robots.GetResponse(), Encoding.ASCII))
                {
                    fileContents = stream.ReadToEnd();
                }

                string[] fileLines = fileContents.Split(Environment.NewLine.ToCharArray(), StringSplitOptions.RemoveEmptyEntries);

                bool          rulesApply = false;
                List <string> rules      = new List <string>();
                foreach (string line in fileLines)
                {
                    RobotInstruction ri = new RobotInstruction(line);
                    if (!ri.Instruction.IsNullOrEmpty())
                    {
                        switch (ri.Instruction[0])
                        {
                        case '#':                                 //then comment - ignore
                            break;

                        case 'u':                                 // User-Agent
                            if ((ri.UrlOrAgent.IndexOf("*") >= 0) || (ri.UrlOrAgent.IndexOf(m_WebDownloader.UserAgent) >= 0))
                            {
                                // these rules apply
                                rulesApply = true;
                            }
                            else
                            {
                                rulesApply = false;
                            }
                            break;

                        case 'd':                                 // Disallow
                            if (rulesApply)
                            {
                                rules.Add(ri.UrlOrAgent.ToUpperInvariant());
                            }
                            break;

                        case 'a':                                 // Allow
                            break;

                        default:
                            // empty/unknown/error
                            break;
                        }
                    }
                }

                m_DenyUrls = rules.ToArray();
            }
            catch (Exception)
            {
            }
        }
        public override void Process(Crawler crawler, PropertyBag propertyBag)
        {
            AspectF.Define.
            NotNull(crawler, "crawler").
            NotNull(propertyBag, "propertyBag");

            if (propertyBag.StatusCode != HttpStatusCode.OK)
            {
                return;
            }

            if (!IsHtmlContent(propertyBag.ContentType))
            {
                return;
            }

            HtmlDocument htmlDoc = new HtmlDocument
            {
                OptionAddDebuggingAttributes = false,
                OptionAutoCloseOnEnd         = true,
                OptionFixNestedTags          = true,
                OptionReadEncoding           = true
            };

            using (Stream reader = propertyBag.GetResponse())
            {
                Encoding documentEncoding = htmlDoc.DetectEncoding(reader);
                reader.Seek(0, SeekOrigin.Begin);
                if (!documentEncoding.IsNull())
                {
                    htmlDoc.Load(reader, documentEncoding, true);
                }
                else
                {
                    htmlDoc.Load(reader, true);
                }
            }

            string originalContent = htmlDoc.DocumentNode.OuterHtml;

            if (HasTextStripRules || HasSubstitutionRules)
            {
                string content = StripText(originalContent);
                content = Substitute(content, propertyBag.Step);
                using (TextReader tr = new StringReader(content))
                {
                    htmlDoc.Load(tr);
                }
            }

            propertyBag["HtmlDoc"].Value = htmlDoc;

            HtmlNodeCollection nodes = htmlDoc.DocumentNode.SelectNodes("//title");

            // Extract Title
            if (!nodes.IsNull())
            {
                propertyBag.Title = string.Join(";", nodes.
                                                Select(n => n.InnerText).
                                                ToArray()).Trim();
            }

            // Extract Meta Data
            nodes = htmlDoc.DocumentNode.SelectNodes("//meta[@content and @name]");
            if (!nodes.IsNull())
            {
                propertyBag["Meta"].Value = (
                    from entry in nodes
                    let name = entry.Attributes["name"]
                               let content = entry.Attributes["content"]
                                             where !name.IsNull() && !name.Value.IsNullOrEmpty() && !content.IsNull() && !content.Value.IsNullOrEmpty()
                                             select name.Value + ": " + content.Value).ToArray();
            }

            // Extract text
            propertyBag.Text = htmlDoc.ExtractText().Trim();
            if (HasLinkStripRules || HasTextStripRules)
            {
                string content = StripLinks(originalContent);
                using (TextReader tr = new StringReader(content))
                {
                    htmlDoc.Load(tr);
                }
            }

            string baseUrl = propertyBag.ResponseUri.GetLeftPart(UriPartial.Path);

            // Extract Head Base
            nodes = htmlDoc.DocumentNode.SelectNodes("//head/base[@href]");
            if (!nodes.IsNull())
            {
                baseUrl =
                    nodes.
                    Select(entry => new { entry, href = entry.Attributes["href"] }).
                    Where(@t => [email protected]() && [email protected]() &&
                          Uri.IsWellFormedUriString(@t.href.Value, UriKind.RelativeOrAbsolute)).
                    Select(@t => @t.href.Value).
                    AddToEnd(baseUrl).
                    FirstOrDefault();
            }

            // Extract Links
            DocumentWithLinks links = htmlDoc.GetLinks();

            foreach (string link in links.Links.Union(links.References))
            {
                if (link.IsNullOrEmpty())
                {
                    continue;
                }

                string decodedLink = ExtendedHtmlUtility.HtmlEntityDecode(link);
                try
                {
                    string normalizedLink = NormalizeLink(baseUrl, decodedLink);
                    if (normalizedLink.IsNullOrEmpty())
                    {
                        continue;
                    }
                    AddStepToCrawler(crawler, propertyBag, normalizedLink, link);
                }
                catch (UriFormatException)
                {
                    //When the link is not propper formatted the link mist be ignored
                }
            }
        }
Пример #21
0
        public void Process(Crawler crawler, PropertyBag propertyBag)
        {
            AspectF.Define.
                NotNull(crawler, "crawler").
                NotNull(propertyBag, "propertyBag");

            if (propertyBag.StatusCode != HttpStatusCode.OK)
            {
                return;
            }

            if (!IsHtmlContent(propertyBag.ContentType))
            {
                return;
            }

            HtmlDocument htmlDoc = new HtmlDocument
                {
                    OptionAddDebuggingAttributes = false,
                    OptionAutoCloseOnEnd = true,
                    OptionFixNestedTags = true,
                    OptionReadEncoding = true
                };
            using (Stream reader = propertyBag.GetResponse())
            {
                Encoding documentEncoding = htmlDoc.DetectEncoding(reader);
                reader.Seek(0, SeekOrigin.Begin);
                if (!documentEncoding.IsNull())
                {
                    htmlDoc.Load(reader, documentEncoding, true);
                }
                else
                {
                    htmlDoc.Load(reader, true);
                }
            }

            string originalContent = htmlDoc.DocumentNode.OuterHtml;
            if (HasTextStripRules || HasSubstitutionRules)
            {
                string content = StripText(originalContent);
                content = Substitute(content, propertyBag.Step);
                using (TextReader tr = new StringReader(content))
                {
                    htmlDoc.Load(tr);
                }
            }

            propertyBag["HtmlDoc"].Value = htmlDoc;

            HtmlNodeCollection nodes = htmlDoc.DocumentNode.SelectNodes("//title");
            // Extract Title
            if (!nodes.IsNull())
            {
                propertyBag.Title = string.Join(";", nodes.
                    Select(n => n.InnerText).
                    ToArray()).Trim();
            }

            // Extract Meta Data
            nodes = htmlDoc.DocumentNode.SelectNodes("//meta[@content and @name]");
            if (!nodes.IsNull())
            {
                propertyBag["Meta"].Value = (
                    from entry in nodes
                    let name = entry.Attributes["name"]
                    let content = entry.Attributes["content"]
                    where !name.IsNull() && !name.Value.IsNullOrEmpty() && !content.IsNull() && !content.Value.IsNullOrEmpty()
                    select name.Value + ": " + content.Value).ToArray();
            }

            propertyBag.Text = htmlDoc.ExtractText().Trim();
            if (HasLinkStripRules || HasTextStripRules)
            {
                string content = StripLinks(originalContent);
                using (TextReader tr = new StringReader(content))
                {
                    htmlDoc.Load(tr);
                }
            }

            // Extract Links
            DocumentWithLinks links = htmlDoc.GetLinks();
            foreach (string link in links.Links.Union(links.References))
            {
                if (link.IsNullOrEmpty())
                {
                    continue;
                }

                string baseUrl = propertyBag.ResponseUri.GetLeftPart(UriPartial.Path);
                string decodedLink = ExtendedHtmlUtility.HtmlEntityDecode(link);
                string normalizedLink = NormalizeLink(baseUrl, decodedLink);
                if (normalizedLink.IsNullOrEmpty())
                {
                    continue;
                }

                crawler.AddStep(new Uri(normalizedLink), propertyBag.Step.Depth + 1,
                    propertyBag.Step, new Dictionary<string, object>
                        {
                            {Resources.PropertyBagKeyOriginalUrl, link},
                            {Resources.PropertyBagKeyOriginalReferrerUrl, propertyBag.ResponseUri}
                        });
            }
        }
Пример #22
0
        public virtual async Task ProcessAsync(ICrawler crawler, PropertyBag propertyBag)
        {
            AspectF.Define.
            NotNull(crawler, "crawler").
            NotNull(propertyBag, "propertyBag");

            if (propertyBag.StatusCode != HttpStatusCode.OK)
            {
                return;
            }

            if (!IsHtmlContent(propertyBag.ContentType))
            {
                return;
            }


            var htmlDoc = new HtmlDocument
            {
                OptionAddDebuggingAttributes = false,
                OptionAutoCloseOnEnd         = true,
                OptionFixNestedTags          = true,
                OptionReadEncoding           = true
            };

            using (var reader = propertyBag.GetResponse())
            {
                var documentEncoding = htmlDoc.DetectEncoding(reader);
                reader.Seek(0, SeekOrigin.Begin);
                if (!documentEncoding.IsNull())
                {
                    htmlDoc.Load(reader, documentEncoding, true);
                }
                else
                {
                    htmlDoc.Load(reader, true);
                }
            }

            var originalContent = htmlDoc.DocumentNode.OuterHtml;

            if (this.HasTextStripRules || this.HasSubstitutionRules)
            {
                var content = this.StripText(originalContent);
                content = this.Substitute(content, propertyBag.Step);
                using (TextReader tr = new StringReader(content))
                {
                    htmlDoc.Load(tr);
                }
            }

            propertyBag["HtmlDoc"].Value = htmlDoc;

            var nodes = htmlDoc.DocumentNode.SelectNodes("//title");

            // Extract Title
            if (!nodes.IsNull())
            {
                propertyBag.Title = string.Join(";", nodes.
                                                Select(n => n.InnerText).
                                                ToArray()).Trim();
            }

            // Extract Meta Data
            nodes = htmlDoc.DocumentNode.SelectNodes("//meta[@content and @name]");
            if (!nodes.IsNull())
            {
                propertyBag["Meta"].Value = (
                    from entry in nodes
                    let name = entry.Attributes["name"]
                               let content = entry.Attributes["content"]
                                             where !name.IsNull() && !name.Value.IsNullOrEmpty() && !content.IsNull() && !content.Value.IsNullOrEmpty()
                                             select name.Value + ": " + content.Value).ToArray();
            }

            // Extract text
            propertyBag.Text = htmlDoc.ExtractText().Trim();
            if (this.HasLinkStripRules || this.HasTextStripRules)
            {
                var content = this.StripLinks(originalContent);
                using (TextReader tr = new StringReader(content))
                {
                    htmlDoc.Load(tr);
                }
            }

            var baseUrl = propertyBag.ResponseUri.GetLeftPath();

            // Extract Head Base
            nodes = htmlDoc.DocumentNode.SelectNodes("//head/base[@href]");
            if (!nodes.IsNull())
            {
                baseUrl =
                    nodes.
                    Select(entry => new { entry, href = entry.Attributes["href"] }).
                    Where(@t => [email protected]() && [email protected]() &&
                          Uri.IsWellFormedUriString(@t.href.Value, UriKind.RelativeOrAbsolute)).
                    Select(@t => @t.href.Value).
                    AddToEnd(baseUrl).
                    FirstOrDefault();
            }

            // Extract Links
            var links = htmlDoc.GetLinks();

            foreach (var link in links.Links.Union(links.References))
            {
                if (link.IsNullOrEmpty())
                {
                    continue;
                }

                var decodedLink    = ExtendedHtmlUtility.HtmlEntityDecode(link);
                var normalizedLink = this.NormalizeLink(baseUrl, decodedLink);
                if (normalizedLink.IsNullOrEmpty())
                {
                    continue;
                }

                await crawler.AddStepAsync(new Uri(normalizedLink), propertyBag.Step.Depth + 1,
                                           propertyBag.Step, new Dictionary <string, object>
                {
                    { Resources.PropertyBagKeyOriginalUrl, link },
                    { Resources.PropertyBagKeyOriginalReferrerUrl, propertyBag.ResponseUri }
                }).ConfigureAwait(false);
            }
        }