Beispiel #1
0
        /**************************************************************************/

        private MacroscopeLink AddSitemapTextOutlink(
            string AbsoluteUrl,
            MacroscopeConstants.InOutLinkType LinkType,
            Boolean Follow
            )
        {
            MacroscopeLink OutLink = null;

            if (!MacroscopePreferencesManager.GetCheckExternalLinks())
            {
                MacroscopeAllowedHosts AllowedHosts = this.DocCollection.GetAllowedHosts();
                if (AllowedHosts != null)
                {
                    if (!AllowedHosts.IsAllowedFromUrl(Url: AbsoluteUrl))
                    {
                        return(OutLink);
                    }
                }
            }

            OutLink = new MacroscopeLink(
                SourceUrl: this.GetUrl(),
                TargetUrl: AbsoluteUrl,
                LinkType: LinkType,
                Follow: Follow
                );

            this.Outlinks.Add(OutLink);

            return(OutLink);
        }
        /**************************************************************************/

        public void Add(MacroscopeLink Link)
        {
            lock (this.Links)
            {
                if (!this.ContainsLink(Link: Link))
                {
                    this.Links.Add(Link);
                }
            }
        }
        /**************************************************************************/

        public void Remove(MacroscopeLink Link)
        {
            lock (this.Links)
            {
                foreach (MacroscopeLink LinkOld in this.Links)
                {
                    if (LinkOld.Equals(Link))
                    {
                        this.Links.Remove(LinkOld);
                    }
                }
            }
        }
        /**************************************************************************/

        public bool ContainsLink(MacroscopeLink Link)
        {
            bool LinkPresent = false;

            lock (this.Links)
            {
                if (this.Links.Contains(Link))
                {
                    LinkPresent = true;
                }
            }

            return(LinkPresent);
        }
        /**************************************************************************/

        private MacroscopeLink AddSitemapXmlOutlink(
            string AbsoluteUrl,
            MacroscopeConstants.InOutLinkType LinkType,
            Boolean Follow
            )
        {
            MacroscopeLink OutLink = null;
            Boolean        Proceed = true;

            if (!MacroscopePreferencesManager.GetCheckExternalLinks())
            {
                MacroscopeAllowedHosts AllowedHosts = this.DocCollection.GetAllowedHosts();
                if (AllowedHosts != null)
                {
                    if (!AllowedHosts.IsAllowedFromUrl(Url: AbsoluteUrl))
                    {
                        Proceed = false;
                    }
                }
            }

            switch (LinkType)
            {
            case MacroscopeConstants.InOutLinkType.SITEMAPXML:
                if (!MacroscopePreferencesManager.GetFetchXml())
                {
                    Proceed = false;
                }
                break;
            }

            if (Proceed)
            {
                OutLink = new MacroscopeLink(
                    SourceUrl: this.GetUrl(),
                    TargetUrl: AbsoluteUrl,
                    LinkType: LinkType,
                    Follow: Follow
                    );

                this.Outlinks.Add(OutLink);
            }

            return(OutLink);
        }
        /** -------------------------------------------------------------------- **/

        private async Task _ProcessImagePage()
        {
            MacroscopeHttpTwoClient         Client   = this.DocCollection.GetJobMaster().GetHttpClient();
            MacroscopeHttpTwoClientResponse Response = null;
            string ResponseErrorCondition            = null;

            try
            {
                Response = await Client.Head(
                    this.GetUri(),
                    this.ConfigureImagePageRequestHeadersCallback,
                    this.PostProcessRequestHttpHeadersCallback
                    );
            }
            catch (MacroscopeDocumentException ex)
            {
                this.DebugMsg(string.Format("_ProcessImagePage :: MacroscopeDocumentException: {0}", ex.Message));
                ResponseErrorCondition = ex.Message;
                this.SetStatusCode(HttpStatusCode.BadRequest);
                this.AddRemark("_ProcessImagePage", ex.Message);
            }
            catch (Exception ex)
            {
                this.DebugMsg(string.Format("_ProcessImagePage :: Exception: {0}", ex.Message));
                ResponseErrorCondition = ex.Message;
                this.SetStatusCode(HttpStatusCode.BadRequest);
                this.AddRemark("_ProcessImagePage", ex.Message);
            }

            if (Response != null)
            {
                this.ProcessResponseHttpHeaders(Response: Response);

                /** Title ---------------------------------------------------------- **/
                {
                    MatchCollection reMatches     = Regex.Matches(this.DocUrl, "/([^/]+)$");
                    string          DocumentTitle = null;
                    foreach (Match match in reMatches)
                    {
                        if (match.Groups[1].Value.Length > 0)
                        {
                            DocumentTitle = match.Groups[1].Value.ToString();
                            break;
                        }
                    }
                    if (DocumentTitle != null)
                    {
                        this.SetTitle(DocumentTitle, MacroscopeConstants.TextProcessingMode.NO_PROCESSING);
                        this.DebugMsg(string.Format("TITLE: {0}", this.GetTitle()));
                    }
                    else
                    {
                        this.DebugMsg(string.Format("TITLE: {0}", "MISSING"));
                    }
                }

                /** QR Codes ------------------------------------------------------- **/
                if (MacroscopePreferencesManager.GetDetectQrCodeInImage())
                {
                    MacroscopeHttpImageLoader ImageLoader = new MacroscopeHttpImageLoader();
                    Uri    QrCodeImageUri      = null;
                    string QrCodeImageFilename = await ImageLoader.DownloadImageFromUriToFile(JobMaster : this.DocCollection.GetJobMaster(), TargetUri : this.GetUri());

                    if ((!string.IsNullOrEmpty(QrCodeImageFilename)) && File.Exists(QrCodeImageFilename))
                    {
                        MacroscopeQrCodeAnalysis QrCodeAnalysis = new MacroscopeQrCodeAnalysis();
                        string ResultText = QrCodeAnalysis.Decode(ImageFilename: QrCodeImageFilename);
                        if (!string.IsNullOrEmpty(ResultText))
                        {
                            try
                            {
                                QrCodeImageUri = new Uri(ResultText);
                            }
                            catch (UriFormatException ex)
                            {
                                this.DebugMsg(string.Format("UriFormatException: {0}", ResultText));
                                this.DebugMsg(string.Format("UriFormatException: {0}", ex.Message));
                            }
                            if (QrCodeImageUri != null)
                            {
                                MacroscopeLink Outlink = null;
                                Outlink = this.AddDocumentOutlink(
                                    AbsoluteUrl: QrCodeImageUri.AbsoluteUri,
                                    LinkType: MacroscopeConstants.InOutLinkType.QRCODE,
                                    Follow: true
                                    );
                                if (Outlink != null)
                                {
                                    Outlink.SetRawTargetUrl(TargetUrl: QrCodeImageUri.AbsoluteUri);
                                    this.AddRemark("QRCODEIMAGE", "This image appears to be a QR Code.");
                                }
                            }
                        }
                    }
                }
                /** ---------------------------------------------------------------- **/
            }

            if (ResponseErrorCondition != null)
            {
                this.ErrorCondition = ResponseErrorCondition;
            }
        }
        /** -------------------------------------------------------------------- **/

        private async Task _ExecuteHeadRequest()
        {
            MacroscopeHttpTwoClient         Client         = this.DocCollection.GetJobMaster().GetHttpClient();
            MacroscopeHttpTwoClientResponse ClientResponse = null;
            string ResponseErrorCondition = null;

            this.SetProcessInlinks();
            this.SetProcessHyperlinksIn();

            try
            {
                ClientResponse = await Client.Head(
                    this.GetUri(),
                    this.ConfigureHeadRequestHeadersCallback,
                    this.PostProcessRequestHttpHeadersCallback
                    );

                this.CrawledDate = DateTime.UtcNow;
            }
            catch (MacroscopeDocumentException ex)
            {
                this.DebugMsg(string.Format("_ExecuteHeadRequest :: MacroscopeDocumentException: {0}", ex.Message));
                ResponseErrorCondition = ex.Message;
                this.SetStatusCode(HttpStatusCode.BadRequest);
                this.AddRemark("_ExecuteHeadRequest", ex.Message);
            }
            catch (Exception ex)
            {
                this.DebugMsg(string.Format("_ExecuteHeadRequest :: Exception: {0}", ex.Message));
                ResponseErrorCondition = ex.Message;
                this.SetStatusCode(HttpStatusCode.BadRequest);
                this.AddRemark("_ExecuteHeadRequest", ex.Message);
            }

            if (ClientResponse != null)
            {
                try
                {
                    this.DebugMsg(string.Format("StatusCode: {0}", ClientResponse.GetResponse().StatusCode));

                    if (ClientResponse.GetResponse() != null)
                    {
                        this.SetErrorCondition(ClientResponse.GetResponse().ReasonPhrase);
                    }
                    else
                    {
                        throw new MacroscopeDocumentException("Bad Response in ExecuteHeadRequest");
                    }

                    this.ProcessResponseHttpHeaders(Response: ClientResponse);

                    if (this.GetIsRedirect())
                    {
                        string Location = this.GetUrlRedirectTo();

                        if (!string.IsNullOrEmpty(Location))
                        {
                            MacroscopeLink OutLink = null;

                            this.SetUrlRedirectTo(Url: Location);

                            OutLink = this.AddDocumentOutlink(
                                AbsoluteUrl: Location,
                                LinkType: MacroscopeConstants.InOutLinkType.REDIRECT,
                                Follow: true
                                );

                            OutLink.SetRawTargetUrl(TargetUrl: this.GetUrlRedirectToRaw());
                        }
                    }
                }
                catch (Exception ex)
                {
                    this.DebugMsg(string.Format("_ExecuteHeadRequest :: Exception: {0}", ex.Message));
                    ResponseErrorCondition = ex.Message;
                }
            }

            if (ResponseErrorCondition != null)
            {
                this.ProcessErrorCondition(ResponseErrorCondition);
            }

            return;
        }
        /**************************************************************************/

        private void ProcessCssOutlinks(Stylesheet CssStylesheet)
        {
            if (this.GetIsExternal())
            {
                return;
            }

            foreach (var CssRule in CssStylesheet.StyleRules)
            {
                int iRule = CssStylesheet.StyleRules.IndexOf(CssRule);

                foreach (Property pProp in CssStylesheet.StyleRules[iRule].Declarations.Properties)
                {
                    string BackgroundImageUrl;
                    string LinkUrlAbs;

                    switch (pProp.Name.ToLower())
                    {
                    case "background-image":

                        if (pProp.Term != null)
                        {
                            BackgroundImageUrl = pProp.Term.ToString();
                            LinkUrlAbs         = this.ProcessCssBackImageUrl(BackgroundImageUrl);

                            DebugMsg(string.Format("ProcessCssHyperlinksOut: (background-image): {0}", BackgroundImageUrl));
                            DebugMsg(string.Format("ProcessCssHyperlinksOut: (background-image): {0}", LinkUrlAbs));

                            if (LinkUrlAbs != null)
                            {
                                MacroscopeHyperlinkOut HyperlinkOut = null;
                                MacroscopeLink         Outlink      = null;

                                HyperlinkOut = this.HyperlinksOut.Add(
                                    LinkType: MacroscopeConstants.HyperlinkType.CSS,
                                    UrlTarget: LinkUrlAbs
                                    );

                                Outlink = this.AddDocumentOutlink(
                                    AbsoluteUrl: LinkUrlAbs,
                                    LinkType: MacroscopeConstants.InOutLinkType.IMAGE,
                                    Follow: true
                                    );

                                if (Outlink != null)
                                {
                                    Outlink.SetRawTargetUrl(BackgroundImageUrl);
                                }
                            }
                        }

                        break;

                    case "background":

                        if (pProp.Term != null)
                        {
                            BackgroundImageUrl = pProp.Term.ToString();
                            LinkUrlAbs         = this.ProcessCssBackImageUrl(BackgroundImageUrl);

                            DebugMsg(string.Format("ProcessCssHyperlinksOut: (background): {0}", BackgroundImageUrl));
                            DebugMsg(string.Format("ProcessCssHyperlinksOut: (background): {0}", LinkUrlAbs));

                            if (LinkUrlAbs != null)
                            {
                                MacroscopeHyperlinkOut HyperlinkOut = null;
                                MacroscopeLink         Outlink      = null;

                                HyperlinkOut = this.HyperlinksOut.Add(
                                    LinkType: MacroscopeConstants.HyperlinkType.CSS,
                                    UrlTarget: LinkUrlAbs
                                    );

                                Outlink = this.AddDocumentOutlink(
                                    AbsoluteUrl: LinkUrlAbs,
                                    LinkType: MacroscopeConstants.InOutLinkType.IMAGE,
                                    Follow: true
                                    );

                                if (Outlink != null)
                                {
                                    Outlink.SetRawTargetUrl(BackgroundImageUrl);
                                }
                            }
                        }

                        break;

                    default:
                        break;
                    }
                }
            }
        }
        /**************************************************************************/

        private void ProcessCssOutlinks(CssStyleSheet Stylesheet)
        {
            // https://github.com/Athari/CsCss

            // https://developer.mozilla.org/en-US/docs/Web/CSS/url

            List <string> BackgroundImageUrls = null;

            if (this.GetIsExternal())
            {
                return;
            }

            try
            {
                BackgroundImageUrls = Stylesheet.AllStyleRules
                                      .Where(StyleRule => StyleRule.Declaration.BackgroundImage != null)
                                      .SelectMany(StyleRule => StyleRule.Declaration.AllData)
                                      .SelectMany(Property => Property.Value.Unit == CssUnit.List ? Property.Value.List : new[] { Property.Value })
                                      .Where(Value => Value.Unit == CssUnit.Url)
                                      .Select(Value => Value.OriginalUri)
                                      .ToList();
            }
            catch (Exception ex)
            {
                DebugMsg(string.Format("ProcessCssOutlinks: {0}", ex.Message));
            }

            if (BackgroundImageUrls != null)
            {
                foreach (string BackgroundImageUrl in BackgroundImageUrls)
                {
                    string LinkUrlAbs = this.ProcessCssBackgroundImageUrl(BackgroundImageUrl);

                    DebugMsg(string.Format("ProcessCssOutlinks: (background-image): {0}", BackgroundImageUrl));
                    DebugMsg(string.Format("ProcessCssOutlinks: (background-image): {0}", LinkUrlAbs));

                    if (LinkUrlAbs != null)
                    {
                        MacroscopeHyperlinkOut HyperlinkOut = null;
                        MacroscopeLink         Outlink      = null;

                        HyperlinkOut = this.HyperlinksOut.Add(
                            LinkType: MacroscopeConstants.HyperlinkType.CSS,
                            UrlTarget: LinkUrlAbs
                            );

                        Outlink = this.AddDocumentOutlink(
                            AbsoluteUrl: LinkUrlAbs,
                            LinkType: MacroscopeConstants.InOutLinkType.IMAGE,
                            Follow: true
                            );

                        if (Outlink != null)
                        {
                            Outlink.SetRawTargetUrl(BackgroundImageUrl);
                        }
                    }
                }
            }

            return;
        }