/**************************************************************************/ private MacroscopeLink AddSitemapTextOutlink( string AbsoluteUrl, MacroscopeConstants.InOutLinkType LinkType, Boolean Follow ) { MacroscopeLink OutLink = null; if (!MacroscopePreferencesManager.GetCheckExternalLinks()) { MacroscopeAllowedHosts AllowedHosts = this.DocCollection.GetAllowedHosts(); if (AllowedHosts != null) { if (!AllowedHosts.IsAllowedFromUrl(Url: AbsoluteUrl)) { return(OutLink); } } } OutLink = new MacroscopeLink( SourceUrl: this.GetUrl(), TargetUrl: AbsoluteUrl, LinkType: LinkType, Follow: Follow ); this.Outlinks.Add(OutLink); return(OutLink); }
/**************************************************************************/ public void Add(MacroscopeLink Link) { lock (this.Links) { if (!this.ContainsLink(Link: Link)) { this.Links.Add(Link); } } }
/**************************************************************************/ public void Remove(MacroscopeLink Link) { lock (this.Links) { foreach (MacroscopeLink LinkOld in this.Links) { if (LinkOld.Equals(Link)) { this.Links.Remove(LinkOld); } } } }
/**************************************************************************/ public bool ContainsLink(MacroscopeLink Link) { bool LinkPresent = false; lock (this.Links) { if (this.Links.Contains(Link)) { LinkPresent = true; } } return(LinkPresent); }
/**************************************************************************/ private MacroscopeLink AddSitemapXmlOutlink( string AbsoluteUrl, MacroscopeConstants.InOutLinkType LinkType, Boolean Follow ) { MacroscopeLink OutLink = null; Boolean Proceed = true; if (!MacroscopePreferencesManager.GetCheckExternalLinks()) { MacroscopeAllowedHosts AllowedHosts = this.DocCollection.GetAllowedHosts(); if (AllowedHosts != null) { if (!AllowedHosts.IsAllowedFromUrl(Url: AbsoluteUrl)) { Proceed = false; } } } switch (LinkType) { case MacroscopeConstants.InOutLinkType.SITEMAPXML: if (!MacroscopePreferencesManager.GetFetchXml()) { Proceed = false; } break; } if (Proceed) { OutLink = new MacroscopeLink( SourceUrl: this.GetUrl(), TargetUrl: AbsoluteUrl, LinkType: LinkType, Follow: Follow ); this.Outlinks.Add(OutLink); } return(OutLink); }
/** -------------------------------------------------------------------- **/ private async Task _ProcessImagePage() { MacroscopeHttpTwoClient Client = this.DocCollection.GetJobMaster().GetHttpClient(); MacroscopeHttpTwoClientResponse Response = null; string ResponseErrorCondition = null; try { Response = await Client.Head( this.GetUri(), this.ConfigureImagePageRequestHeadersCallback, this.PostProcessRequestHttpHeadersCallback ); } catch (MacroscopeDocumentException ex) { this.DebugMsg(string.Format("_ProcessImagePage :: MacroscopeDocumentException: {0}", ex.Message)); ResponseErrorCondition = ex.Message; this.SetStatusCode(HttpStatusCode.BadRequest); this.AddRemark("_ProcessImagePage", ex.Message); } catch (Exception ex) { this.DebugMsg(string.Format("_ProcessImagePage :: Exception: {0}", ex.Message)); ResponseErrorCondition = ex.Message; this.SetStatusCode(HttpStatusCode.BadRequest); this.AddRemark("_ProcessImagePage", ex.Message); } if (Response != null) { this.ProcessResponseHttpHeaders(Response: Response); /** Title ---------------------------------------------------------- **/ { MatchCollection reMatches = Regex.Matches(this.DocUrl, "/([^/]+)$"); string DocumentTitle = null; foreach (Match match in reMatches) { if (match.Groups[1].Value.Length > 0) { DocumentTitle = match.Groups[1].Value.ToString(); break; } } if (DocumentTitle != null) { this.SetTitle(DocumentTitle, MacroscopeConstants.TextProcessingMode.NO_PROCESSING); this.DebugMsg(string.Format("TITLE: {0}", this.GetTitle())); } else { this.DebugMsg(string.Format("TITLE: {0}", "MISSING")); } } /** QR Codes ------------------------------------------------------- **/ if (MacroscopePreferencesManager.GetDetectQrCodeInImage()) { MacroscopeHttpImageLoader ImageLoader = new MacroscopeHttpImageLoader(); Uri QrCodeImageUri = null; string QrCodeImageFilename = await ImageLoader.DownloadImageFromUriToFile(JobMaster : this.DocCollection.GetJobMaster(), TargetUri : this.GetUri()); if ((!string.IsNullOrEmpty(QrCodeImageFilename)) && File.Exists(QrCodeImageFilename)) { MacroscopeQrCodeAnalysis QrCodeAnalysis = new MacroscopeQrCodeAnalysis(); string ResultText = QrCodeAnalysis.Decode(ImageFilename: QrCodeImageFilename); if (!string.IsNullOrEmpty(ResultText)) { try { QrCodeImageUri = new Uri(ResultText); } catch (UriFormatException ex) { this.DebugMsg(string.Format("UriFormatException: {0}", ResultText)); this.DebugMsg(string.Format("UriFormatException: {0}", ex.Message)); } if (QrCodeImageUri != null) { MacroscopeLink Outlink = null; Outlink = this.AddDocumentOutlink( AbsoluteUrl: QrCodeImageUri.AbsoluteUri, LinkType: MacroscopeConstants.InOutLinkType.QRCODE, Follow: true ); if (Outlink != null) { Outlink.SetRawTargetUrl(TargetUrl: QrCodeImageUri.AbsoluteUri); this.AddRemark("QRCODEIMAGE", "This image appears to be a QR Code."); } } } } } /** ---------------------------------------------------------------- **/ } if (ResponseErrorCondition != null) { this.ErrorCondition = ResponseErrorCondition; } }
/** -------------------------------------------------------------------- **/ private async Task _ExecuteHeadRequest() { MacroscopeHttpTwoClient Client = this.DocCollection.GetJobMaster().GetHttpClient(); MacroscopeHttpTwoClientResponse ClientResponse = null; string ResponseErrorCondition = null; this.SetProcessInlinks(); this.SetProcessHyperlinksIn(); try { ClientResponse = await Client.Head( this.GetUri(), this.ConfigureHeadRequestHeadersCallback, this.PostProcessRequestHttpHeadersCallback ); this.CrawledDate = DateTime.UtcNow; } catch (MacroscopeDocumentException ex) { this.DebugMsg(string.Format("_ExecuteHeadRequest :: MacroscopeDocumentException: {0}", ex.Message)); ResponseErrorCondition = ex.Message; this.SetStatusCode(HttpStatusCode.BadRequest); this.AddRemark("_ExecuteHeadRequest", ex.Message); } catch (Exception ex) { this.DebugMsg(string.Format("_ExecuteHeadRequest :: Exception: {0}", ex.Message)); ResponseErrorCondition = ex.Message; this.SetStatusCode(HttpStatusCode.BadRequest); this.AddRemark("_ExecuteHeadRequest", ex.Message); } if (ClientResponse != null) { try { this.DebugMsg(string.Format("StatusCode: {0}", ClientResponse.GetResponse().StatusCode)); if (ClientResponse.GetResponse() != null) { this.SetErrorCondition(ClientResponse.GetResponse().ReasonPhrase); } else { throw new MacroscopeDocumentException("Bad Response in ExecuteHeadRequest"); } this.ProcessResponseHttpHeaders(Response: ClientResponse); if (this.GetIsRedirect()) { string Location = this.GetUrlRedirectTo(); if (!string.IsNullOrEmpty(Location)) { MacroscopeLink OutLink = null; this.SetUrlRedirectTo(Url: Location); OutLink = this.AddDocumentOutlink( AbsoluteUrl: Location, LinkType: MacroscopeConstants.InOutLinkType.REDIRECT, Follow: true ); OutLink.SetRawTargetUrl(TargetUrl: this.GetUrlRedirectToRaw()); } } } catch (Exception ex) { this.DebugMsg(string.Format("_ExecuteHeadRequest :: Exception: {0}", ex.Message)); ResponseErrorCondition = ex.Message; } } if (ResponseErrorCondition != null) { this.ProcessErrorCondition(ResponseErrorCondition); } return; }
/**************************************************************************/ private void ProcessCssOutlinks(Stylesheet CssStylesheet) { if (this.GetIsExternal()) { return; } foreach (var CssRule in CssStylesheet.StyleRules) { int iRule = CssStylesheet.StyleRules.IndexOf(CssRule); foreach (Property pProp in CssStylesheet.StyleRules[iRule].Declarations.Properties) { string BackgroundImageUrl; string LinkUrlAbs; switch (pProp.Name.ToLower()) { case "background-image": if (pProp.Term != null) { BackgroundImageUrl = pProp.Term.ToString(); LinkUrlAbs = this.ProcessCssBackImageUrl(BackgroundImageUrl); DebugMsg(string.Format("ProcessCssHyperlinksOut: (background-image): {0}", BackgroundImageUrl)); DebugMsg(string.Format("ProcessCssHyperlinksOut: (background-image): {0}", LinkUrlAbs)); if (LinkUrlAbs != null) { MacroscopeHyperlinkOut HyperlinkOut = null; MacroscopeLink Outlink = null; HyperlinkOut = this.HyperlinksOut.Add( LinkType: MacroscopeConstants.HyperlinkType.CSS, UrlTarget: LinkUrlAbs ); Outlink = this.AddDocumentOutlink( AbsoluteUrl: LinkUrlAbs, LinkType: MacroscopeConstants.InOutLinkType.IMAGE, Follow: true ); if (Outlink != null) { Outlink.SetRawTargetUrl(BackgroundImageUrl); } } } break; case "background": if (pProp.Term != null) { BackgroundImageUrl = pProp.Term.ToString(); LinkUrlAbs = this.ProcessCssBackImageUrl(BackgroundImageUrl); DebugMsg(string.Format("ProcessCssHyperlinksOut: (background): {0}", BackgroundImageUrl)); DebugMsg(string.Format("ProcessCssHyperlinksOut: (background): {0}", LinkUrlAbs)); if (LinkUrlAbs != null) { MacroscopeHyperlinkOut HyperlinkOut = null; MacroscopeLink Outlink = null; HyperlinkOut = this.HyperlinksOut.Add( LinkType: MacroscopeConstants.HyperlinkType.CSS, UrlTarget: LinkUrlAbs ); Outlink = this.AddDocumentOutlink( AbsoluteUrl: LinkUrlAbs, LinkType: MacroscopeConstants.InOutLinkType.IMAGE, Follow: true ); if (Outlink != null) { Outlink.SetRawTargetUrl(BackgroundImageUrl); } } } break; default: break; } } } }
/**************************************************************************/ private void ProcessCssOutlinks(CssStyleSheet Stylesheet) { // https://github.com/Athari/CsCss // https://developer.mozilla.org/en-US/docs/Web/CSS/url List <string> BackgroundImageUrls = null; if (this.GetIsExternal()) { return; } try { BackgroundImageUrls = Stylesheet.AllStyleRules .Where(StyleRule => StyleRule.Declaration.BackgroundImage != null) .SelectMany(StyleRule => StyleRule.Declaration.AllData) .SelectMany(Property => Property.Value.Unit == CssUnit.List ? Property.Value.List : new[] { Property.Value }) .Where(Value => Value.Unit == CssUnit.Url) .Select(Value => Value.OriginalUri) .ToList(); } catch (Exception ex) { DebugMsg(string.Format("ProcessCssOutlinks: {0}", ex.Message)); } if (BackgroundImageUrls != null) { foreach (string BackgroundImageUrl in BackgroundImageUrls) { string LinkUrlAbs = this.ProcessCssBackgroundImageUrl(BackgroundImageUrl); DebugMsg(string.Format("ProcessCssOutlinks: (background-image): {0}", BackgroundImageUrl)); DebugMsg(string.Format("ProcessCssOutlinks: (background-image): {0}", LinkUrlAbs)); if (LinkUrlAbs != null) { MacroscopeHyperlinkOut HyperlinkOut = null; MacroscopeLink Outlink = null; HyperlinkOut = this.HyperlinksOut.Add( LinkType: MacroscopeConstants.HyperlinkType.CSS, UrlTarget: LinkUrlAbs ); Outlink = this.AddDocumentOutlink( AbsoluteUrl: LinkUrlAbs, LinkType: MacroscopeConstants.InOutLinkType.IMAGE, Follow: true ); if (Outlink != null) { Outlink.SetRawTargetUrl(BackgroundImageUrl); } } } } return; }