/**************************************************************************/ private bool CheckNodeAlreadyVisited( MacroscopeDocument msDoc, MacroscopeHyperlinkOut HyperlinkOut ) { bool Result = false; if (this.NodeVisited.ContainsKey(msDoc)) { if (this.NodeVisited[msDoc].Contains(HyperlinkOut)) { Result = true; } else { this.NodeVisited[msDoc].Add(HyperlinkOut); } } else { this.NodeVisited[msDoc] = new List <MacroscopeHyperlinkOut> (); this.NodeVisited[msDoc].Add(HyperlinkOut); } return(Result); }
/**************************************************************************/ public bool ContainsLink ( MacroscopeHyperlinkOut Link ) { bool LinkPresent = false; lock( this.Links ) { if( this.Links.Contains( Link ) ) { LinkPresent = true; } } return( LinkPresent ); }
/**************************************************************************/ public void Remove(MacroscopeHyperlinkOut HyperlinkOut) { lock (this.Links) { foreach (MacroscopeHyperlinkOut HyperlinkOutOld in this.Links) { if (HyperlinkOutOld.Equals(HyperlinkOut)) { this.Links.Remove(HyperlinkOutOld); } } } }
/**************************************************************************/ public MacroscopeHyperlinkOut Add ( MacroscopeConstants.HyperlinkType LinkType, string UrlTarget ) { MacroscopeHyperlinkOut HyperlinkOut = new MacroscopeHyperlinkOut (); HyperlinkOut.SetHyperlinkType( LinkType ); HyperlinkOut.SetTargetUrl( UrlTarget ); lock( this.Links ) { this.Links.Add( HyperlinkOut ); } return( HyperlinkOut ); }
/** -------------------------------------------------------------------- **/ private async Task _ProcessPdfPage() { MacroscopeHttpTwoClient Client = this.DocCollection.GetJobMaster().GetHttpClient(); MacroscopeHttpTwoClientResponse ClientResponse = null; string ResponseErrorCondition = null; try { ClientResponse = await Client.Get( this.GetUri(), this.ConfigurePdfPageRequestHeadersCallback, this.PostProcessRequestHttpHeadersCallback ); } catch (MacroscopeDocumentException ex) { this.DebugMsg(string.Format("_ProcessPdfPage :: MacroscopeDocumentException: {0}", ex.Message)); ResponseErrorCondition = ex.Message; this.AddRemark("_ProcessPdfPage", ex.Message); } catch (Exception ex) { this.DebugMsg(string.Format("_ProcessPdfPage :: Exception: {0}", ex.Message)); ResponseErrorCondition = ex.Message; this.AddRemark("_ProcessPdfPage", ex.Message); } if (ClientResponse != null) { MacroscopePdfTools PdfTools; this.ProcessResponseHttpHeaders(Response: ClientResponse); { // Probe Locale //this.Locale = "en"; // Implement locale probing this.Locale = "x-default"; // Implement locale probing this.SetHreflang(HrefLangLocale: this.Locale, Url: this.DocUrl); } { // Canonical this.Canonical = this.DocUrl; this.DebugMsg(string.Format("CANONICAL: {0}", this.Canonical)); } /** Get Response Body ---------------------------------------------- **/ try { byte[] RawData = ClientResponse.GetContentAsBytes(); this.SetContentLength(Length: RawData.Length); PdfTools = new MacroscopePdfTools(PdfData: RawData); if (PdfTools.GetHasError()) { this.AddRemark("CORRUPT_PDF", Observation: PdfTools.GetErrorMessage()); } this.SetWasDownloaded(true); } catch (Exception ex) { this.DebugMsg(string.Format("Exception: {0}", ex.Message)); this.SetStatusCode(HttpStatusCode.BadRequest); PdfTools = null; this.SetContentLength(Length: 0); } /** Title ---------------------------------------------------------- **/ if (PdfTools != null) { string Text = PdfTools.GetTitle(); if (!string.IsNullOrEmpty(Text)) { this.SetTitle(Text, MacroscopeConstants.TextProcessingMode.NO_PROCESSING); this.DebugMsg(string.Format("TITLE: {0}", this.GetTitle())); } else { this.DebugMsg(string.Format("TITLE: {0}", "MISSING")); } } /** Author --------------------------------------------------------- **/ if (PdfTools != null) { string Text = PdfTools.GetAuthor(); if (!string.IsNullOrEmpty(Text)) { this.SetAuthor(AuthorText: Text, ProcessingMode: MacroscopeConstants.TextProcessingMode.NO_PROCESSING); this.DebugMsg(string.Format("AUTHOR: {0}", this.GetAuthor())); } else { this.DebugMsg(string.Format("AUTHOR: {0}", "MISSING")); } } /** Description ---------------------------------------------------- **/ if (PdfTools != null) { string Text = PdfTools.GetDescription(); if (!string.IsNullOrEmpty(Text)) { this.SetDescription(Text, MacroscopeConstants.TextProcessingMode.NO_PROCESSING); this.DebugMsg(string.Format("TITLE: {0}", this.GetDescription())); } else { this.DebugMsg(string.Format("TITLE: {0}", "MISSING")); } } /** Metadata Keywords ---------------------------------------------- **/ if (PdfTools != null) { string Text = PdfTools.GetKeywords(); if (!string.IsNullOrEmpty(Text)) { this.SetKeywords(KeywordsText: Text); this.DebugMsg(string.Format("KEYWORDS: {0}", this.GetKeywords())); } else { this.DebugMsg(string.Format("KEYWORDS: {0}", "MISSING")); } } /** Body Text ------------------------------------------------------ **/ if (PdfTools != null) { this.SetBodyText(Text: ""); if (PdfTools.GetHasError()) { this.AddRemark("PDF_ERROR", Observation: PdfTools.GetErrorMessage()); } else { string Text = PdfTools.GetTextAsString(); if (!string.IsNullOrEmpty(Text)) { this.SetDocumentText(Text: Text); this.SetBodyText(Text: Text); } } this.DebugMsg(string.Format("BODY TEXT: {0}", this.GetBodyTextRaw())); } /** Data Extractors ------------------------------------------------ **/ if (!string.IsNullOrEmpty(this.GetBodyTextRaw())) { if (MacroscopePreferencesManager.GetDataExtractorsEnable()) { if (MacroscopePreferencesManager.GetDataExtractorsApplyToPdf()) { string Text = this.GetBodyTextRaw(); this.ProcessGenericDataExtractors(GenericText: Text); } } } /** Out Links Text ------------------------------------------------- **/ if (this.GetDocumentTextRawLength() > 0) { if (this.GetIsInternal()) { string Text = this.GetDocumentTextRaw(); this.ProcessPureTextOutlinks(TextDoc: Text, LinkType: MacroscopeConstants.InOutLinkType.PDF); } } /** Out Links in Annotations --------------------------------------- **/ if (this.GetIsInternal() && (this.GetDocumentTextRawLength() > 0)) { List <KeyValuePair <string, string> > AnnotationOutLinks = PdfTools.GetOutLinks(); // TODO: Implement extraction of text that underlies the link annotation foreach (KeyValuePair <string, string> AnnotationOutLinkPair in AnnotationOutLinks) { MacroscopeHyperlinkOut HyperlinkOut = null; string AnnotationOutLinkUrlAbs; AnnotationOutLinkUrlAbs = MacroscopeHttpUrlUtils.MakeUrlAbsolute( BaseHref: this.BaseHref, BaseUrl: this.DocUrl, Url: AnnotationOutLinkPair.Key ); HyperlinkOut = this.HyperlinksOut.Add(LinkType: MacroscopeConstants.HyperlinkType.PDF, UrlTarget: AnnotationOutLinkUrlAbs); HyperlinkOut.SetRawTargetUrl(TargetUrl: AnnotationOutLinkUrlAbs); HyperlinkOut.SetAltText(AnnotationOutLinkPair.Value); HyperlinkOut.SetAnchorText(AnnotationOutLinkPair.Value); HyperlinkOut.SetTitle(AnnotationOutLinkPair.Value); HyperlinkOut.SetDoFollow(); HyperlinkOut.SetMethod(Method: "GET"); this.AddDocumentOutlink(AbsoluteUrl: AnnotationOutLinkUrlAbs, LinkType: MacroscopeConstants.InOutLinkType.PDF, Follow: true); } } /** ---------------------------------------------------------------- **/ } if (ResponseErrorCondition != null) { this.ProcessErrorCondition(ResponseErrorCondition); } }
/**************************************************************************/ private void ProcessCssOutlinks(Stylesheet CssStylesheet) { if (this.GetIsExternal()) { return; } foreach (var CssRule in CssStylesheet.StyleRules) { int iRule = CssStylesheet.StyleRules.IndexOf(CssRule); foreach (Property pProp in CssStylesheet.StyleRules[iRule].Declarations.Properties) { string BackgroundImageUrl; string LinkUrlAbs; switch (pProp.Name.ToLower()) { case "background-image": if (pProp.Term != null) { BackgroundImageUrl = pProp.Term.ToString(); LinkUrlAbs = this.ProcessCssBackImageUrl(BackgroundImageUrl); DebugMsg(string.Format("ProcessCssHyperlinksOut: (background-image): {0}", BackgroundImageUrl)); DebugMsg(string.Format("ProcessCssHyperlinksOut: (background-image): {0}", LinkUrlAbs)); if (LinkUrlAbs != null) { MacroscopeHyperlinkOut HyperlinkOut = null; MacroscopeLink Outlink = null; HyperlinkOut = this.HyperlinksOut.Add( LinkType: MacroscopeConstants.HyperlinkType.CSS, UrlTarget: LinkUrlAbs ); Outlink = this.AddDocumentOutlink( AbsoluteUrl: LinkUrlAbs, LinkType: MacroscopeConstants.InOutLinkType.IMAGE, Follow: true ); if (Outlink != null) { Outlink.SetRawTargetUrl(BackgroundImageUrl); } } } break; case "background": if (pProp.Term != null) { BackgroundImageUrl = pProp.Term.ToString(); LinkUrlAbs = this.ProcessCssBackImageUrl(BackgroundImageUrl); DebugMsg(string.Format("ProcessCssHyperlinksOut: (background): {0}", BackgroundImageUrl)); DebugMsg(string.Format("ProcessCssHyperlinksOut: (background): {0}", LinkUrlAbs)); if (LinkUrlAbs != null) { MacroscopeHyperlinkOut HyperlinkOut = null; MacroscopeLink Outlink = null; HyperlinkOut = this.HyperlinksOut.Add( LinkType: MacroscopeConstants.HyperlinkType.CSS, UrlTarget: LinkUrlAbs ); Outlink = this.AddDocumentOutlink( AbsoluteUrl: LinkUrlAbs, LinkType: MacroscopeConstants.InOutLinkType.IMAGE, Follow: true ); if (Outlink != null) { Outlink.SetRawTargetUrl(BackgroundImageUrl); } } } break; default: break; } } } }
/**************************************************************************/ private void ProcessCssOutlinks(CssStyleSheet Stylesheet) { // https://github.com/Athari/CsCss // https://developer.mozilla.org/en-US/docs/Web/CSS/url List <string> BackgroundImageUrls = null; if (this.GetIsExternal()) { return; } try { BackgroundImageUrls = Stylesheet.AllStyleRules .Where(StyleRule => StyleRule.Declaration.BackgroundImage != null) .SelectMany(StyleRule => StyleRule.Declaration.AllData) .SelectMany(Property => Property.Value.Unit == CssUnit.List ? Property.Value.List : new[] { Property.Value }) .Where(Value => Value.Unit == CssUnit.Url) .Select(Value => Value.OriginalUri) .ToList(); } catch (Exception ex) { DebugMsg(string.Format("ProcessCssOutlinks: {0}", ex.Message)); } if (BackgroundImageUrls != null) { foreach (string BackgroundImageUrl in BackgroundImageUrls) { string LinkUrlAbs = this.ProcessCssBackgroundImageUrl(BackgroundImageUrl); DebugMsg(string.Format("ProcessCssOutlinks: (background-image): {0}", BackgroundImageUrl)); DebugMsg(string.Format("ProcessCssOutlinks: (background-image): {0}", LinkUrlAbs)); if (LinkUrlAbs != null) { MacroscopeHyperlinkOut HyperlinkOut = null; MacroscopeLink Outlink = null; HyperlinkOut = this.HyperlinksOut.Add( LinkType: MacroscopeConstants.HyperlinkType.CSS, UrlTarget: LinkUrlAbs ); Outlink = this.AddDocumentOutlink( AbsoluteUrl: LinkUrlAbs, LinkType: MacroscopeConstants.InOutLinkType.IMAGE, Follow: true ); if (Outlink != null) { Outlink.SetRawTargetUrl(BackgroundImageUrl); } } } } return; }