Esempio n. 1
0
        /**************************************************************************/

        private bool CheckNodeAlreadyVisited(
            MacroscopeDocument msDoc,
            MacroscopeHyperlinkOut HyperlinkOut
            )
        {
            bool Result = false;

            if (this.NodeVisited.ContainsKey(msDoc))
            {
                if (this.NodeVisited[msDoc].Contains(HyperlinkOut))
                {
                    Result = true;
                }
                else
                {
                    this.NodeVisited[msDoc].Add(HyperlinkOut);
                }
            }
            else
            {
                this.NodeVisited[msDoc] = new List <MacroscopeHyperlinkOut> ();

                this.NodeVisited[msDoc].Add(HyperlinkOut);
            }

            return(Result);
        }
    /**************************************************************************/

    public bool ContainsLink ( MacroscopeHyperlinkOut  Link )
    {
      bool LinkPresent = false;
      lock( this.Links )
      {
        if( this.Links.Contains( Link ) )
        {
          LinkPresent = true;
        }
      }
      return( LinkPresent );
    }
Esempio n. 3
0
        /**************************************************************************/

        public void Remove(MacroscopeHyperlinkOut HyperlinkOut)
        {
            lock (this.Links)
            {
                foreach (MacroscopeHyperlinkOut HyperlinkOutOld in this.Links)
                {
                    if (HyperlinkOutOld.Equals(HyperlinkOut))
                    {
                        this.Links.Remove(HyperlinkOutOld);
                    }
                }
            }
        }
    /**************************************************************************/

    public MacroscopeHyperlinkOut Add (
      MacroscopeConstants.HyperlinkType LinkType,
      string UrlTarget
    )
    {
      MacroscopeHyperlinkOut HyperlinkOut = new MacroscopeHyperlinkOut ();
      HyperlinkOut.SetHyperlinkType( LinkType );
      HyperlinkOut.SetTargetUrl( UrlTarget );
      lock( this.Links )
      {
        this.Links.Add( HyperlinkOut );
      }
      return( HyperlinkOut );
    }
        /** -------------------------------------------------------------------- **/

        private async Task _ProcessPdfPage()
        {
            MacroscopeHttpTwoClient         Client         = this.DocCollection.GetJobMaster().GetHttpClient();
            MacroscopeHttpTwoClientResponse ClientResponse = null;
            string ResponseErrorCondition = null;

            try
            {
                ClientResponse = await Client.Get(
                    this.GetUri(),
                    this.ConfigurePdfPageRequestHeadersCallback,
                    this.PostProcessRequestHttpHeadersCallback
                    );
            }
            catch (MacroscopeDocumentException ex)
            {
                this.DebugMsg(string.Format("_ProcessPdfPage :: MacroscopeDocumentException: {0}", ex.Message));
                ResponseErrorCondition = ex.Message;
                this.AddRemark("_ProcessPdfPage", ex.Message);
            }
            catch (Exception ex)
            {
                this.DebugMsg(string.Format("_ProcessPdfPage :: Exception: {0}", ex.Message));
                ResponseErrorCondition = ex.Message;
                this.AddRemark("_ProcessPdfPage", ex.Message);
            }

            if (ClientResponse != null)
            {
                MacroscopePdfTools PdfTools;

                this.ProcessResponseHttpHeaders(Response: ClientResponse);

                {                              // Probe Locale
                  //this.Locale = "en"; // Implement locale probing
                    this.Locale = "x-default"; // Implement locale probing
                    this.SetHreflang(HrefLangLocale: this.Locale, Url: this.DocUrl);
                }

                { // Canonical
                    this.Canonical = this.DocUrl;
                    this.DebugMsg(string.Format("CANONICAL: {0}", this.Canonical));
                }

                /** Get Response Body ---------------------------------------------- **/

                try
                {
                    byte[] RawData = ClientResponse.GetContentAsBytes();
                    this.SetContentLength(Length: RawData.Length);

                    PdfTools = new MacroscopePdfTools(PdfData: RawData);

                    if (PdfTools.GetHasError())
                    {
                        this.AddRemark("CORRUPT_PDF", Observation: PdfTools.GetErrorMessage());
                    }

                    this.SetWasDownloaded(true);
                }
                catch (Exception ex)
                {
                    this.DebugMsg(string.Format("Exception: {0}", ex.Message));
                    this.SetStatusCode(HttpStatusCode.BadRequest);
                    PdfTools = null;
                    this.SetContentLength(Length: 0);
                }

                /** Title ---------------------------------------------------------- **/

                if (PdfTools != null)
                {
                    string Text = PdfTools.GetTitle();

                    if (!string.IsNullOrEmpty(Text))
                    {
                        this.SetTitle(Text, MacroscopeConstants.TextProcessingMode.NO_PROCESSING);
                        this.DebugMsg(string.Format("TITLE: {0}", this.GetTitle()));
                    }
                    else
                    {
                        this.DebugMsg(string.Format("TITLE: {0}", "MISSING"));
                    }
                }

                /** Author --------------------------------------------------------- **/

                if (PdfTools != null)
                {
                    string Text = PdfTools.GetAuthor();

                    if (!string.IsNullOrEmpty(Text))
                    {
                        this.SetAuthor(AuthorText: Text, ProcessingMode: MacroscopeConstants.TextProcessingMode.NO_PROCESSING);
                        this.DebugMsg(string.Format("AUTHOR: {0}", this.GetAuthor()));
                    }
                    else
                    {
                        this.DebugMsg(string.Format("AUTHOR: {0}", "MISSING"));
                    }
                }

                /** Description ---------------------------------------------------- **/

                if (PdfTools != null)
                {
                    string Text = PdfTools.GetDescription();

                    if (!string.IsNullOrEmpty(Text))
                    {
                        this.SetDescription(Text, MacroscopeConstants.TextProcessingMode.NO_PROCESSING);
                        this.DebugMsg(string.Format("TITLE: {0}", this.GetDescription()));
                    }
                    else
                    {
                        this.DebugMsg(string.Format("TITLE: {0}", "MISSING"));
                    }
                }

                /** Metadata Keywords ---------------------------------------------- **/

                if (PdfTools != null)
                {
                    string Text = PdfTools.GetKeywords();

                    if (!string.IsNullOrEmpty(Text))
                    {
                        this.SetKeywords(KeywordsText: Text);
                        this.DebugMsg(string.Format("KEYWORDS: {0}", this.GetKeywords()));
                    }
                    else
                    {
                        this.DebugMsg(string.Format("KEYWORDS: {0}", "MISSING"));
                    }
                }

                /** Body Text ------------------------------------------------------ **/

                if (PdfTools != null)
                {
                    this.SetBodyText(Text: "");

                    if (PdfTools.GetHasError())
                    {
                        this.AddRemark("PDF_ERROR", Observation: PdfTools.GetErrorMessage());
                    }
                    else
                    {
                        string Text = PdfTools.GetTextAsString();
                        if (!string.IsNullOrEmpty(Text))
                        {
                            this.SetDocumentText(Text: Text);
                            this.SetBodyText(Text: Text);
                        }
                    }

                    this.DebugMsg(string.Format("BODY TEXT: {0}", this.GetBodyTextRaw()));
                }

                /** Data Extractors ------------------------------------------------ **/

                if (!string.IsNullOrEmpty(this.GetBodyTextRaw()))
                {
                    if (MacroscopePreferencesManager.GetDataExtractorsEnable())
                    {
                        if (MacroscopePreferencesManager.GetDataExtractorsApplyToPdf())
                        {
                            string Text = this.GetBodyTextRaw();
                            this.ProcessGenericDataExtractors(GenericText: Text);
                        }
                    }
                }

                /** Out Links Text ------------------------------------------------- **/

                if (this.GetDocumentTextRawLength() > 0)
                {
                    if (this.GetIsInternal())
                    {
                        string Text = this.GetDocumentTextRaw();
                        this.ProcessPureTextOutlinks(TextDoc: Text, LinkType: MacroscopeConstants.InOutLinkType.PDF);
                    }
                }

                /** Out Links in Annotations --------------------------------------- **/

                if (this.GetIsInternal() && (this.GetDocumentTextRawLength() > 0))
                {
                    List <KeyValuePair <string, string> > AnnotationOutLinks = PdfTools.GetOutLinks();

                    // TODO: Implement extraction of text that underlies the link annotation

                    foreach (KeyValuePair <string, string> AnnotationOutLinkPair in AnnotationOutLinks)
                    {
                        MacroscopeHyperlinkOut HyperlinkOut = null;
                        string AnnotationOutLinkUrlAbs;

                        AnnotationOutLinkUrlAbs = MacroscopeHttpUrlUtils.MakeUrlAbsolute(
                            BaseHref: this.BaseHref,
                            BaseUrl: this.DocUrl,
                            Url: AnnotationOutLinkPair.Key
                            );

                        HyperlinkOut = this.HyperlinksOut.Add(LinkType: MacroscopeConstants.HyperlinkType.PDF, UrlTarget: AnnotationOutLinkUrlAbs);
                        HyperlinkOut.SetRawTargetUrl(TargetUrl: AnnotationOutLinkUrlAbs);
                        HyperlinkOut.SetAltText(AnnotationOutLinkPair.Value);
                        HyperlinkOut.SetAnchorText(AnnotationOutLinkPair.Value);
                        HyperlinkOut.SetTitle(AnnotationOutLinkPair.Value);
                        HyperlinkOut.SetDoFollow();
                        HyperlinkOut.SetMethod(Method: "GET");

                        this.AddDocumentOutlink(AbsoluteUrl: AnnotationOutLinkUrlAbs, LinkType: MacroscopeConstants.InOutLinkType.PDF, Follow: true);
                    }
                }

                /** ---------------------------------------------------------------- **/
            }

            if (ResponseErrorCondition != null)
            {
                this.ProcessErrorCondition(ResponseErrorCondition);
            }
        }
        /**************************************************************************/

        private void ProcessCssOutlinks(Stylesheet CssStylesheet)
        {
            if (this.GetIsExternal())
            {
                return;
            }

            foreach (var CssRule in CssStylesheet.StyleRules)
            {
                int iRule = CssStylesheet.StyleRules.IndexOf(CssRule);

                foreach (Property pProp in CssStylesheet.StyleRules[iRule].Declarations.Properties)
                {
                    string BackgroundImageUrl;
                    string LinkUrlAbs;

                    switch (pProp.Name.ToLower())
                    {
                    case "background-image":

                        if (pProp.Term != null)
                        {
                            BackgroundImageUrl = pProp.Term.ToString();
                            LinkUrlAbs         = this.ProcessCssBackImageUrl(BackgroundImageUrl);

                            DebugMsg(string.Format("ProcessCssHyperlinksOut: (background-image): {0}", BackgroundImageUrl));
                            DebugMsg(string.Format("ProcessCssHyperlinksOut: (background-image): {0}", LinkUrlAbs));

                            if (LinkUrlAbs != null)
                            {
                                MacroscopeHyperlinkOut HyperlinkOut = null;
                                MacroscopeLink         Outlink      = null;

                                HyperlinkOut = this.HyperlinksOut.Add(
                                    LinkType: MacroscopeConstants.HyperlinkType.CSS,
                                    UrlTarget: LinkUrlAbs
                                    );

                                Outlink = this.AddDocumentOutlink(
                                    AbsoluteUrl: LinkUrlAbs,
                                    LinkType: MacroscopeConstants.InOutLinkType.IMAGE,
                                    Follow: true
                                    );

                                if (Outlink != null)
                                {
                                    Outlink.SetRawTargetUrl(BackgroundImageUrl);
                                }
                            }
                        }

                        break;

                    case "background":

                        if (pProp.Term != null)
                        {
                            BackgroundImageUrl = pProp.Term.ToString();
                            LinkUrlAbs         = this.ProcessCssBackImageUrl(BackgroundImageUrl);

                            DebugMsg(string.Format("ProcessCssHyperlinksOut: (background): {0}", BackgroundImageUrl));
                            DebugMsg(string.Format("ProcessCssHyperlinksOut: (background): {0}", LinkUrlAbs));

                            if (LinkUrlAbs != null)
                            {
                                MacroscopeHyperlinkOut HyperlinkOut = null;
                                MacroscopeLink         Outlink      = null;

                                HyperlinkOut = this.HyperlinksOut.Add(
                                    LinkType: MacroscopeConstants.HyperlinkType.CSS,
                                    UrlTarget: LinkUrlAbs
                                    );

                                Outlink = this.AddDocumentOutlink(
                                    AbsoluteUrl: LinkUrlAbs,
                                    LinkType: MacroscopeConstants.InOutLinkType.IMAGE,
                                    Follow: true
                                    );

                                if (Outlink != null)
                                {
                                    Outlink.SetRawTargetUrl(BackgroundImageUrl);
                                }
                            }
                        }

                        break;

                    default:
                        break;
                    }
                }
            }
        }
        /**************************************************************************/

        private void ProcessCssOutlinks(CssStyleSheet Stylesheet)
        {
            // https://github.com/Athari/CsCss

            // https://developer.mozilla.org/en-US/docs/Web/CSS/url

            List <string> BackgroundImageUrls = null;

            if (this.GetIsExternal())
            {
                return;
            }

            try
            {
                BackgroundImageUrls = Stylesheet.AllStyleRules
                                      .Where(StyleRule => StyleRule.Declaration.BackgroundImage != null)
                                      .SelectMany(StyleRule => StyleRule.Declaration.AllData)
                                      .SelectMany(Property => Property.Value.Unit == CssUnit.List ? Property.Value.List : new[] { Property.Value })
                                      .Where(Value => Value.Unit == CssUnit.Url)
                                      .Select(Value => Value.OriginalUri)
                                      .ToList();
            }
            catch (Exception ex)
            {
                DebugMsg(string.Format("ProcessCssOutlinks: {0}", ex.Message));
            }

            if (BackgroundImageUrls != null)
            {
                foreach (string BackgroundImageUrl in BackgroundImageUrls)
                {
                    string LinkUrlAbs = this.ProcessCssBackgroundImageUrl(BackgroundImageUrl);

                    DebugMsg(string.Format("ProcessCssOutlinks: (background-image): {0}", BackgroundImageUrl));
                    DebugMsg(string.Format("ProcessCssOutlinks: (background-image): {0}", LinkUrlAbs));

                    if (LinkUrlAbs != null)
                    {
                        MacroscopeHyperlinkOut HyperlinkOut = null;
                        MacroscopeLink         Outlink      = null;

                        HyperlinkOut = this.HyperlinksOut.Add(
                            LinkType: MacroscopeConstants.HyperlinkType.CSS,
                            UrlTarget: LinkUrlAbs
                            );

                        Outlink = this.AddDocumentOutlink(
                            AbsoluteUrl: LinkUrlAbs,
                            LinkType: MacroscopeConstants.InOutLinkType.IMAGE,
                            Follow: true
                            );

                        if (Outlink != null)
                        {
                            Outlink.SetRawTargetUrl(BackgroundImageUrl);
                        }
                    }
                }
            }

            return;
        }