Exemple #1
0
 public void TestPdfTitle()
 {
     foreach (string Filename in this.PdfDocsData.Keys)
     {
         MacroscopePdfTools PdfTools = new MacroscopePdfTools(PdfData: this.PdfDocsData[Filename]);
         Assert.AreEqual(this.PdfDocsTitle[Filename], PdfTools.GetTitle());
     }
 }
Exemple #2
0
 public void TestLoadPdf()
 {
     foreach (string Filename in this.PdfDocsData.Keys)
     {
         MacroscopePdfTools PdfTools = new MacroscopePdfTools(PdfData: this.PdfDocsData[Filename]);
         Assert.IsFalse(PdfTools.GetHasError());
     }
 }
Exemple #3
0
 public void TestPdfTextAsString()
 {
     foreach (string Filename in this.PdfDocsData.Keys)
     {
         MacroscopePdfTools PdfTools = new MacroscopePdfTools(PdfData: this.PdfDocsData[Filename]);
         Assert.Greater(PdfTools.GetTextAsString().Length, 0);
     }
 }
Exemple #4
0
 public void TestPdfDescription()
 {
     foreach (string Filename in this.PdfDocsData.Keys)
     {
         MacroscopePdfTools PdfTools = new MacroscopePdfTools(PdfData: this.PdfDocsData[Filename]);
         //Assert.AreEqual( this.PdfDocsDescription[ Filename ], PdfTools.GetDescription() );
     }
 }
        /** -------------------------------------------------------------------- **/

        private async Task _ProcessPdfPage()
        {
            MacroscopeHttpTwoClient         Client         = this.DocCollection.GetJobMaster().GetHttpClient();
            MacroscopeHttpTwoClientResponse ClientResponse = null;
            string ResponseErrorCondition = null;

            try
            {
                ClientResponse = await Client.Get(
                    this.GetUri(),
                    this.ConfigurePdfPageRequestHeadersCallback,
                    this.PostProcessRequestHttpHeadersCallback
                    );
            }
            catch (MacroscopeDocumentException ex)
            {
                this.DebugMsg(string.Format("_ProcessPdfPage :: MacroscopeDocumentException: {0}", ex.Message));
                ResponseErrorCondition = ex.Message;
                this.AddRemark("_ProcessPdfPage", ex.Message);
            }
            catch (Exception ex)
            {
                this.DebugMsg(string.Format("_ProcessPdfPage :: Exception: {0}", ex.Message));
                ResponseErrorCondition = ex.Message;
                this.AddRemark("_ProcessPdfPage", ex.Message);
            }

            if (ClientResponse != null)
            {
                MacroscopePdfTools PdfTools;

                this.ProcessResponseHttpHeaders(Response: ClientResponse);

                {                              // Probe Locale
                  //this.Locale = "en"; // Implement locale probing
                    this.Locale = "x-default"; // Implement locale probing
                    this.SetHreflang(HrefLangLocale: this.Locale, Url: this.DocUrl);
                }

                { // Canonical
                    this.Canonical = this.DocUrl;
                    this.DebugMsg(string.Format("CANONICAL: {0}", this.Canonical));
                }

                /** Get Response Body ---------------------------------------------- **/

                try
                {
                    byte[] RawData = ClientResponse.GetContentAsBytes();
                    this.SetContentLength(Length: RawData.Length);

                    PdfTools = new MacroscopePdfTools(PdfData: RawData);

                    if (PdfTools.GetHasError())
                    {
                        this.AddRemark("CORRUPT_PDF", Observation: PdfTools.GetErrorMessage());
                    }

                    this.SetWasDownloaded(true);
                }
                catch (Exception ex)
                {
                    this.DebugMsg(string.Format("Exception: {0}", ex.Message));
                    this.SetStatusCode(HttpStatusCode.BadRequest);
                    PdfTools = null;
                    this.SetContentLength(Length: 0);
                }

                /** Title ---------------------------------------------------------- **/

                if (PdfTools != null)
                {
                    string Text = PdfTools.GetTitle();

                    if (!string.IsNullOrEmpty(Text))
                    {
                        this.SetTitle(Text, MacroscopeConstants.TextProcessingMode.NO_PROCESSING);
                        this.DebugMsg(string.Format("TITLE: {0}", this.GetTitle()));
                    }
                    else
                    {
                        this.DebugMsg(string.Format("TITLE: {0}", "MISSING"));
                    }
                }

                /** Author --------------------------------------------------------- **/

                if (PdfTools != null)
                {
                    string Text = PdfTools.GetAuthor();

                    if (!string.IsNullOrEmpty(Text))
                    {
                        this.SetAuthor(AuthorText: Text, ProcessingMode: MacroscopeConstants.TextProcessingMode.NO_PROCESSING);
                        this.DebugMsg(string.Format("AUTHOR: {0}", this.GetAuthor()));
                    }
                    else
                    {
                        this.DebugMsg(string.Format("AUTHOR: {0}", "MISSING"));
                    }
                }

                /** Description ---------------------------------------------------- **/

                if (PdfTools != null)
                {
                    string Text = PdfTools.GetDescription();

                    if (!string.IsNullOrEmpty(Text))
                    {
                        this.SetDescription(Text, MacroscopeConstants.TextProcessingMode.NO_PROCESSING);
                        this.DebugMsg(string.Format("TITLE: {0}", this.GetDescription()));
                    }
                    else
                    {
                        this.DebugMsg(string.Format("TITLE: {0}", "MISSING"));
                    }
                }

                /** Metadata Keywords ---------------------------------------------- **/

                if (PdfTools != null)
                {
                    string Text = PdfTools.GetKeywords();

                    if (!string.IsNullOrEmpty(Text))
                    {
                        this.SetKeywords(KeywordsText: Text);
                        this.DebugMsg(string.Format("KEYWORDS: {0}", this.GetKeywords()));
                    }
                    else
                    {
                        this.DebugMsg(string.Format("KEYWORDS: {0}", "MISSING"));
                    }
                }

                /** Body Text ------------------------------------------------------ **/

                if (PdfTools != null)
                {
                    this.SetBodyText(Text: "");

                    if (PdfTools.GetHasError())
                    {
                        this.AddRemark("PDF_ERROR", Observation: PdfTools.GetErrorMessage());
                    }
                    else
                    {
                        string Text = PdfTools.GetTextAsString();
                        if (!string.IsNullOrEmpty(Text))
                        {
                            this.SetDocumentText(Text: Text);
                            this.SetBodyText(Text: Text);
                        }
                    }

                    this.DebugMsg(string.Format("BODY TEXT: {0}", this.GetBodyTextRaw()));
                }

                /** Data Extractors ------------------------------------------------ **/

                if (!string.IsNullOrEmpty(this.GetBodyTextRaw()))
                {
                    if (MacroscopePreferencesManager.GetDataExtractorsEnable())
                    {
                        if (MacroscopePreferencesManager.GetDataExtractorsApplyToPdf())
                        {
                            string Text = this.GetBodyTextRaw();
                            this.ProcessGenericDataExtractors(GenericText: Text);
                        }
                    }
                }

                /** Out Links Text ------------------------------------------------- **/

                if (this.GetDocumentTextRawLength() > 0)
                {
                    if (this.GetIsInternal())
                    {
                        string Text = this.GetDocumentTextRaw();
                        this.ProcessPureTextOutlinks(TextDoc: Text, LinkType: MacroscopeConstants.InOutLinkType.PDF);
                    }
                }

                /** Out Links in Annotations --------------------------------------- **/

                if (this.GetIsInternal() && (this.GetDocumentTextRawLength() > 0))
                {
                    List <KeyValuePair <string, string> > AnnotationOutLinks = PdfTools.GetOutLinks();

                    // TODO: Implement extraction of text that underlies the link annotation

                    foreach (KeyValuePair <string, string> AnnotationOutLinkPair in AnnotationOutLinks)
                    {
                        MacroscopeHyperlinkOut HyperlinkOut = null;
                        string AnnotationOutLinkUrlAbs;

                        AnnotationOutLinkUrlAbs = MacroscopeHttpUrlUtils.MakeUrlAbsolute(
                            BaseHref: this.BaseHref,
                            BaseUrl: this.DocUrl,
                            Url: AnnotationOutLinkPair.Key
                            );

                        HyperlinkOut = this.HyperlinksOut.Add(LinkType: MacroscopeConstants.HyperlinkType.PDF, UrlTarget: AnnotationOutLinkUrlAbs);
                        HyperlinkOut.SetRawTargetUrl(TargetUrl: AnnotationOutLinkUrlAbs);
                        HyperlinkOut.SetAltText(AnnotationOutLinkPair.Value);
                        HyperlinkOut.SetAnchorText(AnnotationOutLinkPair.Value);
                        HyperlinkOut.SetTitle(AnnotationOutLinkPair.Value);
                        HyperlinkOut.SetDoFollow();
                        HyperlinkOut.SetMethod(Method: "GET");

                        this.AddDocumentOutlink(AbsoluteUrl: AnnotationOutLinkUrlAbs, LinkType: MacroscopeConstants.InOutLinkType.PDF, Follow: true);
                    }
                }

                /** ---------------------------------------------------------------- **/
            }

            if (ResponseErrorCondition != null)
            {
                this.ProcessErrorCondition(ResponseErrorCondition);
            }
        }
Exemple #6
0
        /**************************************************************************/

        private void ProcessPdfPage()
        {
            HttpWebRequest  req = null;
            HttpWebResponse res = null;
            string          ResponseErrorCondition = null;
            Boolean         Authenticating         = false;

            try
            {
                req           = WebRequest.CreateHttp(this.DocUrl);
                req.Method    = "GET";
                req.Timeout   = this.Timeout;
                req.KeepAlive = false;
                req.AutomaticDecompression = DecompressionMethods.GZip | DecompressionMethods.Deflate;

                this.PrepareRequestHttpHeaders(req: req);

                Authenticating = this.AuthenticateRequest(req);

                MacroscopePreferencesManager.EnableHttpProxy(req);

                res = ( HttpWebResponse )req.GetResponse();
            }
            catch (UriFormatException ex)
            {
                DebugMsg(string.Format("ProcessPdfPage :: UriFormatException: {0}", ex.Message));
                ResponseErrorCondition = ex.Message;
            }
            catch (WebException ex)
            {
                DebugMsg(string.Format("ProcessPdfPage :: WebException: {0}", ex.Message));
                DebugMsg(string.Format("ProcessPdfPage :: WebException: {0}", ex.Status));
                DebugMsg(string.Format("ProcessPdfPage :: WebException: {0}", ( int )ex.Status));

                ResponseErrorCondition = ex.Status.ToString();
            }

            if (res != null)
            {
                MacroscopePdfTools pdfTools;

                this.ProcessResponseHttpHeaders(req, res);

                if (Authenticating)
                {
                    this.VerifyOrPurgeCredential();
                }

                {                              // Probe Locale
                  //this.Locale = "en"; // Implement locale probing
                    this.Locale = "x-default"; // Implement locale probing
                    this.SetHreflang(HrefLangLocale: this.Locale, Url: this.DocUrl);
                }

                { // Canonical
                    this.Canonical = this.DocUrl;
                    DebugMsg(string.Format("CANONICAL: {0}", this.Canonical));
                }

                { // Get Response Body
                    try
                    {
                        Stream      ResponseStream = res.GetResponseStream();
                        List <byte> RawDataList    = new List <byte> ();
                        byte []     RawData;

                        do
                        {
                            int buf = ResponseStream.ReadByte();
                            if (buf > -1)
                            {
                                RawDataList.Add(( byte )buf);
                            }
                            else
                            {
                                break;
                            }
                        } while(ResponseStream.CanRead);

                        RawData            = RawDataList.ToArray();
                        this.ContentLength = RawData.Length;

                        pdfTools = new MacroscopePdfTools(RawData);

                        if (pdfTools.GetHasError())
                        {
                            this.AddRemark(Observation: pdfTools.GetErrorMessage());
                        }

                        this.SetWasDownloaded(true);
                    }
                    catch (WebException ex)
                    {
                        DebugMsg(string.Format("WebException: {0}", ex.Message));

                        if (ex.Response != null)
                        {
                            this.SetStatusCode((( HttpWebResponse )ex.Response).StatusCode);
                        }
                        else
                        {
                            this.SetStatusCode(( HttpStatusCode )ex.Status);
                        }

                        pdfTools           = null;
                        this.ContentLength = 0;
                    }
                    catch (Exception ex)
                    {
                        DebugMsg(string.Format("Exception: {0}", ex.Message));
                        this.SetStatusCode(HttpStatusCode.BadRequest);
                        pdfTools           = null;
                        this.ContentLength = 0;
                    }
                }

                /** Title ---------------------------------------------------------- **/

                {
                    if (pdfTools != null)
                    {
                        string DocumentTitle = pdfTools.GetTitle();
                        if (DocumentTitle != null)
                        {
                            this.SetTitle(DocumentTitle, MacroscopeConstants.TextProcessingMode.NO_PROCESSING);
                            DebugMsg(string.Format("TITLE: {0}", this.GetTitle()));
                        }
                        else
                        {
                            DebugMsg(string.Format("TITLE: {0}", "MISSING"));
                        }
                    }
                }

                /** ---------------------------------------------------------------- **/

                res.Close();

                res.Dispose();
            }

            if (ResponseErrorCondition != null)
            {
                this.ProcessErrorCondition(ResponseErrorCondition);
            }
        }