/**************************************************************************/ public void SetPrefsFormControlFields() { { // Configure Display Options this.checkBoxPauseDisplayDuringScan.Checked = MacroscopePreferencesManager.GetPauseDisplayDuringScan(); this.checkBoxShowProgressDialogues.Checked = MacroscopePreferencesManager.GetShowProgressDialogues(); } { //Configure Form Fields /** Spidering Control ---------------------------------------------- **/ this.numericUpDownDepth.Minimum = -1; this.numericUpDownDepth.Maximum = 10000; this.numericUpDownPageLimit.Minimum = -1; this.numericUpDownPageLimit.Maximum = 10000; this.numericUpDownCrawlDelay.Minimum = 0; this.numericUpDownCrawlDelay.Maximum = 60; this.numericUpDownMaxRetries.Minimum = 0; this.numericUpDownMaxRetries.Maximum = 10; } { /** WebProxy Options ----------------------------------------------- **/ this.comboBoxProxyType.SelectedIndex = (int)MacroscopePreferencesManager.GetProxyType(); /** Server Certificate Options --------------------------------------- **/ this.checkBoxServerCertificateValidation.Checked = MacroscopePreferencesManager.GetServerCertificateValidation(); /** Spidering Control ---------------------------------------------- **/ this.numericUpDownMaxThreads.Value = MacroscopePreferencesManager.GetMaxThreads(); this.numericUpDownDepth.Value = MacroscopePreferencesManager.GetDepth(); this.numericUpDownPageLimit.Value = MacroscopePreferencesManager.GetPageLimit(); this.numericUpDownCrawlDelay.Value = MacroscopePreferencesManager.GetCrawlDelay(); this.numericUpDownRequestTimeout.Value = (Decimal)MacroscopePreferencesManager.GetRequestTimeout(); this.numericUpDownMaxRetries.Value = (Decimal)MacroscopePreferencesManager.GetMaxRetries(); this.checkBoxCrawlStrictUrlCheck.Checked = MacroscopePreferencesManager.GetCrawlStrictUrlCheck(); this.checkBoxCheckExternalLinks.Checked = MacroscopePreferencesManager.GetCheckExternalLinks(); this.checkBoxFetchExternalLinks.Checked = MacroscopePreferencesManager.GetFetchExternalLinks(); this.checkBoxFollowRobotsProtocol.Checked = MacroscopePreferencesManager.GetFollowRobotsProtocol(); this.checkBoxFollowSitemapLinks.Checked = MacroscopePreferencesManager.GetFollowSitemapLinks(); this.checkBoxProbeHumansText.Checked = MacroscopePreferencesManager.GetProbeHumansText(); this.checkBoxCheckRedirects.Checked = MacroscopePreferencesManager.GetCheckRedirects(); this.checkBoxFollowRedirects.Checked = MacroscopePreferencesManager.GetFollowRedirects(); this.checkBoxFollowNoFollow.Checked = MacroscopePreferencesManager.GetFollowNoFollow(); this.checkBoxIgnoreQueries.Checked = MacroscopePreferencesManager.GetIgnoreQueries(); this.checkBoxIgnoreHashFragments.Checked = MacroscopePreferencesManager.GetIgnoreHashFragments(); this.checkBoxFollowCanonicalLinks.Checked = MacroscopePreferencesManager.GetFollowCanonicalLinks(); this.checkBoxFollowAlternateLinks.Checked = MacroscopePreferencesManager.GetFollowAlternateLinks(); this.checkBoxFollowHrefLangLinks.Checked = MacroscopePreferencesManager.GetFollowHrefLangLinks(); this.checkBoxDowncaseLinks.Checked = MacroscopePreferencesManager.GetDowncaseLinks(); this.checkBoxFetchStylesheets.Checked = MacroscopePreferencesManager.GetFetchStylesheets(); this.checkBoxFetchJavascripts.Checked = MacroscopePreferencesManager.GetFetchJavascripts(); this.checkBoxFetchImages.Checked = MacroscopePreferencesManager.GetFetchImages(); this.checkBoxFetchAudio.Checked = MacroscopePreferencesManager.GetFetchAudio(); this.checkBoxFetchVideo.Checked = MacroscopePreferencesManager.GetFetchVideo(); this.checkBoxFetchXml.Checked = MacroscopePreferencesManager.GetFetchXml(); this.checkBoxFetchBinaries.Checked = MacroscopePreferencesManager.GetFetchBinaries(); this.checkBoxScanSitesInList.Checked = MacroscopePreferencesManager.GetScanSitesInList(); this.checkBoxProbeParentFolderUrls.Checked = MacroscopePreferencesManager.GetProbeParentFolderUrls(); this.checkBoxProbeHead404sWithGet.Checked = MacroscopePreferencesManager.GetProbeHead404sWithGet(); /** Analysis Options ----------------------------------------------- **/ this.checkBoxResolveAddresses.Checked = MacroscopePreferencesManager.GetResolveAddresses(); this.checkBoxCheckHreflangs.Checked = MacroscopePreferencesManager.GetCheckHreflangs(); this.checkBoxDetectLanguage.Checked = MacroscopePreferencesManager.GetDetectLanguage(); this.checkBoxProcessStylesheets.Checked = MacroscopePreferencesManager.GetProcessStylesheets(); this.checkBoxProcessJavascripts.Checked = MacroscopePreferencesManager.GetProcessJavascripts(); this.checkBoxProcessImages.Checked = MacroscopePreferencesManager.GetProcessImages(); this.checkBoxProcessPdfs.Checked = MacroscopePreferencesManager.GetProcessPdfs(); this.checkBoxProcessAudio.Checked = MacroscopePreferencesManager.GetProcessAudio(); this.checkBoxProcessVideo.Checked = MacroscopePreferencesManager.GetProcessVideo(); this.checkBoxProcessXml.Checked = MacroscopePreferencesManager.GetProcessXml(); this.checkBoxProcessBinaries.Checked = MacroscopePreferencesManager.GetProcessBinaries(); this.numericUpDownRedirectChainsMaxHops.Value = MacroscopePreferencesManager.GetRedirectChainsMaxHops(); this.checkBoxWarnAboutInsecureLinks.Checked = MacroscopePreferencesManager.GetWarnAboutInsecureLinks(); this.checkBoxEnableTextIndexing.Checked = MacroscopePreferencesManager.GetEnableTextIndexing(); this.checkBoxCaseSensitiveTextIndexing.Checked = MacroscopePreferencesManager.GetCaseSensitiveTextIndexing(); this.checkBoxDisregardHtml5ElementNav.Checked = MacroscopePreferencesManager.GetDisregardHtml5ElementNav(); this.checkBoxDisregardHtml5ElementHeader.Checked = MacroscopePreferencesManager.GetDisregardHtml5ElementHeader(); this.checkBoxDisregardHtml5ElementFooter.Checked = MacroscopePreferencesManager.GetDisregardHtml5ElementFooter(); this.checkBoxDetectQrCodeInImage.Checked = MacroscopePreferencesManager.GetDetectQrCodeInImage(); /** SEO Options ---------------------------------------------------- **/ this.numericUpDownTitleMinLen.Value = MacroscopePreferencesManager.GetTitleMinLen(); this.numericUpDownTitleMaxLen.Value = MacroscopePreferencesManager.GetTitleMaxLen(); this.numericUpDownTitleMinWords.Value = MacroscopePreferencesManager.GetTitleMinWords(); this.numericUpDownTitleMaxWords.Value = MacroscopePreferencesManager.GetTitleMaxWords(); this.numericUpDownTitleMaxPixelWidth.Value = MacroscopePreferencesManager.GetTitleMaxPixelWidth(); this.numericUpDownDescriptionMinLen.Value = MacroscopePreferencesManager.GetDescriptionMinLen(); this.numericUpDownDescriptionMaxLen.Value = MacroscopePreferencesManager.GetDescriptionMaxLen(); this.numericUpDownDescriptionMinWords.Value = MacroscopePreferencesManager.GetDescriptionMinWords(); this.numericUpDownDescriptionMaxWords.Value = MacroscopePreferencesManager.GetDescriptionMaxWords(); this.numericUpDownMaxHeadingDepth.Value = MacroscopePreferencesManager.GetMaxHeadingDepth(); this.checkBoxAnalyzeKeywordsInText.Checked = MacroscopePreferencesManager.GetAnalyzeKeywordsInText(); this.checkBoxAnalyzeTextReadability.Checked = MacroscopePreferencesManager.GetAnalyzeTextReadability(); this.comboBoxAnalyzeTextReadabilityEnglishAlgorithm.SelectedIndex = (int)MacroscopePreferencesManager.GetAnalyzeTextReadabilityEnglishAlgorithm(); this.checkBoxEnableLevenshteinDeduplication.Checked = MacroscopePreferencesManager.GetEnableLevenshteinDeduplication(); this.comboBoxLevenshteinAnalysisLevel.SelectedIndex = (int)MacroscopePreferencesManager.GetLevenshteinAnalysisLevel(); this.numericUpDownMaxLevenshteinSizeDifference.Value = MacroscopePreferencesManager.GetMaxLevenshteinSizeDifference(); this.numericUpDownMaxLevenshteinDistance.Value = MacroscopePreferencesManager.GetMaxLevenshteinDistance(); this.checkBoxAnalyzeClickPaths.Checked = MacroscopePreferencesManager.GetAnalyzeClickPaths(); // TODO: Finish implementing click path analysis: #if DEBUG this.groupBoxPageNavigationAnalysis.Visible = true; #else this.groupBoxPageNavigationAnalysis.Visible = false; #endif /** Custom Filter Options ------------------------------------------ **/ this.checkBoxCustomFiltersEnable.Checked = MacroscopePreferencesManager.GetCustomFiltersEnable(); this.numericUpDownCustomFiltersMaxItems.Value = MacroscopePreferencesManager.GetCustomFiltersMaxItems(); this.checkBoxCustomFiltersApplyToHtml.Checked = MacroscopePreferencesManager.GetCustomFiltersApplyToHtml(); this.checkBoxCustomFiltersApplyToCss.Checked = MacroscopePreferencesManager.GetCustomFiltersApplyToCss(); this.checkBoxCustomFiltersApplyToJavascripts.Checked = MacroscopePreferencesManager.GetCustomFiltersApplyToJavascripts(); this.checkBoxCustomFiltersApplyToText.Checked = MacroscopePreferencesManager.GetCustomFiltersApplyToText(); this.checkBoxCustomFiltersApplyToXml.Checked = MacroscopePreferencesManager.GetCustomFiltersApplyToXml(); /** Extractor Options ---------------------------------------------- **/ this.checkBoxDataExtractorsEnable.Checked = MacroscopePreferencesManager.GetDataExtractorsEnable(); this.checkBoxDataExtractorsCleanWhiteSpace.Checked = MacroscopePreferencesManager.GetDataExtractorsCleanWhiteSpace(); this.numericUpDownDataExtractorsMaxItemsCssSelectors.Value = MacroscopePreferencesManager.GetDataExtractorsMaxItemsCssSelectors(); this.numericUpDownDataExtractorsMaxItemsRegexes.Value = MacroscopePreferencesManager.GetDataExtractorsMaxItemsRegexes(); this.numericUpDownDataExtractorsMaxItemsXpaths.Value = MacroscopePreferencesManager.GetDataExtractorsMaxItemsXpaths(); this.checkBoxDataExtractorsApplyToHtml.Checked = MacroscopePreferencesManager.GetDataExtractorsApplyToHtml(); this.checkBoxDataExtractorsApplyToCss.Checked = MacroscopePreferencesManager.GetDataExtractorsApplyToCss(); this.checkBoxDataExtractorsApplyToJavascripts.Checked = MacroscopePreferencesManager.GetDataExtractorsApplyToJavascripts(); this.checkBoxDataExtractorsApplyToText.Checked = MacroscopePreferencesManager.GetDataExtractorsApplyToText(); this.checkBoxDataExtractorsApplyToPdf.Checked = MacroscopePreferencesManager.GetDataExtractorsApplyToPdf(); this.checkBoxDataExtractorsApplyToXml.Checked = MacroscopePreferencesManager.GetDataExtractorsApplyToXml(); /** Export Options ------------------------------------------------- **/ this.checkBoxSitemapIncludeLinkedPdfs.Checked = MacroscopePreferencesManager.GetSitemapIncludeLinkedPdfs(); /** Ignore Errors Settings ----------------------------------------- **/ this.checkBoxIgnoreErrors410.Checked = MacroscopePreferencesManager.GetIgnoreErrors410(); this.checkBoxIgnoreErrors451.Checked = MacroscopePreferencesManager.GetIgnoreErrors451(); } }
/** -------------------------------------------------------------------- **/ private async Task _ProcessJavascriptPage() { MacroscopeHttpTwoClient Client = this.DocCollection.GetJobMaster().GetHttpClient(); MacroscopeHttpTwoClientResponse Response = null; string ResponseErrorCondition = null; try { Response = await Client.Get( this.GetUri(), this.ConfigureJavascriptPageRequestHeadersCallback, this.PostProcessRequestHttpHeadersCallback ); } catch (MacroscopeDocumentException ex) { this.DebugMsg(string.Format("_ProcessJavascriptPage :: MacroscopeDocumentException: {0}", ex.Message)); ResponseErrorCondition = ex.Message; this.SetStatusCode(HttpStatusCode.BadRequest); this.AddRemark("_ProcessJavascriptPage", ex.Message); } catch (Exception ex) { this.DebugMsg(string.Format("_ProcessJavascriptPage :: Exception: {0}", ex.Message)); ResponseErrorCondition = ex.Message; this.SetStatusCode(HttpStatusCode.BadRequest); this.AddRemark("_ProcessJavascriptPage", ex.Message); } if (Response != null) { string RawData = ""; this.ProcessResponseHttpHeaders(Response: Response); /** Get Response Body ---------------------------------------------- **/ try { DebugMsg(string.Format("MIME TYPE: {0}", this.MimeType)); /* * Encoding encUseEncoding = Encoding.UTF8; * * if( this.GetCharacterEncoding() != null ) * { * encUseEncoding = this.GetCharacterEncoding(); * } * else * { * encUseEncoding = this.JavascriptSniffCharset(); * } */ RawData = Response.GetContentAsString(); this.SetContentLength(Length: RawData.Length); // May need to find bytes length this.SetChecksum(RawData); } catch (Exception ex) { DebugMsg(string.Format("Exception: {0}", ex.Message)); this.SetStatusCode(HttpStatusCode.Ambiguous); RawData = ""; this.SetContentLength(Length: 0); } /** Custom Filters ------------------------------------------------- **/ if (!string.IsNullOrEmpty(RawData)) { if ( MacroscopePreferencesManager.GetCustomFiltersEnable() && MacroscopePreferencesManager.GetCustomFiltersApplyToJavascripts()) { MacroscopeCustomFilters CustomFilter = this.DocCollection.GetJobMaster().GetCustomFilter(); if ((CustomFilter != null) && (CustomFilter.IsEnabled())) { this.ProcessGenericCustomFiltered( CustomFilter: CustomFilter, GenericText: RawData ); } } } /** Data Extractors ------------------------------------------------ **/ if (!string.IsNullOrEmpty(RawData)) { if ( MacroscopePreferencesManager.GetDataExtractorsEnable() && MacroscopePreferencesManager.GetDataExtractorsApplyToJavascripts()) { this.ProcessGenericDataExtractors(GenericText: RawData); } } /** Title ---------------------------------------------------------- **/ { MatchCollection reMatches = Regex.Matches(this.DocUrl, "/([^/]+)$"); string DocumentTitle = null; foreach (Match match in reMatches) { if (match.Groups[1].Value.Length > 0) { DocumentTitle = match.Groups[1].Value.ToString(); break; } } if (DocumentTitle != null) { this.SetTitle(DocumentTitle, MacroscopeConstants.TextProcessingMode.NO_PROCESSING); DebugMsg(string.Format("TITLE: {0}", this.GetTitle())); } else { DebugMsg(string.Format("TITLE: {0}", "MISSING")); } } /** ---------------------------------------------------------------- **/ } if (ResponseErrorCondition != null) { this.ProcessErrorCondition(ResponseErrorCondition); } }
/** -------------------------------------------------------------------- **/ private async Task _ProcessXmlPage() { XmlDocument XmlDoc = null; MacroscopeHttpTwoClient Client = this.DocCollection.GetJobMaster().GetHttpClient(); MacroscopeHttpTwoClientResponse Response = null; string ResponseErrorCondition = null; try { Response = await Client.Get( this.GetUri(), this.ConfigureXmlPageRequestHeadersCallback, this.PostProcessRequestHttpHeadersCallback ); } catch (MacroscopeDocumentException ex) { this.DebugMsg(string.Format("_ProcessXmlPage :: MacroscopeDocumentException: {0}", ex.Message)); ResponseErrorCondition = ex.Message; this.SetStatusCode(HttpStatusCode.BadRequest); this.AddRemark("_ProcessXmlPage", ex.Message); } catch (Exception ex) { this.DebugMsg(string.Format("_ProcessXmlPage :: Exception: {0}", ex.Message)); ResponseErrorCondition = ex.Message; this.SetStatusCode(HttpStatusCode.BadRequest); this.AddRemark("_ProcessXmlPage", ex.Message); } if (Response != null) { string RawData = ""; this.ProcessResponseHttpHeaders(Response: Response); // Get Response Body try { DebugMsg(string.Format("MIME TYPE: {0}", this.MimeType)); RawData = Response.GetContentAsString(); this.SetContentLength(Length: RawData.Length); // May need to find bytes length this.SetWasDownloaded(true); this.SetChecksum(RawData); } catch (Exception ex) { DebugMsg(string.Format("Exception: {0}", ex.Message)); this.SetStatusCode(HttpStatusCode.BadRequest); RawData = ""; this.SetContentLength(Length: 0); } if (!string.IsNullOrEmpty(RawData)) { XmlDoc = new XmlDocument(); try { XmlDoc.LoadXml(RawData); } catch (XmlException ex) { DebugMsg(string.Format("XmlException: {0}", ex.Message)); } catch (Exception ex) { DebugMsg(string.Format("Exception: {0}", ex.Message)); } DebugMsg(string.Format("XmlDoc: {0}", XmlDoc)); } else { DebugMsg(string.Format("RawData: {0}", "EMPTY")); } /** Custom Filters ------------------------------------------------- **/ if (!string.IsNullOrEmpty(RawData)) { if ( MacroscopePreferencesManager.GetCustomFiltersEnable() && MacroscopePreferencesManager.GetCustomFiltersApplyToXml()) { MacroscopeCustomFilters CustomFilter = this.DocCollection.GetJobMaster().GetCustomFilter(); if ((CustomFilter != null) && (CustomFilter.IsEnabled())) { this.ProcessGenericCustomFiltered( CustomFilter: CustomFilter, GenericText: RawData ); } } } /** Data Extractors ------------------------------------------------ **/ if (!string.IsNullOrEmpty(RawData)) { if ( MacroscopePreferencesManager.GetDataExtractorsEnable() && MacroscopePreferencesManager.GetDataExtractorsApplyToXml()) { this.ProcessGenericDataExtractors(GenericText: RawData); } } /** ---------------------------------------------------------------- **/ if ((XmlDoc != null) && (XmlDoc.DocumentElement != null)) { if (this.DetectSitemapXmlDocument(XmlDoc)) { DebugMsg(string.Format("ProcessXmlPage: {0} :: {1}", "SITEMAP DETECTED", this.GetUrl())); this.SetDocumentType(Type: MacroscopeConstants.DocumentType.SITEMAPXML); this.ProcessSitemapXmlOutlinks(XmlDoc); } } /** ---------------------------------------------------------------- **/ if (RawData != null) { this.SetDocumentText(Text: RawData); } /** ---------------------------------------------------------------- **/ } if (ResponseErrorCondition != null) { this.ProcessErrorCondition(ResponseErrorCondition); } }
/**************************************************************************/ private void ProcessJavascriptPage() { HttpWebRequest req = null; HttpWebResponse res = null; string ResponseErrorCondition = null; Boolean IsAuthenticating = false; try { req = WebRequest.CreateHttp(this.DocUrl); req.Method = "GET"; req.Timeout = this.Timeout; req.KeepAlive = false; req.AutomaticDecompression = DecompressionMethods.GZip | DecompressionMethods.Deflate; this.PrepareRequestHttpHeaders(req: req); IsAuthenticating = this.AuthenticateRequest(req); MacroscopePreferencesManager.EnableHttpProxy(req); res = ( HttpWebResponse )req.GetResponse(); } catch (UriFormatException ex) { DebugMsg(string.Format("ProcessJavascriptPage :: UriFormatException: {0}", ex.Message)); ResponseErrorCondition = ex.Message; } catch (TimeoutException ex) { DebugMsg(string.Format("ProcessJavascriptPage :: TimeoutException: {0}", ex.Message)); ResponseErrorCondition = ex.Message; } catch (WebException ex) { DebugMsg(string.Format("ProcessJavascriptPage :: WebException: {0}", ex.Message)); DebugMsg(string.Format("ProcessJavascriptPage :: WebException: {0}", ex.Status)); DebugMsg(string.Format("ProcessJavascriptPage :: WebException: {0}", ( int )ex.Status)); ResponseErrorCondition = ex.Status.ToString(); } if (res != null) { string RawData = ""; this.ProcessResponseHttpHeaders(req, res); /** ---------------------------------------------------------------- **/ if (IsAuthenticating) { this.VerifyOrPurgeCredential(); } /** Get Response Body ---------------------------------------------- **/ try { DebugMsg(string.Format("MIME TYPE: {0}", this.MimeType)); Encoding encUseEncoding = Encoding.UTF8; if (this.GetCharacterEncoding() != null) { encUseEncoding = this.GetCharacterEncoding(); } else { encUseEncoding = this.JavascriptSniffCharset(); } Stream ResponseStream = res.GetResponseStream(); StreamReader ResponseStreamReader = new StreamReader(ResponseStream, encUseEncoding); RawData = ResponseStreamReader.ReadToEnd(); this.ContentLength = RawData.Length; // May need to find bytes length this.SetChecksum(RawData); } catch (WebException ex) { DebugMsg(string.Format("WebException: {0}", ex.Message)); if (ex.Response != null) { this.SetStatusCode((( HttpWebResponse )ex.Response).StatusCode); } else { this.SetStatusCode(( HttpStatusCode )ex.Status); } RawData = ""; this.ContentLength = 0; } catch (Exception ex) { DebugMsg(string.Format("Exception: {0}", ex.Message)); this.SetStatusCode(HttpStatusCode.BadRequest); RawData = ""; this.ContentLength = 0; } /** Custom Filters ------------------------------------------------- **/ if (!string.IsNullOrEmpty(RawData)) { if ( MacroscopePreferencesManager.GetCustomFiltersEnable() && MacroscopePreferencesManager.GetCustomFiltersApplyToJavascripts()) { MacroscopeCustomFilters CustomFilter = this.DocCollection.GetJobMaster().GetCustomFilter(); if ((CustomFilter != null) && (CustomFilter.IsEnabled())) { this.ProcessGenericCustomFiltered( CustomFilter: CustomFilter, GenericText: RawData ); } } } /** Data Extractors ------------------------------------------------ **/ if (!string.IsNullOrEmpty(RawData)) { if ( MacroscopePreferencesManager.GetDataExtractorsEnable() && MacroscopePreferencesManager.GetDataExtractorsApplyToJavascripts()) { this.ProcessGenericDataExtractors(GenericText: RawData); } } /** Title ---------------------------------------------------------- **/ { MatchCollection reMatches = Regex.Matches(this.DocUrl, "/([^/]+)$"); string DocumentTitle = null; foreach (Match match in reMatches) { if (match.Groups[1].Value.Length > 0) { DocumentTitle = match.Groups[1].Value.ToString(); break; } } if (DocumentTitle != null) { this.SetTitle(DocumentTitle, MacroscopeConstants.TextProcessingMode.NO_PROCESSING); DebugMsg(string.Format("TITLE: {0}", this.GetTitle())); } else { DebugMsg(string.Format("TITLE: {0}", "MISSING")); } } /** ---------------------------------------------------------------- **/ res.Close(); res.Dispose(); } if (ResponseErrorCondition != null) { this.ProcessErrorCondition(ResponseErrorCondition); } }
/** -------------------------------------------------------------------- **/ private async Task _ProcessPdfPage() { MacroscopeHttpTwoClient Client = this.DocCollection.GetJobMaster().GetHttpClient(); MacroscopeHttpTwoClientResponse ClientResponse = null; string ResponseErrorCondition = null; try { ClientResponse = await Client.Get( this.GetUri(), this.ConfigurePdfPageRequestHeadersCallback, this.PostProcessRequestHttpHeadersCallback ); } catch (MacroscopeDocumentException ex) { this.DebugMsg(string.Format("_ProcessPdfPage :: MacroscopeDocumentException: {0}", ex.Message)); ResponseErrorCondition = ex.Message; this.AddRemark("_ProcessPdfPage", ex.Message); } catch (Exception ex) { this.DebugMsg(string.Format("_ProcessPdfPage :: Exception: {0}", ex.Message)); ResponseErrorCondition = ex.Message; this.AddRemark("_ProcessPdfPage", ex.Message); } if (ClientResponse != null) { MacroscopePdfTools PdfTools; this.ProcessResponseHttpHeaders(Response: ClientResponse); { // Probe Locale //this.Locale = "en"; // Implement locale probing this.Locale = "x-default"; // Implement locale probing this.SetHreflang(HrefLangLocale: this.Locale, Url: this.DocUrl); } { // Canonical this.Canonical = this.DocUrl; this.DebugMsg(string.Format("CANONICAL: {0}", this.Canonical)); } /** Get Response Body ---------------------------------------------- **/ try { byte[] RawData = ClientResponse.GetContentAsBytes(); this.SetContentLength(Length: RawData.Length); PdfTools = new MacroscopePdfTools(PdfData: RawData); if (PdfTools.GetHasError()) { this.AddRemark("CORRUPT_PDF", Observation: PdfTools.GetErrorMessage()); } this.SetWasDownloaded(true); } catch (Exception ex) { this.DebugMsg(string.Format("Exception: {0}", ex.Message)); this.SetStatusCode(HttpStatusCode.BadRequest); PdfTools = null; this.SetContentLength(Length: 0); } /** Title ---------------------------------------------------------- **/ if (PdfTools != null) { string Text = PdfTools.GetTitle(); if (!string.IsNullOrEmpty(Text)) { this.SetTitle(Text, MacroscopeConstants.TextProcessingMode.NO_PROCESSING); this.DebugMsg(string.Format("TITLE: {0}", this.GetTitle())); } else { this.DebugMsg(string.Format("TITLE: {0}", "MISSING")); } } /** Author --------------------------------------------------------- **/ if (PdfTools != null) { string Text = PdfTools.GetAuthor(); if (!string.IsNullOrEmpty(Text)) { this.SetAuthor(AuthorText: Text, ProcessingMode: MacroscopeConstants.TextProcessingMode.NO_PROCESSING); this.DebugMsg(string.Format("AUTHOR: {0}", this.GetAuthor())); } else { this.DebugMsg(string.Format("AUTHOR: {0}", "MISSING")); } } /** Description ---------------------------------------------------- **/ if (PdfTools != null) { string Text = PdfTools.GetDescription(); if (!string.IsNullOrEmpty(Text)) { this.SetDescription(Text, MacroscopeConstants.TextProcessingMode.NO_PROCESSING); this.DebugMsg(string.Format("TITLE: {0}", this.GetDescription())); } else { this.DebugMsg(string.Format("TITLE: {0}", "MISSING")); } } /** Metadata Keywords ---------------------------------------------- **/ if (PdfTools != null) { string Text = PdfTools.GetKeywords(); if (!string.IsNullOrEmpty(Text)) { this.SetKeywords(KeywordsText: Text); this.DebugMsg(string.Format("KEYWORDS: {0}", this.GetKeywords())); } else { this.DebugMsg(string.Format("KEYWORDS: {0}", "MISSING")); } } /** Body Text ------------------------------------------------------ **/ if (PdfTools != null) { this.SetBodyText(Text: ""); if (PdfTools.GetHasError()) { this.AddRemark("PDF_ERROR", Observation: PdfTools.GetErrorMessage()); } else { string Text = PdfTools.GetTextAsString(); if (!string.IsNullOrEmpty(Text)) { this.SetDocumentText(Text: Text); this.SetBodyText(Text: Text); } } this.DebugMsg(string.Format("BODY TEXT: {0}", this.GetBodyTextRaw())); } /** Data Extractors ------------------------------------------------ **/ if (!string.IsNullOrEmpty(this.GetBodyTextRaw())) { if (MacroscopePreferencesManager.GetDataExtractorsEnable()) { if (MacroscopePreferencesManager.GetDataExtractorsApplyToPdf()) { string Text = this.GetBodyTextRaw(); this.ProcessGenericDataExtractors(GenericText: Text); } } } /** Out Links Text ------------------------------------------------- **/ if (this.GetDocumentTextRawLength() > 0) { if (this.GetIsInternal()) { string Text = this.GetDocumentTextRaw(); this.ProcessPureTextOutlinks(TextDoc: Text, LinkType: MacroscopeConstants.InOutLinkType.PDF); } } /** Out Links in Annotations --------------------------------------- **/ if (this.GetIsInternal() && (this.GetDocumentTextRawLength() > 0)) { List <KeyValuePair <string, string> > AnnotationOutLinks = PdfTools.GetOutLinks(); // TODO: Implement extraction of text that underlies the link annotation foreach (KeyValuePair <string, string> AnnotationOutLinkPair in AnnotationOutLinks) { MacroscopeHyperlinkOut HyperlinkOut = null; string AnnotationOutLinkUrlAbs; AnnotationOutLinkUrlAbs = MacroscopeHttpUrlUtils.MakeUrlAbsolute( BaseHref: this.BaseHref, BaseUrl: this.DocUrl, Url: AnnotationOutLinkPair.Key ); HyperlinkOut = this.HyperlinksOut.Add(LinkType: MacroscopeConstants.HyperlinkType.PDF, UrlTarget: AnnotationOutLinkUrlAbs); HyperlinkOut.SetRawTargetUrl(TargetUrl: AnnotationOutLinkUrlAbs); HyperlinkOut.SetAltText(AnnotationOutLinkPair.Value); HyperlinkOut.SetAnchorText(AnnotationOutLinkPair.Value); HyperlinkOut.SetTitle(AnnotationOutLinkPair.Value); HyperlinkOut.SetDoFollow(); HyperlinkOut.SetMethod(Method: "GET"); this.AddDocumentOutlink(AbsoluteUrl: AnnotationOutLinkUrlAbs, LinkType: MacroscopeConstants.InOutLinkType.PDF, Follow: true); } } /** ---------------------------------------------------------------- **/ } if (ResponseErrorCondition != null) { this.ProcessErrorCondition(ResponseErrorCondition); } }
/** -------------------------------------------------------------------- **/ private async Task _ProcessTextPage() { List <string> TextDoc = new List <string>(); MacroscopeHttpTwoClient Client = this.DocCollection.GetJobMaster().GetHttpClient(); MacroscopeHttpTwoClientResponse Response = null; string ResponseErrorCondition = null; try { Response = await Client.Get( this.GetUri(), this.ConfigureTextPageRequestHeadersCallback, this.PostProcessRequestHttpHeadersCallback ); } catch (MacroscopeDocumentException ex) { this.DebugMsg(string.Format("_ProcessTextPage :: MacroscopeDocumentException: {0}", ex.Message)); ResponseErrorCondition = ex.Message; this.SetStatusCode(HttpStatusCode.BadRequest); this.AddRemark("_ProcessTextPage", ex.Message); } catch (Exception ex) { this.DebugMsg(string.Format("_ProcessTextPage :: Exception: {0}", ex.Message)); ResponseErrorCondition = ex.Message; this.SetStatusCode(HttpStatusCode.BadRequest); this.AddRemark("_ProcessTextPage", ex.Message); } if (Response != null) { string RawData = ""; this.ProcessResponseHttpHeaders(Response: Response); /** Get Response Body ---------------------------------------------- **/ try { DebugMsg(string.Format("MIME TYPE: {0}", this.MimeType)); RawData = Response.GetContentAsString(); this.SetContentLength(Length: RawData.Length); // May need to find bytes length this.SetWasDownloaded(true); this.SetChecksum(RawData); } catch (Exception ex) { DebugMsg(string.Format("Exception: {0}", ex.Message)); this.SetStatusCode(HttpStatusCode.BadRequest); RawData = ""; this.SetContentLength(Length: 0); } /** ---------------------------------------------------------------- **/ if (!string.IsNullOrEmpty(RawData)) { string[] Lines = Regex.Split(RawData, @"[\r\n]+"); TextDoc = Lines.ToList(); DebugMsg(string.Format("TextDoc: {0}", TextDoc.Count)); } else { DebugMsg(string.Format("RawData: {0}", "EMPTY")); } /** Custom Filters ------------------------------------------------- **/ if (!string.IsNullOrEmpty(RawData)) { if ( MacroscopePreferencesManager.GetCustomFiltersEnable() && MacroscopePreferencesManager.GetCustomFiltersApplyToText()) { MacroscopeCustomFilters CustomFilter = this.DocCollection.GetJobMaster().GetCustomFilter(); if ((CustomFilter != null) && (CustomFilter.IsEnabled())) { this.ProcessGenericCustomFiltered( CustomFilter: CustomFilter, GenericText: RawData ); } } } /** Data Extractors ------------------------------------------------ **/ if (!string.IsNullOrEmpty(RawData)) { if ( MacroscopePreferencesManager.GetDataExtractorsEnable() && MacroscopePreferencesManager.GetDataExtractorsApplyToText()) { this.ProcessGenericDataExtractors(GenericText: RawData); } } /** Process Text Document ------------------------------------------ **/ if ((TextDoc != null) && (TextDoc.Count > 0)) { this.SetDocumentText(Text: string.Join(Environment.NewLine, TextDoc)); if (this.GetPath().EndsWith("robots.txt", StringComparison.InvariantCultureIgnoreCase)) { long?TextSize = this.GetContentLength(); long?RobotsMaxTextSize = 1024 * 512; this.ProcessRobotsTextOutlinks(TextDoc: TextDoc); if (this.DetectSitemapTextDocument(TextDoc: TextDoc)) { DebugMsg(string.Format("ProcessTextPage: {0} :: {1}", "SITEMAP DETECTED", this.GetUrl())); this.SetDocumentType(Type: MacroscopeConstants.DocumentType.SITEMAPTEXT); this.ProcessSitemapTextOutlinks(TextDoc: TextDoc); } if (TextSize > RobotsMaxTextSize) { this.AddRemark("ROBOTS_TOO_BIG", "Robots.txt is larger than 512KB"); } } else { if (this.GetIsInternal()) { this.ProcessPureTextOutlinks(TextDoc: TextDoc, LinkType: MacroscopeConstants.InOutLinkType.PURETEXT); } } } else { this.SetDocumentText(Text: ""); } /** ---------------------------------------------------------------- **/ } if (ResponseErrorCondition != null) { this.ProcessErrorCondition(ResponseErrorCondition); } }
/**************************************************************************/ private void ProcessTextPage() { List <string> TextDoc = new List <string> (); HttpWebRequest req = null; HttpWebResponse res = null; string ResponseErrorCondition = null; Boolean IsAuthenticating = false; try { req = WebRequest.CreateHttp(this.DocUrl); req.Method = "GET"; req.Timeout = this.Timeout; req.KeepAlive = false; req.AutomaticDecompression = DecompressionMethods.GZip | DecompressionMethods.Deflate; this.PrepareRequestHttpHeaders(req: req); IsAuthenticating = this.AuthenticateRequest(req); MacroscopePreferencesManager.EnableHttpProxy(req); res = ( HttpWebResponse )req.GetResponse(); } catch (UriFormatException ex) { DebugMsg(string.Format("ProcessTextPage :: UriFormatException: {0}", ex.Message)); ResponseErrorCondition = ex.Message; } catch (WebException ex) { DebugMsg(string.Format("ProcessTextPage :: WebException: {0}", ex.Message)); DebugMsg(string.Format("ProcessTextPage :: WebException: {0}", this.DocUrl)); DebugMsg(string.Format("ProcessTextPage :: WebExceptionStatus: {0}", ex.Status)); ResponseErrorCondition = ex.Status.ToString(); } if (res != null) { string RawData = ""; this.ProcessResponseHttpHeaders(req, res); if (IsAuthenticating) { this.VerifyOrPurgeCredential(); } // Get Response Body try { DebugMsg(string.Format("MIME TYPE: {0}", this.MimeType)); Stream ResponseStream = res.GetResponseStream(); StreamReader ResponseStreamReader = new StreamReader(ResponseStream, Encoding.UTF8); // Assume UTF-8 RawData = ResponseStreamReader.ReadToEnd(); this.ContentLength = RawData.Length; // May need to find bytes length this.SetWasDownloaded(true); this.SetChecksum(RawData); } catch (WebException ex) { DebugMsg(string.Format("WebException: {0}", ex.Message)); if (ex.Response != null) { this.SetStatusCode((( HttpWebResponse )ex.Response).StatusCode); } else { this.SetStatusCode(( HttpStatusCode )ex.Status); } RawData = ""; this.ContentLength = 0; } catch (Exception ex) { DebugMsg(string.Format("Exception: {0}", ex.Message)); this.SetStatusCode(HttpStatusCode.BadRequest); RawData = ""; this.ContentLength = 0; } /** ---------------------------------------------------------------- **/ if (!string.IsNullOrEmpty(RawData)) { string [] Lines = Regex.Split(RawData, @"[\r\n]+"); TextDoc = Lines.ToList(); DebugMsg(string.Format("TextDoc: {0}", TextDoc.Count)); } else { DebugMsg(string.Format("RawData: {0}", "EMPTY")); } /** Custom Filters ------------------------------------------------- **/ if (!string.IsNullOrEmpty(RawData)) { if ( MacroscopePreferencesManager.GetCustomFiltersEnable() && MacroscopePreferencesManager.GetCustomFiltersApplyToText()) { MacroscopeCustomFilters CustomFilter = this.DocCollection.GetJobMaster().GetCustomFilter(); if ((CustomFilter != null) && (CustomFilter.IsEnabled())) { this.ProcessGenericCustomFiltered( CustomFilter: CustomFilter, GenericText: RawData ); } } } /** Data Extractors ------------------------------------------------ **/ if (!string.IsNullOrEmpty(RawData)) { if ( MacroscopePreferencesManager.GetDataExtractorsEnable() && MacroscopePreferencesManager.GetDataExtractorsApplyToText()) { this.ProcessGenericDataExtractors(GenericText: RawData); } } /** Process Text Document ------------------------------------------ **/ if ((TextDoc != null) && (TextDoc.Count > 0)) { if (this.GetPath().EndsWith("robots.txt", StringComparison.InvariantCultureIgnoreCase)) { this.ProcessRobotsTextOutlinks(TextDoc: TextDoc); } if (this.DetectSitemapTextDocument(TextDoc: TextDoc)) { DebugMsg(string.Format("ProcessTextPage: {0} :: {1}", "SITEMAP DETECTED", this.GetUrl())); this.SetIsSitemapText(); this.ProcessSitemapTextOutlinks(TextDoc); } } /** ---------------------------------------------------------------- **/ res.Close(); res.Dispose(); } if (ResponseErrorCondition != null) { this.ProcessErrorCondition(ResponseErrorCondition); } }
/** -------------------------------------------------------------------- **/ private async Task _ProcessCssPage() { MacroscopeHttpTwoClient Client = this.DocCollection.GetJobMaster().GetHttpClient(); MacroscopeHttpTwoClientResponse Response = null; string ResponseErrorCondition = null; DebugMsg(string.Format("ProcessCssPage: {0}", "")); try { Response = await Client.Get( this.GetUri(), this.ConfigureCssPageRequestHeadersCallback, this.PostProcessRequestHttpHeadersCallback ); } catch (MacroscopeDocumentException ex) { this.DebugMsg(string.Format("_ProcessCssPage :: MacroscopeDocumentException: {0}", ex.Message)); ResponseErrorCondition = ex.Message; this.SetStatusCode(HttpStatusCode.BadRequest); this.AddRemark("_ProcessCssPage", ex.Message); } catch (Exception ex) { this.DebugMsg(string.Format("_ProcessCssPage :: Exception: {0}", ex.Message)); ResponseErrorCondition = ex.Message; this.SetStatusCode(HttpStatusCode.BadRequest); this.AddRemark("_ProcessCssPage", ex.Message); } if (Response != null) { string RawData = ""; this.ProcessResponseHttpHeaders(Response: Response); /** Get Response Body ---------------------------------------------- **/ try { DebugMsg(string.Format("MIME TYPE: {0}", this.MimeType)); RawData = Response.GetContentAsString(); this.SetContentLength(Length: RawData.Length); // May need to find bytes length this.SetWasDownloaded(true); } catch (Exception ex) { DebugMsg(string.Format("Exception: {0}", ex.Message)); this.SetStatusCode(HttpStatusCode.Ambiguous); this.SetContentLength(Length: 0); } if (!string.IsNullOrEmpty(RawData)) { try { StylesheetParser CssParser = new StylesheetParser(); Stylesheet CssStylesheet = CssParser.Parse(RawData); this.ProcessCssOutlinks(CssStylesheet: CssStylesheet); } catch (Exception ex) { this.DebugMsg(string.Format("ProcessHtmlAttributeCssLinks: {0}", ex.Message)); this.AddRemark("ProcessHtmlAttributeCssLinks", ex.Message); } } else { DebugMsg(string.Format("ProcessCssPage: ERROR: {0}", this.GetUrl())); } /** Custom Filters ------------------------------------------------- **/ if (!string.IsNullOrEmpty(RawData)) { if ( MacroscopePreferencesManager.GetCustomFiltersEnable() && MacroscopePreferencesManager.GetCustomFiltersApplyToCss()) { MacroscopeCustomFilters CustomFilter = this.DocCollection.GetJobMaster().GetCustomFilter(); if ((CustomFilter != null) && (CustomFilter.IsEnabled())) { this.ProcessGenericCustomFiltered( CustomFilter: CustomFilter, GenericText: RawData ); } } } /** Data Extractors ------------------------------------------------ **/ if (!string.IsNullOrEmpty(RawData)) { if ( MacroscopePreferencesManager.GetDataExtractorsEnable() && MacroscopePreferencesManager.GetDataExtractorsApplyToCss()) { this.ProcessGenericDataExtractors(GenericText: RawData); } } /** Title ---------------------------------------------------------- **/ { MatchCollection reMatches = Regex.Matches(this.DocUrl, "/([^/]+)$"); string DocumentTitle = null; foreach (Match match in reMatches) { if (match.Groups[1].Value.Length > 0) { DocumentTitle = match.Groups[1].Value.ToString(); break; } } if (DocumentTitle != null) { this.SetTitle(DocumentTitle, MacroscopeConstants.TextProcessingMode.NO_PROCESSING); DebugMsg(string.Format("TITLE: {0}", this.GetTitle())); } else { DebugMsg(string.Format("TITLE: {0}", "MISSING")); } } } if (ResponseErrorCondition != null) { this.ProcessErrorCondition(ResponseErrorCondition); } }