/**************************************************************************/ public static async Task <string> GetMimeTypeOfUrl(MacroscopeJobMaster JobMaster, Uri TargetUri) { MacroscopeHttpTwoClient Client = JobMaster.GetHttpClient(); MacroscopeHttpTwoClientResponse Response = null; string MimeType = null; try { Response = await Client.Head(TargetUri, ConfigureHeadRequestHeadersCallback, PostProcessRequestHttpHeadersCallback); if (Response != null) { MimeType = Response.GetMimeType().ToString(); } } catch (MacroscopeDocumentException ex) { DebugMsgStatic(string.Format("MacroscopeDocumentException: {0}", ex.Message)); DebugMsgStatic(string.Format("MacroscopeDocumentException: {0}", TargetUri.ToString())); } catch (Exception ex) { DebugMsgStatic(string.Format("Exception: {0}", ex.Message)); DebugMsgStatic(string.Format("Exception: {0}", TargetUri.ToString())); } return(MimeType); }
public async Task TestHttpTwoClientGet() { MacroscopeHttpTwoClient Client = new MacroscopeHttpTwoClient(); List <Uri> UrlList = new List <Uri>(); UrlList.Add(new Uri("https://nazuke.github.io/robots.txt")); foreach (Uri Url in UrlList) { this.DebugMsg(string.Format("Url: {0}", Url)); MacroscopeHttpTwoClientResponse ClientResponse = await Client.Get( Url, this.PreProcessHeadRequestHeadersCallback, this.PostProcessRequestHttpHeadersCallback ); HttpResponseMessage Response = ClientResponse.GetResponse(); this.DebugMsg(string.Format("Response.Version: {0}", Response.Version)); Assert.AreEqual(200, (int)Response.StatusCode); Assert.Greater(ClientResponse.GetContentAsString().Length, 0); } return; }
/** Execute Head Request **************************************************/ private void ConfigureRequestHeadersAddCookieHeaders(HttpRequestMessage Request) { CookieContainer BiscuitTin = MacroscopeHttpTwoClient.GetCookieMonster(); string Biscuit = BiscuitTin.GetCookieHeader(uri: Request.RequestUri); Request.Headers.Add(name: "Cookie", value: Biscuit); return; }
/**************************************************************************/ public MacroscopeRobots() { this.SuppressDebugMsg = true; this.RobotSquad = new Dictionary <string, Robots>(8); this.BadRobots = new Dictionary <Uri, bool>(8); this.Client = new MacroscopeHttpTwoClient(); }
/**************************************************************************/ public async Task <Image> LoadImageFromUri(MacroscopeJobMaster JobMaster, Uri TargetUri) { MacroscopeHttpTwoClient Client = JobMaster.GetHttpClient(); MacroscopeHttpTwoClientResponse Response = null; Image LoadedImage = null; try { Response = await Client.Get( TargetUri, this.ConfigureHeadRequestHeadersCallback, this.PostProcessRequestHttpHeadersCallback ); } catch (MacroscopeDocumentException ex) { this.DebugMsg(string.Format("MacroscopeDocumentException: {0}", ex.Message)); this.DebugMsg(string.Format("MacroscopeDocumentException: {0}", TargetUri.ToString())); } catch (Exception ex) { this.DebugMsg(string.Format("Exception: {0}", ex.Message)); this.DebugMsg(string.Format("Exception: {0}", TargetUri.ToString())); } if (Response != null) { try { string ImageFilename = Path.GetTempFileName(); byte[] ByteData = Response.GetContentAsBytes(); using (FileStream ImageStream = File.Create(ImageFilename)) { foreach (byte b in ByteData) { ImageStream.WriteByte(b); } ImageStream.Close(); } if (File.Exists(ImageFilename)) { TemporaryFiles.Add(ImageFilename); LoadedImage = Image.FromFile(ImageFilename); } } catch (Exception ex) { this.DebugMsg(string.Format("Exception: {0}", ex.Message)); } } return(LoadedImage); }
/** -------------------------------------------------------------------- **/ private async Task <bool> _ExecuteHeadCheck() { bool IsAvailableCheck = false; MacroscopeHttpTwoClient Client = this.MsJobMaster.GetHttpClient(); MacroscopeHttpTwoClientResponse ClientResponse = null; Uri DocUri = null; try { DocUri = new Uri(this.Url); ClientResponse = await Client.Head( DocUri, this.ConfigureHeadRequestHeadersCallback, this.PostProcessRequestHttpHeadersCallback ); } catch (MacroscopeDocumentException ex) { this.DebugMsg(string.Format("_ExecuteHeadCheck :: MacroscopeDocumentException: {0}", ex.Message)); } catch (Exception ex) { this.DebugMsg(string.Format("_ExecuteHeadCheck :: Exception: {0}", ex.Message)); } if (ClientResponse != null) { try { this.DebugMsg(string.Format("StatusCode: {0}", ClientResponse.GetResponse().StatusCode)); if (ClientResponse.GetResponse() != null) { if (ClientResponse.GetResponse().StatusCode == HttpStatusCode.OK) { IsAvailableCheck = true; } } else { throw new MacroscopeDocumentException("Bad Response in _ExecuteHeadCheck"); } } catch (Exception ex) { this.DebugMsg(string.Format("_ExecuteHeadCheck :: Exception: {0}", ex.Message)); } this.ProcessResponseHttpHeaders(Response: ClientResponse); } return(IsAvailableCheck); }
public async Task TestAnalyzeRedirectChains() { MacroscopeHttpTwoClient HttpClient = new MacroscopeHttpTwoClient(); MacroscopeRedirectChainAnalysis Analyzer = new MacroscopeRedirectChainAnalysis(Client: HttpClient); List <MacroscopeRedirectChainDocStruct> AnalyzedRedirectChain; MacroscopePreferencesManager.SetRedirectChainsMaxHops(Max: 100); AnalyzedRedirectChain = await Analyzer.AnalyzeRedirectChains( StatusCode : HttpStatusCode.Redirect, StartUrl : string.Format("https://httpbin.org/redirect/{0}", MaxHops), RedirectUrl : string.Format("https://httpbin.org/redirect/{0}", MaxHops - 1) ); this.DebugMsg(string.Format("AnalyzedRedirectChain: {0}", AnalyzedRedirectChain.GetHashCode())); Assert.AreEqual(MaxHops + 1, AnalyzedRedirectChain.Count); }
/** -------------------------------------------------------------------- **/ private async Task <byte[]> _LoadMemoryStreamFromUrl(MacroscopeJobMaster JobMaster, Uri TargetUri) { MacroscopeHttpTwoClient Client = JobMaster.GetHttpClient(); MacroscopeHttpTwoClientResponse Response = null; byte[] ByteData = null; try { Response = await Client.Get( TargetUri, this.ConfigureHeadRequestHeadersCallback, this.PostProcessRequestHttpHeadersCallback ); } catch (MacroscopeDocumentException ex) { this.DebugMsg(string.Format("MacroscopeDocumentException: {0}", ex.Message)); this.DebugMsg(string.Format("MacroscopeDocumentException: {0}", TargetUri.ToString())); } catch (Exception ex) { this.DebugMsg(string.Format("Exception: {0}", ex.Message)); this.DebugMsg(string.Format("Exception: {0}", TargetUri.ToString())); } if (Response != null) { try { ByteData = Response.GetContentAsBytes(); } catch (Exception ex) { this.DebugMsg(string.Format("Exception: {0}", ex.Message)); } } else { this.DebugMsg("NULL"); } return(ByteData); }
/**************************************************************************/ public async Task <bool> PhoneHome() { bool NewVersionAvailable = false; MacroscopeHttpUrlLoader UrlLoader = new MacroscopeHttpUrlLoader(); MacroscopeHttpTwoClient Client = new MacroscopeHttpTwoClient(); Uri TargetUri = new Uri(MacroscopeConstants.CheckForUpdateUrl); byte[] Data = await UrlLoader.LoadImmediateDataFromUrl(Client : Client, TargetUri : TargetUri); string PublishedVersion = System.Text.Encoding.UTF8.GetString(Data); string CurrentVersion = Macroscope.GetVersion(); bool CheckResult = this.IsVersionNewer(CurrentVersion: CurrentVersion, CompareVersion: PublishedVersion); if (CheckResult) { NewVersionAvailable = true; } return(NewVersionAvailable); }
/**************************************************************************/ private void ProcessResponseHttpHeaders(MacroscopeHttpTwoClientResponse Response) { HttpResponseMessage ResponseMessage = Response.GetResponse(); HttpResponseHeaders ResponseHeaders = ResponseMessage.Headers; HttpContentHeaders ContentHeaders = ResponseMessage.Content.Headers; /** Status Code ------------------------------------------------------ **/ this.SetStatusCode(ResponseMessage.StatusCode); this.SetErrorCondition(ResponseMessage.ReasonPhrase); try { switch (this.GetStatusCode()) { // 200 Range case HttpStatusCode.OK: this.SetIsNotRedirect(); break; // 300 Range case HttpStatusCode.Moved: this.SetErrorCondition(HttpStatusCode.Moved.ToString()); this.SetIsRedirect(); break; case HttpStatusCode.SeeOther: this.SetErrorCondition(HttpStatusCode.SeeOther.ToString()); this.SetIsRedirect(); break; case HttpStatusCode.Found: this.SetErrorCondition(HttpStatusCode.Redirect.ToString()); this.SetIsRedirect(); break; // 400 Range case HttpStatusCode.BadRequest: this.SetErrorCondition(HttpStatusCode.BadRequest.ToString()); this.SetIsNotRedirect(); break; case HttpStatusCode.Unauthorized: this.SetErrorCondition(HttpStatusCode.Unauthorized.ToString()); this.SetIsNotRedirect(); break; case HttpStatusCode.PaymentRequired: this.SetErrorCondition(HttpStatusCode.PaymentRequired.ToString()); this.SetIsNotRedirect(); break; case HttpStatusCode.Forbidden: this.SetErrorCondition(HttpStatusCode.Forbidden.ToString()); this.SetIsNotRedirect(); break; case HttpStatusCode.NotFound: this.SetErrorCondition(HttpStatusCode.NotFound.ToString()); this.SetIsNotRedirect(); break; case HttpStatusCode.MethodNotAllowed: this.SetErrorCondition(HttpStatusCode.MethodNotAllowed.ToString()); this.SetIsNotRedirect(); break; case HttpStatusCode.Gone: this.SetErrorCondition(HttpStatusCode.Gone.ToString()); this.SetIsNotRedirect(); break; case HttpStatusCode.RequestUriTooLong: this.SetErrorCondition(HttpStatusCode.RequestUriTooLong.ToString()); this.SetIsNotRedirect(); break; // Unhandled default: throw new MacroscopeDocumentException("Unhandled HttpStatusCode Type"); } } catch (MacroscopeDocumentException ex) { this.DebugMsg(string.Format("MacroscopeDocumentException: {0}", ex.Message)); } /** Raw HTTP Headers ------------------------------------------------- **/ this.SetHttpResponseStatusLine(Response: Response); this.SetHttpResponseHeaders(Response: Response); /** Server Information ----------------------------------------------- **/ /*{ * this.ServerName = ResponseHeaders.Server.First().ToString(); * }*/ /** PROBE HTTP HEADERS ----------------------------------------------- **/ /** Server HTTP Header ----------------------------------------------- **/ try { HttpHeaderValueCollection <ProductInfoHeaderValue> HeaderValue = ResponseHeaders.Server; if (HeaderValue != null) { if (HeaderValue.FirstOrDefault() != null) { this.SetServerName(HeaderValue.FirstOrDefault().ToString()); } } } catch (Exception ex) { this.DebugMsg(ex.Message); FindHttpResponseHeaderCallback Callback = delegate(IEnumerable <string> HeaderValues) { this.SetServerName(HeaderValues.First().ToString()); return(true); }; if (!this.FindHttpResponseHeader(ResponseHeaders: ResponseHeaders, HeaderName: "server", Callback: Callback)) { this.FindHttpContentHeader(ContentHeaders: ContentHeaders, HeaderName: "server", Callback: Callback); } } this.DebugMsg(string.Format("this.ServerName: {0}", this.ServerName)); /** Content-Type HTTP Header ----------------------------------------- **/ try { MediaTypeHeaderValue HeaderValue = ContentHeaders.ContentType; if (HeaderValue != null) { this.DebugMsg(string.Format("HeaderValue: {0}", HeaderValue)); this.MimeType = HeaderValue.MediaType; if (HeaderValue.CharSet != null) { this.SetCharacterSet(HeaderValue.CharSet); // TODO: Implement character set probing this.SetCharacterEncoding(NewEncoding: new UTF8Encoding()); } } } catch (Exception ex) { this.DebugMsg(string.Format("MediaType Exception: {0}", ex.Message)); this.MimeType = MacroscopeConstants.DefaultMimeType; } this.DebugMsg(string.Format("this.MimeType: {0}", this.MimeType)); /** Content-Length HTTP Header --------------------------------------- **/ try { long?HeaderValue = null; if (ContentHeaders.Contains("Content-Length")) { HeaderValue = ContentHeaders.ContentLength; } if (HeaderValue != null) { this.ContentLength = HeaderValue; } else { this.ContentLength = 0; } } catch (Exception ex) { this.DebugMsg(ex.Message); this.SetContentLength(Length: 0); FindHttpResponseHeaderCallback Callback = delegate(IEnumerable <string> HeaderValues) { this.SetContentLength(Length: long.Parse(HeaderValues.FirstOrDefault())); return(true); }; if (!this.FindHttpResponseHeader(ResponseHeaders: ResponseHeaders, HeaderName: "content-length", Callback: Callback)) { this.FindHttpContentHeader(ContentHeaders: ContentHeaders, HeaderName: "content-length", Callback: Callback); } } this.DebugMsg(string.Format("this.GetContentLength(): {0}", this.GetContentLength())); /** Content-Encoding HTTP Header ------------------------------------- **/ try { ICollection <string> HeaderValue = ContentHeaders.ContentEncoding; if (HeaderValue != null) { this.ContentEncoding = HeaderValue.FirstOrDefault(); } } catch (Exception ex) { this.DebugMsg(ex.Message); FindHttpResponseHeaderCallback Callback = delegate(IEnumerable <string> HeaderValues) { this.ContentEncoding = HeaderValues.FirstOrDefault(); return(true); }; if (!this.FindHttpResponseHeader(ResponseHeaders: ResponseHeaders, HeaderName: "content-encoding", Callback: Callback)) { this.FindHttpContentHeader(ContentHeaders: ContentHeaders, HeaderName: "content-encoding", Callback: Callback); } } if (string.IsNullOrEmpty(this.CompressionMethod) && (!string.IsNullOrEmpty(this.ContentEncoding))) { this.IsCompressed = true; this.CompressionMethod = this.ContentEncoding; } this.DebugMsg(string.Format("this.ContentEncoding: {0}", this.ContentEncoding)); this.DebugMsg(string.Format("this.CompressionMethod: {0}", this.CompressionMethod)); /** Date HTTP Header ------------------------------------------------- **/ try { DateTimeOffset?HeaderValue = ResponseHeaders.Date; if (HeaderValue != null) { this.DateServer = MacroscopeDateTools.ParseHttpDate(DateString: HeaderValue.ToString()); } } catch (Exception ex) { this.DebugMsg(ex.Message); this.DateServer = new DateTime(); FindHttpResponseHeaderCallback Callback = delegate(IEnumerable <string> HeaderValues) { this.DateServer = MacroscopeDateTools.ParseHttpDate(DateString: HeaderValues.First().ToString()); return(true); }; if (!this.FindHttpResponseHeader(ResponseHeaders: ResponseHeaders, HeaderName: "date", Callback: Callback)) { this.FindHttpContentHeader(ContentHeaders: ContentHeaders, HeaderName: "date", Callback: Callback); } } this.DebugMsg(string.Format("this.DateServer: {0}", this.DateServer)); /** Last-Modified HTTP Header ---------------------------------------- **/ try { DateTimeOffset?HeaderValue = ContentHeaders.LastModified; if (HeaderValue != null) { this.DateModified = MacroscopeDateTools.ParseHttpDate(DateString: HeaderValue.ToString()); } } catch (Exception ex) { this.DebugMsg(ex.Message); this.DateModified = new DateTime(); FindHttpResponseHeaderCallback Callback = delegate(IEnumerable <string> HeaderValues) { this.DateModified = MacroscopeDateTools.ParseHttpDate(DateString: HeaderValues.First().ToString()); return(true); }; if (!this.FindHttpResponseHeader(ResponseHeaders: ResponseHeaders, HeaderName: "last-modified", Callback: Callback)) { this.FindHttpContentHeader(ContentHeaders: ContentHeaders, HeaderName: "last-modified", Callback: Callback); } } this.DebugMsg(string.Format("this.DateModified: {0}", this.DateModified)); /** Expires HTTP Header ---------------------------------------------- **/ try { DateTimeOffset?HeaderValue = ContentHeaders.Expires; if (HeaderValue != null) { this.DateExpires = MacroscopeDateTools.ParseHttpDate(DateString: HeaderValue.ToString()); } } catch (Exception ex) { this.DebugMsg(ex.Message); this.DateExpires = new DateTime(); FindHttpResponseHeaderCallback Callback = delegate(IEnumerable <string> HeaderValues) { this.DateExpires = MacroscopeDateTools.ParseHttpDate(DateString: HeaderValues.First().ToString()); return(true); }; if (!this.FindHttpResponseHeader(ResponseHeaders: ResponseHeaders, HeaderName: "expires", Callback: Callback)) { this.FindHttpContentHeader(ContentHeaders: ContentHeaders, HeaderName: "expires", Callback: Callback); } } this.DebugMsg(string.Format("this.DateExpires: {0}", this.DateExpires)); /** HTST Policy HTTP Header ------------------------------------------ **/ // https://www.owasp.org/index.php/HTTP_Strict_Transport_Security_Cheat_Sheet // Strict-Transport-Security: max-age=31536000; includeSubDomains; preload { FindHttpResponseHeaderCallback Callback = delegate(IEnumerable <string> HeaderValues) { this.HypertextStrictTransportPolicy = true; return(true); }; if (!this.FindHttpResponseHeader(ResponseHeaders: ResponseHeaders, HeaderName: "strict-transport-security", Callback: Callback)) { this.FindHttpContentHeader(ContentHeaders: ContentHeaders, HeaderName: "strict-transport-security", Callback: Callback); } } this.DebugMsg(string.Format("this.HypertextStrictTransportPolicy: {0}", this.HypertextStrictTransportPolicy)); /** Location (Redirect) HTTP Header ---------------------------------- **/ try { Uri HeaderValue = ResponseHeaders.Location; if (HeaderValue != null) { this.SetUrlRedirectTo(Url: HeaderValue.ToString()); } } catch (Exception ex) { this.DebugMsg(ex.Message); FindHttpResponseHeaderCallback Callback = delegate(IEnumerable <string> HeaderValues) { this.SetUrlRedirectTo(Url: HeaderValues.FirstOrDefault().ToString()); return(true); }; if (!this.FindHttpResponseHeader(ResponseHeaders: ResponseHeaders, HeaderName: "location", Callback: Callback)) { this.FindHttpContentHeader(ContentHeaders: ContentHeaders, HeaderName: "location", Callback: Callback); } } this.DebugMsg(string.Format("this.GetIsRedirect(): {0}", this.GetIsRedirect())); this.DebugMsg(string.Format("this.GetUrlRedirectTo(): {0}", this.GetUrlRedirectTo())); /** Link HTTP Headers ------------------------------------------------ **/ { FindHttpResponseHeaderCallback Callback = delegate(IEnumerable <string> HeaderValues) { foreach (string HeaderValue in HeaderValues) { this.DebugMsg(string.Format("HeaderValue: {0}", HeaderValue)); this.ProcessHttpLinkHeader(HttpLinkHeader: HeaderValue); } return(true); }; if (!this.FindHttpResponseHeader(ResponseHeaders: ResponseHeaders, HeaderName: "link", Callback: Callback)) { this.FindHttpContentHeader(ContentHeaders: ContentHeaders, HeaderName: "link", Callback: Callback); } } /** ETag HTTP Header ------------------------------------------------- **/ try { EntityTagHeaderValue HeaderValue = ResponseHeaders.ETag; if (HeaderValue != null) { string ETagValue = HeaderValue.Tag; if (!string.IsNullOrEmpty(ETagValue)) { this.SetEtag(HeaderValue.Tag); } } } catch (Exception ex) { this.DebugMsg(ex.Message); FindHttpResponseHeaderCallback Callback = delegate(IEnumerable <string> HeaderValues) { string HeaderValue = HeaderValues.FirstOrDefault(); if (HeaderValue != null) { if (!string.IsNullOrEmpty(HeaderValue)) { this.SetEtag(HeaderValue); } } return(true); }; if (!this.FindHttpResponseHeader(ResponseHeaders: ResponseHeaders, HeaderName: "etag", Callback: Callback)) { this.FindHttpContentHeader(ContentHeaders: ContentHeaders, HeaderName: "etag", Callback: Callback); } } this.DebugMsg(string.Format("this.Etag: {0}", this.Etag)); /** WWW-AUTHENTICATE HTTP Header ------------------------------------- **/ // Reference: http://httpbin.org/basic-auth/user/passwd try { HttpHeaderValueCollection <AuthenticationHeaderValue> HeaderValue = ResponseHeaders.WwwAuthenticate; if (HeaderValue != null) { string Scheme = null; string Realm = null; foreach (AuthenticationHeaderValue AuthenticationValue in HeaderValue) { Scheme = AuthenticationValue.Scheme; string Parameter = AuthenticationValue.Parameter; Match Matched = Regex.Match(Parameter, "^[^\"]+\"([^\"]+)\""); if (Matched.Success) { Realm = Matched.Groups[1].Value; } } if (!string.IsNullOrEmpty(Scheme) && !string.IsNullOrEmpty(Realm)) { if (Scheme.ToLower() == "basic") { this.SetAuthenticationType(MacroscopeConstants.AuthenticationType.BASIC); this.SetAuthenticationRealm(Realm); } else { this.SetAuthenticationType(MacroscopeConstants.AuthenticationType.UNSUPPORTED); } } } } catch (Exception ex) { this.DebugMsg(ex.Message); } this.DebugMsg(string.Format("WwwAuthenticate: \"{0}\", Realm: \"{1}\"", this.GetAuthenticationType(), this.GetAuthenticationRealm())); /** Process Dates ---------------------------------------------------- **/ { if (this.DateServer.Date == new DateTime().Date) { this.DateServer = DateTime.UtcNow; } if (this.DateModified.Date == new DateTime().Date) { this.DateModified = this.DateServer; } } /** Process MIME Type ------------------------------------------------ **/ { Regex reIsHtml = new Regex(@"^(text/html|application/xhtml+xml)", RegexOptions.IgnoreCase); Regex reIsCss = new Regex(@"^text/css", RegexOptions.IgnoreCase); Regex reIsJavascript = new Regex(@"^(application/javascript|text/javascript)", RegexOptions.IgnoreCase); Regex reIsImage = new Regex(@"^image/(gif|png|jpeg|bmp|webp|vnd.microsoft.icon|x-icon)", RegexOptions.IgnoreCase); Regex reIsPdf = new Regex(@"^application/pdf", RegexOptions.IgnoreCase); Regex reIsAudio = new Regex(@"^audio/[a-z0-9]+", RegexOptions.IgnoreCase); Regex reIsVideo = new Regex(@"^video/[a-z0-9]+", RegexOptions.IgnoreCase); Regex reIsXml = new Regex(@"^(application|text)/(atom\+xml|xml)", RegexOptions.IgnoreCase); Regex reIsText = new Regex(@"^(text)/(plain)", RegexOptions.IgnoreCase); if (reIsHtml.IsMatch(this.MimeType)) { this.SetDocumentType(Type: MacroscopeConstants.DocumentType.HTML); } else if (reIsCss.IsMatch(this.MimeType)) { this.SetDocumentType(Type: MacroscopeConstants.DocumentType.CSS); } else if (reIsJavascript.IsMatch(this.MimeType)) { this.SetDocumentType(Type: MacroscopeConstants.DocumentType.JAVASCRIPT); } else if (reIsImage.IsMatch(this.MimeType)) { this.SetDocumentType(Type: MacroscopeConstants.DocumentType.IMAGE); } else if (reIsPdf.IsMatch(this.MimeType)) { this.SetDocumentType(Type: MacroscopeConstants.DocumentType.PDF); } else if (reIsAudio.IsMatch(this.MimeType)) { this.SetDocumentType(Type: MacroscopeConstants.DocumentType.AUDIO); } else if (reIsVideo.IsMatch(this.MimeType)) { this.SetDocumentType(Type: MacroscopeConstants.DocumentType.VIDEO); } else if (reIsXml.IsMatch(this.MimeType)) { this.SetDocumentType(Type: MacroscopeConstants.DocumentType.XML); } else if (reIsText.IsMatch(this.MimeType)) { this.SetDocumentType(Type: MacroscopeConstants.DocumentType.TEXT); } else { this.SetDocumentType(Type: MacroscopeConstants.DocumentType.BINARY); } } /** Process Cookies -------------------------------------------------- **/ // https://stackoverflow.com/questions/29224734/how-to-read-cookies-from-httpresponsemessage { try { CookieContainer CookieMonster = MacroscopeHttpTwoClient.GetCookieMonster(); CookieCollection Biscuits = CookieMonster.GetCookies(uri: this.GetUri()); this.AddCookies(Cookies: Biscuits); this.DebugMsg("cookies"); // CookieContainer CookieTin = MacroscopeHttpTwoClient.GetCookieMonster(); // string LimpBizkit = tin.GetCookieHeader( uri: Request.RequestUri ); } catch (Exception ex) { this.DebugMsg(ex.Message); } } return; }
/**************************************************************************/ public MacroscopeRedirectChainAnalysis(MacroscopeHttpTwoClient Client) : base() { this.SuppressDebugMsg = true; this.HttpClient = Client; this.RedirectChainDocCache = new Dictionary <string, MacroscopeRedirectChainDocStruct>(); }
/** -------------------------------------------------------------------- **/ private async Task _ProcessXmlPage() { XmlDocument XmlDoc = null; MacroscopeHttpTwoClient Client = this.DocCollection.GetJobMaster().GetHttpClient(); MacroscopeHttpTwoClientResponse Response = null; string ResponseErrorCondition = null; try { Response = await Client.Get( this.GetUri(), this.ConfigureXmlPageRequestHeadersCallback, this.PostProcessRequestHttpHeadersCallback ); } catch (MacroscopeDocumentException ex) { this.DebugMsg(string.Format("_ProcessXmlPage :: MacroscopeDocumentException: {0}", ex.Message)); ResponseErrorCondition = ex.Message; this.SetStatusCode(HttpStatusCode.BadRequest); this.AddRemark("_ProcessXmlPage", ex.Message); } catch (Exception ex) { this.DebugMsg(string.Format("_ProcessXmlPage :: Exception: {0}", ex.Message)); ResponseErrorCondition = ex.Message; this.SetStatusCode(HttpStatusCode.BadRequest); this.AddRemark("_ProcessXmlPage", ex.Message); } if (Response != null) { string RawData = ""; this.ProcessResponseHttpHeaders(Response: Response); // Get Response Body try { DebugMsg(string.Format("MIME TYPE: {0}", this.MimeType)); RawData = Response.GetContentAsString(); this.SetContentLength(Length: RawData.Length); // May need to find bytes length this.SetWasDownloaded(true); this.SetChecksum(RawData); } catch (Exception ex) { DebugMsg(string.Format("Exception: {0}", ex.Message)); this.SetStatusCode(HttpStatusCode.BadRequest); RawData = ""; this.SetContentLength(Length: 0); } if (!string.IsNullOrEmpty(RawData)) { XmlDoc = new XmlDocument(); try { XmlDoc.LoadXml(RawData); } catch (XmlException ex) { DebugMsg(string.Format("XmlException: {0}", ex.Message)); } catch (Exception ex) { DebugMsg(string.Format("Exception: {0}", ex.Message)); } DebugMsg(string.Format("XmlDoc: {0}", XmlDoc)); } else { DebugMsg(string.Format("RawData: {0}", "EMPTY")); } /** Custom Filters ------------------------------------------------- **/ if (!string.IsNullOrEmpty(RawData)) { if ( MacroscopePreferencesManager.GetCustomFiltersEnable() && MacroscopePreferencesManager.GetCustomFiltersApplyToXml()) { MacroscopeCustomFilters CustomFilter = this.DocCollection.GetJobMaster().GetCustomFilter(); if ((CustomFilter != null) && (CustomFilter.IsEnabled())) { this.ProcessGenericCustomFiltered( CustomFilter: CustomFilter, GenericText: RawData ); } } } /** Data Extractors ------------------------------------------------ **/ if (!string.IsNullOrEmpty(RawData)) { if ( MacroscopePreferencesManager.GetDataExtractorsEnable() && MacroscopePreferencesManager.GetDataExtractorsApplyToXml()) { this.ProcessGenericDataExtractors(GenericText: RawData); } } /** ---------------------------------------------------------------- **/ if ((XmlDoc != null) && (XmlDoc.DocumentElement != null)) { if (this.DetectSitemapXmlDocument(XmlDoc)) { DebugMsg(string.Format("ProcessXmlPage: {0} :: {1}", "SITEMAP DETECTED", this.GetUrl())); this.SetDocumentType(Type: MacroscopeConstants.DocumentType.SITEMAPXML); this.ProcessSitemapXmlOutlinks(XmlDoc); } } /** ---------------------------------------------------------------- **/ if (RawData != null) { this.SetDocumentText(Text: RawData); } /** ---------------------------------------------------------------- **/ } if (ResponseErrorCondition != null) { this.ProcessErrorCondition(ResponseErrorCondition); } }
/** -------------------------------------------------------------------- **/ private async Task _ProcessImagePage() { MacroscopeHttpTwoClient Client = this.DocCollection.GetJobMaster().GetHttpClient(); MacroscopeHttpTwoClientResponse Response = null; string ResponseErrorCondition = null; try { Response = await Client.Head( this.GetUri(), this.ConfigureImagePageRequestHeadersCallback, this.PostProcessRequestHttpHeadersCallback ); } catch (MacroscopeDocumentException ex) { this.DebugMsg(string.Format("_ProcessImagePage :: MacroscopeDocumentException: {0}", ex.Message)); ResponseErrorCondition = ex.Message; this.SetStatusCode(HttpStatusCode.BadRequest); this.AddRemark("_ProcessImagePage", ex.Message); } catch (Exception ex) { this.DebugMsg(string.Format("_ProcessImagePage :: Exception: {0}", ex.Message)); ResponseErrorCondition = ex.Message; this.SetStatusCode(HttpStatusCode.BadRequest); this.AddRemark("_ProcessImagePage", ex.Message); } if (Response != null) { this.ProcessResponseHttpHeaders(Response: Response); /** Title ---------------------------------------------------------- **/ { MatchCollection reMatches = Regex.Matches(this.DocUrl, "/([^/]+)$"); string DocumentTitle = null; foreach (Match match in reMatches) { if (match.Groups[1].Value.Length > 0) { DocumentTitle = match.Groups[1].Value.ToString(); break; } } if (DocumentTitle != null) { this.SetTitle(DocumentTitle, MacroscopeConstants.TextProcessingMode.NO_PROCESSING); this.DebugMsg(string.Format("TITLE: {0}", this.GetTitle())); } else { this.DebugMsg(string.Format("TITLE: {0}", "MISSING")); } } /** QR Codes ------------------------------------------------------- **/ if (MacroscopePreferencesManager.GetDetectQrCodeInImage()) { MacroscopeHttpImageLoader ImageLoader = new MacroscopeHttpImageLoader(); Uri QrCodeImageUri = null; string QrCodeImageFilename = await ImageLoader.DownloadImageFromUriToFile(JobMaster : this.DocCollection.GetJobMaster(), TargetUri : this.GetUri()); if ((!string.IsNullOrEmpty(QrCodeImageFilename)) && File.Exists(QrCodeImageFilename)) { MacroscopeQrCodeAnalysis QrCodeAnalysis = new MacroscopeQrCodeAnalysis(); string ResultText = QrCodeAnalysis.Decode(ImageFilename: QrCodeImageFilename); if (!string.IsNullOrEmpty(ResultText)) { try { QrCodeImageUri = new Uri(ResultText); } catch (UriFormatException ex) { this.DebugMsg(string.Format("UriFormatException: {0}", ResultText)); this.DebugMsg(string.Format("UriFormatException: {0}", ex.Message)); } if (QrCodeImageUri != null) { MacroscopeLink Outlink = null; Outlink = this.AddDocumentOutlink( AbsoluteUrl: QrCodeImageUri.AbsoluteUri, LinkType: MacroscopeConstants.InOutLinkType.QRCODE, Follow: true ); if (Outlink != null) { Outlink.SetRawTargetUrl(TargetUrl: QrCodeImageUri.AbsoluteUri); this.AddRemark("QRCODEIMAGE", "This image appears to be a QR Code."); } } } } } /** ---------------------------------------------------------------- **/ } if (ResponseErrorCondition != null) { this.ErrorCondition = ResponseErrorCondition; } }
/** -------------------------------------------------------------------- **/ private async Task _ExecuteHeadRequest() { MacroscopeHttpTwoClient Client = this.DocCollection.GetJobMaster().GetHttpClient(); MacroscopeHttpTwoClientResponse ClientResponse = null; string ResponseErrorCondition = null; this.SetProcessInlinks(); this.SetProcessHyperlinksIn(); try { ClientResponse = await Client.Head( this.GetUri(), this.ConfigureHeadRequestHeadersCallback, this.PostProcessRequestHttpHeadersCallback ); this.CrawledDate = DateTime.UtcNow; } catch (MacroscopeDocumentException ex) { this.DebugMsg(string.Format("_ExecuteHeadRequest :: MacroscopeDocumentException: {0}", ex.Message)); ResponseErrorCondition = ex.Message; this.SetStatusCode(HttpStatusCode.BadRequest); this.AddRemark("_ExecuteHeadRequest", ex.Message); } catch (Exception ex) { this.DebugMsg(string.Format("_ExecuteHeadRequest :: Exception: {0}", ex.Message)); ResponseErrorCondition = ex.Message; this.SetStatusCode(HttpStatusCode.BadRequest); this.AddRemark("_ExecuteHeadRequest", ex.Message); } if (ClientResponse != null) { try { this.DebugMsg(string.Format("StatusCode: {0}", ClientResponse.GetResponse().StatusCode)); if (ClientResponse.GetResponse() != null) { this.SetErrorCondition(ClientResponse.GetResponse().ReasonPhrase); } else { throw new MacroscopeDocumentException("Bad Response in ExecuteHeadRequest"); } this.ProcessResponseHttpHeaders(Response: ClientResponse); if (this.GetIsRedirect()) { string Location = this.GetUrlRedirectTo(); if (!string.IsNullOrEmpty(Location)) { MacroscopeLink OutLink = null; this.SetUrlRedirectTo(Url: Location); OutLink = this.AddDocumentOutlink( AbsoluteUrl: Location, LinkType: MacroscopeConstants.InOutLinkType.REDIRECT, Follow: true ); OutLink.SetRawTargetUrl(TargetUrl: this.GetUrlRedirectToRaw()); } } } catch (Exception ex) { this.DebugMsg(string.Format("_ExecuteHeadRequest :: Exception: {0}", ex.Message)); ResponseErrorCondition = ex.Message; } } if (ResponseErrorCondition != null) { this.ProcessErrorCondition(ResponseErrorCondition); } return; }
/** -------------------------------------------------------------------- **/ private async Task _ProcessVideoPage() { MacroscopeHttpTwoClient Client = this.DocCollection.GetJobMaster().GetHttpClient(); MacroscopeHttpTwoClientResponse Response = null; string ResponseErrorCondition = null; try { Response = await Client.Get( this.GetUri(), this.ConfigureVideoPageRequestHeadersCallback, this.PostProcessRequestHttpHeadersCallback ); } catch (MacroscopeDocumentException ex) { this.DebugMsg(string.Format("_ProcessVideoPage :: MacroscopeDocumentException: {0}", ex.Message)); ResponseErrorCondition = ex.Message; this.SetStatusCode(HttpStatusCode.BadRequest); this.AddRemark("_ProcessVideoPage", ex.Message); } catch (Exception ex) { this.DebugMsg(string.Format("_ProcessVideoPage :: Exception: {0}", ex.Message)); ResponseErrorCondition = ex.Message; this.SetStatusCode(HttpStatusCode.BadRequest); this.AddRemark("_ProcessVideoPage", ex.Message); } if (Response != null) { this.ProcessResponseHttpHeaders(Response: Response); { // Title MatchCollection reMatches = Regex.Matches(this.DocUrl, "/([^/]+)$"); string DocumentTitle = null; foreach (Match match in reMatches) { if (match.Groups[1].Value.Length > 0) { DocumentTitle = match.Groups[1].Value.ToString(); break; } } if (DocumentTitle != null) { this.SetTitle(DocumentTitle, MacroscopeConstants.TextProcessingMode.NO_PROCESSING); DebugMsg(string.Format("TITLE: {0}", this.GetTitle())); } else { DebugMsg(string.Format("TITLE: {0}", "MISSING")); } } } if (ResponseErrorCondition != null) { this.ErrorCondition = ResponseErrorCondition; } }
/** -------------------------------------------------------------------- **/ private async Task _ProcessJavascriptPage() { MacroscopeHttpTwoClient Client = this.DocCollection.GetJobMaster().GetHttpClient(); MacroscopeHttpTwoClientResponse Response = null; string ResponseErrorCondition = null; try { Response = await Client.Get( this.GetUri(), this.ConfigureJavascriptPageRequestHeadersCallback, this.PostProcessRequestHttpHeadersCallback ); } catch (MacroscopeDocumentException ex) { this.DebugMsg(string.Format("_ProcessJavascriptPage :: MacroscopeDocumentException: {0}", ex.Message)); ResponseErrorCondition = ex.Message; this.SetStatusCode(HttpStatusCode.BadRequest); this.AddRemark("_ProcessJavascriptPage", ex.Message); } catch (Exception ex) { this.DebugMsg(string.Format("_ProcessJavascriptPage :: Exception: {0}", ex.Message)); ResponseErrorCondition = ex.Message; this.SetStatusCode(HttpStatusCode.BadRequest); this.AddRemark("_ProcessJavascriptPage", ex.Message); } if (Response != null) { string RawData = ""; this.ProcessResponseHttpHeaders(Response: Response); /** Get Response Body ---------------------------------------------- **/ try { DebugMsg(string.Format("MIME TYPE: {0}", this.MimeType)); /* * Encoding encUseEncoding = Encoding.UTF8; * * if( this.GetCharacterEncoding() != null ) * { * encUseEncoding = this.GetCharacterEncoding(); * } * else * { * encUseEncoding = this.JavascriptSniffCharset(); * } */ RawData = Response.GetContentAsString(); this.SetContentLength(Length: RawData.Length); // May need to find bytes length this.SetChecksum(RawData); } catch (Exception ex) { DebugMsg(string.Format("Exception: {0}", ex.Message)); this.SetStatusCode(HttpStatusCode.Ambiguous); RawData = ""; this.SetContentLength(Length: 0); } /** Custom Filters ------------------------------------------------- **/ if (!string.IsNullOrEmpty(RawData)) { if ( MacroscopePreferencesManager.GetCustomFiltersEnable() && MacroscopePreferencesManager.GetCustomFiltersApplyToJavascripts()) { MacroscopeCustomFilters CustomFilter = this.DocCollection.GetJobMaster().GetCustomFilter(); if ((CustomFilter != null) && (CustomFilter.IsEnabled())) { this.ProcessGenericCustomFiltered( CustomFilter: CustomFilter, GenericText: RawData ); } } } /** Data Extractors ------------------------------------------------ **/ if (!string.IsNullOrEmpty(RawData)) { if ( MacroscopePreferencesManager.GetDataExtractorsEnable() && MacroscopePreferencesManager.GetDataExtractorsApplyToJavascripts()) { this.ProcessGenericDataExtractors(GenericText: RawData); } } /** Title ---------------------------------------------------------- **/ { MatchCollection reMatches = Regex.Matches(this.DocUrl, "/([^/]+)$"); string DocumentTitle = null; foreach (Match match in reMatches) { if (match.Groups[1].Value.Length > 0) { DocumentTitle = match.Groups[1].Value.ToString(); break; } } if (DocumentTitle != null) { this.SetTitle(DocumentTitle, MacroscopeConstants.TextProcessingMode.NO_PROCESSING); DebugMsg(string.Format("TITLE: {0}", this.GetTitle())); } else { DebugMsg(string.Format("TITLE: {0}", "MISSING")); } } /** ---------------------------------------------------------------- **/ } if (ResponseErrorCondition != null) { this.ProcessErrorCondition(ResponseErrorCondition); } }
/** -------------------------------------------------------------------- **/ private async Task _ProcessTextPage() { List <string> TextDoc = new List <string>(); MacroscopeHttpTwoClient Client = this.DocCollection.GetJobMaster().GetHttpClient(); MacroscopeHttpTwoClientResponse Response = null; string ResponseErrorCondition = null; try { Response = await Client.Get( this.GetUri(), this.ConfigureTextPageRequestHeadersCallback, this.PostProcessRequestHttpHeadersCallback ); } catch (MacroscopeDocumentException ex) { this.DebugMsg(string.Format("_ProcessTextPage :: MacroscopeDocumentException: {0}", ex.Message)); ResponseErrorCondition = ex.Message; this.SetStatusCode(HttpStatusCode.BadRequest); this.AddRemark("_ProcessTextPage", ex.Message); } catch (Exception ex) { this.DebugMsg(string.Format("_ProcessTextPage :: Exception: {0}", ex.Message)); ResponseErrorCondition = ex.Message; this.SetStatusCode(HttpStatusCode.BadRequest); this.AddRemark("_ProcessTextPage", ex.Message); } if (Response != null) { string RawData = ""; this.ProcessResponseHttpHeaders(Response: Response); /** Get Response Body ---------------------------------------------- **/ try { DebugMsg(string.Format("MIME TYPE: {0}", this.MimeType)); RawData = Response.GetContentAsString(); this.SetContentLength(Length: RawData.Length); // May need to find bytes length this.SetWasDownloaded(true); this.SetChecksum(RawData); } catch (Exception ex) { DebugMsg(string.Format("Exception: {0}", ex.Message)); this.SetStatusCode(HttpStatusCode.BadRequest); RawData = ""; this.SetContentLength(Length: 0); } /** ---------------------------------------------------------------- **/ if (!string.IsNullOrEmpty(RawData)) { string[] Lines = Regex.Split(RawData, @"[\r\n]+"); TextDoc = Lines.ToList(); DebugMsg(string.Format("TextDoc: {0}", TextDoc.Count)); } else { DebugMsg(string.Format("RawData: {0}", "EMPTY")); } /** Custom Filters ------------------------------------------------- **/ if (!string.IsNullOrEmpty(RawData)) { if ( MacroscopePreferencesManager.GetCustomFiltersEnable() && MacroscopePreferencesManager.GetCustomFiltersApplyToText()) { MacroscopeCustomFilters CustomFilter = this.DocCollection.GetJobMaster().GetCustomFilter(); if ((CustomFilter != null) && (CustomFilter.IsEnabled())) { this.ProcessGenericCustomFiltered( CustomFilter: CustomFilter, GenericText: RawData ); } } } /** Data Extractors ------------------------------------------------ **/ if (!string.IsNullOrEmpty(RawData)) { if ( MacroscopePreferencesManager.GetDataExtractorsEnable() && MacroscopePreferencesManager.GetDataExtractorsApplyToText()) { this.ProcessGenericDataExtractors(GenericText: RawData); } } /** Process Text Document ------------------------------------------ **/ if ((TextDoc != null) && (TextDoc.Count > 0)) { this.SetDocumentText(Text: string.Join(Environment.NewLine, TextDoc)); if (this.GetPath().EndsWith("robots.txt", StringComparison.InvariantCultureIgnoreCase)) { long?TextSize = this.GetContentLength(); long?RobotsMaxTextSize = 1024 * 512; this.ProcessRobotsTextOutlinks(TextDoc: TextDoc); if (this.DetectSitemapTextDocument(TextDoc: TextDoc)) { DebugMsg(string.Format("ProcessTextPage: {0} :: {1}", "SITEMAP DETECTED", this.GetUrl())); this.SetDocumentType(Type: MacroscopeConstants.DocumentType.SITEMAPTEXT); this.ProcessSitemapTextOutlinks(TextDoc: TextDoc); } if (TextSize > RobotsMaxTextSize) { this.AddRemark("ROBOTS_TOO_BIG", "Robots.txt is larger than 512KB"); } } else { if (this.GetIsInternal()) { this.ProcessPureTextOutlinks(TextDoc: TextDoc, LinkType: MacroscopeConstants.InOutLinkType.PURETEXT); } } } else { this.SetDocumentText(Text: ""); } /** ---------------------------------------------------------------- **/ } if (ResponseErrorCondition != null) { this.ProcessErrorCondition(ResponseErrorCondition); } }
/** -------------------------------------------------------------------- **/ private async Task _ProcessPdfPage() { MacroscopeHttpTwoClient Client = this.DocCollection.GetJobMaster().GetHttpClient(); MacroscopeHttpTwoClientResponse ClientResponse = null; string ResponseErrorCondition = null; try { ClientResponse = await Client.Get( this.GetUri(), this.ConfigurePdfPageRequestHeadersCallback, this.PostProcessRequestHttpHeadersCallback ); } catch (MacroscopeDocumentException ex) { this.DebugMsg(string.Format("_ProcessPdfPage :: MacroscopeDocumentException: {0}", ex.Message)); ResponseErrorCondition = ex.Message; this.AddRemark("_ProcessPdfPage", ex.Message); } catch (Exception ex) { this.DebugMsg(string.Format("_ProcessPdfPage :: Exception: {0}", ex.Message)); ResponseErrorCondition = ex.Message; this.AddRemark("_ProcessPdfPage", ex.Message); } if (ClientResponse != null) { MacroscopePdfTools PdfTools; this.ProcessResponseHttpHeaders(Response: ClientResponse); { // Probe Locale //this.Locale = "en"; // Implement locale probing this.Locale = "x-default"; // Implement locale probing this.SetHreflang(HrefLangLocale: this.Locale, Url: this.DocUrl); } { // Canonical this.Canonical = this.DocUrl; this.DebugMsg(string.Format("CANONICAL: {0}", this.Canonical)); } /** Get Response Body ---------------------------------------------- **/ try { byte[] RawData = ClientResponse.GetContentAsBytes(); this.SetContentLength(Length: RawData.Length); PdfTools = new MacroscopePdfTools(PdfData: RawData); if (PdfTools.GetHasError()) { this.AddRemark("CORRUPT_PDF", Observation: PdfTools.GetErrorMessage()); } this.SetWasDownloaded(true); } catch (Exception ex) { this.DebugMsg(string.Format("Exception: {0}", ex.Message)); this.SetStatusCode(HttpStatusCode.BadRequest); PdfTools = null; this.SetContentLength(Length: 0); } /** Title ---------------------------------------------------------- **/ if (PdfTools != null) { string Text = PdfTools.GetTitle(); if (!string.IsNullOrEmpty(Text)) { this.SetTitle(Text, MacroscopeConstants.TextProcessingMode.NO_PROCESSING); this.DebugMsg(string.Format("TITLE: {0}", this.GetTitle())); } else { this.DebugMsg(string.Format("TITLE: {0}", "MISSING")); } } /** Author --------------------------------------------------------- **/ if (PdfTools != null) { string Text = PdfTools.GetAuthor(); if (!string.IsNullOrEmpty(Text)) { this.SetAuthor(AuthorText: Text, ProcessingMode: MacroscopeConstants.TextProcessingMode.NO_PROCESSING); this.DebugMsg(string.Format("AUTHOR: {0}", this.GetAuthor())); } else { this.DebugMsg(string.Format("AUTHOR: {0}", "MISSING")); } } /** Description ---------------------------------------------------- **/ if (PdfTools != null) { string Text = PdfTools.GetDescription(); if (!string.IsNullOrEmpty(Text)) { this.SetDescription(Text, MacroscopeConstants.TextProcessingMode.NO_PROCESSING); this.DebugMsg(string.Format("TITLE: {0}", this.GetDescription())); } else { this.DebugMsg(string.Format("TITLE: {0}", "MISSING")); } } /** Metadata Keywords ---------------------------------------------- **/ if (PdfTools != null) { string Text = PdfTools.GetKeywords(); if (!string.IsNullOrEmpty(Text)) { this.SetKeywords(KeywordsText: Text); this.DebugMsg(string.Format("KEYWORDS: {0}", this.GetKeywords())); } else { this.DebugMsg(string.Format("KEYWORDS: {0}", "MISSING")); } } /** Body Text ------------------------------------------------------ **/ if (PdfTools != null) { this.SetBodyText(Text: ""); if (PdfTools.GetHasError()) { this.AddRemark("PDF_ERROR", Observation: PdfTools.GetErrorMessage()); } else { string Text = PdfTools.GetTextAsString(); if (!string.IsNullOrEmpty(Text)) { this.SetDocumentText(Text: Text); this.SetBodyText(Text: Text); } } this.DebugMsg(string.Format("BODY TEXT: {0}", this.GetBodyTextRaw())); } /** Data Extractors ------------------------------------------------ **/ if (!string.IsNullOrEmpty(this.GetBodyTextRaw())) { if (MacroscopePreferencesManager.GetDataExtractorsEnable()) { if (MacroscopePreferencesManager.GetDataExtractorsApplyToPdf()) { string Text = this.GetBodyTextRaw(); this.ProcessGenericDataExtractors(GenericText: Text); } } } /** Out Links Text ------------------------------------------------- **/ if (this.GetDocumentTextRawLength() > 0) { if (this.GetIsInternal()) { string Text = this.GetDocumentTextRaw(); this.ProcessPureTextOutlinks(TextDoc: Text, LinkType: MacroscopeConstants.InOutLinkType.PDF); } } /** Out Links in Annotations --------------------------------------- **/ if (this.GetIsInternal() && (this.GetDocumentTextRawLength() > 0)) { List <KeyValuePair <string, string> > AnnotationOutLinks = PdfTools.GetOutLinks(); // TODO: Implement extraction of text that underlies the link annotation foreach (KeyValuePair <string, string> AnnotationOutLinkPair in AnnotationOutLinks) { MacroscopeHyperlinkOut HyperlinkOut = null; string AnnotationOutLinkUrlAbs; AnnotationOutLinkUrlAbs = MacroscopeHttpUrlUtils.MakeUrlAbsolute( BaseHref: this.BaseHref, BaseUrl: this.DocUrl, Url: AnnotationOutLinkPair.Key ); HyperlinkOut = this.HyperlinksOut.Add(LinkType: MacroscopeConstants.HyperlinkType.PDF, UrlTarget: AnnotationOutLinkUrlAbs); HyperlinkOut.SetRawTargetUrl(TargetUrl: AnnotationOutLinkUrlAbs); HyperlinkOut.SetAltText(AnnotationOutLinkPair.Value); HyperlinkOut.SetAnchorText(AnnotationOutLinkPair.Value); HyperlinkOut.SetTitle(AnnotationOutLinkPair.Value); HyperlinkOut.SetDoFollow(); HyperlinkOut.SetMethod(Method: "GET"); this.AddDocumentOutlink(AbsoluteUrl: AnnotationOutLinkUrlAbs, LinkType: MacroscopeConstants.InOutLinkType.PDF, Follow: true); } } /** ---------------------------------------------------------------- **/ } if (ResponseErrorCondition != null) { this.ProcessErrorCondition(ResponseErrorCondition); } }
/** -------------------------------------------------------------------- **/ private async Task _ProcessCssPage() { MacroscopeHttpTwoClient Client = this.DocCollection.GetJobMaster().GetHttpClient(); MacroscopeHttpTwoClientResponse Response = null; string ResponseErrorCondition = null; DebugMsg(string.Format("ProcessCssPage: {0}", "")); try { Response = await Client.Get( this.GetUri(), this.ConfigureCssPageRequestHeadersCallback, this.PostProcessRequestHttpHeadersCallback ); } catch (MacroscopeDocumentException ex) { this.DebugMsg(string.Format("_ProcessCssPage :: MacroscopeDocumentException: {0}", ex.Message)); ResponseErrorCondition = ex.Message; this.SetStatusCode(HttpStatusCode.BadRequest); this.AddRemark("_ProcessCssPage", ex.Message); } catch (Exception ex) { this.DebugMsg(string.Format("_ProcessCssPage :: Exception: {0}", ex.Message)); ResponseErrorCondition = ex.Message; this.SetStatusCode(HttpStatusCode.BadRequest); this.AddRemark("_ProcessCssPage", ex.Message); } if (Response != null) { string RawData = ""; this.ProcessResponseHttpHeaders(Response: Response); /** Get Response Body ---------------------------------------------- **/ try { DebugMsg(string.Format("MIME TYPE: {0}", this.MimeType)); RawData = Response.GetContentAsString(); this.SetContentLength(Length: RawData.Length); // May need to find bytes length this.SetWasDownloaded(true); } catch (Exception ex) { DebugMsg(string.Format("Exception: {0}", ex.Message)); this.SetStatusCode(HttpStatusCode.Ambiguous); this.SetContentLength(Length: 0); } if (!string.IsNullOrEmpty(RawData)) { try { StylesheetParser CssParser = new StylesheetParser(); Stylesheet CssStylesheet = CssParser.Parse(RawData); this.ProcessCssOutlinks(CssStylesheet: CssStylesheet); } catch (Exception ex) { this.DebugMsg(string.Format("ProcessHtmlAttributeCssLinks: {0}", ex.Message)); this.AddRemark("ProcessHtmlAttributeCssLinks", ex.Message); } } else { DebugMsg(string.Format("ProcessCssPage: ERROR: {0}", this.GetUrl())); } /** Custom Filters ------------------------------------------------- **/ if (!string.IsNullOrEmpty(RawData)) { if ( MacroscopePreferencesManager.GetCustomFiltersEnable() && MacroscopePreferencesManager.GetCustomFiltersApplyToCss()) { MacroscopeCustomFilters CustomFilter = this.DocCollection.GetJobMaster().GetCustomFilter(); if ((CustomFilter != null) && (CustomFilter.IsEnabled())) { this.ProcessGenericCustomFiltered( CustomFilter: CustomFilter, GenericText: RawData ); } } } /** Data Extractors ------------------------------------------------ **/ if (!string.IsNullOrEmpty(RawData)) { if ( MacroscopePreferencesManager.GetDataExtractorsEnable() && MacroscopePreferencesManager.GetDataExtractorsApplyToCss()) { this.ProcessGenericDataExtractors(GenericText: RawData); } } /** Title ---------------------------------------------------------- **/ { MatchCollection reMatches = Regex.Matches(this.DocUrl, "/([^/]+)$"); string DocumentTitle = null; foreach (Match match in reMatches) { if (match.Groups[1].Value.Length > 0) { DocumentTitle = match.Groups[1].Value.ToString(); break; } } if (DocumentTitle != null) { this.SetTitle(DocumentTitle, MacroscopeConstants.TextProcessingMode.NO_PROCESSING); DebugMsg(string.Format("TITLE: {0}", this.GetTitle())); } else { DebugMsg(string.Format("TITLE: {0}", "MISSING")); } } } if (ResponseErrorCondition != null) { this.ProcessErrorCondition(ResponseErrorCondition); } }