public async Task TestHttpTwoClientGet() { MacroscopeHttpTwoClient Client = new MacroscopeHttpTwoClient(); List <Uri> UrlList = new List <Uri>(); UrlList.Add(new Uri("https://nazuke.github.io/robots.txt")); foreach (Uri Url in UrlList) { this.DebugMsg(string.Format("Url: {0}", Url)); MacroscopeHttpTwoClientResponse ClientResponse = await Client.Get( Url, this.PreProcessHeadRequestHeadersCallback, this.PostProcessRequestHttpHeadersCallback ); HttpResponseMessage Response = ClientResponse.GetResponse(); this.DebugMsg(string.Format("Response.Version: {0}", Response.Version)); Assert.AreEqual(200, (int)Response.StatusCode); Assert.Greater(ClientResponse.GetContentAsString().Length, 0); } return; }
/** -------------------------------------------------------------------- **/ private async Task _ProcessXmlPage() { XmlDocument XmlDoc = null; MacroscopeHttpTwoClient Client = this.DocCollection.GetJobMaster().GetHttpClient(); MacroscopeHttpTwoClientResponse Response = null; string ResponseErrorCondition = null; try { Response = await Client.Get( this.GetUri(), this.ConfigureXmlPageRequestHeadersCallback, this.PostProcessRequestHttpHeadersCallback ); } catch (MacroscopeDocumentException ex) { this.DebugMsg(string.Format("_ProcessXmlPage :: MacroscopeDocumentException: {0}", ex.Message)); ResponseErrorCondition = ex.Message; this.SetStatusCode(HttpStatusCode.BadRequest); this.AddRemark("_ProcessXmlPage", ex.Message); } catch (Exception ex) { this.DebugMsg(string.Format("_ProcessXmlPage :: Exception: {0}", ex.Message)); ResponseErrorCondition = ex.Message; this.SetStatusCode(HttpStatusCode.BadRequest); this.AddRemark("_ProcessXmlPage", ex.Message); } if (Response != null) { string RawData = ""; this.ProcessResponseHttpHeaders(Response: Response); // Get Response Body try { DebugMsg(string.Format("MIME TYPE: {0}", this.MimeType)); RawData = Response.GetContentAsString(); this.SetContentLength(Length: RawData.Length); // May need to find bytes length this.SetWasDownloaded(true); this.SetChecksum(RawData); } catch (Exception ex) { DebugMsg(string.Format("Exception: {0}", ex.Message)); this.SetStatusCode(HttpStatusCode.BadRequest); RawData = ""; this.SetContentLength(Length: 0); } if (!string.IsNullOrEmpty(RawData)) { XmlDoc = new XmlDocument(); try { XmlDoc.LoadXml(RawData); } catch (XmlException ex) { DebugMsg(string.Format("XmlException: {0}", ex.Message)); } catch (Exception ex) { DebugMsg(string.Format("Exception: {0}", ex.Message)); } DebugMsg(string.Format("XmlDoc: {0}", XmlDoc)); } else { DebugMsg(string.Format("RawData: {0}", "EMPTY")); } /** Custom Filters ------------------------------------------------- **/ if (!string.IsNullOrEmpty(RawData)) { if ( MacroscopePreferencesManager.GetCustomFiltersEnable() && MacroscopePreferencesManager.GetCustomFiltersApplyToXml()) { MacroscopeCustomFilters CustomFilter = this.DocCollection.GetJobMaster().GetCustomFilter(); if ((CustomFilter != null) && (CustomFilter.IsEnabled())) { this.ProcessGenericCustomFiltered( CustomFilter: CustomFilter, GenericText: RawData ); } } } /** Data Extractors ------------------------------------------------ **/ if (!string.IsNullOrEmpty(RawData)) { if ( MacroscopePreferencesManager.GetDataExtractorsEnable() && MacroscopePreferencesManager.GetDataExtractorsApplyToXml()) { this.ProcessGenericDataExtractors(GenericText: RawData); } } /** ---------------------------------------------------------------- **/ if ((XmlDoc != null) && (XmlDoc.DocumentElement != null)) { if (this.DetectSitemapXmlDocument(XmlDoc)) { DebugMsg(string.Format("ProcessXmlPage: {0} :: {1}", "SITEMAP DETECTED", this.GetUrl())); this.SetDocumentType(Type: MacroscopeConstants.DocumentType.SITEMAPXML); this.ProcessSitemapXmlOutlinks(XmlDoc); } } /** ---------------------------------------------------------------- **/ if (RawData != null) { this.SetDocumentText(Text: RawData); } /** ---------------------------------------------------------------- **/ } if (ResponseErrorCondition != null) { this.ProcessErrorCondition(ResponseErrorCondition); } }
/** Fetch Robots Text *****************************************************/ private async Task <string> FetchRobotTextFile(Uri RobotsUri) { MacroscopeHttpTwoClientResponse Response = null; bool Proceed = false; string RobotText = ""; string RawData = ""; if (!MacroscopeDnsTools.CheckValidHostname(Url: RobotsUri.ToString())) { DebugMsg(string.Format("FetchRobotTextFile :: CheckValidHostname: {0}", "NOT OK")); return(RobotText); } try { Response = await this.Client.Get( RobotsUri, this.ConfigureHeadRequestHeadersCallback, this.PostProcessRequestHttpHeadersCallback ); if (Response != null) { Proceed = true; } } catch (MacroscopeDocumentException ex) { DebugMsg(string.Format("MacroscopeDocumentException: {0}", ex.Message)); DebugMsg(string.Format("MacroscopeDocumentException: {0}", RobotsUri.ToString())); } catch (Exception ex) { DebugMsg(string.Format("Exception: {0}", ex.Message)); DebugMsg(string.Format("Exception: {0}", RobotsUri.ToString())); } if ((Proceed) && (Response != null)) { try { RawData = Response.GetContentAsString(); } catch (Exception ex) { DebugMsg(string.Format("FetchRobotTextFile: Exception: {0}", ex.Message)); RawData = ""; } } else { lock (this.BadRobots) { if (!this.BadRobots.ContainsKey(RobotsUri)) { this.BadRobots.Add(RobotsUri, true); RobotText = ""; } } } if (!string.IsNullOrEmpty(RawData)) { RobotText = RawData; } return(RobotText); }
/** -------------------------------------------------------------------- **/ private async Task _ProcessJavascriptPage() { MacroscopeHttpTwoClient Client = this.DocCollection.GetJobMaster().GetHttpClient(); MacroscopeHttpTwoClientResponse Response = null; string ResponseErrorCondition = null; try { Response = await Client.Get( this.GetUri(), this.ConfigureJavascriptPageRequestHeadersCallback, this.PostProcessRequestHttpHeadersCallback ); } catch (MacroscopeDocumentException ex) { this.DebugMsg(string.Format("_ProcessJavascriptPage :: MacroscopeDocumentException: {0}", ex.Message)); ResponseErrorCondition = ex.Message; this.SetStatusCode(HttpStatusCode.BadRequest); this.AddRemark("_ProcessJavascriptPage", ex.Message); } catch (Exception ex) { this.DebugMsg(string.Format("_ProcessJavascriptPage :: Exception: {0}", ex.Message)); ResponseErrorCondition = ex.Message; this.SetStatusCode(HttpStatusCode.BadRequest); this.AddRemark("_ProcessJavascriptPage", ex.Message); } if (Response != null) { string RawData = ""; this.ProcessResponseHttpHeaders(Response: Response); /** Get Response Body ---------------------------------------------- **/ try { DebugMsg(string.Format("MIME TYPE: {0}", this.MimeType)); /* * Encoding encUseEncoding = Encoding.UTF8; * * if( this.GetCharacterEncoding() != null ) * { * encUseEncoding = this.GetCharacterEncoding(); * } * else * { * encUseEncoding = this.JavascriptSniffCharset(); * } */ RawData = Response.GetContentAsString(); this.SetContentLength(Length: RawData.Length); // May need to find bytes length this.SetChecksum(RawData); } catch (Exception ex) { DebugMsg(string.Format("Exception: {0}", ex.Message)); this.SetStatusCode(HttpStatusCode.Ambiguous); RawData = ""; this.SetContentLength(Length: 0); } /** Custom Filters ------------------------------------------------- **/ if (!string.IsNullOrEmpty(RawData)) { if ( MacroscopePreferencesManager.GetCustomFiltersEnable() && MacroscopePreferencesManager.GetCustomFiltersApplyToJavascripts()) { MacroscopeCustomFilters CustomFilter = this.DocCollection.GetJobMaster().GetCustomFilter(); if ((CustomFilter != null) && (CustomFilter.IsEnabled())) { this.ProcessGenericCustomFiltered( CustomFilter: CustomFilter, GenericText: RawData ); } } } /** Data Extractors ------------------------------------------------ **/ if (!string.IsNullOrEmpty(RawData)) { if ( MacroscopePreferencesManager.GetDataExtractorsEnable() && MacroscopePreferencesManager.GetDataExtractorsApplyToJavascripts()) { this.ProcessGenericDataExtractors(GenericText: RawData); } } /** Title ---------------------------------------------------------- **/ { MatchCollection reMatches = Regex.Matches(this.DocUrl, "/([^/]+)$"); string DocumentTitle = null; foreach (Match match in reMatches) { if (match.Groups[1].Value.Length > 0) { DocumentTitle = match.Groups[1].Value.ToString(); break; } } if (DocumentTitle != null) { this.SetTitle(DocumentTitle, MacroscopeConstants.TextProcessingMode.NO_PROCESSING); DebugMsg(string.Format("TITLE: {0}", this.GetTitle())); } else { DebugMsg(string.Format("TITLE: {0}", "MISSING")); } } /** ---------------------------------------------------------------- **/ } if (ResponseErrorCondition != null) { this.ProcessErrorCondition(ResponseErrorCondition); } }
/** -------------------------------------------------------------------- **/ private async Task _ProcessTextPage() { List <string> TextDoc = new List <string>(); MacroscopeHttpTwoClient Client = this.DocCollection.GetJobMaster().GetHttpClient(); MacroscopeHttpTwoClientResponse Response = null; string ResponseErrorCondition = null; try { Response = await Client.Get( this.GetUri(), this.ConfigureTextPageRequestHeadersCallback, this.PostProcessRequestHttpHeadersCallback ); } catch (MacroscopeDocumentException ex) { this.DebugMsg(string.Format("_ProcessTextPage :: MacroscopeDocumentException: {0}", ex.Message)); ResponseErrorCondition = ex.Message; this.SetStatusCode(HttpStatusCode.BadRequest); this.AddRemark("_ProcessTextPage", ex.Message); } catch (Exception ex) { this.DebugMsg(string.Format("_ProcessTextPage :: Exception: {0}", ex.Message)); ResponseErrorCondition = ex.Message; this.SetStatusCode(HttpStatusCode.BadRequest); this.AddRemark("_ProcessTextPage", ex.Message); } if (Response != null) { string RawData = ""; this.ProcessResponseHttpHeaders(Response: Response); /** Get Response Body ---------------------------------------------- **/ try { DebugMsg(string.Format("MIME TYPE: {0}", this.MimeType)); RawData = Response.GetContentAsString(); this.SetContentLength(Length: RawData.Length); // May need to find bytes length this.SetWasDownloaded(true); this.SetChecksum(RawData); } catch (Exception ex) { DebugMsg(string.Format("Exception: {0}", ex.Message)); this.SetStatusCode(HttpStatusCode.BadRequest); RawData = ""; this.SetContentLength(Length: 0); } /** ---------------------------------------------------------------- **/ if (!string.IsNullOrEmpty(RawData)) { string[] Lines = Regex.Split(RawData, @"[\r\n]+"); TextDoc = Lines.ToList(); DebugMsg(string.Format("TextDoc: {0}", TextDoc.Count)); } else { DebugMsg(string.Format("RawData: {0}", "EMPTY")); } /** Custom Filters ------------------------------------------------- **/ if (!string.IsNullOrEmpty(RawData)) { if ( MacroscopePreferencesManager.GetCustomFiltersEnable() && MacroscopePreferencesManager.GetCustomFiltersApplyToText()) { MacroscopeCustomFilters CustomFilter = this.DocCollection.GetJobMaster().GetCustomFilter(); if ((CustomFilter != null) && (CustomFilter.IsEnabled())) { this.ProcessGenericCustomFiltered( CustomFilter: CustomFilter, GenericText: RawData ); } } } /** Data Extractors ------------------------------------------------ **/ if (!string.IsNullOrEmpty(RawData)) { if ( MacroscopePreferencesManager.GetDataExtractorsEnable() && MacroscopePreferencesManager.GetDataExtractorsApplyToText()) { this.ProcessGenericDataExtractors(GenericText: RawData); } } /** Process Text Document ------------------------------------------ **/ if ((TextDoc != null) && (TextDoc.Count > 0)) { this.SetDocumentText(Text: string.Join(Environment.NewLine, TextDoc)); if (this.GetPath().EndsWith("robots.txt", StringComparison.InvariantCultureIgnoreCase)) { long?TextSize = this.GetContentLength(); long?RobotsMaxTextSize = 1024 * 512; this.ProcessRobotsTextOutlinks(TextDoc: TextDoc); if (this.DetectSitemapTextDocument(TextDoc: TextDoc)) { DebugMsg(string.Format("ProcessTextPage: {0} :: {1}", "SITEMAP DETECTED", this.GetUrl())); this.SetDocumentType(Type: MacroscopeConstants.DocumentType.SITEMAPTEXT); this.ProcessSitemapTextOutlinks(TextDoc: TextDoc); } if (TextSize > RobotsMaxTextSize) { this.AddRemark("ROBOTS_TOO_BIG", "Robots.txt is larger than 512KB"); } } else { if (this.GetIsInternal()) { this.ProcessPureTextOutlinks(TextDoc: TextDoc, LinkType: MacroscopeConstants.InOutLinkType.PURETEXT); } } } else { this.SetDocumentText(Text: ""); } /** ---------------------------------------------------------------- **/ } if (ResponseErrorCondition != null) { this.ProcessErrorCondition(ResponseErrorCondition); } }
/** -------------------------------------------------------------------- **/ private async Task _ProcessCssPage() { MacroscopeHttpTwoClient Client = this.DocCollection.GetJobMaster().GetHttpClient(); MacroscopeHttpTwoClientResponse Response = null; string ResponseErrorCondition = null; DebugMsg(string.Format("ProcessCssPage: {0}", "")); try { Response = await Client.Get( this.GetUri(), this.ConfigureCssPageRequestHeadersCallback, this.PostProcessRequestHttpHeadersCallback ); } catch (MacroscopeDocumentException ex) { this.DebugMsg(string.Format("_ProcessCssPage :: MacroscopeDocumentException: {0}", ex.Message)); ResponseErrorCondition = ex.Message; this.SetStatusCode(HttpStatusCode.BadRequest); this.AddRemark("_ProcessCssPage", ex.Message); } catch (Exception ex) { this.DebugMsg(string.Format("_ProcessCssPage :: Exception: {0}", ex.Message)); ResponseErrorCondition = ex.Message; this.SetStatusCode(HttpStatusCode.BadRequest); this.AddRemark("_ProcessCssPage", ex.Message); } if (Response != null) { string RawData = ""; this.ProcessResponseHttpHeaders(Response: Response); /** Get Response Body ---------------------------------------------- **/ try { DebugMsg(string.Format("MIME TYPE: {0}", this.MimeType)); RawData = Response.GetContentAsString(); this.SetContentLength(Length: RawData.Length); // May need to find bytes length this.SetWasDownloaded(true); } catch (Exception ex) { DebugMsg(string.Format("Exception: {0}", ex.Message)); this.SetStatusCode(HttpStatusCode.Ambiguous); this.SetContentLength(Length: 0); } if (!string.IsNullOrEmpty(RawData)) { try { StylesheetParser CssParser = new StylesheetParser(); Stylesheet CssStylesheet = CssParser.Parse(RawData); this.ProcessCssOutlinks(CssStylesheet: CssStylesheet); } catch (Exception ex) { this.DebugMsg(string.Format("ProcessHtmlAttributeCssLinks: {0}", ex.Message)); this.AddRemark("ProcessHtmlAttributeCssLinks", ex.Message); } } else { DebugMsg(string.Format("ProcessCssPage: ERROR: {0}", this.GetUrl())); } /** Custom Filters ------------------------------------------------- **/ if (!string.IsNullOrEmpty(RawData)) { if ( MacroscopePreferencesManager.GetCustomFiltersEnable() && MacroscopePreferencesManager.GetCustomFiltersApplyToCss()) { MacroscopeCustomFilters CustomFilter = this.DocCollection.GetJobMaster().GetCustomFilter(); if ((CustomFilter != null) && (CustomFilter.IsEnabled())) { this.ProcessGenericCustomFiltered( CustomFilter: CustomFilter, GenericText: RawData ); } } } /** Data Extractors ------------------------------------------------ **/ if (!string.IsNullOrEmpty(RawData)) { if ( MacroscopePreferencesManager.GetDataExtractorsEnable() && MacroscopePreferencesManager.GetDataExtractorsApplyToCss()) { this.ProcessGenericDataExtractors(GenericText: RawData); } } /** Title ---------------------------------------------------------- **/ { MatchCollection reMatches = Regex.Matches(this.DocUrl, "/([^/]+)$"); string DocumentTitle = null; foreach (Match match in reMatches) { if (match.Groups[1].Value.Length > 0) { DocumentTitle = match.Groups[1].Value.ToString(); break; } } if (DocumentTitle != null) { this.SetTitle(DocumentTitle, MacroscopeConstants.TextProcessingMode.NO_PROCESSING); DebugMsg(string.Format("TITLE: {0}", this.GetTitle())); } else { DebugMsg(string.Format("TITLE: {0}", "MISSING")); } } } if (ResponseErrorCondition != null) { this.ProcessErrorCondition(ResponseErrorCondition); } }