/**************************************************************************/ private Boolean Check() { // TODO: Increase level of detail here. HttpWebRequest req = null; HttpWebResponse res = null; Boolean IsAvailableCheck = false; try { req = WebRequest.CreateHttp(this.Url); req.Method = "HEAD"; req.Timeout = 10000; req.KeepAlive = false; req.Host = MacroscopeUrlUtils.GetHostnameAndPortFromUrl(this.Url); req.AutomaticDecompression = DecompressionMethods.GZip | DecompressionMethods.Deflate; MacroscopePreferencesManager.EnableHttpProxy(req); using (res = ( HttpWebResponse )req.GetResponse()) { DebugMsg(string.Format("MacroscopeHrefLang Status: {0}", res.StatusCode)); if (res.StatusCode == HttpStatusCode.OK) { IsAvailableCheck = true; this.ProcessResponseHttpHeaders(req: req, res: res); } else { IsAvailableCheck = false; } res.Close(); } } catch (UriFormatException ex) { DebugMsg(string.Format("MacroscopeHrefLang UriFormatException: {0}", ex.Message)); } catch (WebException ex) { DebugMsg(string.Format("MacroscopeHrefLang WebException: {0}", ex.Message)); } return(IsAvailableCheck); }
/**************************************************************************/ public static string GetMimeTypeOfUrl(string Url) { HttpWebRequest req = null; HttpWebResponse res = null; string MimeType = null; try { req = WebRequest.CreateHttp(Url); req.Method = "HEAD"; req.Timeout = MacroscopePreferencesManager.GetRequestTimeout() * 1000; req.KeepAlive = false; req.AllowAutoRedirect = false; req.AutomaticDecompression = DecompressionMethods.GZip | DecompressionMethods.Deflate; MacroscopePreferencesManager.EnableHttpProxy(req); res = ( HttpWebResponse )req.GetResponse(); MimeType = res.Headers[HttpResponseHeader.ContentType]; res.Close(); res.Dispose(); } catch (UriFormatException ex) { DebugMsg(string.Format("ExecuteHeadRequest :: UriFormatException: {0}", ex.Message), true); } catch (TimeoutException ex) { DebugMsg(string.Format("ExecuteHeadRequest :: TimeoutException: {0}", ex.Message), true); } catch (WebException ex) { DebugMsg(string.Format("ExecuteHeadRequest :: WebException: {0}", ex.Message), true); } return(MimeType); }
/**************************************************************************/ public MemoryStream LoadMemoryStreamFromUrl(string Url) { HttpWebRequest req = null; HttpWebResponse res = null; MemoryStream msStream = null; try { req = WebRequest.CreateHttp(Url); req.Method = "GET"; req.Timeout = 1000; req.KeepAlive = false; req.AutomaticDecompression = DecompressionMethods.GZip | DecompressionMethods.Deflate; MacroscopePreferencesManager.EnableHttpProxy(req); res = ( HttpWebResponse )req.GetResponse(); } catch (UriFormatException ex) { DebugMsg(string.Format("LoadFromUrl :: UriFormatException: {0}", ex.Message)); } catch (WebException ex) { this.DebugMsg(string.Format("LoadFromUrl :: WebException: {0}", ex.Message)); this.DebugMsg(string.Format("LoadFromUrl :: WebException: {0}", Url)); } if (res != null) { try { Stream sStream = res.GetResponseStream(); List <byte> aRawDataList = new List <byte> (); Byte [] aRawData = new Byte[0]; do { int buf = sStream.ReadByte(); if (buf > -1) { aRawDataList.Add(( byte )buf); } else { break; } } while(sStream.CanRead); aRawData = aRawDataList.ToArray(); if (aRawData.Length > 0) { msStream = new MemoryStream(aRawData); } } catch (WebException ex) { this.DebugMsg(string.Format("LoadFromUrl :: WebException: {0}", ex.Message)); } res.Close(); res.Dispose(); } return(msStream); }
/**************************************************************************/ private void ProcessJavascriptPage() { HttpWebRequest req = null; HttpWebResponse res = null; string ResponseErrorCondition = null; Boolean IsAuthenticating = false; try { req = WebRequest.CreateHttp(this.DocUrl); req.Method = "GET"; req.Timeout = this.Timeout; req.KeepAlive = false; req.AutomaticDecompression = DecompressionMethods.GZip | DecompressionMethods.Deflate; this.PrepareRequestHttpHeaders(req: req); IsAuthenticating = this.AuthenticateRequest(req); MacroscopePreferencesManager.EnableHttpProxy(req); res = ( HttpWebResponse )req.GetResponse(); } catch (UriFormatException ex) { DebugMsg(string.Format("ProcessJavascriptPage :: UriFormatException: {0}", ex.Message)); ResponseErrorCondition = ex.Message; } catch (TimeoutException ex) { DebugMsg(string.Format("ProcessJavascriptPage :: TimeoutException: {0}", ex.Message)); ResponseErrorCondition = ex.Message; } catch (WebException ex) { DebugMsg(string.Format("ProcessJavascriptPage :: WebException: {0}", ex.Message)); DebugMsg(string.Format("ProcessJavascriptPage :: WebException: {0}", ex.Status)); DebugMsg(string.Format("ProcessJavascriptPage :: WebException: {0}", ( int )ex.Status)); ResponseErrorCondition = ex.Status.ToString(); } if (res != null) { string RawData = ""; this.ProcessResponseHttpHeaders(req, res); /** ---------------------------------------------------------------- **/ if (IsAuthenticating) { this.VerifyOrPurgeCredential(); } /** Get Response Body ---------------------------------------------- **/ try { DebugMsg(string.Format("MIME TYPE: {0}", this.MimeType)); Encoding encUseEncoding = Encoding.UTF8; if (this.GetCharacterEncoding() != null) { encUseEncoding = this.GetCharacterEncoding(); } else { encUseEncoding = this.JavascriptSniffCharset(); } Stream ResponseStream = res.GetResponseStream(); StreamReader ResponseStreamReader = new StreamReader(ResponseStream, encUseEncoding); RawData = ResponseStreamReader.ReadToEnd(); this.ContentLength = RawData.Length; // May need to find bytes length this.SetChecksum(RawData); } catch (WebException ex) { DebugMsg(string.Format("WebException: {0}", ex.Message)); if (ex.Response != null) { this.SetStatusCode((( HttpWebResponse )ex.Response).StatusCode); } else { this.SetStatusCode(( HttpStatusCode )ex.Status); } RawData = ""; this.ContentLength = 0; } catch (Exception ex) { DebugMsg(string.Format("Exception: {0}", ex.Message)); this.SetStatusCode(HttpStatusCode.BadRequest); RawData = ""; this.ContentLength = 0; } /** Custom Filters ------------------------------------------------- **/ if (!string.IsNullOrEmpty(RawData)) { if ( MacroscopePreferencesManager.GetCustomFiltersEnable() && MacroscopePreferencesManager.GetCustomFiltersApplyToJavascripts()) { MacroscopeCustomFilters CustomFilter = this.DocCollection.GetJobMaster().GetCustomFilter(); if ((CustomFilter != null) && (CustomFilter.IsEnabled())) { this.ProcessGenericCustomFiltered( CustomFilter: CustomFilter, GenericText: RawData ); } } } /** Data Extractors ------------------------------------------------ **/ if (!string.IsNullOrEmpty(RawData)) { if ( MacroscopePreferencesManager.GetDataExtractorsEnable() && MacroscopePreferencesManager.GetDataExtractorsApplyToJavascripts()) { this.ProcessGenericDataExtractors(GenericText: RawData); } } /** Title ---------------------------------------------------------- **/ { MatchCollection reMatches = Regex.Matches(this.DocUrl, "/([^/]+)$"); string DocumentTitle = null; foreach (Match match in reMatches) { if (match.Groups[1].Value.Length > 0) { DocumentTitle = match.Groups[1].Value.ToString(); break; } } if (DocumentTitle != null) { this.SetTitle(DocumentTitle, MacroscopeConstants.TextProcessingMode.NO_PROCESSING); DebugMsg(string.Format("TITLE: {0}", this.GetTitle())); } else { DebugMsg(string.Format("TITLE: {0}", "MISSING")); } } /** ---------------------------------------------------------------- **/ res.Close(); res.Dispose(); } if (ResponseErrorCondition != null) { this.ProcessErrorCondition(ResponseErrorCondition); } }
/**************************************************************************/ private void ProcessImagePage() { HttpWebRequest req = null; HttpWebResponse res = null; string ResponseErrorCondition = null; Boolean IsAuthenticating = false; try { req = WebRequest.CreateHttp(this.DocUrl); req.Method = "HEAD"; req.Timeout = this.Timeout; req.KeepAlive = false; req.AutomaticDecompression = DecompressionMethods.GZip | DecompressionMethods.Deflate; this.PrepareRequestHttpHeaders(req: req); IsAuthenticating = this.AuthenticateRequest(req); MacroscopePreferencesManager.EnableHttpProxy(req); res = ( HttpWebResponse )req.GetResponse(); } catch (UriFormatException ex) { DebugMsg(string.Format("ProcessImagePage :: UriFormatException: {0}", ex.Message)); ResponseErrorCondition = ex.Message; } catch (WebException ex) { DebugMsg(string.Format("ProcessImagePage :: WebException: {0}", ex.Message)); DebugMsg(string.Format("ProcessImagePage :: WebException: {0}", ex.Status)); DebugMsg(string.Format("ProcessImagePage :: WebException: {0}", ( int )ex.Status)); ResponseErrorCondition = ex.Status.ToString(); } if (res != null) { this.ProcessResponseHttpHeaders(req, res); if (IsAuthenticating) { this.VerifyOrPurgeCredential(); } { // Title MatchCollection reMatches = Regex.Matches(this.DocUrl, "/([^/]+)$"); string DocumentTitle = null; foreach (Match match in reMatches) { if (match.Groups[1].Value.Length > 0) { DocumentTitle = match.Groups[1].Value.ToString(); break; } } if (DocumentTitle != null) { this.SetTitle(DocumentTitle, MacroscopeConstants.TextProcessingMode.NO_PROCESSING); DebugMsg(string.Format("TITLE: {0}", this.GetTitle())); } else { DebugMsg(string.Format("TITLE: {0}", "MISSING")); } } res.Close(); res.Dispose(); } if (ResponseErrorCondition != null) { this.ErrorCondition = ResponseErrorCondition; } }
/**************************************************************************/ private void ProcessTextPage() { List <string> TextDoc = new List <string> (); HttpWebRequest req = null; HttpWebResponse res = null; string ResponseErrorCondition = null; Boolean IsAuthenticating = false; try { req = WebRequest.CreateHttp(this.DocUrl); req.Method = "GET"; req.Timeout = this.Timeout; req.KeepAlive = false; req.AutomaticDecompression = DecompressionMethods.GZip | DecompressionMethods.Deflate; this.PrepareRequestHttpHeaders(req: req); IsAuthenticating = this.AuthenticateRequest(req); MacroscopePreferencesManager.EnableHttpProxy(req); res = ( HttpWebResponse )req.GetResponse(); } catch (UriFormatException ex) { DebugMsg(string.Format("ProcessTextPage :: UriFormatException: {0}", ex.Message)); ResponseErrorCondition = ex.Message; } catch (WebException ex) { DebugMsg(string.Format("ProcessTextPage :: WebException: {0}", ex.Message)); DebugMsg(string.Format("ProcessTextPage :: WebException: {0}", this.DocUrl)); DebugMsg(string.Format("ProcessTextPage :: WebExceptionStatus: {0}", ex.Status)); ResponseErrorCondition = ex.Status.ToString(); } if (res != null) { string RawData = ""; this.ProcessResponseHttpHeaders(req, res); if (IsAuthenticating) { this.VerifyOrPurgeCredential(); } // Get Response Body try { DebugMsg(string.Format("MIME TYPE: {0}", this.MimeType)); Stream ResponseStream = res.GetResponseStream(); StreamReader ResponseStreamReader = new StreamReader(ResponseStream, Encoding.UTF8); // Assume UTF-8 RawData = ResponseStreamReader.ReadToEnd(); this.ContentLength = RawData.Length; // May need to find bytes length this.SetWasDownloaded(true); this.SetChecksum(RawData); } catch (WebException ex) { DebugMsg(string.Format("WebException: {0}", ex.Message)); if (ex.Response != null) { this.SetStatusCode((( HttpWebResponse )ex.Response).StatusCode); } else { this.SetStatusCode(( HttpStatusCode )ex.Status); } RawData = ""; this.ContentLength = 0; } catch (Exception ex) { DebugMsg(string.Format("Exception: {0}", ex.Message)); this.SetStatusCode(HttpStatusCode.BadRequest); RawData = ""; this.ContentLength = 0; } /** ---------------------------------------------------------------- **/ if (!string.IsNullOrEmpty(RawData)) { string [] Lines = Regex.Split(RawData, @"[\r\n]+"); TextDoc = Lines.ToList(); DebugMsg(string.Format("TextDoc: {0}", TextDoc.Count)); } else { DebugMsg(string.Format("RawData: {0}", "EMPTY")); } /** Custom Filters ------------------------------------------------- **/ if (!string.IsNullOrEmpty(RawData)) { if ( MacroscopePreferencesManager.GetCustomFiltersEnable() && MacroscopePreferencesManager.GetCustomFiltersApplyToText()) { MacroscopeCustomFilters CustomFilter = this.DocCollection.GetJobMaster().GetCustomFilter(); if ((CustomFilter != null) && (CustomFilter.IsEnabled())) { this.ProcessGenericCustomFiltered( CustomFilter: CustomFilter, GenericText: RawData ); } } } /** Data Extractors ------------------------------------------------ **/ if (!string.IsNullOrEmpty(RawData)) { if ( MacroscopePreferencesManager.GetDataExtractorsEnable() && MacroscopePreferencesManager.GetDataExtractorsApplyToText()) { this.ProcessGenericDataExtractors(GenericText: RawData); } } /** Process Text Document ------------------------------------------ **/ if ((TextDoc != null) && (TextDoc.Count > 0)) { if (this.GetPath().EndsWith("robots.txt", StringComparison.InvariantCultureIgnoreCase)) { this.ProcessRobotsTextOutlinks(TextDoc: TextDoc); } if (this.DetectSitemapTextDocument(TextDoc: TextDoc)) { DebugMsg(string.Format("ProcessTextPage: {0} :: {1}", "SITEMAP DETECTED", this.GetUrl())); this.SetIsSitemapText(); this.ProcessSitemapTextOutlinks(TextDoc); } } /** ---------------------------------------------------------------- **/ res.Close(); res.Dispose(); } if (ResponseErrorCondition != null) { this.ProcessErrorCondition(ResponseErrorCondition); } }
/**************************************************************************/ private void ProcessPdfPage() { HttpWebRequest req = null; HttpWebResponse res = null; string ResponseErrorCondition = null; Boolean Authenticating = false; try { req = WebRequest.CreateHttp(this.DocUrl); req.Method = "GET"; req.Timeout = this.Timeout; req.KeepAlive = false; req.AutomaticDecompression = DecompressionMethods.GZip | DecompressionMethods.Deflate; this.PrepareRequestHttpHeaders(req: req); Authenticating = this.AuthenticateRequest(req); MacroscopePreferencesManager.EnableHttpProxy(req); res = ( HttpWebResponse )req.GetResponse(); } catch (UriFormatException ex) { DebugMsg(string.Format("ProcessPdfPage :: UriFormatException: {0}", ex.Message)); ResponseErrorCondition = ex.Message; } catch (WebException ex) { DebugMsg(string.Format("ProcessPdfPage :: WebException: {0}", ex.Message)); DebugMsg(string.Format("ProcessPdfPage :: WebException: {0}", ex.Status)); DebugMsg(string.Format("ProcessPdfPage :: WebException: {0}", ( int )ex.Status)); ResponseErrorCondition = ex.Status.ToString(); } if (res != null) { MacroscopePdfTools pdfTools; this.ProcessResponseHttpHeaders(req, res); if (Authenticating) { this.VerifyOrPurgeCredential(); } { // Probe Locale //this.Locale = "en"; // Implement locale probing this.Locale = "x-default"; // Implement locale probing this.SetHreflang(HrefLangLocale: this.Locale, Url: this.DocUrl); } { // Canonical this.Canonical = this.DocUrl; DebugMsg(string.Format("CANONICAL: {0}", this.Canonical)); } { // Get Response Body try { Stream ResponseStream = res.GetResponseStream(); List <byte> RawDataList = new List <byte> (); byte [] RawData; do { int buf = ResponseStream.ReadByte(); if (buf > -1) { RawDataList.Add(( byte )buf); } else { break; } } while(ResponseStream.CanRead); RawData = RawDataList.ToArray(); this.ContentLength = RawData.Length; pdfTools = new MacroscopePdfTools(RawData); if (pdfTools.GetHasError()) { this.AddRemark(Observation: pdfTools.GetErrorMessage()); } this.SetWasDownloaded(true); } catch (WebException ex) { DebugMsg(string.Format("WebException: {0}", ex.Message)); if (ex.Response != null) { this.SetStatusCode((( HttpWebResponse )ex.Response).StatusCode); } else { this.SetStatusCode(( HttpStatusCode )ex.Status); } pdfTools = null; this.ContentLength = 0; } catch (Exception ex) { DebugMsg(string.Format("Exception: {0}", ex.Message)); this.SetStatusCode(HttpStatusCode.BadRequest); pdfTools = null; this.ContentLength = 0; } } /** Title ---------------------------------------------------------- **/ { if (pdfTools != null) { string DocumentTitle = pdfTools.GetTitle(); if (DocumentTitle != null) { this.SetTitle(DocumentTitle, MacroscopeConstants.TextProcessingMode.NO_PROCESSING); DebugMsg(string.Format("TITLE: {0}", this.GetTitle())); } else { DebugMsg(string.Format("TITLE: {0}", "MISSING")); } } } /** ---------------------------------------------------------------- **/ res.Close(); res.Dispose(); } if (ResponseErrorCondition != null) { this.ProcessErrorCondition(ResponseErrorCondition); } }
/** Fetch Robots Text *****************************************************/ private string FetchRobotTextFile(Uri RobotsUri) { Boolean Proceed = false; HttpWebRequest req = null; HttpWebResponse res = null; string RobotText = ""; string RawData = ""; if (!MacroscopeDnsTools.CheckValidHostname(Url: RobotsUri.ToString())) { DebugMsg(string.Format("FetchRobotTextFile :: CheckValidHostname: {0}", "NOT OK")); return(RobotText); } try { req = WebRequest.CreateHttp(RobotsUri); req.Method = "GET"; req.Timeout = MacroscopePreferencesManager.GetRequestTimeout() * 1000; req.KeepAlive = false; req.UserAgent = this.UserAgent(); req.Host = RobotsUri.Host; req.AutomaticDecompression = DecompressionMethods.GZip | DecompressionMethods.Deflate; MacroscopePreferencesManager.EnableHttpProxy(req); res = ( HttpWebResponse )req.GetResponse(); Proceed = true; } catch (UriFormatException ex) { DebugMsg(string.Format("UriFormatException: {0}", ex.Message)); DebugMsg(string.Format("Exception: {0}", RobotsUri.ToString())); } catch (WebException ex) { DebugMsg(string.Format("WebException: {0}", ex.Message)); DebugMsg(string.Format("WebException: {0}", RobotsUri.ToString())); DebugMsg(string.Format("WebExceptionStatus: {0}", ex.Status)); } catch (NotSupportedException ex) { DebugMsg(string.Format("NotSupportedException: {0}", ex.Message)); DebugMsg(string.Format("NotSupportedException: {0}", RobotsUri.ToString())); } catch (Exception ex) { DebugMsg(string.Format("Exception: {0}", ex.Message)); DebugMsg(string.Format("Exception: {0}", RobotsUri.ToString())); } if ((Proceed) && (res != null)) { try { Stream ResponseStream = res.GetResponseStream(); StreamReader ReadStream = new StreamReader(ResponseStream); RawData = ReadStream.ReadToEnd(); } catch (WebException ex) { DebugMsg(string.Format("FetchRobotTextFile: WebException: {0}", ex.Message)); RawData = ""; } catch (Exception ex) { DebugMsg(string.Format("FetchRobotTextFile: Exception: {0}", ex.Message)); RawData = ""; } res.Close(); res.Dispose(); } else { lock (this.BadRobots) { if (!this.BadRobots.ContainsKey(RobotsUri)) { this.BadRobots.Add(RobotsUri, true); RobotText = ""; } } } if (!string.IsNullOrEmpty(RawData)) { RobotText = RawData; } return(RobotText); }