/** -------------------------------------------------------------------- **/ public void AddUrlQueueItem(string Url) { string NewUrl = Url; if (MacroscopePreferencesManager.GetIgnoreQueries()) { NewUrl = MacroscopeUrlUtils.StripQueryString(Url: NewUrl); } if (MacroscopePreferencesManager.GetIgnoreHashFragments()) { NewUrl = MacroscopeUrlUtils.StripHashFragment(Url: NewUrl); } if (!this.JobHistory.SeenHistoryItem(Url: NewUrl)) { try { MacroscopeJobItem JobItem; JobItem = new MacroscopeJobItem(Url: NewUrl); this.NamedQueueJobItems.AddToNamedQueue( Name: MacroscopeConstants.NamedQueueUrlList, Item: JobItem ); } catch (MacroscopeNamedQueueException ex) { this.DebugMsg(string.Format("AddUrlQueueItem: {0}", ex.Message)); } } this.AddToProgress(Url: NewUrl); }
public void TestMakeUrlAbsoluteUrls() { Dictionary <string, string> UrlTable = new Dictionary <string, string> (); UrlTable.Add( @"path/to/images/picture.gif", @"http://www.host.com/path/to/page/path/to/images/picture.gif" ); UrlTable.Add( @"../path/to/images/picture.gif", @"http://www.host.com/path/to/path/to/images/picture.gif" ); UrlTable.Add( @"../../path/to/images/picture.gif", @"http://www.host.com/path/path/to/images/picture.gif" ); const string BaseUrl = "http://www.host.com/path/to/page/"; const string Filename = "index.html"; string Url = string.Join("", BaseUrl, Filename); foreach (string RelativeUrl in UrlTable.Keys) { string sAbsoluteUrl = MacroscopeUrlUtils.MakeUrlAbsolute(Url, RelativeUrl); Assert.AreEqual(UrlTable[RelativeUrl], sAbsoluteUrl, "DO NOT MATCH"); } }
/**************************************************************************/ private string ProcessCssBackImageUrl(string BackgroundImageUrl) { string LinkUrlAbs = null; string LinkUrlCleaned = MacroscopeUrlUtils.CleanUrlCss(BackgroundImageUrl); if (LinkUrlCleaned != null) { try { LinkUrlAbs = MacroscopeUrlUtils.MakeUrlAbsolute( BaseUrl: this.DocUrl, Url: LinkUrlCleaned ); } catch (MacroscopeUriFormatException ex) { DebugMsg(string.Format("ProcessCssBackImageUrl: {0}", ex.Message)); } DebugMsg(string.Format("ProcessCssBackImageUrl: {0}", LinkUrlCleaned)); DebugMsg(string.Format("ProcessCssBackImageUrl: this.DocUrl: {0}", this.DocUrl)); DebugMsg(string.Format("ProcessCssBackImageUrl: LinkUrlAbs: {0}", LinkUrlAbs)); } return(LinkUrlAbs); }
public void TestMakeUrlAbsoluteUrlsWithBaseHref() { /* * List Items: * Base HREF * Base URL * Page URL * Absolute URL */ List <List <string> > TestList = new List <List <string> > (); TestList.Add(new List <string> ()); TestList[TestList.Count - 1].Add("http://www.host.com/BASEHREF/index.html"); TestList[TestList.Count - 1].Add("http://www.host.com/path/to/page/"); TestList[TestList.Count - 1].Add("http://www.host.com/path/to/page/to/pages/index.html"); TestList[TestList.Count - 1].Add("http://www.host.com/path/to/page/to/pages/index.html"); TestList.Add(new List <string> ()); TestList[TestList.Count - 1].Add("http://www.host.com/BASEHREF/index.html"); TestList[TestList.Count - 1].Add("http://www.host.com/path/to/page/"); TestList[TestList.Count - 1].Add("path/to/pages/index.html"); TestList[TestList.Count - 1].Add("http://www.host.com/BASEHREF/path/to/pages/index.html"); TestList.Add(new List <string> ()); TestList[TestList.Count - 1].Add("http://www.host.com/BASEHREF/index.html"); TestList[TestList.Count - 1].Add("http://www.host.com/path/to/page/"); TestList[TestList.Count - 1].Add("../path/to/pages/index.html"); TestList[TestList.Count - 1].Add("http://www.host.com/path/to/pages/index.html"); TestList.Add(new List <string> ()); TestList[TestList.Count - 1].Add("http://www.host.com/BASEHREF/index.html"); TestList[TestList.Count - 1].Add("http://www.host.com/path/to/page/"); TestList[TestList.Count - 1].Add("../../path/to/pages/index.html"); TestList[TestList.Count - 1].Add("http://www.host.com/path/to/pages/index.html"); foreach (List <string> UrlSet in TestList) { string BaseHref = UrlSet[0]; string BaseUrl = UrlSet[1]; string PageUrl = UrlSet[2]; string AbsoluteUrl = UrlSet[3]; string ResolvedUrl; ResolvedUrl = MacroscopeUrlUtils.MakeUrlAbsolute( BaseHref: BaseHref, BaseUrl: BaseUrl, Url: PageUrl ); Assert.AreEqual(AbsoluteUrl, ResolvedUrl, "DO NOT MATCH"); } }
/**************************************************************************/ public Boolean IsPdfUrl(string Url) { Boolean Result = false; string MimeType = MacroscopeUrlUtils.GetMimeTypeOfUrl(Url: Url); if (!string.IsNullOrEmpty(MimeType)) { if (Regex.IsMatch(MimeType, "^application/pdf$", RegexOptions.IgnoreCase)) { Result = true; } } return(Result); }
/**************************************************************************/ private Boolean Check() { // TODO: Increase level of detail here. HttpWebRequest req = null; HttpWebResponse res = null; Boolean IsAvailableCheck = false; try { req = WebRequest.CreateHttp(this.Url); req.Method = "HEAD"; req.Timeout = 10000; req.KeepAlive = false; req.Host = MacroscopeUrlUtils.GetHostnameAndPortFromUrl(this.Url); req.AutomaticDecompression = DecompressionMethods.GZip | DecompressionMethods.Deflate; MacroscopePreferencesManager.EnableHttpProxy(req); using (res = ( HttpWebResponse )req.GetResponse()) { DebugMsg(string.Format("MacroscopeHrefLang Status: {0}", res.StatusCode)); if (res.StatusCode == HttpStatusCode.OK) { IsAvailableCheck = true; this.ProcessResponseHttpHeaders(req: req, res: res); } else { IsAvailableCheck = false; } res.Close(); } } catch (UriFormatException ex) { DebugMsg(string.Format("MacroscopeHrefLang UriFormatException: {0}", ex.Message)); } catch (WebException ex) { DebugMsg(string.Format("MacroscopeHrefLang WebException: {0}", ex.Message)); } return(IsAvailableCheck); }
/** Execute Job ***********************************************************/ public Boolean Execute() { DebugMsg(string.Format("Start URL: {0}", this.StartUrl)); //this.LogEntry( string.Format( "Executing with Start URL: {0}", this.StartUrl ) ); this.StartUrl = MacroscopeUrlUtils.SanitizeUrl(Url: this.StartUrl); this.DocCollection.SetStartUrl(Url: this.StartUrl); this.DetermineStartingDirectory(); this.SetThreadsStop(Stopped: false); this.AllowedHosts.AddFromUrl(Url: this.StartUrl); if (!this.PeekUrlQueue()) { string RobotsUrl = MacroscopeRobots.GenerateRobotUrl(Url: this.StartUrl); if (!string.IsNullOrEmpty(RobotsUrl)) { this.AddUrlQueueItem(Url: RobotsUrl); } this.IncludeExcludeUrls.AddExplicitIncludeUrl(Url: this.StartUrl); this.AddUrlQueueItem(Url: this.StartUrl); } this.ProbeRobotsFile(Url: this.StartUrl); this.SetCrawlDelay(Url: this.StartUrl); this.SpawnWorkers(); DebugMsg(string.Format("Pages Found: {0}", this.GetPagesFound())); if (this.TaskController != null) { this.TaskController.ICallbackScanComplete(); } this.AddUpdateDisplayQueue(Url: this.StartUrl); return(true); }
public void TestValidateUrls() { Dictionary <string, Boolean> UrlList = new Dictionary <string, Boolean> (); UrlList.Add( "http://www.host.com/", true ); UrlList.Add( "http://www.host.com/index.html", true ); UrlList.Add( "http://www.host.com/path/path/to/images/picture.gif", true ); UrlList.Add( "http://www.host.com/??", true ); UrlList.Add( "http://www.host.com/ ", true ); UrlList.Add( "http:// www.host.com/", false ); foreach (string Url in UrlList.Keys) { Boolean IsValid = MacroscopeUrlUtils.ValidateUrl(Url); Assert.AreEqual(UrlList[Url], IsValid, string.Format("NOT VALID: {0}", Url)); } }
public void TestStripHashFragment() { Dictionary <string, string> UrlList = new Dictionary <string, string> (); UrlList.Add("http://www.host.com/#aberdeen-angus", "http://www.host.com/"); UrlList.Add("http://www.host.com/product/list/#boris", "http://www.host.com/product/list/"); UrlList.Add("http://www.host.com/product/list/index.html#boris", "http://www.host.com/product/list/index.html"); UrlList.Add("http://www.host.com/?key1=value1&key2=value2&key3=value3", "http://www.host.com/?key1=value1&key2=value2&key3=value3"); UrlList.Add("http://www.host.com/?key1=value1&key2=value2&key3=value3#gonzo", "http://www.host.com/?key1=value1&key2=value2&key3=value3"); UrlList.Add("http://www.host.com/index.html?key1=value1&key2=value2&key3=value3#gonzo", "http://www.host.com/index.html?key1=value1&key2=value2&key3=value3"); foreach (string Url in UrlList.Keys) { string UrlResult = MacroscopeUrlUtils.StripHashFragment(Url); Assert.AreEqual(UrlList[Url], UrlResult, string.Format("NOT VALID: {0}", Url)); } }
/** -------------------------------------------------------------------- **/ public void ForgetUrlQueueItem(string Url) { MacroscopeJobItem JobItem; string NewUrl = Url; if (MacroscopePreferencesManager.GetIgnoreQueries()) { NewUrl = MacroscopeUrlUtils.StripQueryString(Url: NewUrl); } if (MacroscopePreferencesManager.GetIgnoreHashFragments()) { NewUrl = MacroscopeUrlUtils.StripHashFragment(Url: NewUrl); } JobItem = new MacroscopeJobItem(Url: NewUrl); this.NamedQueueJobItems.ForgetNamedQueueItem( Name: MacroscopeConstants.NamedQueueUrlList, Item: JobItem ); }
/** -------------------------------------------------------------------- **/ private void GenerateTextSitemapPdfEntries( MacroscopeDocument msDoc, List <string> SitemapText, Dictionary <string, Boolean> Dedupe ) { foreach (MacroscopeHyperlinkOut HyperlinkOut in msDoc.IterateHyperlinksOut()) { string Url = HyperlinkOut.GetTargetUrl(); Uri UrlParsed = new Uri(uriString: Url); if (Dedupe.ContainsKey(Url)) { continue; } else { Dedupe.Add(Url, true); } if (!UrlParsed.AbsolutePath.ToLower().EndsWith(".pdf", StringComparison.InvariantCultureIgnoreCase)) { continue; } if (!this.DocCollection.GetAllowedHosts().IsAllowedFromUrl(Url: Url)) { continue; } if (!MacroscopeUrlUtils.VerifySameHost(BaseUrl: msDoc.GetUrl(), Url: Url)) { continue; } SitemapText.Add(Url); } }
/**************************************************************************/ /* * * Reference: https://www.w3.org/TR/html5/document-metadata.html#the-base-element * */ public static string MakeUrlAbsolute( string BaseHref, string BaseUrl, string Url ) { string AbsoluteBaseHref; string UrlFixed; if (!string.IsNullOrEmpty(value: BaseHref)) { AbsoluteBaseHref = MacroscopeUrlUtils.MakeUrlAbsolute( BaseUrl: BaseUrl, Url: BaseHref ); DebugMsg(string.Format("BASEHREF: {0}", BaseHref), true); DebugMsg(string.Format("ABSOLUTEBASEHREF: {0}", AbsoluteBaseHref), true); UrlFixed = MacroscopeUrlUtils.MakeUrlAbsolute( BaseUrl: AbsoluteBaseHref, Url: Url ); DebugMsg(string.Format("URL: {0}", Url), true); DebugMsg(string.Format("URLFIXED: {0}", UrlFixed), true); } else { UrlFixed = MacroscopeUrlUtils.MakeUrlAbsolute( BaseUrl: BaseUrl, Url: Url ); } return(UrlFixed); }
public void TestCleanUrlCss() { Dictionary <string, string> PropertiesTable = new Dictionary <string, string> (); PropertiesTable.Add( "background-image:none;", null ); PropertiesTable.Add( "background: #0b7bee url(none) no-repeat center center/cover;", null ); PropertiesTable.Add( "background: #0b7bee url(images/video-bg.jpg) no-repeat center center/cover;", "images/video-bg.jpg" ); PropertiesTable.Add( "background: #0b7bee url(\"images/video-bg.jpg\") no-repeat center center/cover;", "images/video-bg.jpg" ); PropertiesTable.Add( "src: url(\"fonts/company/latin-e-bold-eot.eot\");", "fonts/company/latin-e-bold-eot.eot" ); PropertiesTable.Add( "src: url(\"fonts/company/latin-e-bold-eot.eot?#iefix\") format(\"embedded-opentype\"),url(\"fonts/company/latin-e-bold-woff.woff\") format(\"woff\"),url(\"fonts/company/latin-e-bold-ttf.ttf\") format(\"truetype\");", "fonts/company/latin-e-bold-eot.eot?#iefix" ); PropertiesTable.Add( "background: #ffffff url(images/services/features-background.png) no-repeat left bottom;", "images/services/features-background.png" ); PropertiesTable.Add( "background: transparent url(\"images/home/mouse.png\") no-repeat 90% top;", "images/home/mouse.png" ); PropertiesTable.Add( "background: #0b7bee url(images/services/features-background_hover.png) no-repeat left bottom;", "images/services/features-background_hover.png" ); PropertiesTable.Add( "background-image: url(\"images/global/page-head-trans.png\");", "images/global/page-head-trans.png" ); PropertiesTable.Add( "background-image: url(\"images/heroes/hero.jpg\");", "images/heroes/hero.jpg" ); foreach (string PropertyKey in PropertiesTable.Keys) { string Cleaned = MacroscopeUrlUtils.CleanUrlCss(PropertyKey); Assert.AreEqual(PropertiesTable[PropertyKey], Cleaned, string.Format("NOT VALID: {0}", Cleaned)); } }
/**************************************************************************/ private MacroscopeConstants.FetchStatus Fetch(string Url) { MacroscopeDocument msDoc = this.DocCollection.GetDocument(Url); MacroscopeConstants.FetchStatus FetchStatus = MacroscopeConstants.FetchStatus.VOID; if (msDoc != null) { if (msDoc.GetAuthenticationRealm() != null) { if (msDoc.GetAuthenticationType() == MacroscopeConstants.AuthenticationType.BASIC) { MacroscopeCredential Credential; Credential = this.JobMaster.GetCredentialsHttp().GetCredential( msDoc.GetHostAndPort(), msDoc.GetAuthenticationRealm() ); if (Credential != null) { msDoc = this.DocCollection.CreateDocument( Credential: Credential, Url: Url ); } } } } else { msDoc = this.DocCollection.CreateDocument(Url); } msDoc.SetFetchStatus(MacroscopeConstants.FetchStatus.OK); if (!MacroscopeDnsTools.CheckValidHostname(Url: Url)) { DebugMsg(string.Format("Fetch :: CheckValidHostname: {0}", "NOT OK")); msDoc.SetStatusCode(HttpStatusCode.BadGateway); FetchStatus = MacroscopeConstants.FetchStatus.NETWORK_ERROR; msDoc.SetFetchStatus(MacroscopeConstants.FetchStatus.NETWORK_ERROR); } if (!this.JobMaster.GetRobots().ApplyRobotRule(Url)) { DebugMsg(string.Format("Disallowed by robots.txt: {0}", Url)); this.JobMaster.AddToBlockedByRobots(Url); FetchStatus = MacroscopeConstants.FetchStatus.ROBOTS_DISALLOWED; msDoc.SetFetchStatus(MacroscopeConstants.FetchStatus.ROBOTS_DISALLOWED); this.JobMaster.GetJobHistory().VisitedHistoryItem(msDoc.GetUrl()); } else { this.JobMaster.RemoveFromBlockedByRobots(Url); } this.JobMaster.GetJobHistory().AddHistoryItem(Url); if (this.AllowedHosts.IsExternalUrl(Url: Url)) { DebugMsg(string.Format("IsExternalUrl: {0}", Url)); msDoc.SetIsExternal(State: true); } if (this.DocCollection.ContainsDocument(Url)) { if (!this.DocCollection.GetDocument(Url).GetIsDirty()) { FetchStatus = MacroscopeConstants.FetchStatus.ALREADY_SEEN; return(FetchStatus); } } if (this.JobMaster.GetDepth() > 0) { int Depth = MacroscopeUrlUtils.FindUrlDepth(Url); if (Depth > this.JobMaster.GetDepth()) { DebugMsg(string.Format("TOO DEEP: {0}", Depth)); FetchStatus = MacroscopeConstants.FetchStatus.SKIPPED; return(FetchStatus); } } if (msDoc.Execute()) { this.DocCollection.AddDocument(Url, msDoc); if (msDoc.GetStatusCode() == HttpStatusCode.Unauthorized) { if (msDoc.GetAuthenticationType() == MacroscopeConstants.AuthenticationType.BASIC) { MacroscopeCredentialsHttp CredentialsHttp = this.JobMaster.GetCredentialsHttp(); CredentialsHttp.EnqueueCredentialRequest( Domain: msDoc.GetHostAndPort(), Realm: msDoc.GetAuthenticationRealm(), Url: msDoc.GetUrl() ); this.JobMaster.AddUrlQueueItem(Url: msDoc.GetUrl()); } } this.JobMaster.GetJobHistory().VisitedHistoryItem(msDoc.GetUrl()); this.JobMaster.IncPageLimitCount(); if (msDoc.GetIsRedirect()) { DebugMsg(string.Format("REDIRECTION DETECTED GetUrl: {0}", msDoc.GetUrl())); DebugMsg(string.Format("REDIRECTION DETECTED From: {0}", msDoc.GetUrlRedirectFrom())); if (MacroscopePreferencesManager.GetFollowRedirects()) { string Hostname = msDoc.GetHostAndPort(); string HostnameFrom = MacroscopeAllowedHosts.ParseHostnameFromUrl(msDoc.GetUrlRedirectFrom()); string UrlRedirectTo = msDoc.GetUrlRedirectTo(); string HostnameTo = MacroscopeAllowedHosts.ParseHostnameFromUrl(UrlRedirectTo); DebugMsg(string.Format("REDIRECTION DETECTED UrlRedirectTo: {0}", UrlRedirectTo)); DebugMsg(string.Format("REDIRECTION DETECTED HostnameTo: {0}", HostnameTo)); } this.JobMaster.AddUrlQueueItem(Url: msDoc.GetUrlRedirectTo()); } else { this.ProcessHrefLangLanguages(msDoc); // Process Languages from HrefLang this.ProcessOutlinks(msDoc); // Process Outlinks from document } FetchStatus = MacroscopeConstants.FetchStatus.SUCCESS; } else { DebugMsg(string.Format("EXECUTE FAILED: {0}", Url)); FetchStatus = MacroscopeConstants.FetchStatus.ERROR; } return(FetchStatus); }
/** -------------------------------------------------------------------- **/ private void GenerateXmlSitemapPdfEntries( MacroscopeDocument msDoc, XmlDocument SitemapXml, XmlElement UrlSetNode, Dictionary <string, Boolean> Dedupe ) { foreach (MacroscopeHyperlinkOut HyperlinkOut in msDoc.IterateHyperlinksOut()) { string Url = HyperlinkOut.GetTargetUrl(); Uri UrlParsed = new Uri(uriString: Url); if (Dedupe.ContainsKey(Url)) { continue; } else { Dedupe.Add(Url, true); } if (!UrlParsed.AbsolutePath.ToLower().EndsWith(".pdf", StringComparison.InvariantCultureIgnoreCase)) { continue; } if (!this.DocCollection.GetAllowedHosts().IsAllowedFromUrl(Url: Url)) { continue; } if (!MacroscopeUrlUtils.VerifySameHost(BaseUrl: msDoc.GetUrl(), Url: Url)) { continue; } XmlElement UrlNode = SitemapXml.CreateElement(string.Empty, "url", MacroscopeSitemapGenerator.XmlNamespace); UrlSetNode.AppendChild(UrlNode); { XmlElement EntryNode = SitemapXml.CreateElement(string.Empty, "loc", MacroscopeSitemapGenerator.XmlNamespace); XmlText TextNode = SitemapXml.CreateTextNode(Url); UrlNode.AppendChild(EntryNode); EntryNode.AppendChild(TextNode); } { XmlElement EntryNode = SitemapXml.CreateElement(string.Empty, "changefreq", MacroscopeSitemapGenerator.XmlNamespace); XmlText TextNode = SitemapXml.CreateTextNode("daily"); UrlNode.AppendChild(EntryNode); EntryNode.AppendChild(TextNode); } { XmlElement EntryNode = SitemapXml.CreateElement(string.Empty, "priority", MacroscopeSitemapGenerator.XmlNamespace); XmlText TextNode = SitemapXml.CreateTextNode("1.0"); UrlNode.AppendChild(EntryNode); EntryNode.AppendChild(TextNode); } } }