public void TestMakeUrlAbsoluteUrls() { Dictionary <string, string> UrlTable = new Dictionary <string, string> (); UrlTable.Add( @"path/to/images/picture.gif", @"http://www.host.com/path/to/page/path/to/images/picture.gif" ); UrlTable.Add( @"../path/to/images/picture.gif", @"http://www.host.com/path/to/path/to/images/picture.gif" ); UrlTable.Add( @"../../path/to/images/picture.gif", @"http://www.host.com/path/path/to/images/picture.gif" ); const string BaseUrl = "http://www.host.com/path/to/page/"; const string Filename = "index.html"; string Url = string.Join("", BaseUrl, Filename); foreach (string RelativeUrl in UrlTable.Keys) { string sAbsoluteUrl = MacroscopeHttpUrlUtils.MakeUrlAbsolute(Url, RelativeUrl); Assert.AreEqual(UrlTable[RelativeUrl], sAbsoluteUrl, "DO NOT MATCH"); } }
/**************************************************************************/ private string ProcessCssBackImageUrl(string BackgroundImageUrl) { string LinkUrlAbs = null; string LinkUrlCleaned = MacroscopeHttpUrlUtils.CleanUrlCss(BackgroundImageUrl); if (LinkUrlCleaned != null) { try { LinkUrlAbs = MacroscopeHttpUrlUtils.MakeUrlAbsolute( BaseUrl: this.DocUrl, Url: LinkUrlCleaned ); } catch (MacroscopeUriFormatException ex) { DebugMsg(string.Format("ProcessCssBackImageUrl: {0}", ex.Message)); } DebugMsg(string.Format("ProcessCssBackImageUrl: {0}", LinkUrlCleaned)); DebugMsg(string.Format("ProcessCssBackImageUrl: this.DocUrl: {0}", this.DocUrl)); DebugMsg(string.Format("ProcessCssBackImageUrl: LinkUrlAbs: {0}", LinkUrlAbs)); } return(LinkUrlAbs); }
/** Sitemaps **************************************************************/ public async Task <List <string> > GetSitemapsAsList(string Url) { List <string> SitemapsList = new List <string>(); if (MacroscopePreferencesManager.GetFollowRobotsProtocol()) { Robots robot = await this.FetchRobot(Url : Url); try { if ((robot != null) && (robot.Sitemaps != null)) { foreach (Sitemap SitemapEntry in robot.Sitemaps) { string SitemapUrl = SitemapEntry.Url.ToString(); string SitemapUrlAbs = MacroscopeHttpUrlUtils.MakeUrlAbsolute(BaseUrl: Url, Url: SitemapUrl); SitemapsList.Add(SitemapUrlAbs); this.DebugMsg(string.Format("ROBOTS SitemapUrl: {0}", SitemapUrl)); } } } catch (Exception ex) { this.DebugMsg(ex.Message); } } return(SitemapsList); }
public void TestMakeUrlAbsoluteUrlsWithBaseHref() { /* * List Items: * Base HREF * Base URL * Page URL * Absolute URL */ List <List <string> > TestList = new List <List <string> > (); TestList.Add(new List <string> ()); TestList[TestList.Count - 1].Add("http://www.host.com/BASEHREF/index.html"); TestList[TestList.Count - 1].Add("http://www.host.com/path/to/page/"); TestList[TestList.Count - 1].Add("http://www.host.com/path/to/page/to/pages/index.html"); TestList[TestList.Count - 1].Add("http://www.host.com/path/to/page/to/pages/index.html"); TestList.Add(new List <string> ()); TestList[TestList.Count - 1].Add("http://www.host.com/BASEHREF/index.html"); TestList[TestList.Count - 1].Add("http://www.host.com/path/to/page/"); TestList[TestList.Count - 1].Add("path/to/pages/index.html"); TestList[TestList.Count - 1].Add("http://www.host.com/BASEHREF/path/to/pages/index.html"); TestList.Add(new List <string> ()); TestList[TestList.Count - 1].Add("http://www.host.com/BASEHREF/index.html"); TestList[TestList.Count - 1].Add("http://www.host.com/path/to/page/"); TestList[TestList.Count - 1].Add("../path/to/pages/index.html"); TestList[TestList.Count - 1].Add("http://www.host.com/path/to/pages/index.html"); TestList.Add(new List <string> ()); TestList[TestList.Count - 1].Add("http://www.host.com/BASEHREF/index.html"); TestList[TestList.Count - 1].Add("http://www.host.com/path/to/page/"); TestList[TestList.Count - 1].Add("../../path/to/pages/index.html"); TestList[TestList.Count - 1].Add("http://www.host.com/path/to/pages/index.html"); foreach (List <string> UrlSet in TestList) { string BaseHref = UrlSet[0]; string BaseUrl = UrlSet[1]; string PageUrl = UrlSet[2]; string AbsoluteUrl = UrlSet[3]; string ResolvedUrl; ResolvedUrl = MacroscopeHttpUrlUtils.MakeUrlAbsolute( BaseHref: BaseHref, BaseUrl: BaseUrl, Url: PageUrl ); Assert.AreEqual(AbsoluteUrl, ResolvedUrl, "DO NOT MATCH"); } }
public void TestIsWithinChildDirectory() { const string StartUrl = "http://www.companyname.com/path/to/some/deep/folder/index.html"; List <string> TargetUrls = new List <string>(); TargetUrls.Add("http://www.companyname.com/path/to/some/deep/folder/sub-folder/sub-folder/index.html"); TargetUrls.Add("http://www.companyname.com/path/to/some/deep/folder/sub-folder/image"); foreach (string TargetUrl in TargetUrls) { Assert.IsTrue(MacroscopeHttpUrlUtils.IsWithinChildDirectory(StartUrl: StartUrl, Url: TargetUrl), string.Format("FAIL: {0}", TargetUrl)); } }
/**************************************************************************/ public async Task <bool> IsPdfUrl(MacroscopeJobMaster JobMaster, string Url) { bool Result = false; Uri TargetUri = new Uri(Url); string MimeType = await MacroscopeHttpUrlUtils.GetMimeTypeOfUrl(JobMaster : JobMaster, TargetUri : TargetUri); if (!string.IsNullOrEmpty(MimeType)) { if (Regex.IsMatch(MimeType, "^application/pdf$", RegexOptions.IgnoreCase)) { Result = true; } } return(Result); }
public void TestDowncaseUrl() { Dictionary <string, string> UrlList = new Dictionary <string, string>(); UrlList.Add("https://nazuke.github.io/", "https://nazuke.github.io/"); UrlList.Add("https://nazuke.github.io/ABC.html", "https://nazuke.github.io/abc.html"); UrlList.Add("https://nazuke.github.io/ABC/ABC.html", "https://nazuke.github.io/abc/abc.html"); UrlList.Add("https://nazuke.github.io/ABC/ABC/ABC.HTML", "https://nazuke.github.io/abc/abc/abc.html"); UrlList.Add("https://nazuke.github.io/ABC/ABC/ABC/ABC/ABC.html?key=value", "https://nazuke.github.io/abc/abc/abc/abc/abc.html?key=value"); UrlList.Add("https://nazuke.github.io/ABC/ABC/ABC/ABC/ABC.html?key=value&name=bongo", "https://nazuke.github.io/abc/abc/abc/abc/abc.html?key=value&name=bongo"); UrlList.Add("https://nazuke.github.io/ABC/ABC/ABC/ABC/ABC.html?KEY=value&Name=Bongo", "https://nazuke.github.io/abc/abc/abc/abc/abc.html?KEY=value&Name=Bongo"); foreach (KeyValuePair <string, string> UrlPair in UrlList) { string DowncasedUrl = MacroscopeHttpUrlUtils.DowncaseUrl(Url: UrlPair.Key); Assert.AreEqual(UrlPair.Value, DowncasedUrl); } }
public void TestParentFolderUrlsDepth() { Dictionary <string, int> UrlList = new Dictionary <string, int>(); UrlList.Add("https://nazuke.github.io/", 0); UrlList.Add("https://nazuke.github.io/0.html", 0); UrlList.Add("https://nazuke.github.io/0/1.html", 1); UrlList.Add("https://nazuke.github.io/0/1/2.html", 2); UrlList.Add("https://nazuke.github.io/0/1/2/", 3); UrlList.Add("https://nazuke.github.io/0/1/2/3.html", 3); UrlList.Add("https://nazuke.github.io/0/1/2/3.html/", 4); UrlList.Add("https://nazuke.github.io/0/1/2/3/4.html?key=value", 4); foreach (KeyValuePair <string, int> UrlPair in UrlList) { List <string> ParentFolderUrls = MacroscopeHttpUrlUtils.GetParentFolderUrls(Url: UrlPair.Key); Assert.AreEqual(UrlPair.Value, ParentFolderUrls.Count); } }
/** Target URL ************************************************************/ public void SetTargetUrl(string TargetUrl) { if (MacroscopePreferencesManager.GetDowncaseLinks()) { string DowncasedUrl = MacroscopeHttpUrlUtils.DowncaseUrl(Url: TargetUrl); if (DowncasedUrl != null) { this.TargetUrl = DowncasedUrl; } else { this.TargetUrl = TargetUrl; } } else { this.TargetUrl = TargetUrl; } }
public void TestFindUrlDepth() { Dictionary <string, int> UrlList = new Dictionary <string, int>(); UrlList.Add("https://nazuke.github.io/", 0); UrlList.Add("https://nazuke.github.io/0.html", 0); UrlList.Add("https://nazuke.github.io/0/1.html", 1); UrlList.Add("https://nazuke.github.io/0/1/2.html", 2); UrlList.Add("https://nazuke.github.io/0/1/2/", 2); UrlList.Add("https://nazuke.github.io/0/1/2/3.html", 3); UrlList.Add("https://nazuke.github.io/0/1/2/3.html/", 3); UrlList.Add("https://nazuke.github.io/0/1/2/3/4.html?key=value", 4); foreach (KeyValuePair <string, int> UrlPair in UrlList) { this.DebugMsg(string.Format("{0}: {1}", UrlPair.Value, UrlPair.Key)); int Depth = MacroscopeHttpUrlUtils.FindUrlDepth(Url: UrlPair.Key); Assert.AreEqual(UrlPair.Value, Depth); } }
/**************************************************************************/ private void ProcessSitemapXmlOutlinks(XmlDocument XmlDoc) { XmlNodeList OutlinksList = XmlDoc.GetElementsByTagName("loc", MacroscopeConstants.SitemapXmlNamespace); DebugMsg(string.Format("ProcessSitemapXmlOutlinks nlOutlinks: {0}", OutlinksList.Count)); if (OutlinksList != null) { foreach (XmlNode LinkNode in OutlinksList) { string LinkUrl = null; try { LinkUrl = LinkNode.InnerText; DebugMsg(string.Format("ProcessSitemapXmlOutlinks sLinkUrl: {0}", LinkUrl)); } catch (Exception ex) { DebugMsg(string.Format("ProcessSitemapXmlOutlinks: {0}", ex.Message)); } if (LinkUrl != null) { MacroscopeLink Outlink; string LinkUrlAbs = MacroscopeHttpUrlUtils.MakeUrlAbsolute(BaseUrl: this.GetUrl(), Url: LinkUrl); Outlink = this.AddDocumentOutlink( AbsoluteUrl: LinkUrlAbs, LinkType: MacroscopeConstants.InOutLinkType.SITEMAPXML, Follow: true ); if (Outlink != null) { Outlink.SetRawTargetUrl(LinkUrl); } } } } }
public void TestValidateUrls() { Dictionary <string, bool> UrlList = new Dictionary <string, bool> (); UrlList.Add( "http://www.host.com/", true ); UrlList.Add( "http://www.host.com/index.html", true ); UrlList.Add( "http://www.host.com/path/path/to/images/picture.gif", true ); UrlList.Add( "http://www.host.com/??", true ); UrlList.Add( "http://www.host.com/ ", true ); UrlList.Add( "http:// www.host.com/", false ); foreach (string Url in UrlList.Keys) { bool IsValid = MacroscopeHttpUrlUtils.ValidateUrl(Url); Assert.AreEqual(UrlList[Url], IsValid, string.Format("NOT VALID: {0}", Url)); } }
public void TestStripHashFragment() { Dictionary <string, string> UrlList = new Dictionary <string, string> (); UrlList.Add("http://www.host.com/#aberdeen-angus", "http://www.host.com/"); UrlList.Add("http://www.host.com/product/list/#boris", "http://www.host.com/product/list/"); UrlList.Add("http://www.host.com/product/list/index.html#boris", "http://www.host.com/product/list/index.html"); UrlList.Add("http://www.host.com/?key1=value1&key2=value2&key3=value3", "http://www.host.com/?key1=value1&key2=value2&key3=value3"); UrlList.Add("http://www.host.com/?key1=value1&key2=value2&key3=value3#gonzo", "http://www.host.com/?key1=value1&key2=value2&key3=value3"); UrlList.Add("http://www.host.com/index.html?key1=value1&key2=value2&key3=value3#gonzo", "http://www.host.com/index.html?key1=value1&key2=value2&key3=value3"); foreach (string Url in UrlList.Keys) { string UrlResult = MacroscopeHttpUrlUtils.StripHashFragment(Url); Assert.AreEqual(UrlList[Url], UrlResult, string.Format("NOT VALID: {0}", Url)); } }
/** -------------------------------------------------------------------- **/ private void GenerateTextSitemapPdfEntries( MacroscopeDocument msDoc, List <string> SitemapText, Dictionary <string, bool> Dedupe ) { foreach (MacroscopeHyperlinkOut HyperlinkOut in msDoc.IterateHyperlinksOut()) { string Url = HyperlinkOut.GetTargetUrl(); Uri UrlParsed = new Uri(uriString: Url); if (Dedupe.ContainsKey(Url)) { continue; } else { Dedupe.Add(Url, true); } if (!UrlParsed.AbsolutePath.ToLower().EndsWith(".pdf", StringComparison.InvariantCultureIgnoreCase)) { continue; } if (!this.DocCollection.GetAllowedHosts().IsAllowedFromUrl(Url: Url)) { continue; } if (!MacroscopeHttpUrlUtils.VerifySameHost(BaseUrl: msDoc.GetUrl(), Url: Url)) { continue; } SitemapText.Add(Url); } }
/**************************************************************************/ /* * * Reference: https://www.w3.org/TR/html5/document-metadata.html#the-base-element * */ public static string MakeUrlAbsolute( string BaseHref, string BaseUrl, string Url ) { string AbsoluteBaseHref; string UrlFixed; if (!string.IsNullOrEmpty(value: BaseHref)) { AbsoluteBaseHref = MacroscopeHttpUrlUtils.MakeUrlAbsolute( BaseUrl: BaseUrl, Url: BaseHref ); DebugMsgStatic(string.Format("BASEHREF: {0}", BaseHref)); DebugMsgStatic(string.Format("ABSOLUTEBASEHREF: {0}", AbsoluteBaseHref)); UrlFixed = MacroscopeHttpUrlUtils.MakeUrlAbsolute( BaseUrl: AbsoluteBaseHref, Url: Url ); DebugMsgStatic(string.Format("URL: {0}", Url)); DebugMsgStatic(string.Format("URLFIXED: {0}", UrlFixed)); } else { UrlFixed = MacroscopeHttpUrlUtils.MakeUrlAbsolute( BaseUrl: BaseUrl, Url: Url ); } return(UrlFixed); }
public void TestCleanUrlCss() { Dictionary <string, string> PropertiesTable = new Dictionary <string, string> (); PropertiesTable.Add( "background-image:none;", null ); PropertiesTable.Add( "background: #0b7bee url(none) no-repeat center center/cover;", null ); PropertiesTable.Add( "background: #0b7bee url(images/video-bg.jpg) no-repeat center center/cover;", "images/video-bg.jpg" ); PropertiesTable.Add( "background: #0b7bee url(\"images/video-bg.jpg\") no-repeat center center/cover;", "images/video-bg.jpg" ); PropertiesTable.Add( "src: url(\"fonts/company/latin-e-bold-eot.eot\");", "fonts/company/latin-e-bold-eot.eot" ); PropertiesTable.Add( "src: url(\"fonts/company/latin-e-bold-eot.eot?#iefix\") format(\"embedded-opentype\"),url(\"fonts/company/latin-e-bold-woff.woff\") format(\"woff\"),url(\"fonts/company/latin-e-bold-ttf.ttf\") format(\"truetype\");", "fonts/company/latin-e-bold-eot.eot?#iefix" ); PropertiesTable.Add( "background: #ffffff url(images/services/features-background.png) no-repeat left bottom;", "images/services/features-background.png" ); PropertiesTable.Add( "background: transparent url(\"images/home/mouse.png\") no-repeat 90% top;", "images/home/mouse.png" ); PropertiesTable.Add( "background: #0b7bee url(images/services/features-background_hover.png) no-repeat left bottom;", "images/services/features-background_hover.png" ); PropertiesTable.Add( "background-image: url(\"images/global/page-head-trans.png\");", "images/global/page-head-trans.png" ); PropertiesTable.Add( "background-image: url(\"images/heroes/hero.jpg\");", "images/heroes/hero.jpg" ); foreach (string PropertyKey in PropertiesTable.Keys) { string Cleaned = MacroscopeHttpUrlUtils.CleanUrlCss(PropertyKey); Assert.AreEqual(PropertiesTable[PropertyKey], Cleaned, string.Format("NOT VALID: {0}", Cleaned)); } }
/**************************************************************************/ private async Task <MacroscopeConstants.FetchStatus> Fetch(string Url, string RedirectedFromUrl = null) { MacroscopeDocument msDoc = null; MacroscopeConstants.FetchStatus FetchStatus = MacroscopeConstants.FetchStatus.VOID; bool BlockedByRobotsRule; if (MacroscopePreferencesManager.GetPageLimit() > -1) { int PagesFound = this.JobMaster.GetPagesFound(); int PageLimit = MacroscopePreferencesManager.GetPageLimit(); if (PagesFound >= PageLimit) { this.DebugMsg(string.Format("PAGE LIMIT REACHED: {0} :: {1}", PageLimit, PagesFound)); return(FetchStatus); } } if (this.DocCollection.ContainsDocument(Url: Url)) { msDoc = this.DocCollection.GetDocumentByUrl(Url: Url); if (msDoc.GetAuthenticationRealm() != null) { if (msDoc.GetAuthenticationType() == MacroscopeConstants.AuthenticationType.BASIC) { MacroscopeCredential Credential; Credential = this.JobMaster.GetCredentialsHttp().GetCredential( msDoc.GetHostAndPort(), msDoc.GetAuthenticationRealm() ); if (Credential != null) { msDoc = this.DocCollection.CreateDocument( Credential: Credential, Url: Url ); } } } } else { msDoc = this.DocCollection.CreateDocument(Url: Url); } if (!string.IsNullOrEmpty(RedirectedFromUrl)) { msDoc.SetUrlRedirectFrom(Url: RedirectedFromUrl); } msDoc.SetFetchStatus(MacroscopeConstants.FetchStatus.OK); if (!MacroscopeDnsTools.CheckValidHostname(Url: Url)) { this.DebugMsg(string.Format("Fetch :: CheckValidHostname: {0}", "NOT OK")); msDoc.SetStatusCode(HttpStatusCode.BadGateway); FetchStatus = MacroscopeConstants.FetchStatus.NETWORK_ERROR; msDoc.SetFetchStatus(FetchStatus); } if (await this.JobMaster.GetRobots().CheckRobotRule(Url: Url)) { msDoc.SetAllowedByRobots(true); } else { msDoc.SetAllowedByRobots(false); } BlockedByRobotsRule = await this.JobMaster.GetRobots().ApplyRobotRule(Url: Url); if (!BlockedByRobotsRule) { this.DebugMsg(string.Format("Disallowed by robots.txt: {0}", Url)); this.JobMaster.AddToBlockedByRobots(Url); FetchStatus = MacroscopeConstants.FetchStatus.ROBOTS_DISALLOWED; msDoc.SetFetchStatus(FetchStatus); JobHistory.VisitedHistoryItem(Url: msDoc.GetUrl()); } else { this.JobMaster.RemoveFromBlockedByRobots(Url); } if (this.AllowedHosts.IsExternalUrl(Url: Url)) { this.DebugMsg(string.Format("IsExternalUrl: {0}", Url)); msDoc.SetIsExternal(State: true); } if (this.DocCollection.ContainsDocument(Url: Url)) { if (!this.DocCollection.GetDocumentByUrl(Url: Url).GetIsDirty()) { FetchStatus = MacroscopeConstants.FetchStatus.ALREADY_SEEN; return(FetchStatus); } } if (MacroscopePreferencesManager.GetDepth() >= 0) { int Depth = MacroscopeHttpUrlUtils.FindUrlDepth(Url: Url); if (Depth > MacroscopePreferencesManager.GetDepth()) { this.DebugMsg(string.Format("URL Too Deep: {0}", Depth)); FetchStatus = MacroscopeConstants.FetchStatus.SKIPPED; return(FetchStatus); } } /** ------------------------------------------------------------------ **/ if (!await msDoc.Execute()) { this.DebugMsg(string.Format("EXECUTE FAILED: {0}", Url)); FetchStatus = MacroscopeConstants.FetchStatus.ERROR; } /** ------------------------------------------------------------------ **/ /** ------------------------------------------------------------------ **/ { if (msDoc.GetStatusCode() == HttpStatusCode.Unauthorized) { if (msDoc.GetAuthenticationType() == MacroscopeConstants.AuthenticationType.BASIC) { MacroscopeCredentialsHttp CredentialsHttp = this.JobMaster.GetCredentialsHttp(); CredentialsHttp.EnqueueCredentialRequest( Domain: msDoc.GetHostAndPort(), Realm: msDoc.GetAuthenticationRealm(), Url: msDoc.GetUrl() ); this.JobMaster.AddUrlQueueItem(Url: msDoc.GetUrl()); } } if (msDoc.GetIsRedirect()) { this.DebugMsg(string.Format("REDIRECTION DETECTED GetUrl: {0}", msDoc.GetUrl())); this.DebugMsg(string.Format("REDIRECTION DETECTED From: {0}", msDoc.GetUrlRedirectFrom())); if (MacroscopePreferencesManager.GetCheckRedirects()) { string Hostname = msDoc.GetHostAndPort(); string HostnameFrom = MacroscopeAllowedHosts.ParseHostnameFromUrl(msDoc.GetUrlRedirectFrom()); string UrlRedirectTo = msDoc.GetUrlRedirectTo(); string HostnameTo = MacroscopeAllowedHosts.ParseHostnameFromUrl(UrlRedirectTo); this.DebugMsg(string.Format("REDIRECTION DETECTED UrlRedirectTo: {0}", UrlRedirectTo)); this.DebugMsg(string.Format("REDIRECTION DETECTED HostnameTo: {0}", HostnameTo)); if (MacroscopePreferencesManager.GetFollowRedirects()) { if (MacroscopePreferencesManager.GetCheckExternalLinks()) { this.AllowedHosts.AddFromUrl(Url: UrlRedirectTo); } else { if (this.AllowedHosts.IsInternalUrl(Url: UrlRedirectTo)) { this.AllowedHosts.AddFromUrl(Url: UrlRedirectTo); } } } } this.JobMaster.AddUrlQueueItem(Url: msDoc.GetUrlRedirectTo()); } else { this.ProcessHrefLangLanguages(msDoc); // Process Languages from HrefLang this.JobMaster.ProcessOutlinks(msDoc: msDoc); // Process Outlinks from document } FetchStatus = MacroscopeConstants.FetchStatus.SUCCESS; } /** ------------------------------------------------------------------ **/ if (DocCollection.ContainsDocument(msDoc: msDoc)) { JobHistory.VisitedHistoryItem(Url: Url); } else { this.DebugMsg(string.Format("OOPS: {0}", Url)); } /** ------------------------------------------------------------------ **/ return(FetchStatus); }
/**************************************************************************/ public async void Execute() { int MaxFetches = MacroscopePreferencesManager.GetMaxFetchesPerWorker(); while (MaxFetches > 0) { if (this.JobMaster.GetThreadsStop()) { this.DebugMsg(string.Format("JobMaster.GetThreadsStop: {0}", this.JobMaster.GetThreadsStop())); break; } else { MacroscopeJobItem JobItem = this.JobMaster.GetUrlQueueItem(); string Url = null; string RedirectedFromUrl = null; if (JobItem != null) { Url = JobItem.GetItemUrl(); RedirectedFromUrl = JobItem.GetItemRedirectedFromUrl(); } if (!string.IsNullOrEmpty(Url)) { if (!this.CheckIncludeExcludeUrl(Url)) { Url = null; } } if (!string.IsNullOrEmpty(Url)) { if ( !MacroscopePreferencesManager.GetCrawlParentDirectories() && !MacroscopePreferencesManager.GetCrawlChildDirectories() && Url != this.JobMaster.GetStartUrl()) { Url = null; } else if ( !MacroscopePreferencesManager.GetCrawlParentDirectories() || !MacroscopePreferencesManager.GetCrawlChildDirectories()) { this.DebugMsg(string.Format("Running Parent/Child Check: {0}", Url)); if ( MacroscopePreferencesManager.GetCrawlParentDirectories() && (!string.IsNullOrEmpty(Url))) { if (!MacroscopeHttpUrlUtils.IsWithinParentDirectory(StartUrl: this.JobMaster.GetParentStartingDirectory(), Url: Url)) { Url = null; } } if ( MacroscopePreferencesManager.GetCrawlChildDirectories() && (!string.IsNullOrEmpty(Url))) { if (!MacroscopeHttpUrlUtils.IsWithinChildDirectory(StartUrl: this.JobMaster.GetChildStartingDirectory(), Url: Url)) { Url = null; } } } else { this.DebugMsg(string.Format("Skipping Parent/Child Check: {0}", Url)); } } if (!string.IsNullOrEmpty(Url)) { if (MacroscopePreferencesManager.GetDepth() >= 0) { if (MacroscopeHttpUrlUtils.FindUrlDepth(Url: Url) > MacroscopePreferencesManager.GetDepth()) { this.DebugMsg(string.Format("URL Too Deep: {0}", Url)); Url = null; } } } if (!string.IsNullOrEmpty(Url)) { this.DebugMsg(string.Format("Execute: {0}", Url)); int Tries = MacroscopePreferencesManager.GetMaxRetries(); JobHistory.AddHistoryItem(Url: Url); do { this.DebugMsg(string.Format("Trying Fetch: {0} :: {1}", Tries, Url)); MacroscopeConstants.FetchStatus FetchStatus = MacroscopeConstants.FetchStatus.VOID; try { if (!string.IsNullOrEmpty(RedirectedFromUrl)) { FetchStatus = await this.Fetch(Url, RedirectedFromUrl); } else { FetchStatus = await this.Fetch(Url); } } catch (Exception ex) { this.DebugMsg(string.Format("FetchStatus: {0}", ex.Message)); this.DebugMsg(string.Format("Url: {0}", Url)); this.DebugMsg(string.Format("FetchStatus: {0}", FetchStatus)); } switch (FetchStatus) { case MacroscopeConstants.FetchStatus.ERROR: this.DebugMsg(string.Format("Fetch Failed: {0} :: {1}", Tries, Url)); Thread.Sleep(25); break; case MacroscopeConstants.FetchStatus.NETWORK_ERROR: this.DebugMsg(string.Format("Fetch Failed: {0} :: {1}", Tries, Url)); Thread.Sleep(25); break; default: this.JobMaster.NotifyWorkersFetched(Url: Url); Tries = 0; break; } Tries--; } while(Tries > 0); if (this.CrawlDelay > 0) { this.DebugMsg(string.Format("CRAWL DELAY: Sleeping for {0} seconds...", this.CrawlDelay)); Thread.Sleep(CrawlDelay * 1000); } } } MaxFetches--; //Thread.Yield(); } this.JobMaster.NotifyWorkersDone(); }
/** -------------------------------------------------------------------- **/ public static bool IsWithinChildDirectory(string StartUrl, string Url) { bool IsWithin = false; Uri CurrentUri = null; string CurrentUriPort = ""; try { CurrentUri = new Uri(Url); if (CurrentUri.Port > 0) { CurrentUriPort = string.Format(":{0}", CurrentUri.Port); } } catch (UriFormatException ex) { DebugMsgStatic(string.Format("UriFormatException: {0}", ex.Message)); } catch (Exception ex) { DebugMsgStatic(string.Format("Exception: {0}", ex.Message)); } if (CurrentUri != null) { if ( (CurrentUri.Scheme.ToLower() == "http") || (CurrentUri.Scheme.ToLower() == "https")) { string StartingUrl = MacroscopeHttpUrlUtils.DetermineStartingDirectory(Url: StartUrl); string Path = CurrentUri.AbsolutePath; string CurrentUriString; int ChildStartingDirectoryLength; int CurrentUriStringLength; Path = Regex.Replace(Path, "/[^/]*$", "/", RegexOptions.IgnoreCase); if (Path.Length == 0) { Path = "/"; } CurrentUriString = string.Join( "", CurrentUri.Scheme, "://", CurrentUri.Host, CurrentUriPort, Path ); ChildStartingDirectoryLength = StartingUrl.Length; CurrentUriStringLength = CurrentUriString.Length; if (CurrentUriStringLength >= ChildStartingDirectoryLength) { if (CurrentUriString.StartsWith(StartingUrl, StringComparison.Ordinal)) { IsWithin = true; } } } } return(IsWithin); }
/**************************************************************************/ public MacroscopeDocumentList AnalyzeOrphanedDocumentsInCollection(MacroscopeDocumentCollection DocCollection) { MacroscopeDocumentList OrphanedDocumentList = new MacroscopeDocumentList(); foreach (MacroscopeDocument msDocLeft in DocCollection.IterateDocuments()) { bool IsOrphan = true; string UrlLeft = msDocLeft.GetUrl(); if (!IsValidDocument(msDoc: msDocLeft)) { continue; } foreach (MacroscopeDocument msDocRight in DocCollection.IterateDocuments()) { if (MacroscopeHttpUrlUtils.CompareUrls(UrlLeft: UrlLeft, UrlRight: msDocRight.GetUrl())) { continue; } if (!this.IsValidDocument(msDoc: msDocRight)) { continue; } foreach (MacroscopeHyperlinkOut HyperlinkOut in msDocRight.IterateHyperlinksOut()) { string UrlRight = HyperlinkOut.GetTargetUrl(); string UrlRightRaw = HyperlinkOut.GetRawTargetUrl(); if (MacroscopeHttpUrlUtils.CompareUrls(UrlLeft: UrlLeft, UrlRight: UrlRight)) { IsOrphan = false; } else if (MacroscopeHttpUrlUtils.CompareUrls(UrlLeft: UrlLeft, UrlRight: UrlRightRaw)) { IsOrphan = false; } if (!IsOrphan) { break; } } if (!IsOrphan) { break; } } if (IsOrphan) { OrphanedDocumentList.AddDocument(msDoc: msDocLeft); msDocLeft.AddRemark("ORPHAN1", "This appears to be an orphaned page, not linked to from any other HTML page in this collection."); msDocLeft.AddRemark("ORPHAN2", "This page appears to only be referenced from one or more sitemaps."); } else { msDocLeft.RemoveRemark("ORPHAN1"); msDocLeft.RemoveRemark("ORPHAN2"); } } return(OrphanedDocumentList); }
/** -------------------------------------------------------------------- **/ private async Task _ProcessPdfPage() { MacroscopeHttpTwoClient Client = this.DocCollection.GetJobMaster().GetHttpClient(); MacroscopeHttpTwoClientResponse ClientResponse = null; string ResponseErrorCondition = null; try { ClientResponse = await Client.Get( this.GetUri(), this.ConfigurePdfPageRequestHeadersCallback, this.PostProcessRequestHttpHeadersCallback ); } catch (MacroscopeDocumentException ex) { this.DebugMsg(string.Format("_ProcessPdfPage :: MacroscopeDocumentException: {0}", ex.Message)); ResponseErrorCondition = ex.Message; this.AddRemark("_ProcessPdfPage", ex.Message); } catch (Exception ex) { this.DebugMsg(string.Format("_ProcessPdfPage :: Exception: {0}", ex.Message)); ResponseErrorCondition = ex.Message; this.AddRemark("_ProcessPdfPage", ex.Message); } if (ClientResponse != null) { MacroscopePdfTools PdfTools; this.ProcessResponseHttpHeaders(Response: ClientResponse); { // Probe Locale //this.Locale = "en"; // Implement locale probing this.Locale = "x-default"; // Implement locale probing this.SetHreflang(HrefLangLocale: this.Locale, Url: this.DocUrl); } { // Canonical this.Canonical = this.DocUrl; this.DebugMsg(string.Format("CANONICAL: {0}", this.Canonical)); } /** Get Response Body ---------------------------------------------- **/ try { byte[] RawData = ClientResponse.GetContentAsBytes(); this.SetContentLength(Length: RawData.Length); PdfTools = new MacroscopePdfTools(PdfData: RawData); if (PdfTools.GetHasError()) { this.AddRemark("CORRUPT_PDF", Observation: PdfTools.GetErrorMessage()); } this.SetWasDownloaded(true); } catch (Exception ex) { this.DebugMsg(string.Format("Exception: {0}", ex.Message)); this.SetStatusCode(HttpStatusCode.BadRequest); PdfTools = null; this.SetContentLength(Length: 0); } /** Title ---------------------------------------------------------- **/ if (PdfTools != null) { string Text = PdfTools.GetTitle(); if (!string.IsNullOrEmpty(Text)) { this.SetTitle(Text, MacroscopeConstants.TextProcessingMode.NO_PROCESSING); this.DebugMsg(string.Format("TITLE: {0}", this.GetTitle())); } else { this.DebugMsg(string.Format("TITLE: {0}", "MISSING")); } } /** Author --------------------------------------------------------- **/ if (PdfTools != null) { string Text = PdfTools.GetAuthor(); if (!string.IsNullOrEmpty(Text)) { this.SetAuthor(AuthorText: Text, ProcessingMode: MacroscopeConstants.TextProcessingMode.NO_PROCESSING); this.DebugMsg(string.Format("AUTHOR: {0}", this.GetAuthor())); } else { this.DebugMsg(string.Format("AUTHOR: {0}", "MISSING")); } } /** Description ---------------------------------------------------- **/ if (PdfTools != null) { string Text = PdfTools.GetDescription(); if (!string.IsNullOrEmpty(Text)) { this.SetDescription(Text, MacroscopeConstants.TextProcessingMode.NO_PROCESSING); this.DebugMsg(string.Format("TITLE: {0}", this.GetDescription())); } else { this.DebugMsg(string.Format("TITLE: {0}", "MISSING")); } } /** Metadata Keywords ---------------------------------------------- **/ if (PdfTools != null) { string Text = PdfTools.GetKeywords(); if (!string.IsNullOrEmpty(Text)) { this.SetKeywords(KeywordsText: Text); this.DebugMsg(string.Format("KEYWORDS: {0}", this.GetKeywords())); } else { this.DebugMsg(string.Format("KEYWORDS: {0}", "MISSING")); } } /** Body Text ------------------------------------------------------ **/ if (PdfTools != null) { this.SetBodyText(Text: ""); if (PdfTools.GetHasError()) { this.AddRemark("PDF_ERROR", Observation: PdfTools.GetErrorMessage()); } else { string Text = PdfTools.GetTextAsString(); if (!string.IsNullOrEmpty(Text)) { this.SetDocumentText(Text: Text); this.SetBodyText(Text: Text); } } this.DebugMsg(string.Format("BODY TEXT: {0}", this.GetBodyTextRaw())); } /** Data Extractors ------------------------------------------------ **/ if (!string.IsNullOrEmpty(this.GetBodyTextRaw())) { if (MacroscopePreferencesManager.GetDataExtractorsEnable()) { if (MacroscopePreferencesManager.GetDataExtractorsApplyToPdf()) { string Text = this.GetBodyTextRaw(); this.ProcessGenericDataExtractors(GenericText: Text); } } } /** Out Links Text ------------------------------------------------- **/ if (this.GetDocumentTextRawLength() > 0) { if (this.GetIsInternal()) { string Text = this.GetDocumentTextRaw(); this.ProcessPureTextOutlinks(TextDoc: Text, LinkType: MacroscopeConstants.InOutLinkType.PDF); } } /** Out Links in Annotations --------------------------------------- **/ if (this.GetIsInternal() && (this.GetDocumentTextRawLength() > 0)) { List <KeyValuePair <string, string> > AnnotationOutLinks = PdfTools.GetOutLinks(); // TODO: Implement extraction of text that underlies the link annotation foreach (KeyValuePair <string, string> AnnotationOutLinkPair in AnnotationOutLinks) { MacroscopeHyperlinkOut HyperlinkOut = null; string AnnotationOutLinkUrlAbs; AnnotationOutLinkUrlAbs = MacroscopeHttpUrlUtils.MakeUrlAbsolute( BaseHref: this.BaseHref, BaseUrl: this.DocUrl, Url: AnnotationOutLinkPair.Key ); HyperlinkOut = this.HyperlinksOut.Add(LinkType: MacroscopeConstants.HyperlinkType.PDF, UrlTarget: AnnotationOutLinkUrlAbs); HyperlinkOut.SetRawTargetUrl(TargetUrl: AnnotationOutLinkUrlAbs); HyperlinkOut.SetAltText(AnnotationOutLinkPair.Value); HyperlinkOut.SetAnchorText(AnnotationOutLinkPair.Value); HyperlinkOut.SetTitle(AnnotationOutLinkPair.Value); HyperlinkOut.SetDoFollow(); HyperlinkOut.SetMethod(Method: "GET"); this.AddDocumentOutlink(AbsoluteUrl: AnnotationOutLinkUrlAbs, LinkType: MacroscopeConstants.InOutLinkType.PDF, Follow: true); } } /** ---------------------------------------------------------------- **/ } if (ResponseErrorCondition != null) { this.ProcessErrorCondition(ResponseErrorCondition); } }
/** -------------------------------------------------------------------- **/ private void GenerateXmlSitemapPdfEntries( MacroscopeDocument msDoc, XmlDocument SitemapXml, XmlElement UrlSetNode, Dictionary <string, bool> Dedupe ) { foreach (MacroscopeHyperlinkOut HyperlinkOut in msDoc.IterateHyperlinksOut()) { string Url = HyperlinkOut.GetTargetUrl(); Uri UrlParsed = new Uri(uriString: Url); if (Dedupe.ContainsKey(Url)) { continue; } else { Dedupe.Add(Url, true); } if (!UrlParsed.AbsolutePath.ToLower().EndsWith(".pdf", StringComparison.InvariantCultureIgnoreCase)) { continue; } if (!this.DocCollection.GetAllowedHosts().IsAllowedFromUrl(Url: Url)) { continue; } if (!MacroscopeHttpUrlUtils.VerifySameHost(BaseUrl: msDoc.GetUrl(), Url: Url)) { continue; } XmlElement UrlNode = SitemapXml.CreateElement(string.Empty, "url", MacroscopeSitemapGenerator.XmlNamespace); UrlSetNode.AppendChild(UrlNode); { XmlElement EntryNode = SitemapXml.CreateElement(string.Empty, "loc", MacroscopeSitemapGenerator.XmlNamespace); XmlText TextNode = SitemapXml.CreateTextNode(Url); UrlNode.AppendChild(EntryNode); EntryNode.AppendChild(TextNode); } { XmlElement EntryNode = SitemapXml.CreateElement(string.Empty, "changefreq", MacroscopeSitemapGenerator.XmlNamespace); XmlText TextNode = SitemapXml.CreateTextNode("daily"); UrlNode.AppendChild(EntryNode); EntryNode.AppendChild(TextNode); } { XmlElement EntryNode = SitemapXml.CreateElement(string.Empty, "priority", MacroscopeSitemapGenerator.XmlNamespace); XmlText TextNode = SitemapXml.CreateTextNode("1.0"); UrlNode.AppendChild(EntryNode); EntryNode.AppendChild(TextNode); } } }
/**************************************************************************/ public async Task <List <MacroscopeRedirectChainDocStruct> > AnalyzeRedirectChains( HttpStatusCode StatusCode, string StartUrl, string RedirectUrl ) { List <MacroscopeRedirectChainDocStruct> RedirectChain = new List <MacroscopeRedirectChainDocStruct>(); int MaxHops = MacroscopePreferencesManager.GetRedirectChainsMaxHops(); MacroscopeRedirectChainDocStruct StructStart; int IHOP = 0; string PrevUrl = null; string NextUrl = null; try { try { StructStart = new MacroscopeRedirectChainDocStruct( NewStatusCode: StatusCode, NewUrl: StartUrl, NewRedirectUrl: RedirectUrl ); RedirectChain.Add(StructStart); PrevUrl = StructStart.Url; NextUrl = StructStart.RedirectUrl; } catch (Exception ex) { this.DebugMsg(ex.Message); } do { MacroscopeRedirectChainDocStruct StructNext; try { if (!string.IsNullOrEmpty(PrevUrl)) { NextUrl = MacroscopeHttpUrlUtils.MakeUrlAbsolute(PrevUrl, NextUrl); } StructNext = await this.Probe(Url : NextUrl); RedirectChain.Add(StructNext); PrevUrl = StructNext.Url; NextUrl = StructNext.RedirectUrl; switch (StructNext.StatusCode) { case HttpStatusCode.Found: break; case HttpStatusCode.Moved: break; case HttpStatusCode.SeeOther: break; case HttpStatusCode.TemporaryRedirect: break; default: IHOP = MaxHops; break; } } catch (Exception ex) { this.DebugMsg(ex.Message); } IHOP++; }while(IHOP < MaxHops); } catch (Exception ex) { this.DebugMsg(ex.Message); } return(RedirectChain); }
/**************************************************************************/ private void ProcessHttpLinkHeader(string HttpLinkHeader) { // https://webmasters.googleblog.com/2011/09/pagination-with-relnext-and-relprev.html // Link: <http://www.example.com/downloads/white-paper.pdf>; rel="canonical" string[] HttpLinkHeaderItems = Regex.Split(HttpLinkHeader, @",\s*"); for (int i = 0; i < HttpLinkHeaderItems.Length; i++) { string Url = null; string Rel = null; MatchCollection matches; matches = Regex.Matches(HttpLinkHeader, "<([^<>]+)>\\s*;\\srel=\"([^\"]+)\""); foreach (Match match in matches) { Url = match.Groups[1].Value; Rel = match.Groups[2].Value; } if ( (!string.IsNullOrEmpty(Rel)) && (!string.IsNullOrEmpty(Url))) { string LinkUrl = null; string LinkUrlAbs = null; MacroscopeConstants.InOutLinkType LinkType = MacroscopeConstants.InOutLinkType.RELATED; switch (Rel.ToLower()) { case @"canonical": this.SetCanonical(Url: Url); break; case @"shortlink": this.SetLinkShortLink(Url: Url); break; case @"first": this.SetLinkFirst(Url: Url); break; case @"prev": this.SetLinkPrev(Url: Url); break; case @"next": this.SetLinkNext(Url: Url); break; case @"last": this.SetLinkLast(Url: Url); break; default: this.DebugMsgForced(string.Format("Link Rel: {0} :: {1}", Rel, Url)); break; } LinkUrl = Uri.UnescapeDataString(stringToUnescape: Url); if (!string.IsNullOrEmpty(LinkUrlAbs)) { LinkUrlAbs = MacroscopeHttpUrlUtils.MakeUrlAbsolute( BaseHref: this.GetBaseHref(), BaseUrl: this.DocUrl, Url: LinkUrl ); if (!string.IsNullOrEmpty(LinkUrlAbs)) { this.AddDocumentOutlink( AbsoluteUrl: LinkUrlAbs, LinkType: LinkType, Follow: true ); } } } } return; }