/**************************************************************************/ protected override void RenderTreeView( MacroscopeDocument msDoc, string Url ) { TreeNode CurrentNode = this.tvTreeView.Nodes[0]; string sPath = string.Join("/", msDoc.GetHostAndPort(), msDoc.GetPath()); if (sPath != null) { DebugMsg(string.Format("HIERARCHY PATH: {0}", sPath)); List <String> lElements = new List <string> (sPath.Split(new char[] { '/' }, StringSplitOptions.RemoveEmptyEntries)); for (int i = 0; i < lElements.Count; i++) { string sElementName = lElements[i]; if (CurrentNode != null) { if (CurrentNode.Nodes.ContainsKey(sElementName)) { CurrentNode = CurrentNode.Nodes[sElementName]; } else { TreeNode nNewNode = CurrentNode.Nodes.Add(sElementName); nNewNode.Name = sElementName; nNewNode.Text = sElementName; nNewNode.Tag = Url; CurrentNode = nNewNode; } } } } else { DebugMsg(string.Format("HIERARCHY ERROR: {0}", Url)); } }
/**************************************************************************/ private async Task <MacroscopeConstants.FetchStatus> Fetch(string Url, string RedirectedFromUrl = null) { MacroscopeDocument msDoc = null; MacroscopeConstants.FetchStatus FetchStatus = MacroscopeConstants.FetchStatus.VOID; bool BlockedByRobotsRule; if (MacroscopePreferencesManager.GetPageLimit() > -1) { int PagesFound = this.JobMaster.GetPagesFound(); int PageLimit = MacroscopePreferencesManager.GetPageLimit(); if (PagesFound >= PageLimit) { this.DebugMsg(string.Format("PAGE LIMIT REACHED: {0} :: {1}", PageLimit, PagesFound)); return(FetchStatus); } } if (this.DocCollection.ContainsDocument(Url: Url)) { msDoc = this.DocCollection.GetDocumentByUrl(Url: Url); if (msDoc.GetAuthenticationRealm() != null) { if (msDoc.GetAuthenticationType() == MacroscopeConstants.AuthenticationType.BASIC) { MacroscopeCredential Credential; Credential = this.JobMaster.GetCredentialsHttp().GetCredential( msDoc.GetHostAndPort(), msDoc.GetAuthenticationRealm() ); if (Credential != null) { msDoc = this.DocCollection.CreateDocument( Credential: Credential, Url: Url ); } } } } else { msDoc = this.DocCollection.CreateDocument(Url: Url); } if (!string.IsNullOrEmpty(RedirectedFromUrl)) { msDoc.SetUrlRedirectFrom(Url: RedirectedFromUrl); } msDoc.SetFetchStatus(MacroscopeConstants.FetchStatus.OK); if (!MacroscopeDnsTools.CheckValidHostname(Url: Url)) { this.DebugMsg(string.Format("Fetch :: CheckValidHostname: {0}", "NOT OK")); msDoc.SetStatusCode(HttpStatusCode.BadGateway); FetchStatus = MacroscopeConstants.FetchStatus.NETWORK_ERROR; msDoc.SetFetchStatus(FetchStatus); } if (await this.JobMaster.GetRobots().CheckRobotRule(Url: Url)) { msDoc.SetAllowedByRobots(true); } else { msDoc.SetAllowedByRobots(false); } BlockedByRobotsRule = await this.JobMaster.GetRobots().ApplyRobotRule(Url: Url); if (!BlockedByRobotsRule) { this.DebugMsg(string.Format("Disallowed by robots.txt: {0}", Url)); this.JobMaster.AddToBlockedByRobots(Url); FetchStatus = MacroscopeConstants.FetchStatus.ROBOTS_DISALLOWED; msDoc.SetFetchStatus(FetchStatus); JobHistory.VisitedHistoryItem(Url: msDoc.GetUrl()); } else { this.JobMaster.RemoveFromBlockedByRobots(Url); } if (this.AllowedHosts.IsExternalUrl(Url: Url)) { this.DebugMsg(string.Format("IsExternalUrl: {0}", Url)); msDoc.SetIsExternal(State: true); } if (this.DocCollection.ContainsDocument(Url: Url)) { if (!this.DocCollection.GetDocumentByUrl(Url: Url).GetIsDirty()) { FetchStatus = MacroscopeConstants.FetchStatus.ALREADY_SEEN; return(FetchStatus); } } if (MacroscopePreferencesManager.GetDepth() >= 0) { int Depth = MacroscopeHttpUrlUtils.FindUrlDepth(Url: Url); if (Depth > MacroscopePreferencesManager.GetDepth()) { this.DebugMsg(string.Format("URL Too Deep: {0}", Depth)); FetchStatus = MacroscopeConstants.FetchStatus.SKIPPED; return(FetchStatus); } } /** ------------------------------------------------------------------ **/ if (!await msDoc.Execute()) { this.DebugMsg(string.Format("EXECUTE FAILED: {0}", Url)); FetchStatus = MacroscopeConstants.FetchStatus.ERROR; } /** ------------------------------------------------------------------ **/ /** ------------------------------------------------------------------ **/ { if (msDoc.GetStatusCode() == HttpStatusCode.Unauthorized) { if (msDoc.GetAuthenticationType() == MacroscopeConstants.AuthenticationType.BASIC) { MacroscopeCredentialsHttp CredentialsHttp = this.JobMaster.GetCredentialsHttp(); CredentialsHttp.EnqueueCredentialRequest( Domain: msDoc.GetHostAndPort(), Realm: msDoc.GetAuthenticationRealm(), Url: msDoc.GetUrl() ); this.JobMaster.AddUrlQueueItem(Url: msDoc.GetUrl()); } } if (msDoc.GetIsRedirect()) { this.DebugMsg(string.Format("REDIRECTION DETECTED GetUrl: {0}", msDoc.GetUrl())); this.DebugMsg(string.Format("REDIRECTION DETECTED From: {0}", msDoc.GetUrlRedirectFrom())); if (MacroscopePreferencesManager.GetCheckRedirects()) { string Hostname = msDoc.GetHostAndPort(); string HostnameFrom = MacroscopeAllowedHosts.ParseHostnameFromUrl(msDoc.GetUrlRedirectFrom()); string UrlRedirectTo = msDoc.GetUrlRedirectTo(); string HostnameTo = MacroscopeAllowedHosts.ParseHostnameFromUrl(UrlRedirectTo); this.DebugMsg(string.Format("REDIRECTION DETECTED UrlRedirectTo: {0}", UrlRedirectTo)); this.DebugMsg(string.Format("REDIRECTION DETECTED HostnameTo: {0}", HostnameTo)); if (MacroscopePreferencesManager.GetFollowRedirects()) { if (MacroscopePreferencesManager.GetCheckExternalLinks()) { this.AllowedHosts.AddFromUrl(Url: UrlRedirectTo); } else { if (this.AllowedHosts.IsInternalUrl(Url: UrlRedirectTo)) { this.AllowedHosts.AddFromUrl(Url: UrlRedirectTo); } } } } this.JobMaster.AddUrlQueueItem(Url: msDoc.GetUrlRedirectTo()); } else { this.ProcessHrefLangLanguages(msDoc); // Process Languages from HrefLang this.JobMaster.ProcessOutlinks(msDoc: msDoc); // Process Outlinks from document } FetchStatus = MacroscopeConstants.FetchStatus.SUCCESS; } /** ------------------------------------------------------------------ **/ if (DocCollection.ContainsDocument(msDoc: msDoc)) { JobHistory.VisitedHistoryItem(Url: Url); } else { this.DebugMsg(string.Format("OOPS: {0}", Url)); } /** ------------------------------------------------------------------ **/ return(FetchStatus); }
/**************************************************************************/ // TODO: Finish this. private void Descend( LinkedList <string> PageChain, MacroscopeDocument ParentDoc ) { string ParentUrl = ParentDoc.GetUrl(); PageChain.AddLast(ParentUrl); foreach (MacroscopeHyperlinkOut HyperlinkOut in ParentDoc.IterateHyperlinksOut()) { if (HyperlinkOut.GetTargetUrl().Equals(ParentUrl)) { continue; } if (this.CheckNodeAlreadyVisited(msDoc: ParentDoc, HyperlinkOut: HyperlinkOut)) { continue; } else { MacroscopeDocument CurrentDoc = this.DocCollection.GetDocumentByUrl(Url: HyperlinkOut.GetTargetUrl()); if (CurrentDoc != null) { if (CurrentDoc.GetHostAndPort().Equals(ParentDoc.GetHostAndPort())) { this.Descend( PageChain: PageChain, ParentDoc: CurrentDoc ); } } } } { LinkedList <string> PageChainClone = new LinkedList <string> (); foreach (string Url in PageChain) { PageChainClone.AddLast(Url); if (ParentDoc.GetUrl().Equals(Url)) { break; } } if (!this.PageChains.ContainsKey(PageChain.Last.Value)) { this.PageChains.Add(PageChain.Last.Value, new List <LinkedList <string> > ()); } this.PageChains[PageChain.Last.Value].Add(PageChainClone); } PageChain.RemoveLast(); return; }
/**************************************************************************/ private MacroscopeConstants.FetchStatus Fetch(string Url) { MacroscopeDocument msDoc = this.DocCollection.GetDocument(Url); MacroscopeConstants.FetchStatus FetchStatus = MacroscopeConstants.FetchStatus.VOID; if (msDoc != null) { if (msDoc.GetAuthenticationRealm() != null) { if (msDoc.GetAuthenticationType() == MacroscopeConstants.AuthenticationType.BASIC) { MacroscopeCredential Credential; Credential = this.JobMaster.GetCredentialsHttp().GetCredential( msDoc.GetHostAndPort(), msDoc.GetAuthenticationRealm() ); if (Credential != null) { msDoc = this.DocCollection.CreateDocument( Credential: Credential, Url: Url ); } } } } else { msDoc = this.DocCollection.CreateDocument(Url); } msDoc.SetFetchStatus(MacroscopeConstants.FetchStatus.OK); if (!MacroscopeDnsTools.CheckValidHostname(Url: Url)) { DebugMsg(string.Format("Fetch :: CheckValidHostname: {0}", "NOT OK")); msDoc.SetStatusCode(HttpStatusCode.BadGateway); FetchStatus = MacroscopeConstants.FetchStatus.NETWORK_ERROR; msDoc.SetFetchStatus(MacroscopeConstants.FetchStatus.NETWORK_ERROR); } if (!this.JobMaster.GetRobots().ApplyRobotRule(Url)) { DebugMsg(string.Format("Disallowed by robots.txt: {0}", Url)); this.JobMaster.AddToBlockedByRobots(Url); FetchStatus = MacroscopeConstants.FetchStatus.ROBOTS_DISALLOWED; msDoc.SetFetchStatus(MacroscopeConstants.FetchStatus.ROBOTS_DISALLOWED); this.JobMaster.GetJobHistory().VisitedHistoryItem(msDoc.GetUrl()); } else { this.JobMaster.RemoveFromBlockedByRobots(Url); } this.JobMaster.GetJobHistory().AddHistoryItem(Url); if (this.AllowedHosts.IsExternalUrl(Url: Url)) { DebugMsg(string.Format("IsExternalUrl: {0}", Url)); msDoc.SetIsExternal(State: true); } if (this.DocCollection.ContainsDocument(Url)) { if (!this.DocCollection.GetDocument(Url).GetIsDirty()) { FetchStatus = MacroscopeConstants.FetchStatus.ALREADY_SEEN; return(FetchStatus); } } if (this.JobMaster.GetDepth() > 0) { int Depth = MacroscopeUrlUtils.FindUrlDepth(Url); if (Depth > this.JobMaster.GetDepth()) { DebugMsg(string.Format("TOO DEEP: {0}", Depth)); FetchStatus = MacroscopeConstants.FetchStatus.SKIPPED; return(FetchStatus); } } if (msDoc.Execute()) { this.DocCollection.AddDocument(Url, msDoc); if (msDoc.GetStatusCode() == HttpStatusCode.Unauthorized) { if (msDoc.GetAuthenticationType() == MacroscopeConstants.AuthenticationType.BASIC) { MacroscopeCredentialsHttp CredentialsHttp = this.JobMaster.GetCredentialsHttp(); CredentialsHttp.EnqueueCredentialRequest( Domain: msDoc.GetHostAndPort(), Realm: msDoc.GetAuthenticationRealm(), Url: msDoc.GetUrl() ); this.JobMaster.AddUrlQueueItem(Url: msDoc.GetUrl()); } } this.JobMaster.GetJobHistory().VisitedHistoryItem(msDoc.GetUrl()); this.JobMaster.IncPageLimitCount(); if (msDoc.GetIsRedirect()) { DebugMsg(string.Format("REDIRECTION DETECTED GetUrl: {0}", msDoc.GetUrl())); DebugMsg(string.Format("REDIRECTION DETECTED From: {0}", msDoc.GetUrlRedirectFrom())); if (MacroscopePreferencesManager.GetFollowRedirects()) { string Hostname = msDoc.GetHostAndPort(); string HostnameFrom = MacroscopeAllowedHosts.ParseHostnameFromUrl(msDoc.GetUrlRedirectFrom()); string UrlRedirectTo = msDoc.GetUrlRedirectTo(); string HostnameTo = MacroscopeAllowedHosts.ParseHostnameFromUrl(UrlRedirectTo); DebugMsg(string.Format("REDIRECTION DETECTED UrlRedirectTo: {0}", UrlRedirectTo)); DebugMsg(string.Format("REDIRECTION DETECTED HostnameTo: {0}", HostnameTo)); } this.JobMaster.AddUrlQueueItem(Url: msDoc.GetUrlRedirectTo()); } else { this.ProcessHrefLangLanguages(msDoc); // Process Languages from HrefLang this.ProcessOutlinks(msDoc); // Process Outlinks from document } FetchStatus = MacroscopeConstants.FetchStatus.SUCCESS; } else { DebugMsg(string.Format("EXECUTE FAILED: {0}", Url)); FetchStatus = MacroscopeConstants.FetchStatus.ERROR; } return(FetchStatus); }