public void TestLinksInTextDocs() { string Url = @"https://nazuke.github.io/dummy.txt"; MacroscopeJobMaster JobMaster; MacroscopeDocumentCollection DocCollection; JobMaster = new MacroscopeJobMaster( JobRunTimeMode: MacroscopeConstants.RunTimeMode.LIVE, TaskController: this ); DocCollection = new MacroscopeDocumentCollection(JobMaster: JobMaster); MacroscopeDocument msDoc = DocCollection.CreateDocument(Url: Url); Assert.IsNotNull(msDoc, string.Format("FAIL: {0}", Url)); msDoc.ProcessPureTextOutlinks(TextDoc: this.TextDoc, LinkType: MacroscopeConstants.InOutLinkType.PURETEXT); foreach (MacroscopeLink Outlink in msDoc.IterateOutlinks()) { Assert.Contains(Outlink.GetTargetUrl(), this.TextLinks); } Assert.AreEqual(5, msDoc.CountOutlinks()); }
/**************************************************************************/ private Dictionary <string, Dictionary <string, bool> > BuildSitemapUrlList(MacroscopeDocumentCollection DocCollection) { Dictionary <string, Dictionary <string, bool> > UrlMap = new Dictionary <string, Dictionary <string, bool> >(); try { MacroscopeDocumentList SitemapDocumentList = this.FindSitemaps(DocCollection: DocCollection); foreach (MacroscopeDocument msDoc in SitemapDocumentList.IterateDocuments()) { string SitemapUrl = msDoc.GetUrl(); if (!UrlMap.ContainsKey(SitemapUrl)) { UrlMap.Add(SitemapUrl, new Dictionary <string, bool>()); } foreach (MacroscopeLink Outlink in msDoc.IterateOutlinks()) { string TargetUrl = Outlink.GetTargetUrl(); if (!UrlMap[SitemapUrl].ContainsKey(TargetUrl)) { UrlMap[SitemapUrl].Add(TargetUrl, false); } } } } catch (Exception ex) { this.DebugMsg(string.Format("BuildSitemapUrlList: {0}", ex.Message)); } return(UrlMap); }
/** Robots.txt Out Links **************************************************/ private void ProcessRobotsTextOutlinks(List <string> TextDoc) { if (this.GetIsExternal()) { return; } foreach (string Url in TextDoc) { if (Regex.IsMatch(Url, @"^Sitemap:\s*[^\s]+", RegexOptions.IgnoreCase)) { string UrlProcessing = Regex.Replace(Url, @"^(Sitemap:\s*)", "", RegexOptions.IgnoreCase); string UrlCleaned = null; UrlProcessing = UrlProcessing.Trim(); if (!string.IsNullOrEmpty(UrlProcessing)) { try { Uri SitemapUri = new Uri(UrlProcessing); if (SitemapUri != null) { UrlCleaned = UrlProcessing; } } catch (UriFormatException ex) { DebugMsg(string.Format("ProcessRobotsTextOutlinks: {0}", ex.Message)); UrlCleaned = null; } catch (Exception ex) { DebugMsg(string.Format("ProcessRobotsTextOutlinks: {0}", ex.Message)); UrlCleaned = null; } if (UrlCleaned != null) { MacroscopeLink Outlink; Outlink = this.AddDocumentOutlink( AbsoluteUrl: UrlCleaned, LinkType: MacroscopeConstants.InOutLinkType.ROBOTSTEXT, Follow: true ); if (Outlink != null) { Outlink.SetRawTargetUrl(UrlCleaned); } } } } } }
/**************************************************************************/ private void BuildWorksheetSitemapErrors( MacroscopeJobMaster JobMaster, CsvWriter ws ) { MacroscopeDocumentCollection DocCollection = JobMaster.GetDocCollection(); MacroscopeAllowedHosts AllowedHosts = JobMaster.GetAllowedHosts(); { ws.WriteField("Sitemap URL"); ws.WriteField("Status Code"); ws.WriteField("Robots"); ws.WriteField("URL"); ws.NextRecord(); } foreach (MacroscopeDocument msDoc in DocCollection.IterateDocuments()) { if (msDoc.GetIsInternal() && msDoc.IsDocumentType(Type: MacroscopeConstants.DocumentType.SITEMAPXML)) { foreach (MacroscopeLink Outlink in msDoc.IterateOutlinks()) { string TargetUrl = Outlink.GetTargetUrl(); MacroscopeDocument msDocLinked = DocCollection.GetDocumentByUrl(Url: TargetUrl); bool InsertRow = false; if (msDocLinked.GetIsInternal()) { int StatusCode = (int)msDocLinked.GetStatusCode(); if ((StatusCode >= 400) && (StatusCode <= 599)) { InsertRow = true; } if (!msDocLinked.GetAllowedByRobots()) { InsertRow = true; } } if (InsertRow) { this.InsertAndFormatUrlCell(ws, msDoc); this.InsertAndFormatStatusCodeCell(ws, msDoc); this.InsertAndFormatRobotsCell(ws, msDoc); this.InsertAndFormatUrlCell(ws, TargetUrl); ws.NextRecord(); } } } } }
/**************************************************************************/ private void ProcessOutlinks(MacroscopeDocument msDoc) { if ( (this.JobMaster.GetRunTimeMode() == MacroscopeConstants.RunTimeMode.LISTFILE) || (this.JobMaster.GetRunTimeMode() == MacroscopeConstants.RunTimeMode.LISTTEXT) || (this.JobMaster.GetRunTimeMode() == MacroscopeConstants.RunTimeMode.SITEMAP)) { if (!MacroscopePreferencesManager.GetScanSitesInList()) { return; } } foreach (MacroscopeLink Outlink in msDoc.IterateOutlinks()) { Boolean Proceed = true; if (!Outlink.GetDoFollow()) { continue; } if (Outlink.GetTargetUrl() == null) { continue; } if (this.JobMaster.GetJobHistory().SeenHistoryItem(Outlink.GetTargetUrl())) { continue; } if (this.JobMaster.GetPageLimit() > -1) { if (this.JobMaster.GetPageLimitCount() >= this.JobMaster.GetPageLimit()) { this.DebugMsg( string.Format( "PAGE LIMIT REACHED: {0} :: {1}", this.JobMaster.GetPageLimit(), this.JobMaster.GetPageLimitCount() ) ); Proceed = false; } } if (Proceed) { this.JobMaster.AddUrlQueueItem( Url: Outlink.GetTargetUrl(), Check: true ); } } }
/** Text Sitemap Out Links ************************************************/ private void ProcessSitemapTextOutlinks(List <string> TextDoc) { if (this.GetIsExternal()) { return; } foreach (string Url in TextDoc) { string UrlProcessing = Regex.Replace(Url, @"\s+", ""); string UrlCleaned = null; if (!string.IsNullOrEmpty(UrlProcessing)) { try { Uri SitemapUri = new Uri(UrlProcessing); if (SitemapUri != null) { UrlCleaned = UrlProcessing; } } catch (UriFormatException ex) { DebugMsg(string.Format("ProcessSitemapTextOutlinks: {0}", ex.Message)); UrlCleaned = null; } catch (Exception ex) { DebugMsg(string.Format("ProcessSitemapTextOutlinks: {0}", ex.Message)); UrlCleaned = null; } if (UrlCleaned != null) { MacroscopeLink Outlink; Outlink = this.AddSitemapTextOutlink( AbsoluteUrl: UrlCleaned, LinkType: MacroscopeConstants.InOutLinkType.SITEMAPTEXT, Follow: true ); if (Outlink != null) { Outlink.SetRawTargetUrl(UrlCleaned); } } } } }
/**************************************************************************/ private void ProcessSitemapXmlOutlinks(XmlDocument XmlDoc) { XmlNodeList OutlinksList = XmlDoc.GetElementsByTagName("loc", MacroscopeConstants.SitemapXmlNamespace); DebugMsg(string.Format("ProcessSitemapXmlOutlinks nlOutlinks: {0}", OutlinksList.Count)); if (OutlinksList != null) { foreach (XmlNode LinkNode in OutlinksList) { string LinkUrl = null; try { LinkUrl = LinkNode.InnerText; DebugMsg(string.Format("ProcessSitemapXmlOutlinks sLinkUrl: {0}", LinkUrl)); } catch (Exception ex) { DebugMsg(string.Format("ProcessSitemapXmlOutlinks: {0}", ex.Message)); } if (LinkUrl != null) { MacroscopeLink Outlink; string LinkUrlAbs = MacroscopeHttpUrlUtils.MakeUrlAbsolute(BaseUrl: this.GetUrl(), Url: LinkUrl); Outlink = this.AddDocumentOutlink( AbsoluteUrl: LinkUrlAbs, LinkType: MacroscopeConstants.InOutLinkType.SITEMAPXML, Follow: true ); if (Outlink != null) { Outlink.SetRawTargetUrl(LinkUrl); } } } } }
/** -------------------------------------------------------------------- **/ public void ProcessPureTextOutlinks(string TextDoc, MacroscopeConstants.InOutLinkType LinkType) { // BUG: Trailing punctuation in the detected URL can cause problems: Regex UrlRegex = new Regex( @"(https?://[^/]+/[^\s]*)", RegexOptions.IgnoreCase ); Match UrlMatch = UrlRegex.Match(TextDoc); while (UrlMatch.Success) { Group CaptureGroups = UrlMatch.Groups[0]; CaptureCollection Captures = CaptureGroups.Captures; Capture Captured = null; string UrlProcessing = null; string UrlCleaned = null; if (Captures.Count <= 0) { continue; } Captured = Captures[0]; UrlProcessing = Captured.Value; UrlProcessing = UrlProcessing.Trim(); UrlProcessing = UrlProcessing.Trim(','); UrlProcessing = UrlProcessing.Trim('.'); UrlProcessing = UrlProcessing.Trim('('); UrlProcessing = UrlProcessing.Trim(')'); UrlProcessing = UrlProcessing.Trim('"'); UrlProcessing = UrlProcessing.Trim('\''); if (!string.IsNullOrEmpty(UrlProcessing)) { try { Uri PureTextUri = new Uri(UrlProcessing); if (PureTextUri != null) { UrlCleaned = UrlProcessing; } } catch (UriFormatException ex) { this.DebugMsg(string.Format("ProcessPureTextOutlinks: {0}", ex.Message)); UrlCleaned = null; } catch (Exception ex) { this.DebugMsg(string.Format("ProcessPureTextOutlinks: {0}", ex.Message)); UrlCleaned = null; } if (UrlCleaned != null) { MacroscopeLink Outlink; Outlink = this.AddDocumentOutlink( AbsoluteUrl: UrlCleaned, LinkType: LinkType, Follow: true ); if (Outlink != null) { Outlink.SetRawTargetUrl(TargetUrl: UrlCleaned); } } } UrlMatch = UrlMatch.NextMatch(); } }
/**************************************************************************/ private void BuildWorksheetSitemapXmlErrors( MacroscopeJobMaster JobMaster, XLWorkbook wb, string WorksheetLabel ) { var ws = wb.Worksheets.Add(WorksheetLabel); int iRow = 1; int iCol = 1; int iColMax = 1; MacroscopeDocumentCollection DocCollection = JobMaster.GetDocCollection(); MacroscopeAllowedHosts AllowedHosts = JobMaster.GetAllowedHosts(); { ws.Cell(iRow, iCol).Value = "Sitemap URL"; iCol++; ws.Cell(iRow, iCol).Value = "Status Code"; iCol++; ws.Cell(iRow, iCol).Value = "Robots"; iCol++; ws.Cell(iRow, iCol).Value = "URL"; } iColMax = iCol; iRow++; foreach (MacroscopeDocument msDoc in DocCollection.IterateDocuments()) { if (msDoc.GetIsInternal() && msDoc.IsDocumentType(Type: MacroscopeConstants.DocumentType.SITEMAPXML)) { foreach (MacroscopeLink Outlink in msDoc.IterateOutlinks()) { string TargetUrl = Outlink.GetTargetUrl(); MacroscopeDocument msDocLinked = DocCollection.GetDocumentByUrl(Url: TargetUrl); bool InsertRow = false; if (msDocLinked.GetIsInternal()) { int StatusCode = (int)msDocLinked.GetStatusCode(); if ((StatusCode >= 400) && (StatusCode <= 599)) { InsertRow = true; } if (!msDocLinked.GetAllowedByRobots()) { InsertRow = true; } } if (InsertRow) { iCol = 1; this.InsertAndFormatUrlCell(ws, iRow, iCol, msDoc); if (AllowedHosts.IsInternalUrl(Url: msDoc.GetUrl())) { ws.Cell(iRow, iCol).Style.Font.SetFontColor(XLColor.Green); } else { ws.Cell(iRow, iCol).Style.Font.SetFontColor(XLColor.Gray); } iCol++; this.InsertAndFormatStatusCodeCell(ws, iRow, iCol, msDoc); iCol++; this.InsertAndFormatRobotsCell(ws, iRow, iCol, msDoc); iCol++; this.InsertAndFormatUrlCell(ws, iRow, iCol, TargetUrl); if (AllowedHosts.IsInternalUrl(Url: TargetUrl)) { ws.Cell(iRow, iCol).Style.Font.SetFontColor(XLColor.Green); } else { ws.Cell(iRow, iCol).Style.Font.SetFontColor(XLColor.Gray); } iRow++; } } } } { var rangeData = ws.Range(1, 1, iRow - 1, iColMax); var excelTable = rangeData.CreateTable(); } }