/**************************************************************************/ public MacroscopeJobWorker(MacroscopeJobMaster JobMaster) { this.SuppressDebugMsg = true; this.JobMaster = JobMaster; this.DocCollection = this.JobMaster.GetDocCollection(); this.AllowedHosts = this.JobMaster.GetAllowedHosts(); this.IncludeExcludeUrls = this.JobMaster.GetIncludeExcludeUrls(); if (MacroscopePreferencesManager.GetCrawlDelay() > 0) { this.CrawlDelay = MacroscopePreferencesManager.GetCrawlDelay(); } if (MacroscopePreferencesManager.GetFollowRobotsProtocol()) { if (this.JobMaster.GetCrawlDelay() > 0) { this.CrawlDelay = this.JobMaster.GetCrawlDelay(); } } }
/**************************************************************************/ private void BuildWorksheetEmailAddresses( MacroscopeJobMaster JobMaster, CsvWriter ws ) { MacroscopeDocumentCollection DocCollection = JobMaster.GetDocCollection(); MacroscopeAllowedHosts AllowedHosts = JobMaster.GetAllowedHosts(); { ws.WriteField("Email Address"); ws.WriteField("URL"); ws.NextRecord(); } foreach (MacroscopeDocument msDoc in DocCollection.IterateDocuments()) { if (msDoc.IsDocumentType(Type: MacroscopeConstants.DocumentType.HTML)) { Dictionary <string, string> EmailAddresses = msDoc.GetEmailAddresses(); foreach (string EmailAddress in EmailAddresses.Keys) { this.InsertAndFormatContentCell(ws, EmailAddress); this.InsertAndFormatUrlCell(ws, msDoc); ws.NextRecord(); } } } }
/**************************************************************************/ public bool Execute() { bool Success = false; MacroscopeAllowedHosts AllowedHosts = this.JobMaster.GetAllowedHosts(); this.CleanseList(); if (this.UrlList.Count > 0) { this.JobMaster.SetRunTimeMode( JobRunTimeMode: MacroscopeConstants.RunTimeMode.LISTFILE ); for (int i = 0; i < this.UrlList.Count; i++) { string Url = this.UrlList[i]; AllowedHosts.AddFromUrl(Url); this.JobMaster.AddUrlQueueItem(Url); } Success = true; } return(Success); }
/**************************************************************************/ private void BuildWorksheetXpaths( MacroscopeJobMaster JobMaster, CsvWriter ws ) { MacroscopeDocumentCollection DocCollection = JobMaster.GetDocCollection(); MacroscopeAllowedHosts AllowedHosts = JobMaster.GetAllowedHosts(); { ws.WriteField(MacroscopeConstants.Url); ws.WriteField(MacroscopeConstants.StatusCode); ws.WriteField(MacroscopeConstants.Status); ws.WriteField(MacroscopeConstants.ContentType); ws.WriteField("Extracted Label"); ws.WriteField("Extracted Value"); ws.NextRecord(); } foreach (string Url in DocCollection.DocumentKeys()) { MacroscopeDocument msDoc = DocCollection.GetDocument(Url); string DocUrl = msDoc.GetUrl(); string StatusCode = (( int )msDoc.GetStatusCode()).ToString(); string Status = msDoc.GetStatusCode().ToString(); string MimeType = msDoc.GetMimeType(); if (!this.DataExtractorXpaths.CanApplyDataExtractorsToDocument(msDoc: msDoc)) { continue; } foreach (KeyValuePair <string, string> DataExtractedPair in msDoc.IterateDataExtractedXpaths()) { string ExtractedLabel = DataExtractedPair.Key; string ExtractedValue = DataExtractedPair.Value; if ( string.IsNullOrEmpty(ExtractedLabel) || string.IsNullOrEmpty(ExtractedValue)) { continue; } this.InsertAndFormatUrlCell(ws, msDoc); this.InsertAndFormatStatusCodeCell(ws, msDoc); this.InsertAndFormatContentCell(ws, this.FormatIfMissing(Status)); this.InsertAndFormatContentCell(ws, this.FormatIfMissing(MimeType)); this.InsertAndFormatContentCell(ws, this.FormatIfMissing(ExtractedLabel)); this.InsertAndFormatContentCell(ws, this.FormatIfMissing(ExtractedValue)); ws.NextRecord(); } } }
public void TestRemoveFromUrl() { MacroscopeAllowedHosts AllowedHosts = new MacroscopeAllowedHosts(); List <string> TestUrls = new List <string> (); TestUrls.Add("https://nazuke.github.io/SEOMacroscope/"); TestUrls.Add("https://bogus.bogus.com/some/path/index.html"); TestUrls.Add("https://www.google.com/"); foreach (string Url in TestUrls) { AllowedHosts.AddFromUrl(Url: Url); } Assert.AreEqual( TestUrls.Count, AllowedHosts.Count(), string.Format("FAIL: {0} :: {1}", TestUrls.Count, AllowedHosts.Count()) ); this.DebugMsg(TestUrls[1]); this.DebugMsg(AllowedHosts.Count().ToString()); AllowedHosts.RemoveFromUrl(Url: TestUrls[1]); this.DebugMsg(AllowedHosts.Count().ToString()); Assert.AreEqual( TestUrls.Count - 1, AllowedHosts.Count(), string.Format("FAIL: {0} :: {1}", TestUrls.Count - 1, AllowedHosts.Count()) ); }
/**************************************************************************/ private void BuildWorksheetTelephoneNumbers( MacroscopeJobMaster JobMaster, CsvWriter ws ) { MacroscopeDocumentCollection DocCollection = JobMaster.GetDocCollection(); MacroscopeAllowedHosts AllowedHosts = JobMaster.GetAllowedHosts(); { ws.WriteField("Telephone Number"); ws.WriteField("URL"); ws.NextRecord(); } foreach (MacroscopeDocument msDoc in DocCollection.IterateDocuments()) { if (msDoc.GetIsHtml()) { Dictionary <string, string> TelephoneNumbers = msDoc.GetTelephoneNumbers(); foreach (string TelephoneNumber in TelephoneNumbers.Keys) { this.InsertAndFormatContentCell(ws, TelephoneNumber); this.InsertAndFormatUrlCell(ws, msDoc); ws.NextRecord(); } } } }
/**************************************************************************/ private MacroscopeLink AddSitemapTextOutlink( string AbsoluteUrl, MacroscopeConstants.InOutLinkType LinkType, Boolean Follow ) { MacroscopeLink OutLink = null; if (!MacroscopePreferencesManager.GetCheckExternalLinks()) { MacroscopeAllowedHosts AllowedHosts = this.DocCollection.GetAllowedHosts(); if (AllowedHosts != null) { if (!AllowedHosts.IsAllowedFromUrl(Url: AbsoluteUrl)) { return(OutLink); } } } OutLink = new MacroscopeLink( SourceUrl: this.GetUrl(), TargetUrl: AbsoluteUrl, LinkType: LinkType, Follow: Follow ); this.Outlinks.Add(OutLink); return(OutLink); }
/**************************************************************************/ private void BuildWorksheetPageBrokenLinks( MacroscopeJobMaster JobMaster, CsvWriter ws ) { MacroscopeDocumentCollection DocCollection = JobMaster.GetDocCollection(); MacroscopeAllowedHosts AllowedHosts = JobMaster.GetAllowedHosts(); { ws.WriteField("Status Code"); ws.WriteField("Status"); ws.WriteField("Anchor Text"); ws.WriteField("Alt Text"); ws.WriteField("Origin URL"); ws.WriteField("Destination URL"); ws.NextRecord(); } foreach (MacroscopeDocument msDoc in DocCollection.IterateDocuments()) { MacroscopeHyperlinksIn HyperlinksIn = DocCollection.GetDocumentHyperlinksIn(msDoc.GetUrl()); int StatusCode = ( int )msDoc.GetStatusCode(); string Status = msDoc.GetStatusCode().ToString(); if ( (StatusCode >= 400) && (StatusCode <= 599) && (HyperlinksIn != null)) { foreach (MacroscopeHyperlinkIn HyperlinkIn in HyperlinksIn.IterateLinks()) { string OriginUrl = HyperlinkIn.GetSourceUrl(); string AnchorText = HyperlinkIn.GetAnchorText(); string AltText = HyperlinkIn.GetAltText(); if ( (OriginUrl != null) && (OriginUrl.Length > 0)) { this.InsertAndFormatContentCell(ws, StatusCode.ToString()); this.InsertAndFormatContentCell(ws, Status); this.InsertAndFormatContentCell(ws, AnchorText); this.InsertAndFormatContentCell(ws, AltText); this.InsertAndFormatUrlCell(ws, OriginUrl); this.InsertAndFormatUrlCell(ws, msDoc); ws.NextRecord(); } } } } }
/**************************************************************************/ private void BuildWorksheetBlockedByRobotsInternal( MacroscopeJobMaster JobMaster, XLWorkbook wb, string WorksheetLabel ) { var ws = wb.Worksheets.Add(WorksheetLabel); int iRow = 1; int iCol = 1; int iColMax = 1; MacroscopeDocumentCollection DocCollection = JobMaster.GetDocCollection(); MacroscopeAllowedHosts AllowedHosts = JobMaster.GetAllowedHosts(); { ws.Cell(iRow, iCol).Value = "URL"; iCol++; ws.Cell(iRow, iCol).Value = "Status"; } iColMax = iCol; iRow++; foreach (MacroscopeDocument msDoc in DocCollection.IterateDocuments()) { if (msDoc.GetIsInternal() && (!msDoc.GetAllowedByRobots())) { iCol = 1; this.InsertAndFormatUrlCell(ws, iRow, iCol, msDoc); if (AllowedHosts.IsInternalUrl(Url: msDoc.GetUrl())) { ws.Cell(iRow, iCol).Style.Font.SetFontColor(XLColor.Green); } else { ws.Cell(iRow, iCol).Style.Font.SetFontColor(XLColor.Gray); } iCol++; this.InsertAndFormatRobotsCell(ws, iRow, iCol, msDoc); iRow++; } } { var rangeData = ws.Range(1, 1, iRow - 1, iColMax); var excelTable = rangeData.CreateTable(); } }
/**************************************************************************/ private void BuildWorksheetSitemapErrors( MacroscopeJobMaster JobMaster, CsvWriter ws ) { MacroscopeDocumentCollection DocCollection = JobMaster.GetDocCollection(); MacroscopeAllowedHosts AllowedHosts = JobMaster.GetAllowedHosts(); { ws.WriteField("Sitemap URL"); ws.WriteField("Status Code"); ws.WriteField("Robots"); ws.WriteField("URL"); ws.NextRecord(); } foreach (MacroscopeDocument msDoc in DocCollection.IterateDocuments()) { if (msDoc.GetIsInternal() && msDoc.IsDocumentType(Type: MacroscopeConstants.DocumentType.SITEMAPXML)) { foreach (MacroscopeLink Outlink in msDoc.IterateOutlinks()) { string TargetUrl = Outlink.GetTargetUrl(); MacroscopeDocument msDocLinked = DocCollection.GetDocumentByUrl(Url: TargetUrl); bool InsertRow = false; if (msDocLinked.GetIsInternal()) { int StatusCode = (int)msDocLinked.GetStatusCode(); if ((StatusCode >= 400) && (StatusCode <= 599)) { InsertRow = true; } if (!msDocLinked.GetAllowedByRobots()) { InsertRow = true; } } if (InsertRow) { this.InsertAndFormatUrlCell(ws, msDoc); this.InsertAndFormatStatusCodeCell(ws, msDoc); this.InsertAndFormatRobotsCell(ws, msDoc); this.InsertAndFormatUrlCell(ws, TargetUrl); ws.NextRecord(); } } } } }
/**************************************************************************/ private void BuildWorksheetPageRedirectsAudit( MacroscopeJobMaster JobMaster, CsvWriter ws ) { MacroscopeDocumentCollection DocCollection = JobMaster.GetDocCollection(); MacroscopeAllowedHosts AllowedHosts = JobMaster.GetAllowedHosts(); { ws.WriteField("Origin URL"); ws.WriteField("Status Code"); ws.WriteField("Status"); ws.WriteField("Destination URL"); ws.NextRecord(); } foreach (string Url in DocCollection.DocumentKeys()) { MacroscopeDocument msDoc = DocCollection.GetDocument(Url: Url); if (!msDoc.GetIsRedirect()) { continue; } string OriginURL = msDoc.GetUrlRedirectFrom(); string StatusCode = (( int )msDoc.GetStatusCode()).ToString(); string Status = msDoc.GetStatusCode().ToString(); string DestinationURL = msDoc.GetUrlRedirectTo(); if (string.IsNullOrEmpty(OriginURL)) { continue; } if (string.IsNullOrEmpty(DestinationURL)) { continue; } this.InsertAndFormatUrlCell(ws, OriginURL); this.InsertAndFormatContentCell(ws, StatusCode); this.InsertAndFormatContentCell(ws, Status); this.InsertAndFormatUrlCell(ws, DestinationURL); ws.NextRecord(); } }
/**************************************************************************/ private void BuildWorksheetEmailAddresses( MacroscopeJobMaster JobMaster, XLWorkbook wb, string WorksheetLabel ) { var ws = wb.Worksheets.Add(WorksheetLabel); int iRow = 1; int iCol = 1; int iColMax = 1; MacroscopeDocumentCollection DocCollection = JobMaster.GetDocCollection(); MacroscopeAllowedHosts AllowedHosts = JobMaster.GetAllowedHosts(); { ws.Cell(iRow, iCol).Value = "Email Address"; iCol++; ws.Cell(iRow, iCol).Value = "URL"; } iColMax = iCol; iRow++; foreach (MacroscopeDocument msDoc in DocCollection.IterateDocuments()) { if (msDoc.GetIsHtml()) { Dictionary <string, string> EmailAddresses = msDoc.GetEmailAddresses(); foreach (string EmailAddress in EmailAddresses.Keys) { iCol = 1; this.InsertAndFormatContentCell(ws, iRow, iCol, EmailAddress); iCol++; this.InsertAndFormatUrlCell(ws, iRow, iCol, msDoc); iRow++; } } } { var rangeData = ws.Range(1, 1, iRow - 1, iColMax); var excelTable = rangeData.CreateTable(); } }
/**************************************************************************/ private void BuildWorksheetTelephoneNumbers( MacroscopeJobMaster JobMaster, XLWorkbook wb, string WorksheetLabel ) { var ws = wb.Worksheets.Add(WorksheetLabel); int iRow = 1; int iCol = 1; int iColMax = 1; MacroscopeDocumentCollection DocCollection = JobMaster.GetDocCollection(); MacroscopeAllowedHosts AllowedHosts = JobMaster.GetAllowedHosts(); { ws.Cell(iRow, iCol).Value = "Telephone Number"; iCol++; ws.Cell(iRow, iCol).Value = "URL"; } iColMax = iCol; iRow++; foreach (MacroscopeDocument msDoc in DocCollection.IterateDocuments()) { if (msDoc.IsDocumentType(Type: MacroscopeConstants.DocumentType.HTML)) { Dictionary <string, string> TelephoneNumbers = msDoc.GetTelephoneNumbers(); foreach (string TelephoneNumber in TelephoneNumbers.Keys) { iCol = 1; this.InsertAndFormatContentCell(ws, iRow, iCol, TelephoneNumber); iCol++; this.InsertAndFormatUrlCell(ws, iRow, iCol, msDoc); iRow++; } } } { var rangeData = ws.Range(1, 1, iRow - 1, iColMax); var excelTable = rangeData.CreateTable(); } }
/**************************************************************************/ private MacroscopeLink AddSitemapXmlOutlink( string AbsoluteUrl, MacroscopeConstants.InOutLinkType LinkType, Boolean Follow ) { MacroscopeLink OutLink = null; Boolean Proceed = true; if (!MacroscopePreferencesManager.GetCheckExternalLinks()) { MacroscopeAllowedHosts AllowedHosts = this.DocCollection.GetAllowedHosts(); if (AllowedHosts != null) { if (!AllowedHosts.IsAllowedFromUrl(Url: AbsoluteUrl)) { Proceed = false; } } } switch (LinkType) { case MacroscopeConstants.InOutLinkType.SITEMAPXML: if (!MacroscopePreferencesManager.GetFetchXml()) { Proceed = false; } break; } if (Proceed) { OutLink = new MacroscopeLink( SourceUrl: this.GetUrl(), TargetUrl: AbsoluteUrl, LinkType: LinkType, Follow: Follow ); this.Outlinks.Add(OutLink); } return(OutLink); }
/**************************************************************************/ private void BuildWorksheetPageUriAnalysis( MacroscopeJobMaster JobMaster, CsvWriter ws ) { MacroscopeDocumentCollection DocCollection = JobMaster.GetDocCollection(); MacroscopeAllowedHosts AllowedHosts = JobMaster.GetAllowedHosts(); { ws.WriteField("URL"); ws.WriteField("Status Code"); ws.WriteField("Status"); ws.WriteField("Occurrences"); ws.WriteField("Checksum"); ws.NextRecord(); } foreach (string Url in DocCollection.DocumentKeys()) { MacroscopeDocument msDoc = DocCollection.GetDocument(Url: Url); string StatusCode = (( int )msDoc.GetStatusCode()).ToString(); string Status = msDoc.GetStatusCode().ToString(); string Checksum = msDoc.GetChecksum(); int Count = DocCollection.GetStatsChecksumCount(Checksum: Checksum); this.InsertAndFormatUrlCell(ws, msDoc); this.InsertAndFormatContentCell(ws, StatusCode); this.InsertAndFormatContentCell(ws, Status); this.InsertAndFormatContentCell(ws, Count.ToString()); this.InsertAndFormatContentCell(ws, Checksum); ws.NextRecord(); } }
/**************************************************************************/ private void BuildWorksheetPageObservations( MacroscopeJobMaster JobMaster, CsvWriter ws ) { MacroscopeDocumentCollection DocCollection = JobMaster.GetDocCollection(); MacroscopeAllowedHosts AllowedHosts = JobMaster.GetAllowedHosts(); { ws.WriteField("URL"); ws.WriteField("Status Code"); ws.WriteField("Status"); ws.WriteField("Observation"); ws.NextRecord(); } foreach (MacroscopeDocument msDoc in DocCollection.IterateDocuments()) { string Url = msDoc.GetUrl(); string StatusCode = (( int )msDoc.GetStatusCode()).ToString(); string Status = msDoc.GetStatusCode().ToString(); foreach (KeyValuePair <string, string> RemarkPair in msDoc.IterateRemarks()) { this.InsertAndFormatUrlCell(ws, msDoc); this.InsertAndFormatContentCell(ws, StatusCode); this.InsertAndFormatContentCell(ws, Status); this.InsertAndFormatContentCell(ws, RemarkPair.Value); ws.NextRecord(); } } }
/**************************************************************************/ private void BuildWorksheetErrors( MacroscopeJobMaster JobMaster, CsvWriter ws ) { MacroscopeDocumentCollection DocCollection = JobMaster.GetDocCollection(); MacroscopeAllowedHosts AllowedHosts = JobMaster.GetAllowedHosts(); { ws.WriteField("Status Code"); ws.WriteField("Status"); ws.WriteField("URL"); ws.NextRecord(); } foreach (string Url in DocCollection.DocumentKeys()) { MacroscopeDocument msDoc = DocCollection.GetDocument(Url); MacroscopeHyperlinksIn HyperlinksIn = DocCollection.GetDocumentHyperlinksIn(Url); int StatusCode = ( int )msDoc.GetStatusCode(); string Status = msDoc.GetStatusCode().ToString(); if ( (StatusCode >= 400) && (StatusCode <= 599)) { this.InsertAndFormatContentCell(ws, StatusCode.ToString()); this.InsertAndFormatContentCell(ws, Status); this.InsertAndFormatUrlCell(ws, Url); ws.NextRecord(); } } }
/**************************************************************************/ private void BuildWorksheetPageObservations( MacroscopeJobMaster JobMaster, XLWorkbook wb, string WorksheetLabel ) { var ws = wb.Worksheets.Add(WorksheetLabel); int iRow = 1; int iCol = 1; int iColMax = 1; MacroscopeDocumentCollection DocCollection = JobMaster.GetDocCollection(); MacroscopeAllowedHosts AllowedHosts = JobMaster.GetAllowedHosts(); { ws.Cell(iRow, iCol).Value = "URL"; iCol++; ws.Cell(iRow, iCol).Value = "Status Code"; iCol++; ws.Cell(iRow, iCol).Value = "Status"; iCol++; ws.Cell(iRow, iCol).Value = "Observation"; } iColMax = iCol; iRow++; foreach (MacroscopeDocument msDoc in DocCollection.IterateDocuments()) { string Url = msDoc.GetUrl(); string StatusCode = (( int )msDoc.GetStatusCode()).ToString(); string Status = msDoc.GetStatusCode().ToString(); foreach (KeyValuePair <string, string> RemarkPair in msDoc.IterateRemarks()) { iCol = 1; this.InsertAndFormatUrlCell(ws, iRow, iCol, msDoc); if (AllowedHosts.IsInternalUrl(Url: Url)) { ws.Cell(iRow, iCol).Style.Font.SetFontColor(XLColor.Green); } else { ws.Cell(iRow, iCol).Style.Font.SetFontColor(XLColor.Gray); } iCol++; this.InsertAndFormatContentCell(ws, iRow, iCol, StatusCode); iCol++; this.InsertAndFormatContentCell(ws, iRow, iCol, Status); iCol++; this.InsertAndFormatContentCell(ws, iRow, iCol, RemarkPair.Value); iRow++; } } { var rangeData = ws.Range(1, 1, iRow - 1, iColMax); var excelTable = rangeData.CreateTable(); } }
/**************************************************************************/ private void BuildWorksheetKeywordTerms( MacroscopeJobMaster JobMaster, XLWorkbook wb, string WorksheetLabel, Dictionary <string, int> DicTerms ) { var ws = wb.Worksheets.Add(WorksheetLabel); decimal TermTotal = DicTerms.Count; decimal TermCount = 0; int iRow = 1; int iCol = 1; int iColMax = 1; MacroscopeDocumentCollection DocCollection = JobMaster.GetDocCollection(); MacroscopeAllowedHosts AllowedHosts = JobMaster.GetAllowedHosts(); { ws.Cell(iRow, iCol).Value = "Occurrences"; iCol++; ws.Cell(iRow, iCol).Value = "Term"; iCol++; ws.Cell(iRow, iCol).Value = "URL"; } iColMax = iCol; iRow++; foreach (string Term in DicTerms.Keys) { MacroscopeDocumentList DocumentList = DocCollection.GetDeepKeywordAnalysDocumentList(Term); decimal DocTotal = ( decimal )DocumentList.CountDocuments(); decimal DocCount = 0; TermCount++; if (TermTotal > 0) { this.ProgressForm.UpdatePercentages( Title: null, Message: null, MajorPercentage: -1, ProgressLabelMajor: null, MinorPercentage: (( decimal )100 / TermTotal) * TermCount, ProgressLabelMinor: "Keywords Processed", SubMinorPercentage: -1, ProgressLabelSubMinor: null ); } foreach (MacroscopeDocument msDoc in DocumentList.IterateDocuments()) { DocCount++; if (DocTotal > 0) { this.ProgressForm.UpdatePercentages( Title: null, Message: null, MajorPercentage: -1, ProgressLabelMajor: null, MinorPercentage: -1, ProgressLabelMinor: null, SubMinorPercentage: (( decimal )100 / DocTotal) * DocCount, ProgressLabelSubMinor: "Documents Processed" ); } iCol = 1; this.InsertAndFormatContentCell(ws, iRow, iCol, this.FormatIfMissing(DicTerms[Term].ToString())); iCol++; this.InsertAndFormatContentCell(ws, iRow, iCol, this.FormatIfMissing(Term)); iCol++; this.InsertAndFormatUrlCell(ws, iRow, iCol, msDoc.GetUrl()); iRow++; } } { var rangeData = ws.Range(1, 1, iRow - 1, iColMax); var excelTable = rangeData.CreateTable(); } }
/**************************************************************************/ private void BuildWorksheetPageDuplicateEtags( MacroscopeJobMaster JobMaster, CsvWriter ws ) { decimal CountOuter = 0; decimal CountInner = 0; decimal DocCount = 0; MacroscopeDocumentCollection DocCollection = JobMaster.GetDocCollection(); MacroscopeAllowedHosts AllowedHosts = JobMaster.GetAllowedHosts(); Dictionary <string, int> DuplicatesList = new Dictionary <string, int> (DocCollection.CountDocuments()); Dictionary <string, MacroscopeDocument> DuplicatesDocList = new Dictionary <string, MacroscopeDocument> (DocCollection.CountDocuments()); DocCount = ( decimal )DocCollection.CountDocuments(); foreach (MacroscopeDocument msDoc in DocCollection.IterateDocuments()) { string Etag = msDoc.GetEtag(); if ((Etag != null) && (Etag.Length > 0)) { if (!DuplicatesDocList.ContainsKey(msDoc.GetUrl())) { DuplicatesDocList.Add(msDoc.GetUrl(), msDoc); } if (DuplicatesList.ContainsKey(Etag)) { DuplicatesList[Etag] = DuplicatesList[Etag] + 1; } else { DuplicatesList.Add(Etag, 1); } } } { ws.WriteField("Status Code"); ws.WriteField("Status"); ws.WriteField("Occurrences"); ws.WriteField("ETag"); ws.WriteField("URL"); ws.NextRecord(); } foreach (string Etag in DuplicatesList.Keys) { CountOuter++; CountInner = 0; if (DuplicatesList[Etag] > 1) { foreach (MacroscopeDocument msDoc in DuplicatesDocList.Values) { CountInner++; if (DocCount > 0) { this.ProgressForm.UpdatePercentages( Title: null, Message: null, MajorPercentage: -1, ProgressLabelMajor: string.Format("Documents Processed: {0}", CountOuter), MinorPercentage: (( decimal )100 / DocCount) * CountOuter, ProgressLabelMinor: Etag, SubMinorPercentage: (( decimal )100 / DocCount) * CountInner, ProgressLabelSubMinor: msDoc.GetUrl() ); } if (msDoc.GetEtag() == Etag) { int StatusCode = ( int )msDoc.GetStatusCode(); HttpStatusCode Status = msDoc.GetStatusCode(); int Occurrences = DuplicatesList[Etag]; this.InsertAndFormatStatusCodeCell(ws, StatusCode); this.InsertAndFormatStatusCodeCell(ws, Status); this.InsertAndFormatContentCell(ws, Occurrences); this.InsertAndFormatContentCell(ws, msDoc.GetEtag()); this.InsertAndFormatUrlCell(ws, msDoc); ws.NextRecord(); } } } } }
/**************************************************************************/ private void RenderListViewSearchTargetUrls( List <ListViewItem> ListViewItems, MacroscopeDocument msDoc, string Url, string UrlFragment ) { MacroscopeAllowedHosts AllowedHosts = this.MainForm.GetJobMaster().GetAllowedHosts(); MacroscopeHyperlinksOut HyperlinksOut = msDoc.GetHyperlinksOut(); MacroscopeDocumentCollection DocCollection = this.MainForm.GetJobMaster().GetDocCollection(); foreach (MacroscopeHyperlinkOut HyperlinkOut in HyperlinksOut.IterateLinks()) { string UrlTarget = HyperlinkOut.GetTargetUrl(); HttpStatusCode StatusCode = HttpStatusCode.NotFound; string StatusCodeText = "Not crawled"; string StatusText = "Not crawled"; string PairKey = string.Join(":", UrlToDigest(Url: Url), UrlToDigest(Url: UrlTarget)).ToString(); string LinkTarget = HyperlinkOut.GetLinkTarget(); string LinkText = HyperlinkOut.GetAnchorText(); string LinkTitle = HyperlinkOut.GetTitle(); string AltText = HyperlinkOut.GetAltText(); string LinkTextLabel = LinkText; string LinkTitleLabel = LinkTitle; string AltTextLabel = AltText; string DoFollow = "No Follow"; try { if (DocCollection.ContainsDocument(Url: HyperlinkOut.GetTargetUrl())) { StatusCode = DocCollection.GetDocumentByUrl(Url: HyperlinkOut.GetTargetUrl()).GetStatusCode(); StatusCodeText = ((int)StatusCode).ToString(); StatusText = StatusCode.ToString(); } else { DebugMsg("Not in DocCollection"); } } catch (Exception ex) { this.DebugMsg(ex.Message); } if (HyperlinkOut.GetDoFollow()) { DoFollow = "Follow"; } if (LinkText.Length == 0) { LinkTextLabel = "MISSING"; } if (LinkTitle.Length == 0) { LinkTitleLabel = "MISSING"; } if (AltText.Length == 0) { AltTextLabel = "MISSING"; } if ( (UrlTarget != null) && (UrlTarget.IndexOf(UrlFragment, StringComparison.CurrentCulture) >= 0)) { ListViewItem lvItem = null; if (this.DisplayListView.Items.ContainsKey(PairKey)) { try { lvItem = this.DisplayListView.Items[PairKey]; lvItem.SubItems[ColUrl].Text = Url; lvItem.SubItems[ColUrlTarget].Text = UrlTarget; lvItem.SubItems[ColStatusCode].Text = StatusCodeText; lvItem.SubItems[ColStatus].Text = StatusText; lvItem.SubItems[ColDoFollow].Text = DoFollow; lvItem.SubItems[ColLinkTarget].Text = LinkTarget; lvItem.SubItems[ColLinkAnchorTextLabel].Text = LinkTextLabel; lvItem.SubItems[ColLinkTitleLabel].Text = LinkTitleLabel; lvItem.SubItems[ColAltTextLabel].Text = AltTextLabel; } catch (Exception ex) { this.DebugMsg(string.Format("MacroscopeDisplayLinks 1: {0}", ex.Message)); } } else { try { lvItem = new ListViewItem(PairKey); lvItem.UseItemStyleForSubItems = false; lvItem.Name = PairKey; lvItem.SubItems[ColUrl].Text = Url; lvItem.SubItems.Add(UrlTarget); lvItem.SubItems.Add(StatusCodeText); lvItem.SubItems.Add(StatusText); lvItem.SubItems.Add(DoFollow); lvItem.SubItems.Add(LinkTarget); lvItem.SubItems.Add(LinkTextLabel); lvItem.SubItems.Add(LinkTitleLabel); lvItem.SubItems.Add(AltTextLabel); ListViewItems.Add(lvItem); } catch (Exception ex) { this.DebugMsg(string.Format("MacroscopeDisplayLinks 2: {0}", ex.Message)); } } if (lvItem != null) { for (int i = 0; i < lvItem.SubItems.Count; i++) { lvItem.SubItems[i].ForeColor = Color.Blue; } if (AllowedHosts.IsAllowedFromUrl(Url)) { lvItem.SubItems[ColUrl].ForeColor = Color.Green; } else { lvItem.SubItems[ColUrl].ForeColor = Color.Gray; } if (AllowedHosts.IsAllowedFromUrl(UrlTarget)) { lvItem.SubItems[ColUrlTarget].ForeColor = Color.Green; } else { lvItem.SubItems[ColUrlTarget].ForeColor = Color.Gray; } if (AllowedHosts.IsAllowedFromUrl(Url)) { if (HyperlinkOut.GetDoFollow()) { lvItem.SubItems[ColDoFollow].ForeColor = Color.Green; } else { lvItem.SubItems[ColDoFollow].ForeColor = Color.Red; } } else { lvItem.SubItems[ColDoFollow].ForeColor = Color.Gray; } if (LinkText.Length == 0) { lvItem.SubItems[ColLinkAnchorTextLabel].ForeColor = Color.Gray; } if (LinkTitle.Length == 0) { lvItem.SubItems[ColLinkTitleLabel].ForeColor = Color.Gray; } if (AltText.Length == 0) { lvItem.SubItems[ColAltTextLabel].ForeColor = Color.Gray; } if ( (LinkText.Length == 0) && (LinkTitle.Length == 0) && (AltText.Length == 0)) { lvItem.SubItems[ColLinkAnchorTextLabel].ForeColor = Color.Red; lvItem.SubItems[ColLinkTitleLabel].ForeColor = Color.Red; lvItem.SubItems[ColAltTextLabel].ForeColor = Color.Red; } } } } }
/**************************************************************************/ private void BuildWorksheetPageRedirectChains( MacroscopeJobMaster JobMaster, XLWorkbook wb, string WorksheetLabel ) { var ws = wb.Worksheets.Add(WorksheetLabel); int iRow = 1; int iCol = 1; int iColMax = 1; MacroscopeDocumentCollection DocCollection = JobMaster.GetDocCollection(); MacroscopeAllowedHosts AllowedHosts = JobMaster.GetAllowedHosts(); List <List <MacroscopeRedirectChainDocStruct> > RedirectChains = DocCollection.GetMacroscopeRedirectChains(); { ws.Cell(iRow, iCol).Value = "Hop"; iCol++; ws.Cell(iRow, iCol).Value = "Status"; } iRow++; foreach (List <MacroscopeRedirectChainDocStruct> DocList in RedirectChains) { int iHop = 1; iCol = 1; foreach (MacroscopeRedirectChainDocStruct RedirectChainDocStruct in DocList) { string Url = RedirectChainDocStruct.Url; string StatusCode = RedirectChainDocStruct.StatusCode.ToString(); ws.Cell(1, iCol).Value = string.Format("Hop {0} URL", iHop); this.InsertAndFormatUrlCell(ws, iRow, iCol, Url); iCol++; if (AllowedHosts.IsInternalUrl(Url: Url)) { ws.Cell(iRow, iCol).Style.Font.SetFontColor(XLColor.Green); } else { ws.Cell(iRow, iCol).Style.Font.SetFontColor(XLColor.Gray); } ws.Cell(1, iCol).Value = string.Format("Hop {0} Status", iHop); this.InsertAndFormatContentCell(ws, iRow, iCol, StatusCode); iCol++; iHop++; } if (iCol > iColMax) { iColMax = iCol; } iRow++; } if ((iRow > 1) && (iColMax > 2)) { var rangeData = ws.Range(1, 1, iRow - 1, iColMax - 1); var excelTable = rangeData.CreateTable(); } }
/**************************************************************************/ protected override void RenderListView( List <ListViewItem> ListViewItems, MacroscopeDocumentCollection DocCollection, MacroscopeDocument msDoc, string Url ) { if (msDoc.GetIsRedirect()) { return; } if (msDoc.IsDocumentType(Type: MacroscopeConstants.DocumentType.HTML)) { MacroscopeAllowedHosts AllowedHosts = this.MainForm.GetJobMaster().GetAllowedHosts(); string Canonical = msDoc.GetCanonical(); HttpStatusCode StatusCode = msDoc.GetStatusCode(); string CanonicalLabel = Canonical; ListViewItem lvItem = null; string PairKey = UrlToDigest(Url: Url).ToString(); if (Canonical.Length == 0) { CanonicalLabel = "MISSING"; } if (DisplayListView.Items.ContainsKey(PairKey)) { try { lvItem = DisplayListView.Items[PairKey]; lvItem.SubItems[0].Text = Url; lvItem.SubItems[1].Text = StatusCode.ToString(); lvItem.SubItems[2].Text = CanonicalLabel; } catch (Exception ex) { DebugMsg(string.Format("MacroscopeDisplayCanonical 1: {0}", ex.Message)); } } else { try { lvItem = new ListViewItem(PairKey); lvItem.UseItemStyleForSubItems = false; lvItem.Name = PairKey; lvItem.SubItems[0].Text = Url; lvItem.SubItems.Add(StatusCode.ToString()); lvItem.SubItems.Add(CanonicalLabel); DisplayListView.Items.Add(lvItem); } catch (Exception ex) { DebugMsg(string.Format("MacroscopeDisplayCanonical 2: {0}", ex.Message)); } } if (lvItem != null) { lvItem.ForeColor = Color.Gray; if (AllowedHosts.IsInternalUrl(Url)) { lvItem.SubItems[0].ForeColor = Color.Green; } else { lvItem.SubItems[0].ForeColor = Color.Gray; } if ((( int )StatusCode >= 100) && (( int )StatusCode <= 299)) { lvItem.SubItems[1].ForeColor = Color.Green; } else if ((( int )StatusCode >= 300) && (( int )StatusCode <= 399)) { lvItem.SubItems[1].ForeColor = Color.Orange; } else if ((( int )StatusCode >= 400) && (( int )StatusCode <= 599)) { lvItem.SubItems[1].ForeColor = Color.Red; } else { lvItem.SubItems[2].ForeColor = Color.Gray; } if (Canonical.Length == 0) { if (AllowedHosts.IsInternalUrl(Url)) { lvItem.SubItems[2].ForeColor = Color.Red; } else { lvItem.SubItems[2].ForeColor = Color.Gray; } } else { if (AllowedHosts.IsInternalUrl(Canonical)) { lvItem.SubItems[2].ForeColor = Color.Green; } else { lvItem.SubItems[2].ForeColor = Color.Red; } } } } }
/**************************************************************************/ private void RenderListView( MacroscopeDocumentCollection DocCollection, List <string> UrlList, MacroscopeCustomFilters CustomFilter ) { if (this.FilterColOffset == -1) { throw (new Exception("this.FilterColOffset invalid")); } if (DocCollection.CountDocuments() == 0) { return; } MacroscopeAllowedHosts AllowedHosts = this.MainForm.GetJobMaster().GetAllowedHosts(); Dictionary <string, int> FilterColsTable = new Dictionary <string, int>(CustomFilter.GetSize()); List <ListViewItem> ListViewItems = new List <ListViewItem>(); MacroscopeSinglePercentageProgressForm ProgressForm = new MacroscopeSinglePercentageProgressForm(this.MainForm); decimal Count = 0; decimal TotalDocs = (decimal)DocCollection.CountDocuments(); decimal MajorPercentage = ((decimal)100 / TotalDocs) * Count; if (MacroscopePreferencesManager.GetShowProgressDialogues()) { ProgressForm.UpdatePercentages( Title: "Preparing Display", Message: "Processing document collection for display:", MajorPercentage: MajorPercentage, ProgressLabelMajor: string.Format("Document {0} / {1}", Count, TotalDocs) ); } for (int Slot = 0; Slot < CustomFilter.GetSize(); Slot++) { string FilterPattern = CustomFilter.GetPattern(Slot).Key; if (FilterColsTable.ContainsKey(FilterPattern)) { FilterColsTable.Add(string.Format("EMPTY{0}", Slot + 1), Slot + 1); } else { FilterColsTable.Add(FilterPattern, Slot + 1); } } foreach (string Url in UrlList) { MacroscopeDocument msDoc = DocCollection.GetDocumentByUrl(Url: Url); ListViewItem lvItem = null; string DocUrl; string PairKey; string StatusCode; string Status; string MimeType; if (msDoc == null) { continue; } else { DocUrl = msDoc.GetUrl(); PairKey = DocUrl; StatusCode = ((int)msDoc.GetStatusCode()).ToString(); Status = msDoc.GetStatusCode().ToString(); MimeType = msDoc.GetMimeType(); } if (!CustomFilter.CanApplyCustomFiltersToDocument(msDoc: msDoc)) { continue; } if (this.DisplayListView.Items.ContainsKey(PairKey)) { lvItem = this.DisplayListView.Items[PairKey]; } else { lvItem = new ListViewItem(PairKey); lvItem.UseItemStyleForSubItems = false; lvItem.Name = PairKey; lvItem.SubItems.Add(""); lvItem.SubItems.Add(""); lvItem.SubItems.Add(""); lvItem.SubItems.Add(""); for (int Slot = 0; Slot < CustomFilter.GetSize(); Slot++) { lvItem.SubItems.Add(""); } ListViewItems.Add(lvItem); } if (lvItem != null) { try { lvItem.SubItems[ColUrl].Text = DocUrl; lvItem.SubItems[ColStatusCode].Text = StatusCode; lvItem.SubItems[ColStatus].Text = Status; lvItem.SubItems[ColMimeType].Text = MimeType; for (int Slot = 0; Slot < CustomFilter.GetSize(); Slot++) { string FilterPattern = CustomFilter.GetPattern(Slot: Slot).Key; KeyValuePair <string, MacroscopeConstants.TextPresence> Pair = msDoc.GetCustomFilteredItem(Text: FilterPattern); int ColOffset = this.FilterColOffset + FilterColsTable[FilterPattern]; if ((Pair.Key != null) && (Pair.Value != MacroscopeConstants.TextPresence.UNDEFINED)) { lvItem.SubItems[ColOffset].Text = MacroscopeConstants.TextPresenceLabels[Pair.Value]; switch (Pair.Value) { case MacroscopeConstants.TextPresence.CONTAINS_STRING: lvItem.SubItems[ColOffset].ForeColor = Color.Green; break; case MacroscopeConstants.TextPresence.NOT_CONTAINS_STRING: lvItem.SubItems[ColOffset].ForeColor = Color.Green; break; case MacroscopeConstants.TextPresence.MUST_CONTAIN_STRING: lvItem.SubItems[ColOffset].ForeColor = Color.Red; break; case MacroscopeConstants.TextPresence.SHOULD_NOT_CONTAIN_STRING: lvItem.SubItems[ColOffset].ForeColor = Color.Red; break; case MacroscopeConstants.TextPresence.CONTAINS_REGEX: lvItem.SubItems[ColOffset].ForeColor = Color.Green; break; case MacroscopeConstants.TextPresence.NOT_CONTAINS_REGEX: lvItem.SubItems[ColOffset].ForeColor = Color.Green; break; case MacroscopeConstants.TextPresence.MUST_CONTAIN_REGEX: lvItem.SubItems[ColOffset].ForeColor = Color.Red; break; case MacroscopeConstants.TextPresence.SHOULD_NOT_CONTAIN_REGEX: lvItem.SubItems[ColOffset].ForeColor = Color.Red; break; default: lvItem.SubItems[ColOffset].ForeColor = Color.Gray; break; } } } } catch (Exception ex) { DebugMsg(string.Format("MacroscopeDisplayCustomFilters: {0}", ex.Message)); DebugMsg(string.Format("MacroscopeDisplayCustomFilters: {0}", ex.StackTrace)); } } else { DebugMsg(string.Format("MacroscopeDisplayCustomFilters MISSING: {0}", PairKey)); } if (msDoc.GetIsInternal()) { lvItem.SubItems[ColUrl].ForeColor = Color.Green; } else { lvItem.SubItems[ColUrl].ForeColor = Color.Gray; } if (Regex.IsMatch(StatusCode, "^[2]")) { lvItem.SubItems[ColStatusCode].ForeColor = Color.Green; lvItem.SubItems[ColStatus].ForeColor = Color.Green; } else if (Regex.IsMatch(StatusCode, "^[3]")) { lvItem.SubItems[ColStatusCode].ForeColor = Color.Goldenrod; lvItem.SubItems[ColStatus].ForeColor = Color.Goldenrod; } else if (Regex.IsMatch(StatusCode, "^[45]")) { lvItem.SubItems[ColStatusCode].ForeColor = Color.Red; lvItem.SubItems[ColStatus].ForeColor = Color.Red; } else { lvItem.SubItems[ColStatusCode].ForeColor = Color.Blue; lvItem.SubItems[ColStatus].ForeColor = Color.Blue; } if (MacroscopePreferencesManager.GetShowProgressDialogues()) { Count++; MajorPercentage = ((decimal)100 / TotalDocs) * Count; ProgressForm.UpdatePercentages( Title: null, Message: null, MajorPercentage: MajorPercentage, ProgressLabelMajor: string.Format("Document {0} / {1}", Count, TotalDocs) ); } } this.DisplayListView.Items.AddRange(ListViewItems.ToArray()); this.DisplayListView.AutoResizeColumns(ColumnHeaderAutoResizeStyle.HeaderSize); this.DisplayListView.Columns[ColUrl].Width = 300; this.DisplayListView.Columns[ColStatusCode].Width = 100; this.DisplayListView.Columns[ColStatus].Width = 100; this.DisplayListView.Columns[ColMimeType].Width = 100; if (MacroscopePreferencesManager.GetShowProgressDialogues()) { ProgressForm.DoClose(); } if (ProgressForm != null) { ProgressForm.Dispose(); } }
/**************************************************************************/ private void BuildWorksheetPageLinks( MacroscopeJobMaster JobMaster, XLWorkbook wb, string WorksheetLabel ) { var ws = wb.Worksheets.Add(WorksheetLabel); int iRow = 1; int iCol = 1; int iColMax = 1; MacroscopeDocumentCollection DocCollection = JobMaster.GetDocCollection(); MacroscopeAllowedHosts AllowedHosts = JobMaster.GetAllowedHosts(); { ws.Cell(iRow, iCol).Value = "URL"; iCol++; ws.Cell(iRow, iCol).Value = "Link Type"; iCol++; ws.Cell(iRow, iCol).Value = "Source URL"; iCol++; ws.Cell(iRow, iCol).Value = "Target URL"; iCol++; ws.Cell(iRow, iCol).Value = "Follow"; iCol++; ws.Cell(iRow, iCol).Value = "Alt Text"; iCol++; ws.Cell(iRow, iCol).Value = "Raw Source URL"; iCol++; ws.Cell(iRow, iCol).Value = "Raw Target URL"; } iColMax = iCol; iRow++; foreach (MacroscopeDocument msDoc in DocCollection.IterateDocuments()) { foreach (MacroscopeLink Link in msDoc.IterateOutlinks()) { string LinkType = Link.GetLinkType().ToString(); string SourceUrl = Link.GetSourceUrl(); string TargetUrl = Link.GetTargetUrl(); string AltText = Link.GetAltText(); string RawSourceUrl = Link.GetRawSourceUrl(); string RawTargetUrl = Link.GetRawTargetUrl(); string DoFollow = "No Follow"; if (Link.GetDoFollow()) { DoFollow = "Follow"; } if (string.IsNullOrEmpty(AltText)) { AltText = ""; } if (string.IsNullOrEmpty(RawSourceUrl)) { RawSourceUrl = ""; } if (string.IsNullOrEmpty(RawTargetUrl)) { RawTargetUrl = ""; } iCol = 1; this.InsertAndFormatUrlCell(ws, iRow, iCol, msDoc); if (AllowedHosts.IsInternalUrl(Url: msDoc.GetUrl())) { ws.Cell(iRow, iCol).Style.Font.SetFontColor(XLColor.Green); } else { ws.Cell(iRow, iCol).Style.Font.SetFontColor(XLColor.Gray); } iCol++; this.InsertAndFormatContentCell(ws, iRow, iCol, this.FormatIfMissing(LinkType)); iCol++; this.InsertAndFormatContentCell(ws, iRow, iCol, this.FormatIfMissing(SourceUrl)); iCol++; this.InsertAndFormatContentCell(ws, iRow, iCol, this.FormatIfMissing(TargetUrl)); iCol++; this.InsertAndFormatContentCell(ws, iRow, iCol, this.FormatIfMissing(DoFollow)); iCol++; this.InsertAndFormatContentCell(ws, iRow, iCol, this.FormatIfMissing(AltText)); iCol++; this.InsertAndFormatContentCell(ws, iRow, iCol, this.FormatIfMissing(RawSourceUrl)); iCol++; this.InsertAndFormatContentCell(ws, iRow, iCol, this.FormatIfMissing(RawTargetUrl)); iRow++; } } { var rangeData = ws.Range(1, 1, iRow - 1, iColMax); var excelTable = rangeData.CreateTable(); } }
/**************************************************************************/ private void BuildWorksheetPageRedirectedLinks( MacroscopeJobMaster JobMaster, XLWorkbook wb, string WorksheetLabel ) { var ws = wb.Worksheets.Add(WorksheetLabel); int iRow = 1; int iCol = 1; int iColMax = 1; MacroscopeDocumentCollection DocCollection = JobMaster.GetDocCollection(); MacroscopeAllowedHosts AllowedHosts = JobMaster.GetAllowedHosts(); { ws.Cell(iRow, iCol).Value = "Status Code"; iCol++; ws.Cell(iRow, iCol).Value = "Status"; iCol++; ws.Cell(iRow, iCol).Value = "Origin URL"; iCol++; ws.Cell(iRow, iCol).Value = "Destination URL"; } iColMax = iCol; iRow++; foreach (string Url in DocCollection.DocumentKeys()) { MacroscopeDocument msDoc = DocCollection.GetDocument(Url); MacroscopeHyperlinksIn HyperlinksIn = DocCollection.GetDocumentHyperlinksIn(Url); int StatusCode = ( int )msDoc.GetStatusCode(); string Status = msDoc.GetStatusCode().ToString(); if ( (StatusCode >= 300) && (StatusCode <= 399) && (HyperlinksIn != null)) { foreach (MacroscopeHyperlinkIn HyperlinkIn in HyperlinksIn.IterateLinks()) { string OriginUrl = HyperlinkIn.GetSourceUrl(); if ( (OriginUrl != null) && (OriginUrl.Length > 0)) { iCol = 1; this.InsertAndFormatContentCell(ws, iRow, iCol, StatusCode.ToString()); if ((StatusCode >= 400) && (StatusCode <= 599)) { ws.Cell(iRow, iCol).Style.Font.SetFontColor(XLColor.Red); } else { ws.Cell(iRow, iCol).Style.Font.SetFontColor(XLColor.Blue); } iCol++; this.InsertAndFormatContentCell(ws, iRow, iCol, Status); if ((StatusCode >= 400) && (StatusCode <= 599)) { ws.Cell(iRow, iCol).Style.Font.SetFontColor(XLColor.Red); } else { ws.Cell(iRow, iCol).Style.Font.SetFontColor(XLColor.Blue); } iCol++; this.InsertAndFormatUrlCell(ws, iRow, iCol, OriginUrl); if (AllowedHosts.IsInternalUrl(Url: OriginUrl)) { ws.Cell(iRow, iCol).Style.Font.SetFontColor(XLColor.Green); } else { ws.Cell(iRow, iCol).Style.Font.SetFontColor(XLColor.Gray); } iCol++; this.InsertAndFormatUrlCell(ws, iRow, iCol, msDoc); if (AllowedHosts.IsInternalUrl(Url: Url)) { ws.Cell(iRow, iCol).Style.Font.SetFontColor(XLColor.Green); } else { ws.Cell(iRow, iCol).Style.Font.SetFontColor(XLColor.Gray); } iRow++; } } } } { var rangeData = ws.Range(1, 1, iRow - 1, iColMax); var excelTable = rangeData.CreateTable(); } }
/**************************************************************************/ private void BuildWorksheetPageDuplicateChecksums( MacroscopeJobMaster JobMaster, XLWorkbook wb, string WorksheetLabel ) { var ws = wb.Worksheets.Add(WorksheetLabel); int iRow = 1; int iCol = 1; int iColMax = 1; decimal CountOuter = 0; decimal CountInner = 0; decimal DocCount = 0; MacroscopeDocumentCollection DocCollection = JobMaster.GetDocCollection(); MacroscopeAllowedHosts AllowedHosts = JobMaster.GetAllowedHosts(); Dictionary <string, int> DuplicatesList = new Dictionary <string, int> (DocCollection.CountDocuments()); Dictionary <string, MacroscopeDocument> DuplicatesDocList = new Dictionary <string, MacroscopeDocument> (DocCollection.CountDocuments()); DocCount = ( decimal )DocCollection.CountDocuments(); foreach (MacroscopeDocument msDoc in DocCollection.IterateDocuments()) { string Checksum = msDoc.GetChecksum(); if ((Checksum != null) && (Checksum.Length > 0)) { if (!DuplicatesDocList.ContainsKey(msDoc.GetUrl())) { DuplicatesDocList.Add(msDoc.GetUrl(), msDoc); } if (DuplicatesList.ContainsKey(Checksum)) { DuplicatesList[Checksum] = DuplicatesList[Checksum] + 1; } else { DuplicatesList.Add(Checksum, 1); } } } { ws.Cell(iRow, iCol).Value = "Status Code"; iCol++; ws.Cell(iRow, iCol).Value = "Status"; iCol++; ws.Cell(iRow, iCol).Value = "Occurrences"; iCol++; ws.Cell(iRow, iCol).Value = "Checksum"; iCol++; ws.Cell(iRow, iCol).Value = "URL"; } iColMax = iCol; iRow++; foreach (string Checksum in DuplicatesList.Keys) { CountOuter++; CountInner = 0; if (DuplicatesList[Checksum] > 1) { foreach (MacroscopeDocument msDoc in DuplicatesDocList.Values) { CountInner++; if (DocCount > 0) { this.ProgressForm.UpdatePercentages( Title: null, Message: null, MajorPercentage: -1, ProgressLabelMajor: string.Format("Documents Processed: {0}", CountOuter), MinorPercentage: (( decimal )100 / DocCount) * CountOuter, ProgressLabelMinor: Checksum, SubMinorPercentage: (( decimal )100 / DocCount) * CountInner, ProgressLabelSubMinor: msDoc.GetUrl() ); } if (msDoc.GetChecksum() == Checksum) { iCol = 1; int StatusCode = ( int )msDoc.GetStatusCode(); HttpStatusCode Status = msDoc.GetStatusCode(); int Occurrences = DuplicatesList[Checksum]; this.InsertAndFormatStatusCodeCell(ws, iRow, iCol, StatusCode); iCol++; this.InsertAndFormatStatusCodeCell(ws, iRow, iCol, Status); iCol++; this.InsertAndFormatContentCell(ws, iRow, iCol, Occurrences); iCol++; this.InsertAndFormatContentCell(ws, iRow, iCol, msDoc.GetChecksum()); iCol++; this.InsertAndFormatUrlCell(ws, iRow, iCol, msDoc); iRow++; } } } } { var rangeData = ws.Range(1, 1, iRow - 1, iColMax); var excelTable = rangeData.CreateTable(); } }
/**************************************************************************/ private void BuildWorksheetPageDuplicateTitles( MacroscopeJobMaster JobMaster, XLWorkbook wb, string WorksheetLabel ) { var ws = wb.Worksheets.Add(WorksheetLabel); int iRow = 1; int iCol = 1; int iColMax = 1; decimal Count = 0; decimal DocCount = 0; MacroscopeDocumentCollection DocCollection = JobMaster.GetDocCollection(); MacroscopeAllowedHosts AllowedHosts = JobMaster.GetAllowedHosts(); DocCount = ( decimal )DocCollection.CountDocuments(); { ws.Cell(iRow, iCol).Value = "URL"; iCol++; ws.Cell(iRow, iCol).Value = "Occurrences"; iCol++; ws.Cell(iRow, iCol).Value = "Title"; } iColMax = iCol; iRow++; foreach (MacroscopeDocument msDoc in DocCollection.IterateDocuments()) { bool Proceed = false; if (DocCount > 0) { Count++; this.ProgressForm.UpdatePercentages( Title: null, Message: null, MajorPercentage: -1, ProgressLabelMajor: string.Format("Documents Processed: {0}", Count), MinorPercentage: (( decimal )100 / DocCount) * Count, ProgressLabelMinor: msDoc.GetUrl(), SubMinorPercentage: -1, ProgressLabelSubMinor: null ); } if (AllowedHosts.IsInternalUrl(Url: msDoc.GetUrl())) { switch (msDoc.GetDocumentType()) { case MacroscopeConstants.DocumentType.HTML: Proceed = true; break; case MacroscopeConstants.DocumentType.PDF: Proceed = true; break; default: Proceed = false; break; } } if (Proceed) { string Title = msDoc.GetTitle(); int Occurrences = DocCollection.GetStatsTitleCount(msDoc: msDoc); if (Occurrences > 1) { iCol = 1; this.InsertAndFormatUrlCell(ws, iRow, iCol, msDoc); if (msDoc.GetIsInternal()) { ws.Cell(iRow, iCol).Style.Font.SetFontColor(XLColor.Green); } else { ws.Cell(iRow, iCol).Style.Font.SetFontColor(XLColor.Gray); } iCol++; this.InsertAndFormatContentCell(ws, iRow, iCol, Occurrences); if (Occurrences > 1) { ws.Cell(iRow, iCol).Style.Font.SetFontColor(XLColor.Orange); } else { ws.Cell(iRow, iCol).Style.Font.SetFontColor(XLColor.Green); } iCol++; this.InsertAndFormatContentCell(ws, iRow, iCol, this.FormatIfMissing(Title)); iRow++; } } } { var rangeData = ws.Range(1, 1, iRow - 1, iColMax); var excelTable = rangeData.CreateTable(); } }
/**************************************************************************/ private void RenderListView( MacroscopeDocumentCollection DocCollection, List <string> UrlList, MacroscopeDataExtractorCssSelectors DataExtractor ) { MacroscopeAllowedHosts AllowedHosts = this.MainForm.GetJobMaster().GetAllowedHosts(); if (DocCollection.CountDocuments() == 0) { return; } List <ListViewItem> ListViewItems = new List <ListViewItem> (); MacroscopeSinglePercentageProgressForm ProgressForm = new MacroscopeSinglePercentageProgressForm(this.MainForm); decimal Count = 0; decimal TotalDocs = ( decimal )DocCollection.CountDocuments(); decimal MajorPercentage = (( decimal )100 / TotalDocs) * Count; if (MacroscopePreferencesManager.GetShowProgressDialogues()) { ProgressForm.UpdatePercentages( Title: "Preparing Display", Message: "Processing document collection for display:", MajorPercentage: MajorPercentage, ProgressLabelMajor: string.Format("Document {0} / {1}", Count, TotalDocs) ); } foreach (string Url in UrlList) { MacroscopeDocument msDoc = DocCollection.GetDocumentByUrl(Url: Url); string DocUrl = msDoc.GetUrl(); string StatusCode = (( int )msDoc.GetStatusCode()).ToString(); string Status = msDoc.GetStatusCode().ToString(); string MimeType = msDoc.GetMimeType(); if (!DataExtractor.CanApplyDataExtractorsToDocument(msDoc: msDoc)) { continue; } foreach (KeyValuePair <string, string> DataExtractedPair in msDoc.IterateDataExtractedCssSelectors()) { ListViewItem lvItem = null; string CssSelectorLabel = DataExtractedPair.Key; string ExtractedValue = DataExtractedPair.Value; string PairKey = null; if ( string.IsNullOrEmpty(CssSelectorLabel) || string.IsNullOrEmpty(ExtractedValue)) { continue; } PairKey = string.Join( ":", UrlToDigest(DocUrl), UrlToDigest(Macroscope.GetStringDigest(Text: CssSelectorLabel)), UrlToDigest(Macroscope.GetStringDigest(Text: ExtractedValue)) ); if (this.DisplayListView.Items.ContainsKey(PairKey)) { lvItem = this.DisplayListView.Items[PairKey]; } else { lvItem = new ListViewItem(PairKey); lvItem.UseItemStyleForSubItems = false; lvItem.Name = PairKey; for (int i = 0; i < 6; i++) { lvItem.SubItems.Add(""); } ListViewItems.Add(lvItem); } if (lvItem != null) { try { lvItem.SubItems[ColUrl].Text = DocUrl; lvItem.SubItems[ColStatusCode].Text = StatusCode; lvItem.SubItems[ColStatus].Text = Status; lvItem.SubItems[ColMimeType].Text = MimeType; lvItem.SubItems[ColCssSelectorLabel].Text = CssSelectorLabel; lvItem.SubItems[ColExtractedValue].Text = ExtractedValue; } catch (Exception ex) { DebugMsg(string.Format("MacroscopeDisplayDataExtractorCssSelectors: {0}", ex.Message)); DebugMsg(string.Format("MacroscopeDisplayDataExtractorCssSelectors: {0}", ex.StackTrace)); } } else { DebugMsg(string.Format("MacroscopeDisplayDataExtractorCssSelectors MISSING: {0}", PairKey)); } if (msDoc.GetIsInternal()) { lvItem.SubItems[ColUrl].ForeColor = Color.Green; } else { lvItem.SubItems[ColUrl].ForeColor = Color.Gray; } if (Regex.IsMatch(StatusCode, "^[2]")) { lvItem.SubItems[ColStatusCode].ForeColor = Color.Green; lvItem.SubItems[ColStatus].ForeColor = Color.Green; } else if (Regex.IsMatch(StatusCode, "^[3]")) { lvItem.SubItems[ColStatusCode].ForeColor = Color.Goldenrod; lvItem.SubItems[ColStatus].ForeColor = Color.Goldenrod; } else if (Regex.IsMatch(StatusCode, "^[45]")) { lvItem.SubItems[ColStatusCode].ForeColor = Color.Red; lvItem.SubItems[ColStatus].ForeColor = Color.Red; } else { lvItem.SubItems[ColStatusCode].ForeColor = Color.Blue; lvItem.SubItems[ColStatus].ForeColor = Color.Blue; } } if (MacroscopePreferencesManager.GetShowProgressDialogues()) { Count++; MajorPercentage = (( decimal )100 / TotalDocs) * Count; ProgressForm.UpdatePercentages( Title: null, Message: null, MajorPercentage: MajorPercentage, ProgressLabelMajor: string.Format("Document {0} / {1}", Count, TotalDocs) ); } } this.DisplayListView.Items.AddRange(ListViewItems.ToArray()); this.DisplayListView.AutoResizeColumns(ColumnHeaderAutoResizeStyle.ColumnContent); this.DisplayListView.Columns[ColUrl].Width = 300; this.DisplayListView.Columns[ColStatusCode].Width = 100; this.DisplayListView.Columns[ColStatus].Width = 100; this.DisplayListView.Columns[ColMimeType].Width = 100; if (MacroscopePreferencesManager.GetShowProgressDialogues()) { ProgressForm.DoClose(); } if (ProgressForm != null) { ProgressForm.Dispose(); } }
/**************************************************************************/ private void RenderListView(MacroscopeJobItem [] UriQueue) { if (UriQueue.Length == 0) { return; } List <ListViewItem> ListViewItems = new List <ListViewItem> (1); MacroscopeAllowedHosts AllowedHosts = this.MainForm.GetJobMaster().GetAllowedHosts(); MacroscopeSinglePercentageProgressForm ProgressForm = new MacroscopeSinglePercentageProgressForm(this.MainForm); int Item = 1; decimal Count = 0; decimal TotalDocs = ( decimal )UriQueue.Length; decimal MajorPercentage = (( decimal )100 / TotalDocs) * Count; if (MacroscopePreferencesManager.GetShowProgressDialogues()) { ProgressForm.UpdatePercentages( Title: "Preparing Display", Message: "Processing URI Queue for display:", MajorPercentage: MajorPercentage, ProgressLabelMajor: string.Format("URL {0} / {1}", Count, TotalDocs) ); } for (int i = 0; i < UriQueue.Length; i++) { ListViewItem lvItem = null; string Url = UriQueue[i].GetItemUrl(); if (this.DisplayListView.Items.ContainsKey(Url)) { try { lvItem = this.DisplayListView.Items[Url]; lvItem.SubItems[0].Text = Item.ToString(); } catch (Exception ex) { DebugMsg(string.Format("RenderListView 1: {0}", ex.Message)); } } else { try { lvItem = new ListViewItem(Url); lvItem.UseItemStyleForSubItems = false; lvItem.Name = Url; lvItem.SubItems[0].Text = Item.ToString(); lvItem.SubItems.Add(Url); ListViewItems.Add(lvItem); } catch (Exception ex) { DebugMsg(string.Format("RenderListView 2: {0}", ex.Message)); } } if (lvItem != null) { lvItem.ForeColor = Color.Blue; lvItem.SubItems[0].ForeColor = Color.Blue; if (AllowedHosts.IsInternalUrl(Url)) { lvItem.SubItems[1].ForeColor = Color.Green; } else { lvItem.SubItems[1].ForeColor = Color.Gray; } } if (MacroscopePreferencesManager.GetShowProgressDialogues()) { Count++; TotalDocs = ( decimal )UriQueue.Length; MajorPercentage = (( decimal )100 / TotalDocs) * Count; ProgressForm.UpdatePercentages( Title: null, Message: null, MajorPercentage: MajorPercentage, ProgressLabelMajor: string.Format("URL {0} / {1}", Count, TotalDocs) ); } Item++; } this.DisplayListView.Items.AddRange(ListViewItems.ToArray()); if (MacroscopePreferencesManager.GetShowProgressDialogues()) { ProgressForm.DoClose(); } ProgressForm.Dispose(); }