/**************************************************************************/ private void BuildWorksheetEmailAddresses( MacroscopeJobMaster JobMaster, CsvWriter ws ) { MacroscopeDocumentCollection DocCollection = JobMaster.GetDocCollection(); MacroscopeAllowedHosts AllowedHosts = JobMaster.GetAllowedHosts(); { ws.WriteField("Email Address"); ws.WriteField("URL"); ws.NextRecord(); } foreach (MacroscopeDocument msDoc in DocCollection.IterateDocuments()) { if (msDoc.IsDocumentType(Type: MacroscopeConstants.DocumentType.HTML)) { Dictionary <string, string> EmailAddresses = msDoc.GetEmailAddresses(); foreach (string EmailAddress in EmailAddresses.Keys) { this.InsertAndFormatContentCell(ws, EmailAddress); this.InsertAndFormatUrlCell(ws, msDoc); ws.NextRecord(); } } } }
/**************************************************************************/ public void RenderListViewSearchTargetUrls( MacroscopeDocumentCollection DocCollection, string UrlFragment ) { List <ListViewItem> ListViewItems = new List <ListViewItem>(DocCollection.CountDocuments()); MacroscopeSinglePercentageProgressForm ProgressForm = new MacroscopeSinglePercentageProgressForm(this.MainForm); decimal Count = 0; decimal TotalDocs = (decimal)DocCollection.CountDocuments(); if (MacroscopePreferencesManager.GetShowProgressDialogues()) { ProgressForm.UpdatePercentages( Title: "Displaying Links", Message: "Processing links in document collection for display:", MajorPercentage: ((decimal)100 / TotalDocs) * Count, ProgressLabelMajor: string.Format("Document {0} / {1}", Count, TotalDocs) ); } foreach (MacroscopeDocument msDoc in DocCollection.IterateDocuments()) { string Url = msDoc.GetUrl(); if (msDoc != null) { this.RenderListViewSearchTargetUrls( ListViewItems: ListViewItems, msDoc: msDoc, Url: Url, UrlFragment: UrlFragment ); } if (MacroscopePreferencesManager.GetShowProgressDialogues()) { Count++; ProgressForm.UpdatePercentages( Title: null, Message: null, MajorPercentage: ((decimal)100 / TotalDocs) * Count, ProgressLabelMajor: string.Format("Document {0} / {1}", Count, TotalDocs) ); } } this.DisplayListView.Items.AddRange(ListViewItems.ToArray()); if (MacroscopePreferencesManager.GetShowProgressDialogues()) { ProgressForm.DoClose(); } if (ProgressForm != null) { ProgressForm.Dispose(); } }
/**************************************************************************/ private void BuildWorksheetBlockedByRobotsInternal( MacroscopeJobMaster JobMaster, CsvWriter ws ) { MacroscopeDocumentCollection DocCollection = JobMaster.GetDocCollection(); { ws.WriteField("URL"); ws.WriteField("Status"); ws.NextRecord(); } foreach (MacroscopeDocument msDoc in DocCollection.IterateDocuments()) { if (msDoc.GetIsInternal() && (!msDoc.GetAllowedByRobots())) { this.InsertAndFormatUrlCell(ws, msDoc); this.InsertAndFormatRobotsCell(ws, msDoc); ws.NextRecord(); } } }
/**************************************************************************/ private void BuildWorksheetTelephoneNumbers( MacroscopeJobMaster JobMaster, CsvWriter ws ) { MacroscopeDocumentCollection DocCollection = JobMaster.GetDocCollection(); MacroscopeAllowedHosts AllowedHosts = JobMaster.GetAllowedHosts(); { ws.WriteField("Telephone Number"); ws.WriteField("URL"); ws.NextRecord(); } foreach (MacroscopeDocument msDoc in DocCollection.IterateDocuments()) { if (msDoc.GetIsHtml()) { Dictionary <string, string> TelephoneNumbers = msDoc.GetTelephoneNumbers(); foreach (string TelephoneNumber in TelephoneNumbers.Keys) { this.InsertAndFormatContentCell(ws, TelephoneNumber); this.InsertAndFormatUrlCell(ws, msDoc); ws.NextRecord(); } } } }
/** Render Entire DocCollection *******************************************/ public void RenderTreeView(MacroscopeDocumentCollection DocCollection) { if (DocCollection.CountDocuments() == 0) { return; } MacroscopeSinglePercentageProgressForm ProgressForm = new MacroscopeSinglePercentageProgressForm(this.MainForm); decimal Count = 0; decimal TotalDocs = ( decimal )DocCollection.CountDocuments(); decimal MajorPercentage = (( decimal )100 / TotalDocs) * Count; if (MacroscopePreferencesManager.GetShowProgressDialogues()) { ProgressForm.UpdatePercentages( Title: "Preparing Display", Message: "Processing document collection for display:", MajorPercentage: MajorPercentage, ProgressLabelMajor: string.Format("Document {0} / {1}", Count, TotalDocs) ); } foreach (MacroscopeDocument msDoc in DocCollection.IterateDocuments()) { Application.DoEvents(); if (msDoc == null) { continue; } string Url = msDoc.GetUrl(); this.RenderTreeView(msDoc, Url); if (MacroscopePreferencesManager.GetShowProgressDialogues()) { Count++; MajorPercentage = (( decimal )100 / TotalDocs) * Count; ProgressForm.UpdatePercentages( Title: null, Message: null, MajorPercentage: MajorPercentage, ProgressLabelMajor: string.Format("Document {0} / {1}", Count, TotalDocs) ); } } if (MacroscopePreferencesManager.GetShowProgressDialogues()) { ProgressForm.DoClose(); } if (ProgressForm != null) { ProgressForm.Dispose(); } }
/**************************************************************************/ private void BuildWorksheetRegularExpressions( MacroscopeJobMaster JobMaster, CsvWriter ws ) { MacroscopeDocumentCollection DocCollection = JobMaster.GetDocCollection(); MacroscopeAllowedHosts AllowedHosts = JobMaster.GetAllowedHosts(); { ws.WriteField(MacroscopeConstants.Url); ws.WriteField(MacroscopeConstants.StatusCode); ws.WriteField(MacroscopeConstants.Status); ws.WriteField(MacroscopeConstants.ContentType); ws.WriteField("Extracted Label"); ws.WriteField("Extracted Value"); ws.NextRecord(); } foreach (MacroscopeDocument msDoc in DocCollection.IterateDocuments()) { string DocUrl = msDoc.GetUrl(); string StatusCode = (( int )msDoc.GetStatusCode()).ToString(); string Status = msDoc.GetStatusCode().ToString(); string MimeType = msDoc.GetMimeType(); if (!this.DataExtractorRegexes.CanApplyDataExtractorsToDocument(msDoc: msDoc)) { continue; } foreach (KeyValuePair <string, string> DataExtractedPair in msDoc.IterateDataExtractedRegexes()) { string ExtractedLabel = DataExtractedPair.Key; string ExtractedValue = DataExtractedPair.Value; if ( string.IsNullOrEmpty(ExtractedLabel) || string.IsNullOrEmpty(ExtractedValue)) { continue; } this.InsertAndFormatUrlCell(ws, msDoc); this.InsertAndFormatStatusCodeCell(ws, msDoc); this.InsertAndFormatContentCell(ws, this.FormatIfMissing(Status)); this.InsertAndFormatContentCell(ws, this.FormatIfMissing(MimeType)); this.InsertAndFormatContentCell(ws, this.FormatIfMissing(ExtractedLabel)); this.InsertAndFormatContentCell(ws, this.FormatIfMissing(ExtractedValue)); ws.NextRecord(); } } }
/**************************************************************************/ private void BuildWorksheetPageBrokenLinks( MacroscopeJobMaster JobMaster, CsvWriter ws ) { MacroscopeDocumentCollection DocCollection = JobMaster.GetDocCollection(); MacroscopeAllowedHosts AllowedHosts = JobMaster.GetAllowedHosts(); { ws.WriteField("Status Code"); ws.WriteField("Status"); ws.WriteField("Anchor Text"); ws.WriteField("Alt Text"); ws.WriteField("Origin URL"); ws.WriteField("Destination URL"); ws.NextRecord(); } foreach (MacroscopeDocument msDoc in DocCollection.IterateDocuments()) { MacroscopeHyperlinksIn HyperlinksIn = DocCollection.GetDocumentHyperlinksIn(msDoc.GetUrl()); int StatusCode = ( int )msDoc.GetStatusCode(); string Status = msDoc.GetStatusCode().ToString(); if ( (StatusCode >= 400) && (StatusCode <= 599) && (HyperlinksIn != null)) { foreach (MacroscopeHyperlinkIn HyperlinkIn in HyperlinksIn.IterateLinks()) { string OriginUrl = HyperlinkIn.GetSourceUrl(); string AnchorText = HyperlinkIn.GetAnchorText(); string AltText = HyperlinkIn.GetAltText(); if ( (OriginUrl != null) && (OriginUrl.Length > 0)) { this.InsertAndFormatContentCell(ws, StatusCode.ToString()); this.InsertAndFormatContentCell(ws, Status); this.InsertAndFormatContentCell(ws, AnchorText); this.InsertAndFormatContentCell(ws, AltText); this.InsertAndFormatUrlCell(ws, OriginUrl); this.InsertAndFormatUrlCell(ws, msDoc); ws.NextRecord(); } } } } }
/**************************************************************************/ public void RenderListViewSearchSourceUrls( MacroscopeDocumentCollection DocCollection, string UrlFragment ) { List <ListViewItem> ListViewItems = new List <ListViewItem>(DocCollection.CountDocuments()); MacroscopeSinglePercentageProgressForm ProgressForm = new MacroscopeSinglePercentageProgressForm(this.MainForm); decimal Count = 0; decimal TotalDocs = (decimal)DocCollection.CountDocuments(); if (MacroscopePreferencesManager.GetShowProgressDialogues()) { ProgressForm.UpdatePercentages( Title: "Displaying Links", Message: "Processing links in document collection for display:", MajorPercentage: ((decimal)100 / TotalDocs) * Count, ProgressLabelMajor: "Documents Processed" ); } foreach (MacroscopeDocument msDoc in DocCollection.IterateDocuments()) { string Url = msDoc.GetUrl(); if (Url.IndexOf(UrlFragment, StringComparison.CurrentCulture) >= 0) { this.RenderListView( ListViewItems: ListViewItems, DocCollection: DocCollection, msDoc: msDoc, Url: Url ); } if (MacroscopePreferencesManager.GetShowProgressDialogues()) { Count++; TotalDocs = (decimal)DocCollection.CountDocuments(); ProgressForm.UpdatePercentages( Title: null, Message: null, MajorPercentage: ((decimal)100 / TotalDocs) * Count, ProgressLabelMajor: null ); } } this.DisplayListView.Items.AddRange(ListViewItems.ToArray()); if (MacroscopePreferencesManager.GetShowProgressDialogues()) { ProgressForm.DoClose(); } ProgressForm.Dispose(); }
/**************************************************************************/ public Dictionary <MacroscopeDocument, int> ReanalyzeDocCollection(MacroscopeDocumentCollection DocCollection) { foreach (MacroscopeDocument msDoc in DocCollection.IterateDocuments()) { msDoc.ClearLevenshteinNearDuplicates(); } return(this.AnalyzeDocCollection(DocCollection: DocCollection)); }
/**************************************************************************/ private void BuildWorksheetBlockedByRobotsInternal( MacroscopeJobMaster JobMaster, XLWorkbook wb, string WorksheetLabel ) { var ws = wb.Worksheets.Add(WorksheetLabel); int iRow = 1; int iCol = 1; int iColMax = 1; MacroscopeDocumentCollection DocCollection = JobMaster.GetDocCollection(); MacroscopeAllowedHosts AllowedHosts = JobMaster.GetAllowedHosts(); { ws.Cell(iRow, iCol).Value = "URL"; iCol++; ws.Cell(iRow, iCol).Value = "Status"; } iColMax = iCol; iRow++; foreach (MacroscopeDocument msDoc in DocCollection.IterateDocuments()) { if (msDoc.GetIsInternal() && (!msDoc.GetAllowedByRobots())) { iCol = 1; this.InsertAndFormatUrlCell(ws, iRow, iCol, msDoc); if (AllowedHosts.IsInternalUrl(Url: msDoc.GetUrl())) { ws.Cell(iRow, iCol).Style.Font.SetFontColor(XLColor.Green); } else { ws.Cell(iRow, iCol).Style.Font.SetFontColor(XLColor.Gray); } iCol++; this.InsertAndFormatRobotsCell(ws, iRow, iCol, msDoc); iRow++; } } { var rangeData = ws.Range(1, 1, iRow - 1, iColMax); var excelTable = rangeData.CreateTable(); } }
/**************************************************************************/ private void BuildWorksheetSitemapErrors( MacroscopeJobMaster JobMaster, CsvWriter ws ) { MacroscopeDocumentCollection DocCollection = JobMaster.GetDocCollection(); MacroscopeAllowedHosts AllowedHosts = JobMaster.GetAllowedHosts(); { ws.WriteField("Sitemap URL"); ws.WriteField("Status Code"); ws.WriteField("Robots"); ws.WriteField("URL"); ws.NextRecord(); } foreach (MacroscopeDocument msDoc in DocCollection.IterateDocuments()) { if (msDoc.GetIsInternal() && msDoc.IsDocumentType(Type: MacroscopeConstants.DocumentType.SITEMAPXML)) { foreach (MacroscopeLink Outlink in msDoc.IterateOutlinks()) { string TargetUrl = Outlink.GetTargetUrl(); MacroscopeDocument msDocLinked = DocCollection.GetDocumentByUrl(Url: TargetUrl); bool InsertRow = false; if (msDocLinked.GetIsInternal()) { int StatusCode = (int)msDocLinked.GetStatusCode(); if ((StatusCode >= 400) && (StatusCode <= 599)) { InsertRow = true; } if (!msDocLinked.GetAllowedByRobots()) { InsertRow = true; } } if (InsertRow) { this.InsertAndFormatUrlCell(ws, msDoc); this.InsertAndFormatStatusCodeCell(ws, msDoc); this.InsertAndFormatRobotsCell(ws, msDoc); this.InsertAndFormatUrlCell(ws, TargetUrl); ws.NextRecord(); } } } } }
/**************************************************************************/ private void BuildWorksheetEmailAddresses( MacroscopeJobMaster JobMaster, XLWorkbook wb, string WorksheetLabel ) { var ws = wb.Worksheets.Add(WorksheetLabel); int iRow = 1; int iCol = 1; int iColMax = 1; MacroscopeDocumentCollection DocCollection = JobMaster.GetDocCollection(); MacroscopeAllowedHosts AllowedHosts = JobMaster.GetAllowedHosts(); { ws.Cell(iRow, iCol).Value = "Email Address"; iCol++; ws.Cell(iRow, iCol).Value = "URL"; } iColMax = iCol; iRow++; foreach (MacroscopeDocument msDoc in DocCollection.IterateDocuments()) { if (msDoc.GetIsHtml()) { Dictionary <string, string> EmailAddresses = msDoc.GetEmailAddresses(); foreach (string EmailAddress in EmailAddresses.Keys) { iCol = 1; this.InsertAndFormatContentCell(ws, iRow, iCol, EmailAddress); iCol++; this.InsertAndFormatUrlCell(ws, iRow, iCol, msDoc); iRow++; } } } { var rangeData = ws.Range(1, 1, iRow - 1, iColMax); var excelTable = rangeData.CreateTable(); } }
/**************************************************************************/ private void BuildWorksheetTelephoneNumbers( MacroscopeJobMaster JobMaster, XLWorkbook wb, string WorksheetLabel ) { var ws = wb.Worksheets.Add(WorksheetLabel); int iRow = 1; int iCol = 1; int iColMax = 1; MacroscopeDocumentCollection DocCollection = JobMaster.GetDocCollection(); MacroscopeAllowedHosts AllowedHosts = JobMaster.GetAllowedHosts(); { ws.Cell(iRow, iCol).Value = "Telephone Number"; iCol++; ws.Cell(iRow, iCol).Value = "URL"; } iColMax = iCol; iRow++; foreach (MacroscopeDocument msDoc in DocCollection.IterateDocuments()) { if (msDoc.IsDocumentType(Type: MacroscopeConstants.DocumentType.HTML)) { Dictionary <string, string> TelephoneNumbers = msDoc.GetTelephoneNumbers(); foreach (string TelephoneNumber in TelephoneNumbers.Keys) { iCol = 1; this.InsertAndFormatContentCell(ws, iRow, iCol, TelephoneNumber); iCol++; this.InsertAndFormatUrlCell(ws, iRow, iCol, msDoc); iRow++; } } } { var rangeData = ws.Range(1, 1, iRow - 1, iColMax); var excelTable = rangeData.CreateTable(); } }
/**************************************************************************/ private void BuildWorksheetPageRedirectsAudit( MacroscopeJobMaster JobMaster, CsvWriter ws ) { MacroscopeDocumentCollection DocCollection = JobMaster.GetDocCollection(); MacroscopeAllowedHosts AllowedHosts = JobMaster.GetAllowedHosts(); { ws.WriteField("URL"); ws.WriteField("Status Code"); ws.WriteField("Status"); ws.WriteField("Destination URL"); ws.NextRecord(); } foreach (MacroscopeDocument msDoc in DocCollection.IterateDocuments()) { if (!msDoc.GetIsRedirect()) { continue; } string Url = msDoc.GetUrl(); string StatusCode = (( int )msDoc.GetStatusCode()).ToString(); string Status = msDoc.GetStatusCode().ToString(); string DestinationURL = msDoc.GetUrlRedirectTo(); if (string.IsNullOrEmpty(Url)) { continue; } if (string.IsNullOrEmpty(DestinationURL)) { continue; } this.InsertAndFormatUrlCell(ws, Url); this.InsertAndFormatContentCell(ws, StatusCode); this.InsertAndFormatContentCell(ws, Status); this.InsertAndFormatUrlCell(ws, DestinationURL); ws.NextRecord(); } }
/**************************************************************************/ public List <MacroscopeDocumentList> AnalyzeInSitemaps(MacroscopeDocumentCollection DocCollection) { Dictionary <string, Dictionary <string, bool> > UrlMap = this.BuildSitemapUrlList(DocCollection: DocCollection); MacroscopeDocumentList InSitemapsDocumentList = new MacroscopeDocumentList(); MacroscopeDocumentList NotInSitemapsDocumentList = new MacroscopeDocumentList(); List <MacroscopeDocumentList> DocumentLists = new List <MacroscopeDocumentList>(2); DocumentLists.Add(NotInSitemapsDocumentList); DocumentLists.Add(InSitemapsDocumentList); foreach (MacroscopeDocument msDoc in DocCollection.IterateDocuments()) { bool InSitemap = false; string DocumentNote = null; string Url = msDoc.GetUrl(); if (msDoc.GetIsExternal()) { continue; } if (!msDoc.IsDocumentType(Type: MacroscopeConstants.DocumentType.HTML)) { continue; } foreach (string SitemapUrl in UrlMap.Keys) { if (UrlMap[SitemapUrl].ContainsKey(Url)) { InSitemap = true; DocumentNote = SitemapUrl; } } if (InSitemap) { InSitemapsDocumentList.AddDocument(msDoc: msDoc); InSitemapsDocumentList.AddDocumentNote(msDoc: msDoc, Note: DocumentNote); } else { NotInSitemapsDocumentList.AddDocument(msDoc: msDoc); } } return(DocumentLists); }
/**************************************************************************/ private MacroscopeDocumentList FindSitemaps(MacroscopeDocumentCollection DocCollection) { MacroscopeDocumentList SitemapDocumentList = new MacroscopeDocumentList(); foreach (MacroscopeDocument msDoc in DocCollection.IterateDocuments()) { if ( msDoc.IsDocumentType(Type: MacroscopeConstants.DocumentType.SITEMAPTEXT) || msDoc.IsDocumentType(Type: MacroscopeConstants.DocumentType.SITEMAPXML)) { SitemapDocumentList.AddDocument(msDoc: msDoc); } } return(SitemapDocumentList); }
/**************************************************************************/ private void BuildWorksheetPageUriAnalysis( MacroscopeJobMaster JobMaster, CsvWriter ws ) { MacroscopeDocumentCollection DocCollection = JobMaster.GetDocCollection(); MacroscopeAllowedHosts AllowedHosts = JobMaster.GetAllowedHosts(); { ws.WriteField("URL"); ws.WriteField("Status Code"); ws.WriteField("Status"); ws.WriteField("Occurrences"); ws.WriteField("Checksum"); ws.NextRecord(); } foreach (MacroscopeDocument msDoc in DocCollection.IterateDocuments()) { string StatusCode = (( int )msDoc.GetStatusCode()).ToString(); string Status = msDoc.GetStatusCode().ToString(); string Checksum = msDoc.GetChecksum(); int Count = DocCollection.GetStatsChecksumCount(Checksum: Checksum); this.InsertAndFormatUrlCell(ws, msDoc); this.InsertAndFormatContentCell(ws, StatusCode); this.InsertAndFormatContentCell(ws, Status); this.InsertAndFormatContentCell(ws, Count.ToString()); this.InsertAndFormatContentCell(ws, Checksum); ws.NextRecord(); } }
/**************************************************************************/ public void RenderListViewSearchSourceUrls( MacroscopeDocumentCollection DocCollection, string UrlFragment ) { List <ListViewItem> ListViewItems = new List <ListViewItem> (DocCollection.CountDocuments()); foreach (MacroscopeDocument msDoc in DocCollection.IterateDocuments()) { string Url = msDoc.GetUrl(); if (Url.IndexOf(UrlFragment, StringComparison.CurrentCulture) >= 0) { this.RenderListView( ListViewItems: ListViewItems, msDoc: msDoc, Url: Url ); } } this.DisplayListView.Items.AddRange(ListViewItems.ToArray()); }
/**************************************************************************/ private void BuildWorksheetPageObservations( MacroscopeJobMaster JobMaster, CsvWriter ws ) { MacroscopeDocumentCollection DocCollection = JobMaster.GetDocCollection(); MacroscopeAllowedHosts AllowedHosts = JobMaster.GetAllowedHosts(); { ws.WriteField("URL"); ws.WriteField("Status Code"); ws.WriteField("Status"); ws.WriteField("Observation"); ws.NextRecord(); } foreach (MacroscopeDocument msDoc in DocCollection.IterateDocuments()) { string Url = msDoc.GetUrl(); string StatusCode = (( int )msDoc.GetStatusCode()).ToString(); string Status = msDoc.GetStatusCode().ToString(); foreach (KeyValuePair <string, string> RemarkPair in msDoc.IterateRemarks()) { this.InsertAndFormatUrlCell(ws, msDoc); this.InsertAndFormatContentCell(ws, StatusCode); this.InsertAndFormatContentCell(ws, Status); this.InsertAndFormatContentCell(ws, RemarkPair.Value); ws.NextRecord(); } } }
/**************************************************************************/ private void BuildWorksheetErrors( MacroscopeJobMaster JobMaster, CsvWriter ws ) { MacroscopeDocumentCollection DocCollection = JobMaster.GetDocCollection(); MacroscopeAllowedHosts AllowedHosts = JobMaster.GetAllowedHosts(); { ws.WriteField("Status Code"); ws.WriteField("Status"); ws.WriteField("URL"); ws.NextRecord(); } foreach (MacroscopeDocument msDoc in DocCollection.IterateDocuments()) { MacroscopeHyperlinksIn HyperlinksIn = DocCollection.GetDocumentHyperlinksIn(msDoc.GetUrl()); int StatusCode = ( int )msDoc.GetStatusCode(); string Status = msDoc.GetStatusCode().ToString(); if ( (StatusCode >= 400) && (StatusCode <= 599)) { this.InsertAndFormatContentCell(ws, StatusCode.ToString()); this.InsertAndFormatContentCell(ws, Status); this.InsertAndFormatUrlCell(ws, msDoc.GetUrl()); ws.NextRecord(); } } }
/**************************************************************************/ public void RenderListViewSearchTargetUrls( MacroscopeDocumentCollection DocCollection, string UrlFragment ) { List <ListViewItem> ListViewItems = new List <ListViewItem>(DocCollection.CountDocuments()); foreach (MacroscopeDocument msDoc in DocCollection.IterateDocuments()) { string Url = msDoc.GetUrl(); if (msDoc != null) { this.RenderListViewSearchTargetUrls( ListViewItems: ListViewItems, msDoc: msDoc, Url: Url, UrlFragment: UrlFragment ); } } this.DisplayListView.Items.AddRange(ListViewItems.ToArray()); }
/**************************************************************************/ private void BuildWorksheetPageText( MacroscopeJobMaster JobMaster, CsvWriter ws ) { MacroscopeDocumentCollection DocCollection = JobMaster.GetDocCollection(); { ws.WriteField("URL"); ws.WriteField("Page Locale"); ws.WriteField("Page Language"); ws.WriteField("Detected Language"); ws.WriteField("Word Count"); ws.WriteField("Readability Method"); ws.WriteField("Readability Grade"); ws.WriteField("Readability Grade Description"); ws.NextRecord(); } foreach (MacroscopeDocument msDoc in DocCollection.IterateDocuments()) { bool Proceed = false; if (msDoc.GetIsExternal()) { continue; } if (msDoc.GetIsRedirect()) { continue; } switch (msDoc.GetDocumentType()) { case MacroscopeConstants.DocumentType.HTML: Proceed = true; break; case MacroscopeConstants.DocumentType.PDF: Proceed = true; break; default: break; } if (Proceed) { string PageLocale = msDoc.GetLocale(); string PageLanguage = msDoc.GetIsoLanguageCode(); string DetectedLanguage = msDoc.GetDocumentTextLanguage(); int WordCount = msDoc.GetWordCount(); string ReadabilityGradeType = MacroscopeAnalyzeReadability.FormatAnalyzeReadabilityMethod(ReadabilityMethod: msDoc.GetReadabilityGradeMethod()); string ReadabilityGrade = msDoc.GetReadabilityGrade().ToString("00.00"); string ReadabilityGradeDescription = msDoc.GetReadabilityGradeDescription(); if (string.IsNullOrEmpty(PageLocale)) { PageLocale = ""; } if (string.IsNullOrEmpty(PageLanguage)) { PageLanguage = ""; } if (string.IsNullOrEmpty(DetectedLanguage)) { DetectedLanguage = ""; } this.InsertAndFormatUrlCell(ws, msDoc); this.InsertAndFormatContentCell(ws, this.FormatIfMissing(PageLocale)); this.InsertAndFormatContentCell(ws, this.FormatIfMissing(PageLanguage)); this.InsertAndFormatContentCell(ws, this.FormatIfMissing(DetectedLanguage)); this.InsertAndFormatContentCell(ws, this.FormatIfMissing(WordCount.ToString())); this.InsertAndFormatContentCell(ws, this.FormatIfMissing(ReadabilityGradeType)); this.InsertAndFormatContentCell(ws, this.FormatIfMissing(ReadabilityGrade)); this.InsertAndFormatContentCell(ws, this.FormatIfMissing(ReadabilityGradeDescription)); ws.NextRecord(); } } }
/**************************************************************************/ public void RefreshSiteSpeedData(MacroscopeDocumentCollection DocCollection) { if (this.lvListViewFastest.IsDisposed || this.lvListViewSlowest.IsDisposed) { return; } if (DocCollection.CountDocuments() > 0) { const int MeasurePages = 20; decimal Average = 0; int Count = 0; decimal Maximus = 0; SortedList <decimal, string> SortedListAll = new SortedList <decimal, string> (DocCollection.CountDocuments(), this.DecimalSorterAscending); SortedList <decimal, string> SortedListSlowest = new SortedList <decimal, string> (MeasurePages, this.DecimalSorterDescending); SortedList <decimal, string> SortedListFastest = new SortedList <decimal, string> (MeasurePages, this.DecimalSorterAscending); foreach (MacroscopeDocument msDoc in DocCollection.IterateDocuments()) { string Url = msDoc.GetUrl(); decimal Duration = msDoc.GetDurationInSeconds(); if (msDoc.GetIsInternal() && msDoc.GetWasDownloaded()) { Count++; Maximus += Duration; if (SortedListAll.ContainsKey(Duration)) { SortedListAll[Duration] = Url; } else { SortedListAll.Add(Duration, Url); } } } foreach (decimal Duration in SortedListAll.Keys.Take(MeasurePages)) { SortedListFastest.Add(Duration, SortedListAll[Duration]); } foreach (decimal Duration in SortedListAll.Keys.Reverse().Take(MeasurePages)) { SortedListSlowest.Add(Duration, SortedListAll[Duration]); } if (Count > 0) { Average = Maximus / Count; } if (this.MainForm.InvokeRequired) { this.MainForm.Invoke( new MethodInvoker( delegate { Cursor.Current = Cursors.WaitCursor; this.RenderSiteSpeedListView(this.lvListViewSlowest, SortedListSlowest); this.RenderSiteSpeedListView(this.lvListViewFastest, SortedListFastest); this.UpdateAverageLabel(Average); Cursor.Current = Cursors.Default; } ) ); } else { Cursor.Current = Cursors.WaitCursor; this.RenderSiteSpeedListView(this.lvListViewSlowest, SortedListSlowest); this.RenderSiteSpeedListView(this.lvListViewFastest, SortedListFastest); this.UpdateAverageLabel(Average); Cursor.Current = Cursors.Default; } } }
/**************************************************************************/ private void BuildWorksheetPageDuplicateChecksums( MacroscopeJobMaster JobMaster, XLWorkbook wb, string WorksheetLabel ) { var ws = wb.Worksheets.Add(WorksheetLabel); int iRow = 1; int iCol = 1; int iColMax = 1; decimal CountOuter = 0; decimal CountInner = 0; decimal DocCount = 0; MacroscopeDocumentCollection DocCollection = JobMaster.GetDocCollection(); MacroscopeAllowedHosts AllowedHosts = JobMaster.GetAllowedHosts(); Dictionary <string, int> DuplicatesList = new Dictionary <string, int> (DocCollection.CountDocuments()); Dictionary <string, MacroscopeDocument> DuplicatesDocList = new Dictionary <string, MacroscopeDocument> (DocCollection.CountDocuments()); DocCount = ( decimal )DocCollection.CountDocuments(); foreach (MacroscopeDocument msDoc in DocCollection.IterateDocuments()) { string Checksum = msDoc.GetChecksum(); if ((Checksum != null) && (Checksum.Length > 0)) { if (!DuplicatesDocList.ContainsKey(msDoc.GetUrl())) { DuplicatesDocList.Add(msDoc.GetUrl(), msDoc); } if (DuplicatesList.ContainsKey(Checksum)) { DuplicatesList[Checksum] = DuplicatesList[Checksum] + 1; } else { DuplicatesList.Add(Checksum, 1); } } } { ws.Cell(iRow, iCol).Value = "Status Code"; iCol++; ws.Cell(iRow, iCol).Value = "Status"; iCol++; ws.Cell(iRow, iCol).Value = "Occurrences"; iCol++; ws.Cell(iRow, iCol).Value = "Checksum"; iCol++; ws.Cell(iRow, iCol).Value = "URL"; } iColMax = iCol; iRow++; foreach (string Checksum in DuplicatesList.Keys) { CountOuter++; CountInner = 0; if (DuplicatesList[Checksum] > 1) { foreach (MacroscopeDocument msDoc in DuplicatesDocList.Values) { CountInner++; if (DocCount > 0) { this.ProgressForm.UpdatePercentages( Title: null, Message: null, MajorPercentage: -1, ProgressLabelMajor: string.Format("Documents Processed: {0}", CountOuter), MinorPercentage: (( decimal )100 / DocCount) * CountOuter, ProgressLabelMinor: Checksum, SubMinorPercentage: (( decimal )100 / DocCount) * CountInner, ProgressLabelSubMinor: msDoc.GetUrl() ); } if (msDoc.GetChecksum() == Checksum) { iCol = 1; int StatusCode = ( int )msDoc.GetStatusCode(); HttpStatusCode Status = msDoc.GetStatusCode(); int Occurrences = DuplicatesList[Checksum]; this.InsertAndFormatStatusCodeCell(ws, iRow, iCol, StatusCode); iCol++; this.InsertAndFormatStatusCodeCell(ws, iRow, iCol, Status); iCol++; this.InsertAndFormatContentCell(ws, iRow, iCol, Occurrences); iCol++; this.InsertAndFormatContentCell(ws, iRow, iCol, msDoc.GetChecksum()); iCol++; this.InsertAndFormatUrlCell(ws, iRow, iCol, msDoc); iRow++; } } } } { var rangeData = ws.Range(1, 1, iRow - 1, iColMax); var excelTable = rangeData.CreateTable(); } }
/**************************************************************************/ private void BuildWorksheetPageObservations( MacroscopeJobMaster JobMaster, XLWorkbook wb, string WorksheetLabel ) { var ws = wb.Worksheets.Add(WorksheetLabel); int iRow = 1; int iCol = 1; int iColMax = 1; MacroscopeDocumentCollection DocCollection = JobMaster.GetDocCollection(); MacroscopeAllowedHosts AllowedHosts = JobMaster.GetAllowedHosts(); { ws.Cell(iRow, iCol).Value = "URL"; iCol++; ws.Cell(iRow, iCol).Value = "Status Code"; iCol++; ws.Cell(iRow, iCol).Value = "Status"; iCol++; ws.Cell(iRow, iCol).Value = "Observation"; } iColMax = iCol; iRow++; foreach (MacroscopeDocument msDoc in DocCollection.IterateDocuments()) { string Url = msDoc.GetUrl(); string StatusCode = (( int )msDoc.GetStatusCode()).ToString(); string Status = msDoc.GetStatusCode().ToString(); foreach (KeyValuePair <string, string> RemarkPair in msDoc.IterateRemarks()) { iCol = 1; this.InsertAndFormatUrlCell(ws, iRow, iCol, msDoc); if (AllowedHosts.IsInternalUrl(Url: Url)) { ws.Cell(iRow, iCol).Style.Font.SetFontColor(XLColor.Green); } else { ws.Cell(iRow, iCol).Style.Font.SetFontColor(XLColor.Gray); } iCol++; this.InsertAndFormatContentCell(ws, iRow, iCol, StatusCode); iCol++; this.InsertAndFormatContentCell(ws, iRow, iCol, Status); iCol++; this.InsertAndFormatContentCell(ws, iRow, iCol, RemarkPair.Value); iRow++; } } { var rangeData = ws.Range(1, 1, iRow - 1, iColMax); var excelTable = rangeData.CreateTable(); } }
/**************************************************************************/ private void BuildWorksheetPageDuplicateEtags( MacroscopeJobMaster JobMaster, CsvWriter ws ) { decimal CountOuter = 0; decimal CountInner = 0; decimal DocCount = 0; MacroscopeDocumentCollection DocCollection = JobMaster.GetDocCollection(); MacroscopeAllowedHosts AllowedHosts = JobMaster.GetAllowedHosts(); Dictionary <string, int> DuplicatesList = new Dictionary <string, int> (DocCollection.CountDocuments()); Dictionary <string, MacroscopeDocument> DuplicatesDocList = new Dictionary <string, MacroscopeDocument> (DocCollection.CountDocuments()); DocCount = ( decimal )DocCollection.CountDocuments(); foreach (MacroscopeDocument msDoc in DocCollection.IterateDocuments()) { string Etag = msDoc.GetEtag(); if ((Etag != null) && (Etag.Length > 0)) { if (!DuplicatesDocList.ContainsKey(msDoc.GetUrl())) { DuplicatesDocList.Add(msDoc.GetUrl(), msDoc); } if (DuplicatesList.ContainsKey(Etag)) { DuplicatesList[Etag] = DuplicatesList[Etag] + 1; } else { DuplicatesList.Add(Etag, 1); } } } { ws.WriteField("Status Code"); ws.WriteField("Status"); ws.WriteField("Occurrences"); ws.WriteField("ETag"); ws.WriteField("URL"); ws.NextRecord(); } foreach (string Etag in DuplicatesList.Keys) { CountOuter++; CountInner = 0; if (DuplicatesList[Etag] > 1) { foreach (MacroscopeDocument msDoc in DuplicatesDocList.Values) { CountInner++; if (DocCount > 0) { this.ProgressForm.UpdatePercentages( Title: null, Message: null, MajorPercentage: -1, ProgressLabelMajor: string.Format("Documents Processed: {0}", CountOuter), MinorPercentage: (( decimal )100 / DocCount) * CountOuter, ProgressLabelMinor: Etag, SubMinorPercentage: (( decimal )100 / DocCount) * CountInner, ProgressLabelSubMinor: msDoc.GetUrl() ); } if (msDoc.GetEtag() == Etag) { int StatusCode = ( int )msDoc.GetStatusCode(); HttpStatusCode Status = msDoc.GetStatusCode(); int Occurrences = DuplicatesList[Etag]; this.InsertAndFormatStatusCodeCell(ws, StatusCode); this.InsertAndFormatStatusCodeCell(ws, Status); this.InsertAndFormatContentCell(ws, Occurrences); this.InsertAndFormatContentCell(ws, msDoc.GetEtag()); this.InsertAndFormatUrlCell(ws, msDoc); ws.NextRecord(); } } } } }
/**************************************************************************/ private void BuildWorksheetOverview( MacroscopeJobMaster JobMaster, CsvWriter ws ) { MacroscopeDocumentCollection DocCollection = JobMaster.GetDocCollection(); { ws.WriteField("URL"); ws.WriteField("Status Code"); ws.WriteField("Status"); ws.WriteField("Redirect"); ws.WriteField("Robots"); ws.WriteField("Duration"); ws.WriteField("Crawled Date"); ws.WriteField("Server Date"); ws.WriteField("Modified Date"); ws.WriteField("Expires Date"); ws.WriteField("Content-Type"); ws.WriteField("Charset"); ws.WriteField("Locale"); ws.WriteField("Language"); ws.WriteField("Canonical"); ws.WriteField("Page Depth"); ws.WriteField("Links In"); ws.WriteField("Links Out"); ws.WriteField("Hyperlinks In"); ws.WriteField("Hyperlinks Out"); ws.WriteField("Ration In"); ws.WriteField("Ratio Out"); ws.WriteField("Author"); ws.WriteField("Title"); ws.WriteField("Title Length"); ws.WriteField("Description"); ws.WriteField("Description Length"); ws.WriteField("Keywords"); ws.WriteField("Keywords Length"); ws.WriteField("Keywords Count"); ws.WriteField("Error Condition"); ws.NextRecord(); } foreach (MacroscopeDocument msDoc in DocCollection.IterateDocuments()) { List <decimal> HyperlinkRatio = DocCollection.GetDocumentHyperlinksRatio(Url: msDoc.GetUrl()); this.InsertAndFormatUrlCell(ws, msDoc); this.InsertAndFormatStatusCodeCell(ws, msDoc); this.InsertAndFormatContentCell(ws, msDoc.GetStatusCode().ToString()); this.InsertAndFormatRedirectCell(ws, msDoc); this.InsertAndFormatRobotsCell(ws, msDoc); this.InsertAndFormatContentCell(ws, msDoc.GetDurationInSecondsFormatted()); this.InsertAndFormatContentCell(ws, this.FormatIfMissing(msDoc.GetCrawledDate())); this.InsertAndFormatContentCell(ws, this.FormatIfMissing(msDoc.GetDateServer())); this.InsertAndFormatContentCell(ws, this.FormatIfMissing(msDoc.GetDateModified())); this.InsertAndFormatContentCell(ws, this.FormatIfMissing(msDoc.GetDateExpires())); this.InsertAndFormatContentCell(ws, this.FormatIfMissing(msDoc.GetMimeType())); this.InsertAndFormatContentCell(ws, this.FormatIfMissing(msDoc.GetCharacterSet())); this.InsertAndFormatContentCell(ws, this.FormatIfMissing(msDoc.GetLocale())); this.InsertAndFormatContentCell(ws, this.FormatIfMissing(msDoc.GetIsoLanguageCode())); this.InsertAndFormatContentCell(ws, this.FormatIfMissing(msDoc.GetCanonical())); this.InsertAndFormatContentCell(ws, msDoc.GetDepth().ToString()); this.InsertAndFormatContentCell(ws, msDoc.CountInlinks().ToString()); this.InsertAndFormatContentCell(ws, msDoc.CountOutlinks().ToString()); this.InsertAndFormatContentCell(ws, msDoc.CountHyperlinksIn().ToString()); this.InsertAndFormatContentCell(ws, msDoc.CountHyperlinksOut().ToString()); this.InsertAndFormatContentCell(ws, this.FormatIfMissing(string.Format("{0:0.00}%", HyperlinkRatio[0]))); this.InsertAndFormatContentCell(ws, this.FormatIfMissing(string.Format("{0:0.00}%", HyperlinkRatio[1]))); this.InsertAndFormatContentCell(ws, msDoc.GetAuthor()); this.InsertAndFormatContentCell(ws, this.FormatIfMissing(msDoc.GetTitle())); this.InsertAndFormatContentCell(ws, msDoc.GetTitleLength().ToString()); this.InsertAndFormatContentCell(ws, this.FormatIfMissing(msDoc.GetDescription())); this.InsertAndFormatContentCell(ws, msDoc.GetDescriptionLength().ToString()); this.InsertAndFormatContentCell(ws, this.FormatIfMissing(msDoc.GetKeywords())); this.InsertAndFormatContentCell(ws, msDoc.GetKeywordsLength().ToString()); this.InsertAndFormatContentCell(ws, msDoc.GetKeywordsCount().ToString()); this.InsertAndFormatContentCell(ws, this.FormatIfMissing(msDoc.GetErrorCondition())); ws.NextRecord(); } }
/**************************************************************************/ private void BuildWorksheetPageHeadings( MacroscopeJobMaster JobMaster, CsvWriter ws ) { MacroscopeDocumentCollection DocCollection = JobMaster.GetDocCollection(); { ws.WriteField("URL"); ws.WriteField("Occurences"); ws.WriteField("Order"); for (int i = 1; i <= 6; i++) { ws.WriteField(string.Format("H{0}", i)); } ws.NextRecord(); } foreach (MacroscopeDocument msDoc in DocCollection.IterateDocuments()) { bool Proceed = false; if (msDoc.GetIsExternal()) { continue; } if (msDoc.GetIsRedirect()) { continue; } if (msDoc.IsDocumentType(Type: MacroscopeConstants.DocumentType.HTML)) { Proceed = true; } if (Proceed) { for (ushort HeadingLevel = 1; HeadingLevel <= MacroscopePreferencesManager.GetMaxHeadingDepth(); HeadingLevel++) { List <string> HeadingsList = msDoc.GetHeadings(HeadingLevel); for (int Order = 0; Order < HeadingsList.Count; Order++) { int Occurences = DocCollection.GetStatsHeadingsCount(HeadingLevel: HeadingLevel, Text: HeadingsList[Order]); this.InsertAndFormatUrlCell(ws, msDoc); this.InsertAndFormatContentCell(ws, Occurences.ToString()); this.InsertAndFormatContentCell(ws, this.FormatIfMissing((Order + 1).ToString())); this.InsertAndFormatContentCell(ws, this.FormatIfMissing(HeadingsList[Order])); ws.NextRecord(); } } } } }
/**************************************************************************/ private void BuildWorksheetPageLinks( MacroscopeJobMaster JobMaster, XLWorkbook wb, string WorksheetLabel ) { var ws = wb.Worksheets.Add(WorksheetLabel); int iRow = 1; int iCol = 1; int iColMax = 1; MacroscopeDocumentCollection DocCollection = JobMaster.GetDocCollection(); MacroscopeAllowedHosts AllowedHosts = JobMaster.GetAllowedHosts(); { ws.Cell(iRow, iCol).Value = "URL"; iCol++; ws.Cell(iRow, iCol).Value = "Link Type"; iCol++; ws.Cell(iRow, iCol).Value = "Source URL"; iCol++; ws.Cell(iRow, iCol).Value = "Target URL"; iCol++; ws.Cell(iRow, iCol).Value = "Follow"; iCol++; ws.Cell(iRow, iCol).Value = "Alt Text"; iCol++; ws.Cell(iRow, iCol).Value = "Raw Source URL"; iCol++; ws.Cell(iRow, iCol).Value = "Raw Target URL"; } iColMax = iCol; iRow++; foreach (MacroscopeDocument msDoc in DocCollection.IterateDocuments()) { foreach (MacroscopeLink Link in msDoc.IterateOutlinks()) { string LinkType = Link.GetLinkType().ToString(); string SourceUrl = Link.GetSourceUrl(); string TargetUrl = Link.GetTargetUrl(); string AltText = Link.GetAltText(); string RawSourceUrl = Link.GetRawSourceUrl(); string RawTargetUrl = Link.GetRawTargetUrl(); string DoFollow = "No Follow"; if (Link.GetDoFollow()) { DoFollow = "Follow"; } if (string.IsNullOrEmpty(AltText)) { AltText = ""; } if (string.IsNullOrEmpty(RawSourceUrl)) { RawSourceUrl = ""; } if (string.IsNullOrEmpty(RawTargetUrl)) { RawTargetUrl = ""; } iCol = 1; this.InsertAndFormatUrlCell(ws, iRow, iCol, msDoc); if (AllowedHosts.IsInternalUrl(Url: msDoc.GetUrl())) { ws.Cell(iRow, iCol).Style.Font.SetFontColor(XLColor.Green); } else { ws.Cell(iRow, iCol).Style.Font.SetFontColor(XLColor.Gray); } iCol++; this.InsertAndFormatContentCell(ws, iRow, iCol, this.FormatIfMissing(LinkType)); iCol++; this.InsertAndFormatContentCell(ws, iRow, iCol, this.FormatIfMissing(SourceUrl)); iCol++; this.InsertAndFormatContentCell(ws, iRow, iCol, this.FormatIfMissing(TargetUrl)); iCol++; this.InsertAndFormatContentCell(ws, iRow, iCol, this.FormatIfMissing(DoFollow)); iCol++; this.InsertAndFormatContentCell(ws, iRow, iCol, this.FormatIfMissing(AltText)); iCol++; this.InsertAndFormatContentCell(ws, iRow, iCol, this.FormatIfMissing(RawSourceUrl)); iCol++; this.InsertAndFormatContentCell(ws, iRow, iCol, this.FormatIfMissing(RawTargetUrl)); iRow++; } } { var rangeData = ws.Range(1, 1, iRow - 1, iColMax); var excelTable = rangeData.CreateTable(); } }
/**************************************************************************/ private void BuildWorksheetPageDuplicateTitles( MacroscopeJobMaster JobMaster, XLWorkbook wb, string WorksheetLabel ) { var ws = wb.Worksheets.Add(WorksheetLabel); int iRow = 1; int iCol = 1; int iColMax = 1; decimal Count = 0; decimal DocCount = 0; MacroscopeDocumentCollection DocCollection = JobMaster.GetDocCollection(); MacroscopeAllowedHosts AllowedHosts = JobMaster.GetAllowedHosts(); DocCount = ( decimal )DocCollection.CountDocuments(); { ws.Cell(iRow, iCol).Value = "URL"; iCol++; ws.Cell(iRow, iCol).Value = "Occurrences"; iCol++; ws.Cell(iRow, iCol).Value = "Title"; } iColMax = iCol; iRow++; foreach (MacroscopeDocument msDoc in DocCollection.IterateDocuments()) { bool Proceed = false; if (DocCount > 0) { Count++; this.ProgressForm.UpdatePercentages( Title: null, Message: null, MajorPercentage: -1, ProgressLabelMajor: string.Format("Documents Processed: {0}", Count), MinorPercentage: (( decimal )100 / DocCount) * Count, ProgressLabelMinor: msDoc.GetUrl(), SubMinorPercentage: -1, ProgressLabelSubMinor: null ); } if (AllowedHosts.IsInternalUrl(Url: msDoc.GetUrl())) { switch (msDoc.GetDocumentType()) { case MacroscopeConstants.DocumentType.HTML: Proceed = true; break; case MacroscopeConstants.DocumentType.PDF: Proceed = true; break; default: Proceed = false; break; } } if (Proceed) { string Title = msDoc.GetTitle(); int Occurrences = DocCollection.GetStatsTitleCount(msDoc: msDoc); if (Occurrences > 1) { iCol = 1; this.InsertAndFormatUrlCell(ws, iRow, iCol, msDoc); if (msDoc.GetIsInternal()) { ws.Cell(iRow, iCol).Style.Font.SetFontColor(XLColor.Green); } else { ws.Cell(iRow, iCol).Style.Font.SetFontColor(XLColor.Gray); } iCol++; this.InsertAndFormatContentCell(ws, iRow, iCol, Occurrences); if (Occurrences > 1) { ws.Cell(iRow, iCol).Style.Font.SetFontColor(XLColor.Orange); } else { ws.Cell(iRow, iCol).Style.Font.SetFontColor(XLColor.Green); } iCol++; this.InsertAndFormatContentCell(ws, iRow, iCol, this.FormatIfMissing(Title)); iRow++; } } } { var rangeData = ws.Range(1, 1, iRow - 1, iColMax); var excelTable = rangeData.CreateTable(); } }