/** Render Document List **************************************************/ public void RenderListView(List <MacroscopeDocument> DocList) { MacroscopeDocumentCollection DocCollection = this.MainForm.GetJobMaster().GetDocCollection(); if (DocList.Count == 0) { return; } List <ListViewItem> ListViewItems = new List <ListViewItem>(DocList.Count); MacroscopeSinglePercentageProgressForm ProgressForm = new MacroscopeSinglePercentageProgressForm(this.MainForm); decimal Count = 0; decimal TotalDocs = (decimal)DocList.Count; decimal MajorPercentage = ((decimal)100 / TotalDocs) * Count; if (MacroscopePreferencesManager.GetShowProgressDialogues()) { ProgressForm.ControlBox = false; ProgressForm.UpdatePercentages( Title: "Preparing Display", Message: "Processing document collection for display:", MajorPercentage: MajorPercentage, ProgressLabelMajor: string.Format("Document {0} / {1}", Count, TotalDocs) ); } for (int i = 0; i < DocList.Count; i++) { MacroscopeDocument msDoc = DocList[i]; if (msDoc != null) { this.RenderListView( ListViewItems: ListViewItems, DocCollection: DocCollection, msDoc: msDoc, Url: msDoc.GetUrl() ); } if (MacroscopePreferencesManager.GetShowProgressDialogues()) { Count++; MajorPercentage = ((decimal)100 / TotalDocs) * Count; ProgressForm.UpdatePercentages( Title: null, Message: null, MajorPercentage: MajorPercentage, ProgressLabelMajor: string.Format("Document {0} / {1}", Count, TotalDocs) ); } } this.DisplayListView.Items.AddRange(ListViewItems.ToArray()); if (MacroscopePreferencesManager.GetShowProgressDialogues()) { ProgressForm.DoClose(); } ProgressForm.Dispose(); }
/**************************************************************************/ protected override void RenderListView( List <ListViewItem> ListViewItems, MacroscopeDocumentCollection DocCollection, MacroscopeDocument msDoc, string Url ) { if (msDoc.GetIsRedirect()) { return; } if (!msDoc.IsDocumentType(Type: MacroscopeConstants.DocumentType.JAVASCRIPT)) { return; } string StatusCode = msDoc.GetStatusCode().ToString(); string MimeType = msDoc.GetMimeType(); string FileSize = msDoc.GetContentLength().ToString(); string PairKey = string.Join("", Url); ListViewItem lvItem = null; if (this.DisplayListView.Items.ContainsKey(PairKey)) { try { lvItem = this.DisplayListView.Items[PairKey]; lvItem.SubItems[0].Text = Url; lvItem.SubItems[1].Text = StatusCode; lvItem.SubItems[2].Text = MimeType; lvItem.SubItems[3].Text = FileSize; } catch (Exception ex) { DebugMsg(string.Format("MacroscopeDisplayJavascripts 1: {0}", ex.Message)); } } else { try { lvItem = new ListViewItem(PairKey); lvItem.UseItemStyleForSubItems = false; lvItem.Name = PairKey; lvItem.SubItems[0].Text = Url; lvItem.SubItems.Add(StatusCode); lvItem.SubItems.Add(MimeType); lvItem.SubItems.Add(FileSize); ListViewItems.Add(lvItem); } catch (Exception ex) { DebugMsg(string.Format("MacroscopeDisplayJavascripts 2: {0}", ex.Message)); } } if (lvItem != null) { lvItem.ForeColor = Color.Blue; // URL -------------------------------------------------------------// if (msDoc.GetIsInternal()) { lvItem.SubItems[0].ForeColor = Color.Green; } else { lvItem.SubItems[0].ForeColor = Color.Gray; } // Status Code -------------------------------------------------------// if (msDoc.GetStatusCode() != HttpStatusCode.OK) { lvItem.SubItems[1].ForeColor = Color.Red; } else { lvItem.SubItems[1].ForeColor = Color.Green; } } }
/**************************************************************************/ private void RenderListViewSearchTargetUrls( List <ListViewItem> ListViewItems, MacroscopeDocument msDoc, string Url, string UrlFragment ) { MacroscopeAllowedHosts AllowedHosts = this.MainForm.GetJobMaster().GetAllowedHosts(); MacroscopeDocumentCollection DocCollection = this.MainForm.GetJobMaster().GetDocCollection(); foreach (MacroscopeLink Link in msDoc.IterateOutlinks()) { string LinkType = Link.GetLinkType().ToString(); string UrlTarget = Link.GetTargetUrl(); HttpStatusCode StatusCode = HttpStatusCode.NotFound; string StatusCodeText = "Not crawled"; string StatusText = "Not crawled"; string PairKey = string.Join(":", UrlToDigest(Url: Url), UrlToDigest(Url: UrlTarget)).ToString(); string DoFollow = "No Follow"; string AltText = Link.GetAltText(); string AltTextLabel = AltText; string RawSourceUrl = Link.GetRawSourceUrl(); string RawTargetUrl = Link.GetRawTargetUrl(); try { if (DocCollection.ContainsDocument(Url: Link.GetTargetUrl())) { StatusCode = DocCollection.GetDocumentByUrl(Url: Link.GetTargetUrl()).GetStatusCode(); StatusCodeText = ((int)StatusCode).ToString(); StatusText = StatusCode.ToString(); } } catch (Exception ex) { this.DebugMsg(ex.Message); } if (Link.GetDoFollow()) { DoFollow = "Follow"; } if (string.IsNullOrEmpty(AltText)) { AltTextLabel = ""; } if (string.IsNullOrEmpty(RawSourceUrl)) { RawSourceUrl = ""; } if (string.IsNullOrEmpty(RawTargetUrl)) { RawTargetUrl = ""; } if ( (UrlTarget != null) && (UrlTarget.IndexOf(UrlFragment, StringComparison.CurrentCulture) >= 0)) { ListViewItem lvItem = null; if (this.DisplayListView.Items.ContainsKey(PairKey)) { try { lvItem = this.DisplayListView.Items[PairKey]; lvItem.SubItems[ColType].Text = LinkType; lvItem.SubItems[ColUrl].Text = Url; lvItem.SubItems[ColUrlTarget].Text = UrlTarget; lvItem.SubItems[ColStatusCode].Text = StatusCodeText; lvItem.SubItems[ColStatus].Text = StatusText; lvItem.SubItems[ColDoFollow].Text = DoFollow; lvItem.SubItems[ColAltTextLabel].Text = AltTextLabel; lvItem.SubItems[ColRawSourceUrl].Text = RawSourceUrl; lvItem.SubItems[ColRawTargetUrl].Text = RawTargetUrl; } catch (Exception ex) { this.DebugMsg(string.Format("MacroscopeDisplayLinks 1: {0}", ex.Message)); } } else { try { lvItem = new ListViewItem(PairKey); lvItem.UseItemStyleForSubItems = false; lvItem.Name = PairKey; lvItem.SubItems[ColType].Text = LinkType; lvItem.SubItems.Add(Url); lvItem.SubItems.Add(UrlTarget); lvItem.SubItems.Add(StatusCodeText); lvItem.SubItems.Add(StatusText); lvItem.SubItems.Add(DoFollow); lvItem.SubItems.Add(AltTextLabel); lvItem.SubItems.Add(RawSourceUrl); lvItem.SubItems.Add(RawTargetUrl); ListViewItems.Add(lvItem); } catch (Exception ex) { this.DebugMsg(string.Format("MacroscopeDisplayLinks 2: {0}", ex.Message)); } } if (lvItem != null) { for (int i = 0; i < lvItem.SubItems.Count; i++) { lvItem.SubItems[i].ForeColor = Color.Blue; } if (AllowedHosts.IsAllowedFromUrl(Url)) { lvItem.SubItems[ColUrl].ForeColor = Color.Green; } else { lvItem.SubItems[ColUrl].ForeColor = Color.Gray; } if (AllowedHosts.IsAllowedFromUrl(UrlTarget)) { lvItem.SubItems[ColUrlTarget].ForeColor = Color.Green; } else { lvItem.SubItems[ColUrlTarget].ForeColor = Color.Gray; } if (AllowedHosts.IsAllowedFromUrl(UrlTarget)) { if (Link.GetDoFollow()) { lvItem.SubItems[ColDoFollow].ForeColor = Color.Green; } else { lvItem.SubItems[ColDoFollow].ForeColor = Color.Red; } } else { lvItem.SubItems[ColDoFollow].ForeColor = Color.Gray; } } } } }
/**************************************************************************/ protected override void RenderListView( List <ListViewItem> ListViewItems, MacroscopeDocumentCollection DocCollection, MacroscopeDocument msDoc, string Url ) { bool Proceed = false; switch (msDoc.GetDocumentType()) { case MacroscopeConstants.DocumentType.SITEMAPXML: Proceed = true; break; case MacroscopeConstants.DocumentType.SITEMAPTEXT: Proceed = true; break; default: break; } if (!Proceed) { return; } string PairKey = string.Join("", Url); ListViewItem lvItem = null; int Count = msDoc.CountOutlinks(); if (this.DisplayListView.Items.ContainsKey(PairKey)) { try { lvItem = this.DisplayListView.Items[PairKey]; lvItem.SubItems[0].Text = Url; lvItem.SubItems[1].Text = Count.ToString(); } catch (Exception ex) { DebugMsg(string.Format("MacroscopeDisplaySitemaps 1: {0}", ex.Message)); } } else { try { lvItem = new ListViewItem(PairKey); lvItem.UseItemStyleForSubItems = false; lvItem.Name = PairKey; lvItem.SubItems[0].Text = Url; lvItem.SubItems.Add(Count.ToString()); ListViewItems.Add(lvItem); } catch (Exception ex) { DebugMsg(string.Format("MacroscopeDisplaySitemaps 2: {0}", ex.Message)); } } if (lvItem != null) { lvItem.ForeColor = Color.Blue; if (msDoc.GetIsInternal()) { lvItem.SubItems[0].ForeColor = Color.Green; if (Count <= 0) { lvItem.SubItems[0].ForeColor = Color.Red; lvItem.SubItems[1].ForeColor = Color.Red; } } else { lvItem.SubItems[0].ForeColor = Color.Gray; } } }
/**************************************************************************/ protected override void RenderListView( List <ListViewItem> ListViewItems, MacroscopeDocumentCollection DocCollection, MacroscopeDocument msDoc, string Url ) { if (!msDoc.GetIsRedirect()) { return; } MacroscopeAllowedHosts AllowedHosts = this.MainForm.GetJobMaster().GetAllowedHosts(); int StatusCode = (int)msDoc.GetStatusCode(); string Status = msDoc.GetStatusCode().ToString(); string DestinationURL = msDoc.GetUrlRedirectTo(); string PairKey = string.Join("", Url); if ( (!string.IsNullOrEmpty(Status)) && (!string.IsNullOrEmpty(DestinationURL))) { ListViewItem lvItem = null; if (this.DisplayListView.Items.ContainsKey(PairKey)) { try { lvItem = this.DisplayListView.Items[PairKey]; lvItem.SubItems[COLURL].Text = Url; lvItem.SubItems[COLSTATUSCODE].Text = StatusCode.ToString(); lvItem.SubItems[COLSTATUS].Text = Status; lvItem.SubItems[COLDESTINATIONURL].Text = DestinationURL; } catch (Exception ex) { this.DebugMsg(string.Format("MacroscopeDisplayRedirectsAudit 1: {0}", ex.Message)); } } else { try { lvItem = new ListViewItem(PairKey); lvItem.UseItemStyleForSubItems = false; lvItem.Name = PairKey; lvItem.SubItems[COLURL].Text = Url; lvItem.SubItems.Add(StatusCode.ToString()); lvItem.SubItems.Add(Status); lvItem.SubItems.Add(DestinationURL); ListViewItems.Add(lvItem); } catch (Exception ex) { this.DebugMsg(string.Format("MacroscopeDisplayRedirectsAudit 2: {0}", ex.Message)); } } if (lvItem != null) { if (msDoc.GetIsInternal()) { for (int i = 0; i <= 3; i++) { lvItem.SubItems[i].ForeColor = Color.Blue; } if ((StatusCode >= 200) && (StatusCode <= 299)) { for (int i = 0; i <= 3; i++) { lvItem.SubItems[i].ForeColor = Color.Green; } } else if ((StatusCode >= 300) && (StatusCode <= 399)) { for (int i = 0; i <= 3; i++) { lvItem.SubItems[i].ForeColor = Color.Goldenrod; } } else if ((StatusCode >= 400) && (StatusCode <= 599)) { for (int i = 0; i <= 3; i++) { lvItem.SubItems[i].ForeColor = Color.Red; } } } else { for (int i = 0; i <= 3; i++) { lvItem.SubItems[i].ForeColor = Color.Gray; } } if (AllowedHosts.IsInternalUrl(DestinationURL)) { lvItem.SubItems[COLDESTINATIONURL].ForeColor = Color.Green; } else { lvItem.SubItems[COLDESTINATIONURL].ForeColor = Color.Gray; } } } }
/**************************************************************************/ private void BuildWorksheetCustomFilter( MacroscopeJobMaster JobMaster, CsvWriter ws ) { MacroscopeDocumentCollection DocCollection = JobMaster.GetDocCollection(); MacroscopeAllowedHosts AllowedHosts = JobMaster.GetAllowedHosts(); Dictionary <string, int> FilterColsTable = new Dictionary <string, int> (CustomFilter.GetSize()); const int FilterColOffset = 3; { ws.WriteField(MacroscopeConstants.Url); ws.WriteField(MacroscopeConstants.StatusCode); ws.WriteField(MacroscopeConstants.Status); ws.WriteField(MacroscopeConstants.ContentType); for (int Slot = 0; Slot < CustomFilter.GetSize(); Slot++) { string FilterPattern = CustomFilter.GetPattern(Slot).Key; if (FilterColsTable.ContainsKey(FilterPattern) || string.IsNullOrEmpty(FilterPattern)) { FilterColsTable.Add(string.Format("EMPTY{0}", Slot + 1), Slot + FilterColOffset); ws.WriteField(string.Format("EMPTY{0}", Slot + 1)); } else { FilterColsTable.Add(FilterPattern, Slot + FilterColOffset); ws.WriteField(FilterPattern); } } ws.NextRecord(); } foreach (MacroscopeDocument msDoc in DocCollection.IterateDocuments()) { string DocUrl = msDoc.GetUrl(); string StatusCode = (( int )msDoc.GetStatusCode()).ToString(); string Status = msDoc.GetStatusCode().ToString(); string MimeType = msDoc.GetMimeType(); if (!this.CustomFilter.CanApplyCustomFiltersToDocument(msDoc: msDoc)) { continue; } this.InsertAndFormatUrlCell(ws, msDoc); this.InsertAndFormatStatusCodeCell(ws, msDoc); this.InsertAndFormatContentCell(ws, this.FormatIfMissing(msDoc.GetStatusCode().ToString())); this.InsertAndFormatContentCell(ws, this.FormatIfMissing(MimeType)); for (int Slot = 0; Slot < this.CustomFilter.GetSize(); Slot++) { string FilterPattern = this.CustomFilter.GetPattern(Slot: Slot).Key; KeyValuePair <string, MacroscopeConstants.TextPresence> Pair = msDoc.GetCustomFilteredItem(Text: FilterPattern); if ((Pair.Key != null) && (Pair.Value != MacroscopeConstants.TextPresence.UNDEFINED)) { string CustomFilterItemValue = MacroscopeConstants.TextPresenceLabels[Pair.Value]; this.InsertAndFormatContentCell(ws, CustomFilterItemValue); } else { this.InsertAndFormatContentCell(ws, ""); } ws.NextRecord(); } } }
/**************************************************************************/ protected override void RenderListView( List <ListViewItem> ListViewItems, MacroscopeDocumentCollection DocCollection, MacroscopeDocument msDoc, string Url ) { string StatusCode = ((int)msDoc.GetStatusCode()).ToString(); string Status = msDoc.GetStatusCode().ToString(); string PairKey = string.Join("", Url); ListViewItem lvItem = null; if (this.DisplayListView.Items.ContainsKey(PairKey)) { try { lvItem = this.DisplayListView.Items[PairKey]; lvItem.SubItems[COL_URL].Text = Url; lvItem.SubItems[COL_STATUS_CODE].Text = StatusCode; lvItem.SubItems[COL_STATUS].Text = Status; } catch (Exception ex) { DebugMsg(string.Format("MacroscopeDisplayOrphanedPages 1: {0}", ex.Message)); } } else { try { lvItem = new ListViewItem(PairKey); lvItem.UseItemStyleForSubItems = false; lvItem.Name = PairKey; lvItem.SubItems[COL_URL].Text = Url; lvItem.SubItems.Add(StatusCode); lvItem.SubItems.Add(Status); ListViewItems.Add(lvItem); } catch (Exception ex) { DebugMsg(string.Format("MacroscopeDisplayOrphanedPages 2: {0}", ex.Message)); } } if (lvItem != null) { lvItem.ForeColor = Color.Blue; if (msDoc.GetIsInternal()) { lvItem.SubItems[COL_URL].ForeColor = Color.Green; } else { lvItem.SubItems[COL_URL].ForeColor = Color.Gray; } if (Regex.IsMatch(StatusCode, "^[2]")) { lvItem.SubItems[COL_STATUS_CODE].ForeColor = Color.Green; lvItem.SubItems[COL_STATUS].ForeColor = Color.Green; } else if (Regex.IsMatch(StatusCode, "^[3]")) { lvItem.SubItems[COL_STATUS_CODE].ForeColor = Color.Goldenrod; lvItem.SubItems[COL_STATUS].ForeColor = Color.Goldenrod; } else if (Regex.IsMatch(StatusCode, "^[45]")) { lvItem.SubItems[COL_STATUS_CODE].ForeColor = Color.Red; lvItem.SubItems[COL_STATUS].ForeColor = Color.Red; } else { lvItem.SubItems[COL_STATUS_CODE].ForeColor = Color.Blue; lvItem.SubItems[COL_STATUS].ForeColor = Color.Blue; } } }
/**************************************************************************/ private void BuildWorksheetPageUriAnalysis( MacroscopeJobMaster JobMaster, XLWorkbook wb, string WorksheetLabel ) { var ws = wb.Worksheets.Add(WorksheetLabel); int iRow = 1; int iCol = 1; int iColMax = 1; MacroscopeDocumentCollection DocCollection = JobMaster.GetDocCollection(); MacroscopeAllowedHosts AllowedHosts = JobMaster.GetAllowedHosts(); { ws.Cell(iRow, iCol).Value = "URL"; iCol++; ws.Cell(iRow, iCol).Value = "Status Code"; iCol++; ws.Cell(iRow, iCol).Value = "Status"; iCol++; ws.Cell(iRow, iCol).Value = "Occurrences"; iCol++; ws.Cell(iRow, iCol).Value = "Checksum"; } iColMax = iCol; iRow++; foreach (string Url in DocCollection.DocumentKeys()) { MacroscopeDocument msDoc = DocCollection.GetDocument(Url); string StatusCode = (( int )msDoc.GetStatusCode()).ToString(); string Status = msDoc.GetStatusCode().ToString(); string Checksum = msDoc.GetChecksum(); int Count = DocCollection.GetStatsChecksumCount(Checksum: Checksum); iCol = 1; this.InsertAndFormatUrlCell(ws, iRow, iCol, msDoc); if (AllowedHosts.IsInternalUrl(Url: Url)) { ws.Cell(iRow, iCol).Style.Font.SetFontColor(XLColor.Green); } else { ws.Cell(iRow, iCol).Style.Font.SetFontColor(XLColor.Gray); } iCol++; this.InsertAndFormatContentCell(ws, iRow, iCol, StatusCode); iCol++; this.InsertAndFormatContentCell(ws, iRow, iCol, Status); iCol++; this.InsertAndFormatContentCell(ws, iRow, iCol, Count); if (Count > 1) { ws.Cell(iRow, iCol).Style.Font.SetFontColor(XLColor.Red); } else { ws.Cell(iRow, iCol).Style.Font.SetFontColor(XLColor.Blue); } iCol++; this.InsertAndFormatContentCell(ws, iRow, iCol, Checksum); if (Count > 1) { ws.Cell(iRow, iCol).Style.Font.SetFontColor(XLColor.Red); } else { ws.Cell(iRow, iCol).Style.Font.SetFontColor(XLColor.Blue); } iRow++; } { var rangeData = ws.Range(1, 1, iRow - 1, iColMax); var excelTable = rangeData.CreateTable(); } }
/**************************************************************************/ private void BuildWorksheetPageDuplicatePages( MacroscopeJobMaster JobMaster, CsvWriter ws ) { decimal DocCount = 0; decimal DocListCount = 0; decimal CountOuter = 0; decimal CountInner = 0; MacroscopeDocumentCollection DocCollection = JobMaster.GetDocCollection(); MacroscopeAllowedHosts AllowedHosts = JobMaster.GetAllowedHosts(); Dictionary <string, bool> CrossCheckList; CrossCheckList = MacroscopeLevenshteinAnalysis.GetCrossCheckList( Capacity: DocCollection.CountDocuments() ); DocCount = ( decimal )DocCollection.CountDocuments(); { ws.WriteField("Status Code"); ws.WriteField("Status"); ws.WriteField("Origin URL"); ws.WriteField("Distance"); ws.WriteField("Similar URL"); ws.NextRecord(); } foreach (string UrlLeft in DocCollection.DocumentUrls()) { MacroscopeDocument msDocLeft = DocCollection.GetDocumentByUrl(Url: UrlLeft); MacroscopeLevenshteinAnalysis LevenshteinAnalysis = null; CountOuter++; CountInner = 0; if (DocCount > 0) { this.ProgressForm.UpdatePercentages( Title: null, Message: null, MajorPercentage: -1, ProgressLabelMajor: string.Format("Documents Processed: {0}", CountOuter), MinorPercentage: (( decimal )100 / DocCount) * CountOuter, ProgressLabelMinor: UrlLeft, SubMinorPercentage: 0, ProgressLabelSubMinor: "" ); } if (msDocLeft.GetIsExternal()) { continue; } if (!msDocLeft.IsDocumentType(Type: MacroscopeConstants.DocumentType.HTML)) { continue; } LevenshteinAnalysis = new MacroscopeLevenshteinAnalysis( msDoc: msDocLeft, SizeDifference: MacroscopePreferencesManager.GetMaxLevenshteinSizeDifference(), Threshold: MacroscopePreferencesManager.GetMaxLevenshteinDistance(), CrossCheckList: CrossCheckList, IPercentageDone: this ); Dictionary <MacroscopeDocument, int> DocList; DocList = LevenshteinAnalysis.AnalyzeDocCollection( DocCollection: DocCollection ); DocListCount = ( decimal )DocList.Count; foreach (MacroscopeDocument msDocDuplicate in DocList.Keys) { int StatusCode = ( int )msDocLeft.GetStatusCode(); HttpStatusCode Status = msDocLeft.GetStatusCode(); string UrlDuplicate = msDocDuplicate.GetUrl(); int Distance = DocList[msDocDuplicate]; CountInner++; if (DocCount > 0) { this.ProgressForm.UpdatePercentages( Title: null, Message: null, MajorPercentage: -1, ProgressLabelMajor: string.Format("Documents Processed: {0}", CountOuter), MinorPercentage: (( decimal )100 / DocCount) * CountOuter, ProgressLabelMinor: UrlLeft, SubMinorPercentage: (( decimal )100 / DocListCount) * CountInner, ProgressLabelSubMinor: UrlDuplicate ); } this.InsertAndFormatStatusCodeCell(ws, StatusCode); this.InsertAndFormatStatusCodeCell(ws, Status); this.InsertAndFormatUrlCell(ws, UrlLeft); this.InsertAndFormatContentCell(ws, Distance.ToString()); this.InsertAndFormatUrlCell(ws, UrlDuplicate); ws.NextRecord(); if (this.ProgressForm.Cancelled()) { break; } } if (this.ProgressForm.Cancelled()) { break; } //Thread.Yield(); } }
/**************************************************************************/ private void BuildWorksheetPageDuplicateTitles( MacroscopeJobMaster JobMaster, CsvWriter ws ) { decimal Count = 0; decimal DocCount = 0; MacroscopeDocumentCollection DocCollection = JobMaster.GetDocCollection(); MacroscopeAllowedHosts AllowedHosts = JobMaster.GetAllowedHosts(); DocCount = ( decimal )DocCollection.CountDocuments(); { ws.WriteField("URL"); ws.WriteField("Occurrences"); ws.WriteField("Title"); ws.NextRecord(); } foreach (MacroscopeDocument msDoc in DocCollection.IterateDocuments()) { bool Proceed = false; if (DocCount > 0) { Count++; this.ProgressForm.UpdatePercentages( Title: null, Message: null, MajorPercentage: -1, ProgressLabelMajor: string.Format("Documents Processed: {0}", Count), MinorPercentage: (( decimal )100 / DocCount) * Count, ProgressLabelMinor: msDoc.GetUrl(), SubMinorPercentage: -1, ProgressLabelSubMinor: null ); } if (AllowedHosts.IsInternalUrl(Url: msDoc.GetUrl())) { switch (msDoc.GetDocumentType()) { case MacroscopeConstants.DocumentType.HTML: Proceed = true; break; case MacroscopeConstants.DocumentType.PDF: Proceed = true; break; default: Proceed = false; break; } } if (Proceed) { string Title = msDoc.GetTitle(); int Occurrences = DocCollection.GetStatsTitleCount(msDoc: msDoc); if (Occurrences > 1) { this.InsertAndFormatUrlCell(ws, msDoc); this.InsertAndFormatContentCell(ws, Occurrences); this.InsertAndFormatContentCell(ws, this.FormatIfMissing(Title)); ws.NextRecord(); } } } }
/**************************************************************************/ protected override void RenderListView( List <ListViewItem> ListViewItems, MacroscopeDocumentCollection DocCollection, MacroscopeDocument msDoc, string Url ) { foreach (KeyValuePair <string, string> RemarkPair in msDoc.IterateRemarks()) { ListViewItem lvItem = null; string PairKey = string.Join(@"::::", Url, RemarkPair.Value); string StatusCode = (( int )msDoc.GetStatusCode()).ToString(); string Status = msDoc.GetStatusCode().ToString(); if (this.DisplayListView.Items.ContainsKey(PairKey)) { try { lvItem = this.DisplayListView.Items[PairKey]; lvItem.SubItems[ColUrl].Text = Url; lvItem.SubItems[ColStatusCode].Text = StatusCode; lvItem.SubItems[ColStatus].Text = Status; lvItem.SubItems[ColObservation].Text = RemarkPair.Value; } catch (Exception ex) { DebugMsg(string.Format("MacroscopeDisplayRemarks 1: {0}", ex.Message)); } } else { try { lvItem = new ListViewItem(PairKey); lvItem.UseItemStyleForSubItems = false; lvItem.Name = PairKey; lvItem.SubItems[ColUrl].Text = Url; lvItem.SubItems.Add(StatusCode); lvItem.SubItems.Add(Status); lvItem.SubItems.Add(RemarkPair.Value); ListViewItems.Add(lvItem); } catch (Exception ex) { DebugMsg(string.Format("MacroscopeDisplayRemarks 2: {0}", ex.Message)); } } if (lvItem != null) { lvItem.ForeColor = Color.Blue; // URL -------------------------------------------------------------// if (msDoc.GetIsInternal()) { lvItem.SubItems[ColUrl].ForeColor = Color.Green; } else { lvItem.SubItems[ColUrl].ForeColor = Color.Gray; } } } }
/** Override Render One ***************************************************/ abstract protected void RenderListView( List <ListViewItem> ListViewItems, MacroscopeDocumentCollection DocCollection, MacroscopeDocument msDoc, string Url );
/** Render DocCollection Filtered by URL Fragment *************************/ public void RenderListView( MacroscopeDocumentCollection DocCollection, string UrlFragment ) { if (DocCollection.CountDocuments() == 0) { return; } List <ListViewItem> ListViewItems = new List <ListViewItem>(DocCollection.CountDocuments()); MacroscopeSinglePercentageProgressForm ProgressForm = new MacroscopeSinglePercentageProgressForm(this.MainForm); decimal Count = 0; decimal TotalDocs = (decimal)DocCollection.CountDocuments(); decimal MajorPercentage = ((decimal)100 / TotalDocs) * Count; if (MacroscopePreferencesManager.GetShowProgressDialogues()) { ProgressForm.ControlBox = false; ProgressForm.UpdatePercentages( Title: "Preparing Display", Message: "Processing document collection for display:", MajorPercentage: MajorPercentage, ProgressLabelMajor: string.Format("Document {0} / {1}", Count, TotalDocs) ); } foreach (MacroscopeDocument msDoc in DocCollection.IterateDocuments()) { Application.DoEvents(); if (msDoc != null) { string Url = msDoc.GetUrl(); if (Url.IndexOf(UrlFragment, StringComparison.CurrentCulture) >= 0) { this.RenderListView( ListViewItems: ListViewItems, DocCollection: DocCollection, msDoc: msDoc, Url: Url ); } } if (MacroscopePreferencesManager.GetShowProgressDialogues()) { Count++; MajorPercentage = ((decimal)100 / TotalDocs) * Count; ProgressForm.UpdatePercentages( Title: null, Message: null, MajorPercentage: MajorPercentage, ProgressLabelMajor: string.Format("Document {0} / {1}", Count, TotalDocs) ); } } this.DisplayListView.Items.AddRange(ListViewItems.ToArray()); if (MacroscopePreferencesManager.GetShowProgressDialogues()) { ProgressForm.DoClose(); } ProgressForm.Dispose(); }
/** Render Filtered DocCollection *******************************************/ public void RenderListView( MacroscopeDocumentCollection DocCollection, MacroscopeConstants.DocumentType DocumentType ) { if (DocCollection.CountDocuments() == 0) { return; } List <ListViewItem> ListViewItems = new List <ListViewItem>(DocCollection.CountDocuments()); MacroscopeSinglePercentageProgressForm ProgressForm = new MacroscopeSinglePercentageProgressForm(this.MainForm); decimal Count = 0; decimal TotalDocs = (decimal)DocCollection.CountDocuments(); decimal MajorPercentage = ((decimal)100 / TotalDocs) * Count; if (MacroscopePreferencesManager.GetShowProgressDialogues()) { ProgressForm.ControlBox = false; ProgressForm.UpdatePercentages( Title: "Preparing Display", Message: "Processing document collection for display:", MajorPercentage: MajorPercentage, ProgressLabelMajor: string.Format("Document {0} / {1}", Count, TotalDocs) ); } foreach (MacroscopeDocument msDoc in DocCollection.IterateDocuments()) { Application.DoEvents(); if (msDoc != null) { switch (DocumentType) { case MacroscopeConstants.DocumentType.INTERNALURL: if (msDoc.GetIsInternal()) { this.RenderListView( ListViewItems: ListViewItems, DocCollection: DocCollection, msDoc: msDoc, Url: msDoc.GetUrl() ); } break; case MacroscopeConstants.DocumentType.EXTERNALURL: if (msDoc.GetIsExternal()) { this.RenderListView( ListViewItems: ListViewItems, DocCollection: DocCollection, msDoc: msDoc, Url: msDoc.GetUrl() ); } break; default: if ( (msDoc.GetDocumentType() == DocumentType) || (DocumentType == MacroscopeConstants.DocumentType.ALL)) { this.RenderListView( ListViewItems: ListViewItems, DocCollection: DocCollection, msDoc: msDoc, Url: msDoc.GetUrl() ); } break; } } if (MacroscopePreferencesManager.GetShowProgressDialogues()) { Count++; MajorPercentage = ((decimal)100 / TotalDocs) * Count; ProgressForm.UpdatePercentages( Title: null, Message: null, MajorPercentage: MajorPercentage, ProgressLabelMajor: string.Format("Document {0} / {1}", Count, TotalDocs) ); } } this.DisplayListView.Items.AddRange(ListViewItems.ToArray()); if (MacroscopePreferencesManager.GetShowProgressDialogues()) { ProgressForm.DoClose(); } ProgressForm.Dispose(); }
/**************************************************************************/ private void RenderListView( MacroscopeDocumentCollection DocCollection, List <string> UrlList, MacroscopeDataExtractorRegexes DataExtractor ) { MacroscopeAllowedHosts AllowedHosts = this.MainForm.GetJobMaster().GetAllowedHosts(); if (DocCollection.CountDocuments() == 0) { return; } List <ListViewItem> ListViewItems = new List <ListViewItem> (); MacroscopeSinglePercentageProgressForm ProgressForm = new MacroscopeSinglePercentageProgressForm(this.MainForm); decimal Count = 0; decimal TotalDocs = ( decimal )DocCollection.CountDocuments(); decimal MajorPercentage = (( decimal )100 / TotalDocs) * Count; if (MacroscopePreferencesManager.GetShowProgressDialogues()) { ProgressForm.UpdatePercentages( Title: "Preparing Display", Message: "Processing document collection for display:", MajorPercentage: MajorPercentage, ProgressLabelMajor: string.Format("Document {0} / {1}", Count, TotalDocs) ); } foreach (string Url in UrlList) { MacroscopeDocument msDoc = DocCollection.GetDocument(Url: Url); string DocUrl = msDoc.GetUrl(); string StatusCode = (( int )msDoc.GetStatusCode()).ToString(); string Status = msDoc.GetStatusCode().ToString(); string MimeType = msDoc.GetMimeType(); if (!DataExtractor.CanApplyDataExtractorsToDocument(msDoc: msDoc)) { continue; } foreach (KeyValuePair <string, string> DataExtractedPair in msDoc.IterateDataExtractedRegexes()) { ListViewItem lvItem = null; string RegexLabel = DataExtractedPair.Key; string ExtractedValue = DataExtractedPair.Value; string PairKey = null; if ( string.IsNullOrEmpty(RegexLabel) || string.IsNullOrEmpty(ExtractedValue)) { continue; } PairKey = string.Join( "::", DocUrl, Macroscope.GetStringDigest(Text: RegexLabel), Macroscope.GetStringDigest(Text: ExtractedValue) ); if (this.DisplayListView.Items.ContainsKey(PairKey)) { lvItem = this.DisplayListView.Items[PairKey]; } else { lvItem = new ListViewItem(PairKey); lvItem.UseItemStyleForSubItems = false; lvItem.Name = PairKey; for (int i = 0; i < 6; i++) { lvItem.SubItems.Add(""); } ListViewItems.Add(lvItem); } if (lvItem != null) { try { lvItem.SubItems[ColUrl].Text = DocUrl; lvItem.SubItems[ColStatusCode].Text = StatusCode; lvItem.SubItems[ColStatus].Text = Status; lvItem.SubItems[ColMimeType].Text = MimeType; lvItem.SubItems[ColRegexLabel].Text = RegexLabel; lvItem.SubItems[ColExtractedValue].Text = ExtractedValue; } catch (Exception ex) { DebugMsg(string.Format("MacroscopeDisplayDataExtractorRegexes: {0}", ex.Message)); DebugMsg(string.Format("MacroscopeDisplayDataExtractorRegexes: {0}", ex.StackTrace)); } } else { DebugMsg(string.Format("MacroscopeDisplayDataExtractorRegexes MISSING: {0}", PairKey)); } if (msDoc.GetIsInternal()) { lvItem.SubItems[ColUrl].ForeColor = Color.Green; } else { lvItem.SubItems[ColUrl].ForeColor = Color.Gray; } if (Regex.IsMatch(StatusCode, "^[2]")) { lvItem.SubItems[ColStatusCode].ForeColor = Color.Green; lvItem.SubItems[ColStatus].ForeColor = Color.Green; } else if (Regex.IsMatch(StatusCode, "^[3]")) { lvItem.SubItems[ColStatusCode].ForeColor = Color.Goldenrod; lvItem.SubItems[ColStatus].ForeColor = Color.Goldenrod; } else if (Regex.IsMatch(StatusCode, "^[45]")) { lvItem.SubItems[ColStatusCode].ForeColor = Color.Red; lvItem.SubItems[ColStatus].ForeColor = Color.Red; } else { lvItem.SubItems[ColStatusCode].ForeColor = Color.Blue; lvItem.SubItems[ColStatus].ForeColor = Color.Blue; } } if (MacroscopePreferencesManager.GetShowProgressDialogues()) { Count++; MajorPercentage = (( decimal )100 / TotalDocs) * Count; ProgressForm.UpdatePercentages( Title: null, Message: null, MajorPercentage: MajorPercentage, ProgressLabelMajor: string.Format("Document {0} / {1}", Count, TotalDocs) ); } } this.DisplayListView.Items.AddRange(ListViewItems.ToArray()); this.DeduplicateListView(DuplicatedListView: this.DisplayListView); this.DisplayListView.AutoResizeColumns(ColumnHeaderAutoResizeStyle.ColumnContent); this.DisplayListView.Columns[ColUrl].Width = 300; this.DisplayListView.Columns[ColStatusCode].Width = 100; this.DisplayListView.Columns[ColStatus].Width = 100; this.DisplayListView.Columns[ColMimeType].Width = 100; if (MacroscopePreferencesManager.GetShowProgressDialogues()) { ProgressForm.DoClose(); } ProgressForm.Dispose(); }
/**************************************************************************/ private void BuildWorksheetKeywordsPresence( MacroscopeJobMaster JobMaster, XLWorkbook wb, string WorksheetLabel, MacroscopeDocumentCollection DocCollection ) { var ws = wb.Worksheets.Add(WorksheetLabel); int iRow = 1; int iCol = 1; int iColMax = 1; decimal DocCount = 0; decimal DocTotal = (decimal)DocCollection.CountDocuments(); { ws.Cell(iRow, iCol).Value = "Presence"; iCol++; ws.Cell(iRow, iCol).Value = "Keyword"; iCol++; ws.Cell(iRow, iCol).Value = "URL"; } iColMax = iCol; iRow++; foreach (MacroscopeDocument msDoc in DocCollection.IterateDocuments()) { List <KeyValuePair <string, MacroscopeIntenseKeywordAnalysis.KEYWORD_STATUS> > KeywordPresence; KeywordPresence = DocCollection.GetIntenseKeywordAnalysis(msDoc: msDoc); if (DocCount > 0) { this.ProgressForm.UpdatePercentages( Title: null, Message: null, MajorPercentage: -1, ProgressLabelMajor: null, MinorPercentage: ((decimal)100 / DocTotal) * (decimal)DocCount, ProgressLabelMinor: "Documents Processed" ); } if (KeywordPresence != null) { foreach (KeyValuePair <string, MacroscopeIntenseKeywordAnalysis.KEYWORD_STATUS> Pair in KeywordPresence) { MacroscopeIntenseKeywordAnalysis.KEYWORD_STATUS Present = Pair.Value; string Keyword = Pair.Key; iCol = 1; this.InsertAndFormatContentCell(ws, iRow, iCol, Pair.Value.ToString()); switch (Pair.Value) { case MacroscopeIntenseKeywordAnalysis.KEYWORD_STATUS.KEYWORDS_METATAG_EMPTY: ws.Cell(iRow, iCol).Style.Font.SetFontColor(XLColor.Red); break; case MacroscopeIntenseKeywordAnalysis.KEYWORD_STATUS.MISSING_IN_BODY_TEXT: ws.Cell(iRow, iCol).Style.Font.SetFontColor(XLColor.Red); break; case MacroscopeIntenseKeywordAnalysis.KEYWORD_STATUS.PRESENT_IN_BODY_TEXT: ws.Cell(iRow, iCol).Style.Font.SetFontColor(XLColor.Green); break; default: break; } iCol++; this.InsertAndFormatContentCell(ws, iRow, iCol, Keyword); iCol++; this.InsertAndFormatUrlCell(ws, iRow, iCol, msDoc.GetUrl()); iRow++; } } DocCount++; } { var rangeData = ws.Range(1, 1, iRow - 1, iColMax); var excelTable = rangeData.CreateTable(); } }
/**************************************************************************/ private void BuildWorksheetPageBrokenLinks( MacroscopeJobMaster JobMaster, XLWorkbook wb, string WorksheetLabel ) { var ws = wb.Worksheets.Add(WorksheetLabel); int iRow = 1; int iCol = 1; int iColMax = 1; MacroscopeDocumentCollection DocCollection = JobMaster.GetDocCollection(); MacroscopeAllowedHosts AllowedHosts = JobMaster.GetAllowedHosts(); { ws.Cell(iRow, iCol).Value = "Status Code"; iCol++; ws.Cell(iRow, iCol).Value = "Status"; iCol++; ws.Cell(iRow, iCol).Value = "Origin URL"; iCol++; ws.Cell(iRow, iCol).Value = "Destination URL"; } iColMax = iCol; iRow++; foreach (MacroscopeDocument msDoc in DocCollection.IterateDocuments()) { MacroscopeHyperlinksIn HyperlinksIn = DocCollection.GetDocumentHyperlinksIn(msDoc.GetUrl()); int StatusCode = ( int )msDoc.GetStatusCode(); string Status = msDoc.GetStatusCode().ToString(); if ( (StatusCode >= 400) && (StatusCode <= 599) && (HyperlinksIn != null)) { foreach (MacroscopeHyperlinkIn HyperlinkIn in HyperlinksIn.IterateLinks()) { string OriginUrl = HyperlinkIn.GetSourceUrl(); if ( (OriginUrl != null) && (OriginUrl.Length > 0)) { iCol = 1; this.InsertAndFormatContentCell(ws, iRow, iCol, StatusCode.ToString()); if ((StatusCode >= 400) && (StatusCode <= 599)) { ws.Cell(iRow, iCol).Style.Font.SetFontColor(XLColor.Red); } else { ws.Cell(iRow, iCol).Style.Font.SetFontColor(XLColor.Blue); } iCol++; this.InsertAndFormatContentCell(ws, iRow, iCol, Status); if ((StatusCode >= 400) && (StatusCode <= 599)) { ws.Cell(iRow, iCol).Style.Font.SetFontColor(XLColor.Red); } else { ws.Cell(iRow, iCol).Style.Font.SetFontColor(XLColor.Blue); } iCol++; this.InsertAndFormatUrlCell(ws, iRow, iCol, OriginUrl); if (AllowedHosts.IsInternalUrl(Url: OriginUrl)) { ws.Cell(iRow, iCol).Style.Font.SetFontColor(XLColor.Green); } else { ws.Cell(iRow, iCol).Style.Font.SetFontColor(XLColor.Gray); } iCol++; this.InsertAndFormatUrlCell(ws, iRow, iCol, msDoc); if (AllowedHosts.IsInternalUrl(Url: msDoc.GetUrl())) { ws.Cell(iRow, iCol).Style.Font.SetFontColor(XLColor.Green); } else { ws.Cell(iRow, iCol).Style.Font.SetFontColor(XLColor.Gray); } iRow++; } } } } { var rangeData = ws.Range(1, 1, iRow - 1, iColMax); var excelTable = rangeData.CreateTable(); } }
/**************************************************************************/ private void BuildWorksheetOverview( MacroscopeJobMaster JobMaster, XLWorkbook wb, string WorksheetLabel ) { var ws = wb.Worksheets.Add(WorksheetLabel); int iRow = 1; int iCol = 1; int iColMax = 1; MacroscopeDocumentCollection DocCollection = JobMaster.GetDocCollection(); { ws.Cell(iRow, iCol).Value = "URL"; iCol++; ws.Cell(iRow, iCol).Value = "Status Code"; iCol++; ws.Cell(iRow, iCol).Value = "Status"; iCol++; ws.Cell(iRow, iCol).Value = "Redirect"; iCol++; ws.Cell(iRow, iCol).Value = "Robots"; iCol++; ws.Cell(iRow, iCol).Value = "Duration"; iCol++; ws.Cell(iRow, iCol).Value = "Crawled Date"; iCol++; ws.Cell(iRow, iCol).Value = "Server Date"; iCol++; ws.Cell(iRow, iCol).Value = "Modified Date"; iCol++; ws.Cell(iRow, iCol).Value = "Expires Date"; iCol++; ws.Cell(iRow, iCol).Value = "Content-Type"; iCol++; ws.Cell(iRow, iCol).Value = "Charset"; iCol++; ws.Cell(iRow, iCol).Value = "Locale"; iCol++; ws.Cell(iRow, iCol).Value = "Language"; iCol++; ws.Cell(iRow, iCol).Value = "Canonical"; iCol++; ws.Cell(iRow, iCol).Value = "Page Depth"; iCol++; ws.Cell(iRow, iCol).Value = "Links In"; iCol++; ws.Cell(iRow, iCol).Value = "Links Out"; iCol++; ws.Cell(iRow, iCol).Value = "Hyperlinks In"; iCol++; ws.Cell(iRow, iCol).Value = "Hyperlinks Out"; iCol++; ws.Cell(iRow, iCol).Value = "Ratio In"; iCol++; ws.Cell(iRow, iCol).Value = "Ratio Out"; iCol++; ws.Cell(iRow, iCol).Value = "Author"; iCol++; ws.Cell(iRow, iCol).Value = "Title"; iCol++; ws.Cell(iRow, iCol).Value = "Title Length"; iCol++; ws.Cell(iRow, iCol).Value = "Description"; iCol++; ws.Cell(iRow, iCol).Value = "Description Length"; iCol++; ws.Cell(iRow, iCol).Value = "Keywords"; iCol++; ws.Cell(iRow, iCol).Value = "Keywords Length"; iCol++; ws.Cell(iRow, iCol).Value = "Keywords Count"; iCol++; ws.Cell(iRow, iCol).Value = "Error Condition"; for (int i = 1; i <= iCol; i++) { ws.Cell(iRow, i).Style.Font.SetBold(); } } iColMax = iCol; iRow++; foreach (MacroscopeDocument msDoc in DocCollection.IterateDocuments()) { List <decimal> HyperlinkRatio = DocCollection.GetDocumentHyperlinksRatio(Url: msDoc.GetUrl()); iCol = 1; this.InsertAndFormatUrlCell(ws, iRow, iCol, msDoc); iCol++; this.InsertAndFormatStatusCodeCell(ws, iRow, iCol, msDoc); iCol++; this.InsertAndFormatContentCell(ws, iRow, iCol, this.FormatIfMissing(msDoc.GetStatusCode().ToString())); iCol++; this.InsertAndFormatRedirectCell(ws, iRow, iCol, msDoc); iCol++; this.InsertAndFormatRobotsCell(ws, iRow, iCol, msDoc); iCol++; this.InsertAndFormatContentCell(ws, iRow, iCol, msDoc.GetDurationInSecondsFormatted()); iCol++; this.InsertAndFormatDateCell(ws, iRow, iCol, msDoc.GetCrawledDate()); iCol++; this.InsertAndFormatDateCell(ws, iRow, iCol, msDoc.GetDateServer()); iCol++; this.InsertAndFormatDateCell(ws, iRow, iCol, msDoc.GetDateModified()); iCol++; this.InsertAndFormatDateCell(ws, iRow, iCol, msDoc.GetDateExpires()); iCol++; this.InsertAndFormatContentCell(ws, iRow, iCol, this.FormatIfMissing(msDoc.GetMimeType())); iCol++; this.InsertAndFormatContentCell(ws, iRow, iCol, this.FormatIfMissing(msDoc.GetCharacterSet())); iCol++; this.InsertAndFormatContentCell(ws, iRow, iCol, this.FormatIfMissing(msDoc.GetLocale())); iCol++; this.InsertAndFormatContentCell(ws, iRow, iCol, this.FormatIfMissing(msDoc.GetIsoLanguageCode())); iCol++; this.InsertAndFormatContentCell(ws, iRow, iCol, this.FormatIfMissing(msDoc.GetCanonical())); iCol++; this.InsertAndFormatContentCell(ws, iRow, iCol, msDoc.GetDepth().ToString()); iCol++; this.InsertAndFormatContentCell(ws, iRow, iCol, msDoc.CountInlinks()); iCol++; this.InsertAndFormatContentCell(ws, iRow, iCol, msDoc.CountOutlinks()); iCol++; this.InsertAndFormatContentCell(ws, iRow, iCol, msDoc.CountHyperlinksIn()); iCol++; this.InsertAndFormatContentCell(ws, iRow, iCol, msDoc.CountHyperlinksOut()); iCol++; this.InsertAndFormatContentCell(ws, iRow, iCol, string.Format("{0:0.00}%", HyperlinkRatio[0])); iCol++; this.InsertAndFormatContentCell(ws, iRow, iCol, string.Format("{0:0.00}%", HyperlinkRatio[1])); iCol++; this.InsertAndFormatContentCell(ws, iRow, iCol, msDoc.GetAuthor()); iCol++; this.InsertAndFormatContentCell(ws, iRow, iCol, this.FormatIfMissing(msDoc.GetTitle())); iCol++; this.InsertAndFormatContentCell(ws, iRow, iCol, msDoc.GetTitleLength().ToString()); iCol++; this.InsertAndFormatContentCell(ws, iRow, iCol, this.FormatIfMissing(msDoc.GetDescription())); iCol++; this.InsertAndFormatContentCell(ws, iRow, iCol, msDoc.GetDescriptionLength()); iCol++; this.InsertAndFormatContentCell(ws, iRow, iCol, this.FormatIfMissing(msDoc.GetKeywords())); iCol++; this.InsertAndFormatContentCell(ws, iRow, iCol, msDoc.GetKeywordsLength().ToString()); iCol++; this.InsertAndFormatContentCell(ws, iRow, iCol, msDoc.GetKeywordsCount().ToString()); iCol++; this.InsertAndFormatContentCell(ws, iRow, iCol, this.FormatIfMissing(msDoc.GetErrorCondition())); iRow++; } { var rangeData = ws.Range(1, 1, iRow - 1, iColMax); var excelTable = rangeData.CreateTable(); } }
/**************************************************************************/ private void BuildWorksheetPageDescriptions( MacroscopeJobMaster JobMaster, CsvWriter ws ) { MacroscopeDocumentCollection DocCollection = JobMaster.GetDocCollection(); { ws.WriteField("URL"); ws.WriteField("Page Language"); ws.WriteField("Detected Language"); ws.WriteField("Occurrences"); ws.WriteField("Description"); ws.WriteField("Description Length"); ws.NextRecord(); } foreach (MacroscopeDocument msDoc in DocCollection.IterateDocuments()) { bool Proceed = false; if (msDoc.GetIsExternal()) { continue; } if (msDoc.GetIsRedirect()) { continue; } switch (msDoc.GetDocumentType()) { case MacroscopeConstants.DocumentType.HTML: Proceed = true; break; case MacroscopeConstants.DocumentType.PDF: Proceed = true; break; default: break; } if (Proceed) { string Description = msDoc.GetDescription(); string PageLanguage = msDoc.GetIsoLanguageCode(); string DetectedLanguage = msDoc.GetTitleLanguage(); int Occurrences = 0; int DescriptionLength = msDoc.GetDescriptionLength(); if (DescriptionLength > 0) { Occurrences = DocCollection.GetStatsDescriptionCount(msDoc: msDoc); } this.InsertAndFormatUrlCell(ws, msDoc); this.InsertAndFormatContentCell(ws, this.FormatIfMissing(PageLanguage)); this.InsertAndFormatContentCell(ws, this.FormatIfMissing(DetectedLanguage)); this.InsertAndFormatContentCell(ws, this.FormatIfMissing(Occurrences.ToString())); this.InsertAndFormatContentCell(ws, this.FormatIfMissing(Description)); this.InsertAndFormatContentCell(ws, this.FormatIfMissing(DescriptionLength.ToString())); ws.NextRecord(); } } }
/**************************************************************************/ private void BuildWorksheetXpaths( MacroscopeJobMaster JobMaster, XLWorkbook wb, string WorksheetLabel ) { var ws = wb.Worksheets.Add(WorksheetLabel); int iRow = 1; int iCol = 1; int iColMax = 1; MacroscopeDocumentCollection DocCollection = JobMaster.GetDocCollection(); MacroscopeAllowedHosts AllowedHosts = JobMaster.GetAllowedHosts(); { ws.Cell(iRow, iCol).Value = MacroscopeConstants.Url; iCol++; ws.Cell(iRow, iCol).Value = MacroscopeConstants.StatusCode; iCol++; ws.Cell(iRow, iCol).Value = MacroscopeConstants.Status; iCol++; ws.Cell(iRow, iCol).Value = MacroscopeConstants.ContentType; iCol++; ws.Cell(iRow, iCol).Value = "Extracted Label"; iCol++; ws.Cell(iRow, iCol).Value = "Extracted Value"; } iColMax = iCol; iRow++; foreach (MacroscopeDocument msDoc in DocCollection.IterateDocuments()) { string DocUrl = msDoc.GetUrl(); string StatusCode = (( int )msDoc.GetStatusCode()).ToString(); string Status = msDoc.GetStatusCode().ToString(); string MimeType = msDoc.GetMimeType(); if (!this.DataExtractorRegexes.CanApplyDataExtractorsToDocument(msDoc: msDoc)) { continue; } foreach (KeyValuePair <string, string> DataExtractedPair in msDoc.IterateDataExtractedXpaths()) { string ExtractedLabel = DataExtractedPair.Key; string ExtractedValue = DataExtractedPair.Value; if ( string.IsNullOrEmpty(ExtractedLabel) || string.IsNullOrEmpty(ExtractedValue)) { continue; } iCol = 1; this.InsertAndFormatUrlCell(ws, iRow, iCol, msDoc); if (msDoc.GetIsInternal()) { ws.Cell(iRow, iCol).Style.Font.SetFontColor(XLColor.Green); } else { ws.Cell(iRow, iCol).Style.Font.SetFontColor(XLColor.Gray); } iCol++; this.InsertAndFormatStatusCodeCell(ws, iRow, iCol, msDoc); iCol++; this.InsertAndFormatContentCell(ws, iRow, iCol, this.FormatIfMissing(Status)); iCol++; this.InsertAndFormatContentCell(ws, iRow, iCol, this.FormatIfMissing(MimeType)); iCol++; this.InsertAndFormatContentCell(ws, iRow, iCol, this.FormatIfMissing(ExtractedLabel)); iCol++; this.InsertAndFormatContentCell(ws, iRow, iCol, this.FormatIfMissing(ExtractedValue)); iRow++; } } { var rangeData = ws.Range(1, 1, iRow - 1, iColMax); var excelTable = rangeData.CreateTable(); } }
/**************************************************************************/ private void BuildWorksheetPageTitles( MacroscopeJobMaster JobMaster, XLWorkbook wb, string WorksheetLabel ) { var ws = wb.Worksheets.Add(WorksheetLabel); int iRow = 1; int iCol = 1; int iColMax = 1; MacroscopeDocumentCollection DocCollection = JobMaster.GetDocCollection(); { ws.Cell(iRow, iCol).Value = "URL"; iCol++; ws.Cell(iRow, iCol).Value = "Page Language"; iCol++; ws.Cell(iRow, iCol).Value = "Detected Language"; iCol++; ws.Cell(iRow, iCol).Value = "Occurrences"; iCol++; ws.Cell(iRow, iCol).Value = "Title"; iCol++; ws.Cell(iRow, iCol).Value = "Title Length"; iCol++; ws.Cell(iRow, iCol).Value = "Pixel Width"; } iColMax = iCol; iRow++; foreach (MacroscopeDocument msDoc in DocCollection.IterateDocuments()) { bool Proceed = false; if (msDoc.GetIsExternal()) { continue; } if (msDoc.GetIsRedirect()) { continue; } switch (msDoc.GetDocumentType()) { case MacroscopeConstants.DocumentType.HTML: Proceed = true; break; case MacroscopeConstants.DocumentType.PDF: Proceed = true; break; default: break; } if (Proceed) { iCol = 1; string PageLanguage = msDoc.GetIsoLanguageCode(); string DetectedLanguage = msDoc.GetTitleLanguage(); string Title = msDoc.GetTitle(); int Occurrences = 0; int TitleLength = msDoc.GetTitleLength(); int TitlePixelWidth = msDoc.GetTitlePixelWidth(); if (TitleLength > 0) { Occurrences = DocCollection.GetStatsTitleCount(msDoc: msDoc); } this.InsertAndFormatUrlCell(ws, iRow, iCol, msDoc); if (msDoc.GetIsInternal()) { ws.Cell(iRow, iCol).Style.Font.SetFontColor(XLColor.Green); } else { ws.Cell(iRow, iCol).Style.Font.SetFontColor(XLColor.Gray); } iCol++; switch (msDoc.GetDocumentType()) { case MacroscopeConstants.DocumentType.HTML: this.InsertAndFormatContentCell(ws, iRow, iCol, this.FormatIfMissing(PageLanguage)); break; case MacroscopeConstants.DocumentType.PDF: this.InsertAndFormatContentCell(ws, iRow, iCol, PageLanguage); break; default: break; } if (PageLanguage != DetectedLanguage) { ws.Cell(iRow, iCol).Style.Font.SetFontColor(XLColor.Red); } else { ws.Cell(iRow, iCol).Style.Font.SetFontColor(XLColor.Green); } iCol++; this.InsertAndFormatContentCell(ws, iRow, iCol, this.FormatIfMissing(DetectedLanguage)); if (PageLanguage != DetectedLanguage) { ws.Cell(iRow, iCol).Style.Font.SetFontColor(XLColor.Red); } else { ws.Cell(iRow, iCol).Style.Font.SetFontColor(XLColor.Green); } iCol++; this.InsertAndFormatContentCell(ws, iRow, iCol, this.FormatIfMissing(Occurrences.ToString())); if (Occurrences > 1) { ws.Cell(iRow, iCol).Style.Font.SetFontColor(XLColor.Orange); } else { ws.Cell(iRow, iCol).Style.Font.SetFontColor(XLColor.Green); } iCol++; this.InsertAndFormatContentCell(ws, iRow, iCol, this.FormatIfMissing(Title)); if (TitleLength <= 0) { ws.Cell(iRow, iCol).Style.Font.SetFontColor(XLColor.Red); ws.Cell(iRow, iCol).Value = "MISSING"; } else { ws.Cell(iRow, iCol).Style.Font.SetFontColor(XLColor.Green); } iCol++; this.InsertAndFormatContentCell(ws, iRow, iCol, TitleLength); if (TitleLength < MacroscopePreferencesManager.GetTitleMinLen()) { ws.Cell(iRow, iCol).Style.Font.SetFontColor(XLColor.Red); } else if (TitleLength > MacroscopePreferencesManager.GetTitleMaxLen()) { ws.Cell(iRow, iCol).Style.Font.SetFontColor(XLColor.Red); } else { ws.Cell(iRow, iCol).Style.Font.SetFontColor(XLColor.Green); } iCol++; this.InsertAndFormatContentCell(ws, iRow, iCol, TitlePixelWidth); if (TitlePixelWidth > MacroscopePreferencesManager.GetTitleMaxPixelWidth()) { ws.Cell(iRow, iCol).Style.Font.SetFontColor(XLColor.Red); } else if (TitlePixelWidth >= (MacroscopePreferencesManager.GetTitleMaxPixelWidth() - 20)) { ws.Cell(iRow, iCol).Style.Font.SetFontColor(XLColor.Red); } else if (TitlePixelWidth <= 0) { ws.Cell(iRow, iCol).Style.Font.SetFontColor(XLColor.Orange); } else { ws.Cell(iRow, iCol).Style.Font.SetFontColor(XLColor.Green); } iRow++; } } { var rangeData = ws.Range(1, 1, iRow - 1, iColMax); var excelTable = rangeData.CreateTable(); } }
/**************************************************************************/ protected override void RenderListView( List <ListViewItem> ListViewItems, MacroscopeDocumentCollection DocCollection, MacroscopeDocument msDoc, string Url ) { bool Proceed = false; if (msDoc.GetIsExternal()) { return; } if (msDoc.GetIsRedirect()) { return; } switch (msDoc.GetDocumentType()) { case MacroscopeConstants.DocumentType.HTML: Proceed = true; break; case MacroscopeConstants.DocumentType.PDF: Proceed = true; break; default: break; } if (Proceed) { ListViewItem lvItem = null; string Text = msDoc.GetKeywords(); int Occurrences = 0; int KeywordsLength = msDoc.GetKeywordsLength(); int TextNumber = msDoc.GetKeywordsCount(); string PairKey = string.Join("", Url, Text); if (KeywordsLength > 0) { Occurrences = DocCollection.GetStatsKeywordsCount(msDoc); } if (this.DisplayListView.Items.ContainsKey(PairKey)) { try { lvItem = this.DisplayListView.Items[PairKey]; lvItem.SubItems[0].Text = Url; lvItem.SubItems[1].Text = Occurrences.ToString(); lvItem.SubItems[2].Text = Text; lvItem.SubItems[3].Text = KeywordsLength.ToString(); lvItem.SubItems[4].Text = TextNumber.ToString(); } catch (Exception ex) { DebugMsg(string.Format("MacroscopeDisplayKeywords 1: {0}", ex.Message)); } } else { try { lvItem = new ListViewItem(PairKey); lvItem.UseItemStyleForSubItems = false; lvItem.Name = PairKey; lvItem.SubItems[0].Text = Url; lvItem.SubItems.Add(Occurrences.ToString()); lvItem.SubItems.Add(Text); lvItem.SubItems.Add(KeywordsLength.ToString()); lvItem.SubItems.Add(TextNumber.ToString()); ListViewItems.Add(lvItem); } catch (Exception ex) { DebugMsg(string.Format("MacroscopeDisplayKeywords 2: {0}", ex.Message)); } } if (lvItem != null) { lvItem.ForeColor = Color.Blue; // URL -------------------------------------------------------------// if (msDoc.GetIsInternal()) { lvItem.SubItems[0].ForeColor = Color.Green; } else { lvItem.SubItems[0].ForeColor = Color.Gray; } // Check Missing Text ----------------------------------------------// if (msDoc.GetIsInternal()) { if (KeywordsLength <= 0) { lvItem.SubItems[1].ForeColor = Color.Red; lvItem.SubItems[2].ForeColor = Color.Red; lvItem.SubItems[3].ForeColor = Color.Red; lvItem.SubItems[4].ForeColor = Color.Red; } else { lvItem.SubItems[1].ForeColor = Color.Green; lvItem.SubItems[2].ForeColor = Color.Green; lvItem.SubItems[3].ForeColor = Color.Green; lvItem.SubItems[4].ForeColor = Color.Green; } } else { lvItem.SubItems[1].ForeColor = Color.Gray; lvItem.SubItems[2].ForeColor = Color.Gray; lvItem.SubItems[3].ForeColor = Color.Gray; lvItem.SubItems[4].ForeColor = Color.Gray; } } } }
/**************************************************************************/ private void BuildWorksheetPageTitles( MacroscopeJobMaster JobMaster, CsvWriter ws ) { MacroscopeDocumentCollection DocCollection = JobMaster.GetDocCollection(); { ws.WriteField("URL"); ws.WriteField("Page Language"); ws.WriteField("Detected Language"); ws.WriteField("Occurrences"); ws.WriteField("Title"); ws.WriteField("Title Length"); ws.WriteField("Pixel Width"); ws.NextRecord(); } foreach (string Url in DocCollection.DocumentKeys()) { MacroscopeDocument msDoc = DocCollection.GetDocument(Url); Boolean Proceed = false; if (msDoc.GetIsExternal()) { continue; } if (msDoc.GetIsRedirect()) { continue; } if (msDoc.GetIsHtml()) { Proceed = true; } else if (msDoc.GetIsPdf()) { Proceed = true; } if (Proceed) { string Title = msDoc.GetTitle(); string PageLanguage = msDoc.GetIsoLanguageCode(); string DetectedLanguage = msDoc.GetTitleLanguage(); int Occurrences = 0; int TitleLength = msDoc.GetTitleLength(); int TitlePixelWidth = msDoc.GetTitlePixelWidth(); if (TitleLength > 0) { Occurrences = DocCollection.GetStatsTitleCount(msDoc: msDoc); } this.InsertAndFormatUrlCell(ws, msDoc); this.InsertAndFormatContentCell(ws, this.FormatIfMissing(PageLanguage)); this.InsertAndFormatContentCell(ws, this.FormatIfMissing(DetectedLanguage)); this.InsertAndFormatContentCell(ws, this.FormatIfMissing(Occurrences.ToString())); this.InsertAndFormatContentCell(ws, this.FormatIfMissing(Title)); this.InsertAndFormatContentCell(ws, this.FormatIfMissing(TitleLength.ToString())); this.InsertAndFormatContentCell(ws, this.FormatIfMissing(TitlePixelWidth.ToString())); ws.NextRecord(); } } }
/**************************************************************************/ private void BuildWorksheetOverview( MacroscopeJobMaster JobMaster, CsvWriter ws ) { MacroscopeDocumentCollection DocCollection = JobMaster.GetDocCollection(); { ws.WriteField("URL"); ws.WriteField("Status Code"); ws.WriteField("Status"); ws.WriteField("Redirect"); ws.WriteField("Robots"); ws.WriteField("Duration"); ws.WriteField("Crawled Date"); ws.WriteField("Server Date"); ws.WriteField("Modified Date"); ws.WriteField("Expires Date"); ws.WriteField("Content-Type"); ws.WriteField("Charset"); ws.WriteField("Locale"); ws.WriteField("Language"); ws.WriteField("Canonical"); ws.WriteField("Page Depth"); ws.WriteField("Links In"); ws.WriteField("Links Out"); ws.WriteField("Hyperlinks In"); ws.WriteField("Hyperlinks Out"); ws.WriteField("Ration In"); ws.WriteField("Ratio Out"); ws.WriteField("Author"); ws.WriteField("Title"); ws.WriteField("Title Length"); ws.WriteField("Description"); ws.WriteField("Description Length"); ws.WriteField("Keywords"); ws.WriteField("Keywords Length"); ws.WriteField("Keywords Count"); ws.WriteField("Error Condition"); ws.NextRecord(); } foreach (MacroscopeDocument msDoc in DocCollection.IterateDocuments()) { List <decimal> HyperlinkRatio = DocCollection.GetDocumentHyperlinksRatio(Url: msDoc.GetUrl()); this.InsertAndFormatUrlCell(ws, msDoc); this.InsertAndFormatStatusCodeCell(ws, msDoc); this.InsertAndFormatContentCell(ws, msDoc.GetStatusCode().ToString()); this.InsertAndFormatRedirectCell(ws, msDoc); this.InsertAndFormatRobotsCell(ws, msDoc); this.InsertAndFormatContentCell(ws, msDoc.GetDurationInSecondsFormatted()); this.InsertAndFormatContentCell(ws, this.FormatIfMissing(msDoc.GetCrawledDate())); this.InsertAndFormatContentCell(ws, this.FormatIfMissing(msDoc.GetDateServer())); this.InsertAndFormatContentCell(ws, this.FormatIfMissing(msDoc.GetDateModified())); this.InsertAndFormatContentCell(ws, this.FormatIfMissing(msDoc.GetDateExpires())); this.InsertAndFormatContentCell(ws, this.FormatIfMissing(msDoc.GetMimeType())); this.InsertAndFormatContentCell(ws, this.FormatIfMissing(msDoc.GetCharacterSet())); this.InsertAndFormatContentCell(ws, this.FormatIfMissing(msDoc.GetLocale())); this.InsertAndFormatContentCell(ws, this.FormatIfMissing(msDoc.GetIsoLanguageCode())); this.InsertAndFormatContentCell(ws, this.FormatIfMissing(msDoc.GetCanonical())); this.InsertAndFormatContentCell(ws, msDoc.GetDepth().ToString()); this.InsertAndFormatContentCell(ws, msDoc.CountInlinks().ToString()); this.InsertAndFormatContentCell(ws, msDoc.CountOutlinks().ToString()); this.InsertAndFormatContentCell(ws, msDoc.CountHyperlinksIn().ToString()); this.InsertAndFormatContentCell(ws, msDoc.CountHyperlinksOut().ToString()); this.InsertAndFormatContentCell(ws, this.FormatIfMissing(string.Format("{0:0.00}%", HyperlinkRatio[0]))); this.InsertAndFormatContentCell(ws, this.FormatIfMissing(string.Format("{0:0.00}%", HyperlinkRatio[1]))); this.InsertAndFormatContentCell(ws, msDoc.GetAuthor()); this.InsertAndFormatContentCell(ws, this.FormatIfMissing(msDoc.GetTitle())); this.InsertAndFormatContentCell(ws, msDoc.GetTitleLength().ToString()); this.InsertAndFormatContentCell(ws, this.FormatIfMissing(msDoc.GetDescription())); this.InsertAndFormatContentCell(ws, msDoc.GetDescriptionLength().ToString()); this.InsertAndFormatContentCell(ws, this.FormatIfMissing(msDoc.GetKeywords())); this.InsertAndFormatContentCell(ws, msDoc.GetKeywordsLength().ToString()); this.InsertAndFormatContentCell(ws, msDoc.GetKeywordsCount().ToString()); this.InsertAndFormatContentCell(ws, this.FormatIfMissing(msDoc.GetErrorCondition())); ws.NextRecord(); } }
/**************************************************************************/ private void BuildWorksheetPageAuthors( MacroscopeJobMaster JobMaster, XLWorkbook wb, string WorksheetLabel ) { var ws = wb.Worksheets.Add(WorksheetLabel); int iRow = 1; int iCol = 1; int iColMax = 1; MacroscopeDocumentCollection DocCollection = JobMaster.GetDocCollection(); { ws.Cell(iRow, iCol).Value = "URL"; iCol++; ws.Cell(iRow, iCol).Value = "Page Language"; iCol++; ws.Cell(iRow, iCol).Value = "Detected Language"; iCol++; ws.Cell(iRow, iCol).Value = "Author"; } iColMax = iCol; iRow++; foreach (MacroscopeDocument msDoc in DocCollection.IterateDocuments()) { bool Proceed = false; if (msDoc.GetIsExternal()) { continue; } if (msDoc.GetIsRedirect()) { continue; } switch (msDoc.GetDocumentType()) { case MacroscopeConstants.DocumentType.HTML: Proceed = true; break; case MacroscopeConstants.DocumentType.PDF: Proceed = true; break; default: break; } if (Proceed) { iCol = 1; string PageLanguage = msDoc.GetIsoLanguageCode(); string DetectedLanguage = msDoc.GetTitleLanguage(); string Author = msDoc.GetAuthor(); this.InsertAndFormatUrlCell(ws, iRow, iCol, msDoc); if (msDoc.GetIsInternal()) { ws.Cell(iRow, iCol).Style.Font.SetFontColor(XLColor.Green); } else { ws.Cell(iRow, iCol).Style.Font.SetFontColor(XLColor.Gray); } iCol++; this.InsertAndFormatContentCell(ws, iRow, iCol, this.FormatIfMissing(PageLanguage)); if (PageLanguage != DetectedLanguage) { ws.Cell(iRow, iCol).Style.Font.SetFontColor(XLColor.Red); } else { ws.Cell(iRow, iCol).Style.Font.SetFontColor(XLColor.Green); } iCol++; this.InsertAndFormatContentCell(ws, iRow, iCol, this.FormatIfMissing(DetectedLanguage)); if (PageLanguage != DetectedLanguage) { ws.Cell(iRow, iCol).Style.Font.SetFontColor(XLColor.Red); } else { ws.Cell(iRow, iCol).Style.Font.SetFontColor(XLColor.Green); } iCol++; this.InsertAndFormatContentCell(ws, iRow, iCol, this.FormatIfMissing(Author)); if (Author.Length > 0) { ws.Cell(iRow, iCol).Style.Font.SetFontColor(XLColor.Green); } iRow++; } } { var rangeData = ws.Range(1, 1, iRow - 1, iColMax); var excelTable = rangeData.CreateTable(); } }
/**************************************************************************/ private void RenderListView( MacroscopeDocumentCollection DocCollection, Dictionary <string, string> LocalesList ) { MacroscopeAllowedHosts AllowedHosts = this.MainForm.GetJobMaster().GetAllowedHosts(); SortedDictionary <string, int> LocaleColsTable = new SortedDictionary <string, int> (); if (DocCollection.CountDocuments() == 0) { return; } List <ListViewItem> ListViewItems = new List <ListViewItem> (); MacroscopeSinglePercentageProgressForm ProgressForm = new MacroscopeSinglePercentageProgressForm(this.MainForm); decimal Count = 0; decimal TotalDocs = ( decimal )DocCollection.CountDocuments(); decimal MajorPercentage = (( decimal )100 / TotalDocs) * Count; if (MacroscopePreferencesManager.GetShowProgressDialogues()) { ProgressForm.UpdatePercentages( Title: "Preparing Display", Message: "Processing document collection for display:", MajorPercentage: MajorPercentage, ProgressLabelMajor: string.Format("Document {0} / {1}", Count, TotalDocs) ); } this.DisplayListView.Items.Clear(); this.DisplayListView.Columns.Clear(); { int LocaleColCount = 5; this.DisplayListView.Columns.Add("URL", "URL"); this.DisplayListView.Columns.Add("Status Code", "Status Code"); this.DisplayListView.Columns.Add("Site Locale", "Site Locale"); this.DisplayListView.Columns.Add("HrefLang Present", "HrefLang Present"); this.DisplayListView.Columns.Add("Title", "Title"); foreach (string Locale in LocalesList.Keys) { string LocaleLabel = Locale.ToUpper(); string DateServerLabel = string.Format("{0} Date Server", Locale.ToUpper()); string DateModifiedLabel = string.Format("{0} Date Modified", Locale.ToUpper()); this.DisplayListView.Columns.Add(LocaleLabel, LocaleLabel); this.DisplayListView.Columns.Add(DateServerLabel, DateServerLabel); this.DisplayListView.Columns.Add(DateModifiedLabel, DateModifiedLabel); LocaleColsTable[Locale] = LocaleColCount; LocaleColCount++; LocaleColsTable[DateServerLabel] = LocaleColCount; LocaleColCount++; LocaleColsTable[DateModifiedLabel] = LocaleColCount; LocaleColCount++; } } foreach (MacroscopeDocument msDoc in DocCollection.IterateDocuments()) { bool Proceed = false; if (msDoc.GetIsInternal()) { Proceed = true; if (msDoc.GetIsRedirect()) { Proceed = false; } if (!msDoc.IsDocumentType(Type: MacroscopeConstants.DocumentType.HTML)) { Proceed = false; } } if (Proceed) { Dictionary <string, MacroscopeHrefLang> HrefLangsTable = msDoc.GetHrefLangs(); string DocUrl = msDoc.GetUrl(); string PairKey = UrlToDigest(DocUrl).ToString(); HttpStatusCode StatusCode = msDoc.GetStatusCode(); int StatusCodeNum = ( int )StatusCode; MacroscopeConstants.Specifiers HrefLangPresent = MacroscopeConstants.Specifiers.UNSPECIFIED; string DocLocale = msDoc.GetLocale(); string DocTitle = msDoc.GetTitle(); ListViewItem lvItem = null; if ( (HrefLangsTable != null) && (HrefLangsTable.Count > 1)) { HrefLangPresent = MacroscopeConstants.Specifiers.SPECIFIED; } else { HrefLangPresent = MacroscopeConstants.Specifiers.UNSPECIFIED; } if (this.DisplayListView.Items.ContainsKey(PairKey)) { lvItem = this.DisplayListView.Items[PairKey]; } else { lvItem = new ListViewItem(PairKey); lvItem.UseItemStyleForSubItems = false; lvItem.Name = PairKey; lvItem.SubItems.Add(""); lvItem.SubItems.Add(""); lvItem.SubItems.Add(""); lvItem.SubItems.Add(""); lvItem.SubItems.Add(""); for (int i = 0; i < LocalesList.Keys.Count; i++) { lvItem.SubItems.Add(""); lvItem.SubItems.Add(""); lvItem.SubItems.Add(""); } ListViewItems.Add(lvItem); } if (lvItem != null) { try { lvItem.SubItems[ColUrl].Text = DocUrl; lvItem.SubItems[ColStatusCode].Text = StatusCode.ToString(); lvItem.SubItems[ColSiteLocale].Text = DocLocale; lvItem.SubItems[ColHrefLangPresent].Text = ""; lvItem.SubItems[ColTitle].Text = DocTitle; switch (HrefLangPresent) { case MacroscopeConstants.Specifiers.SPECIFIED: lvItem.SubItems[ColHrefLangPresent].ForeColor = Color.Green; lvItem.SubItems[ColHrefLangPresent].Text = "SPECIFIED"; break; default: lvItem.SubItems[ColHrefLangPresent].ForeColor = Color.Red; lvItem.SubItems[ColHrefLangPresent].Text = "UNSPECIFIED"; break; } if (AllowedHosts.IsInternalUrl(DocUrl)) { lvItem.SubItems[ColUrl].ForeColor = Color.Green; } else { lvItem.SubItems[ColUrl].ForeColor = Color.Gray; } if ((StatusCodeNum >= 100) && (StatusCodeNum <= 299)) { lvItem.SubItems[ColStatusCode].ForeColor = Color.Green; } else if ((StatusCodeNum >= 300) && (StatusCodeNum <= 399)) { lvItem.SubItems[ColStatusCode].ForeColor = Color.Orange; } else if ((StatusCodeNum >= 400) && (StatusCodeNum <= 599)) { lvItem.SubItems[ColStatusCode].ForeColor = Color.Red; } else { lvItem.SubItems[ColSiteLocale].ForeColor = Color.Gray; } foreach (string Locale in LocalesList.Keys) { if (!string.IsNullOrEmpty(Locale)) { string HrefLangUrl = null; DateTime HrefLangDateServer = new DateTime(); DateTime HrefLangDateModified = new DateTime(); int LocaleCol = LocaleColsTable[Locale]; if ( (HrefLangsTable != null) && (HrefLangsTable.Count > 0)) { if (HrefLangsTable.ContainsKey(Locale)) { MacroscopeHrefLang HrefLangAlternate = HrefLangsTable[Locale]; if (HrefLangAlternate != null) { HrefLangUrl = HrefLangAlternate.GetUrl(); HrefLangDateServer = HrefLangAlternate.GetDateServer(); HrefLangDateModified = HrefLangAlternate.GetDateModified(); } } } if (!string.IsNullOrEmpty(HrefLangUrl)) { lvItem.SubItems[LocaleCol].ForeColor = Color.Blue; lvItem.SubItems[LocaleCol].Text = HrefLangUrl; lvItem.SubItems[LocaleCol + 1].Text = HrefLangDateServer.ToString(); lvItem.SubItems[LocaleCol + 2].Text = HrefLangDateModified.ToString(); } else { lvItem.SubItems[LocaleCol].ForeColor = Color.Red; lvItem.SubItems[LocaleCol].Text = "NOT SPECIFIED"; lvItem.SubItems[LocaleCol + 1].Text = "NOT SPECIFIED"; lvItem.SubItems[LocaleCol + 2].Text = "NOT SPECIFIED"; } } } } catch (Exception ex) { DebugMsg(string.Format("MacroscopeDisplayHrefLang: {0}", ex.Message)); DebugMsg(string.Format("MacroscopeDisplayHrefLang: {0}", ex.StackTrace)); } } else { DebugMsg(string.Format("MacroscopeDisplayHrefLang NOT SPECIFIED: {0}", PairKey)); } } if (MacroscopePreferencesManager.GetShowProgressDialogues()) { Count++; MajorPercentage = (( decimal )100 / TotalDocs) * Count; ProgressForm.UpdatePercentages( Title: null, Message: null, MajorPercentage: MajorPercentage, ProgressLabelMajor: string.Format("Document {0} / {1}", Count, TotalDocs) ); } } this.DisplayListView.Items.AddRange(ListViewItems.ToArray()); this.DisplayListView.AutoResizeColumns(ColumnHeaderAutoResizeStyle.ColumnContent); this.DisplayListView.Columns[ColUrl].Width = 300; this.DisplayListView.Columns[ColStatusCode].Width = 80; this.DisplayListView.Columns[ColSiteLocale].Width = 100; this.DisplayListView.Columns[ColTitle].Width = 100; if (MacroscopePreferencesManager.GetShowProgressDialogues()) { ProgressForm.DoClose(); } if (ProgressForm != null) { ProgressForm.Dispose(); } }
/**************************************************************************/ public void RenderListViewSearchSourceUrls( MacroscopeDocumentCollection DocCollection, string UrlFragment ) { List <ListViewItem> ListViewItems = new List <ListViewItem>(DocCollection.CountDocuments()); MacroscopeSinglePercentageProgressForm ProgressForm = new MacroscopeSinglePercentageProgressForm(this.MainForm); decimal Count = 0; decimal TotalDocs = (decimal)DocCollection.CountDocuments(); if (MacroscopePreferencesManager.GetShowProgressDialogues()) { ProgressForm.UpdatePercentages( Title: "Displaying Links", Message: "Processing links in document collection for display:", MajorPercentage: ((decimal)100 / TotalDocs) * Count, ProgressLabelMajor: "Documents Processed" ); } foreach (MacroscopeDocument msDoc in DocCollection.IterateDocuments()) { string Url = msDoc.GetUrl(); if (Url.IndexOf(UrlFragment, StringComparison.CurrentCulture) >= 0) { this.RenderListView( ListViewItems: ListViewItems, DocCollection: DocCollection, msDoc: msDoc, Url: Url ); } if (MacroscopePreferencesManager.GetShowProgressDialogues()) { Count++; TotalDocs = (decimal)DocCollection.CountDocuments(); ProgressForm.UpdatePercentages( Title: null, Message: null, MajorPercentage: ((decimal)100 / TotalDocs) * Count, ProgressLabelMajor: null ); } } this.DisplayListView.Items.AddRange(ListViewItems.ToArray()); if (MacroscopePreferencesManager.GetShowProgressDialogues()) { ProgressForm.DoClose(); } if (ProgressForm != null) { ProgressForm.Dispose(); } }
/**************************************************************************/ private void InitializeJobMaster(MacroscopeConstants.RunTimeMode JobRunTimeMode) { GC.Collect(); /* * { * this.JobMasterLog = new EventLog (); * this.JobMasterLog.Source = MacroscopeConstants.MainEventLogSourceName; * this.JobGuid = Guid.NewGuid(); * this.LogEntry( string.Format( "Starting Job" ) ); * } */ this.RunTimeMode = JobRunTimeMode; if (this.TaskController != null) { this.CredentialsHttp = this.TaskController.IGetCredentialsHttp(); } this.DocCollection = new MacroscopeDocumentCollection(JobMaster: this); this.AllowedHosts = new MacroscopeAllowedHosts(); /** BEGIN: Named Queues *************************************************/ this.NamedQueueJobItems = new MacroscopeNamedQueue <MacroscopeJobItem> (); this.NamedQueueJobItems.CreateNamedQueue( Name: MacroscopeConstants.NamedQueueUrlList, QueueMode: MacroscopeNamedQueue <MacroscopeJobItem> .MODE.USE_HISTORY ); this.NamedQueue = new MacroscopeNamedQueue <string> (); { this.NamedQueue.CreateNamedQueue(Name: MacroscopeConstants.NamedQueueDisplayQueue); this.NamedQueue.CreateNamedQueue(Name: MacroscopeConstants.NamedQueueDisplayStructure); this.NamedQueue.CreateNamedQueue(Name: MacroscopeConstants.NamedQueueDisplayStructureLinkCounts); this.NamedQueue.CreateNamedQueue(Name: MacroscopeConstants.NamedQueueDisplayHierarchy); this.NamedQueue.CreateNamedQueue(Name: MacroscopeConstants.NamedQueueDisplayCanonicalAnalysis); this.NamedQueue.CreateNamedQueue(Name: MacroscopeConstants.NamedQueueDisplayHrefLang); this.NamedQueue.CreateNamedQueue(Name: MacroscopeConstants.NamedQueueDisplayErrors); this.NamedQueue.CreateNamedQueue(Name: MacroscopeConstants.NamedQueueDisplayHostnames); this.NamedQueue.CreateNamedQueue(Name: MacroscopeConstants.NamedQueueDisplayRedirectsAudit); this.NamedQueue.CreateNamedQueue(Name: MacroscopeConstants.NamedQueueDisplayLinks); this.NamedQueue.CreateNamedQueue(Name: MacroscopeConstants.NamedQueueDisplayHyperlinks); this.NamedQueue.CreateNamedQueue(Name: MacroscopeConstants.NamedQueueDisplayUriAnalysis); this.NamedQueue.CreateNamedQueue(Name: MacroscopeConstants.NamedQueueDisplayPageTitles); this.NamedQueue.CreateNamedQueue(Name: MacroscopeConstants.NamedQueueDisplayPageDescriptions); this.NamedQueue.CreateNamedQueue(Name: MacroscopeConstants.NamedQueueDisplayPageKeywords); this.NamedQueue.CreateNamedQueue(Name: MacroscopeConstants.NamedQueueDisplayPageHeadings); this.NamedQueue.CreateNamedQueue(Name: MacroscopeConstants.NamedQueueDisplayPageText); this.NamedQueue.CreateNamedQueue(Name: MacroscopeConstants.NamedQueueDisplayStylesheets); this.NamedQueue.CreateNamedQueue(Name: MacroscopeConstants.NamedQueueDisplayImages); this.NamedQueue.CreateNamedQueue(Name: MacroscopeConstants.NamedQueueDisplayJavascripts); this.NamedQueue.CreateNamedQueue(Name: MacroscopeConstants.NamedQueueDisplayAudios); this.NamedQueue.CreateNamedQueue(Name: MacroscopeConstants.NamedQueueDisplayVideos); this.NamedQueue.CreateNamedQueue(Name: MacroscopeConstants.NamedQueueDisplaySitemaps); this.NamedQueue.CreateNamedQueue(Name: MacroscopeConstants.NamedQueueDisplayEmailAddresses); this.NamedQueue.CreateNamedQueue(Name: MacroscopeConstants.NamedQueueDisplayTelephoneNumbers); this.NamedQueue.CreateNamedQueue(Name: MacroscopeConstants.NamedQueueDisplayCustomFilters); this.NamedQueue.CreateNamedQueue(Name: MacroscopeConstants.NamedQueueDisplayDataExtractorsCssSelectors); this.NamedQueue.CreateNamedQueue(Name: MacroscopeConstants.NamedQueueDisplayDataExtractorsRegexes); this.NamedQueue.CreateNamedQueue(Name: MacroscopeConstants.NamedQueueDisplayDataExtractorsXpaths); this.NamedQueue.CreateNamedQueue(Name: MacroscopeConstants.NamedQueueDisplayRemarks); } /** END: Named Queues ***************************************************/ this.CrawlDelay = 0; this.AdjustThreadsMax(); this.ThreadsRunning = 0; this.ThreadsStop = false; this.ThreadsDict = new Dictionary <int, Boolean> (); this.SemaphoreWorkers = new Semaphore(0, this.ThreadsMax); this.SemaphoreWorkers.Release(this.ThreadsMax); this.Depth = MacroscopePreferencesManager.GetDepth(); this.PageLimit = MacroscopePreferencesManager.GetPageLimit(); this.PageLimitCount = 0; this.PagesFound = 0; { this.ParentStartingDirectory = ""; this.ChildStartingDirectory = ""; } this.JobHistory = new MacroscopeJobHistory(); this.InitProgress(); this.Locales = new Dictionary <string, string> (32); this.Robots = new MacroscopeRobots(); this.BlockedByRobots = new Dictionary <string, Boolean> (); }
/**************************************************************************/ private void BuildWorksheetPageDuplicateEtags( MacroscopeJobMaster JobMaster, CsvWriter ws ) { decimal CountOuter = 0; decimal CountInner = 0; decimal DocCount = 0; MacroscopeDocumentCollection DocCollection = JobMaster.GetDocCollection(); MacroscopeAllowedHosts AllowedHosts = JobMaster.GetAllowedHosts(); Dictionary <string, int> DuplicatesList = new Dictionary <string, int> (DocCollection.CountDocuments()); Dictionary <string, MacroscopeDocument> DuplicatesDocList = new Dictionary <string, MacroscopeDocument> (DocCollection.CountDocuments()); DocCount = ( decimal )DocCollection.CountDocuments(); foreach (MacroscopeDocument msDoc in DocCollection.IterateDocuments()) { string Etag = msDoc.GetEtag(); if ((Etag != null) && (Etag.Length > 0)) { if (!DuplicatesDocList.ContainsKey(msDoc.GetUrl())) { DuplicatesDocList.Add(msDoc.GetUrl(), msDoc); } if (DuplicatesList.ContainsKey(Etag)) { DuplicatesList[Etag] = DuplicatesList[Etag] + 1; } else { DuplicatesList.Add(Etag, 1); } } } { ws.WriteField("Status Code"); ws.WriteField("Status"); ws.WriteField("Occurrences"); ws.WriteField("ETag"); ws.WriteField("URL"); ws.NextRecord(); } foreach (string Etag in DuplicatesList.Keys) { CountOuter++; CountInner = 0; if (DuplicatesList[Etag] > 1) { foreach (MacroscopeDocument msDoc in DuplicatesDocList.Values) { CountInner++; if (DocCount > 0) { this.ProgressForm.UpdatePercentages( Title: null, Message: null, MajorPercentage: -1, ProgressLabelMajor: string.Format("Documents Processed: {0}", CountOuter), MinorPercentage: (( decimal )100 / DocCount) * CountOuter, ProgressLabelMinor: Etag, SubMinorPercentage: (( decimal )100 / DocCount) * CountInner, ProgressLabelSubMinor: msDoc.GetUrl() ); } if (msDoc.GetEtag() == Etag) { int StatusCode = ( int )msDoc.GetStatusCode(); HttpStatusCode Status = msDoc.GetStatusCode(); int Occurrences = DuplicatesList[Etag]; this.InsertAndFormatStatusCodeCell(ws, StatusCode); this.InsertAndFormatStatusCodeCell(ws, Status); this.InsertAndFormatContentCell(ws, Occurrences); this.InsertAndFormatContentCell(ws, msDoc.GetEtag()); this.InsertAndFormatUrlCell(ws, msDoc); ws.NextRecord(); } } } } }
/**************************************************************************/ private void RenderListViewSitemapErrors(MacroscopeDocumentCollection DocCollection) { List <ListViewItem> ListViewItems = new List <ListViewItem>(1); List <Dictionary <string, string> > CompiledTable = DocCollection.GetSitemapErrorsAsTable(); foreach (Dictionary <string, string> Entry in CompiledTable) { string SitemapUrl = Entry["sitemap_url"]; string StatusCode = Entry["status_code"]; string Robots = Entry["robots"]; string TargetUrl = Entry["target_url"]; string PairKey = string.Join("::::::::", SitemapUrl, TargetUrl); MacroscopeDocument msDoc = DocCollection.GetDocumentByUrl(Url: SitemapUrl); MacroscopeDocument msDocLinked = DocCollection.GetDocumentByUrl(Url: TargetUrl); ListViewItem lvItem = null; if (this.DisplayListView.Items.ContainsKey(PairKey)) { try { lvItem = this.DisplayListView.Items[PairKey]; lvItem.SubItems[0].Text = SitemapUrl; lvItem.SubItems[1].Text = StatusCode; lvItem.SubItems[2].Text = Robots; lvItem.SubItems[3].Text = TargetUrl; } catch (Exception ex) { DebugMsg(string.Format("RenderListViewSitemapErrors 1: {0}", ex.Message)); } } else { try { lvItem = new ListViewItem(PairKey); lvItem.UseItemStyleForSubItems = false; lvItem.Name = PairKey; lvItem.SubItems[0].Text = SitemapUrl; lvItem.SubItems.Add(StatusCode); lvItem.SubItems.Add(Robots); lvItem.SubItems.Add(TargetUrl); ListViewItems.Add(lvItem); } catch (Exception ex) { DebugMsg(string.Format("RenderListViewSitemapErrors 2: {0}", ex.Message)); } } if (lvItem != null) { lvItem.ForeColor = Color.Blue; if (msDoc.GetIsInternal()) { lvItem.SubItems[0].ForeColor = Color.Green; } else { lvItem.SubItems[0].ForeColor = Color.Gray; } if (!msDocLinked.GetAllowedByRobots()) { lvItem.SubItems[2].ForeColor = Color.Red; } else { lvItem.SubItems[2].ForeColor = Color.Green; } if (msDocLinked.GetIsInternal()) { lvItem.SubItems[3].ForeColor = Color.Green; } } else { lvItem.SubItems[3].ForeColor = Color.Gray; } } this.DisplayListView.Items.AddRange(ListViewItems.ToArray()); return; }