/**************************************************************************/ private Dictionary <string, Dictionary <string, bool> > BuildSitemapUrlList(MacroscopeDocumentCollection DocCollection) { Dictionary <string, Dictionary <string, bool> > UrlMap = new Dictionary <string, Dictionary <string, bool> >(); try { MacroscopeDocumentList SitemapDocumentList = this.FindSitemaps(DocCollection: DocCollection); foreach (MacroscopeDocument msDoc in SitemapDocumentList.IterateDocuments()) { string SitemapUrl = msDoc.GetUrl(); if (!UrlMap.ContainsKey(SitemapUrl)) { UrlMap.Add(SitemapUrl, new Dictionary <string, bool>()); } foreach (MacroscopeLink Outlink in msDoc.IterateOutlinks()) { string TargetUrl = Outlink.GetTargetUrl(); if (!UrlMap[SitemapUrl].ContainsKey(TargetUrl)) { UrlMap[SitemapUrl].Add(TargetUrl, false); } } } } catch (Exception ex) { this.DebugMsg(string.Format("BuildSitemapUrlList: {0}", ex.Message)); } return(UrlMap); }
/** -------------------------------------------------------------------- **/ public void Analyze( MacroscopeDocument msDoc, string Text, Dictionary <string, int> Terms, int Words ) { Dictionary <string, int> TermsList = null; if (Words == 1) { TermsList = this.AnalyzeTerm( Text: Text.ToLower(), Terms: Terms ); } else if (Words > 1) { TermsList = this.AnalyzePhrase( Text: Text.ToLower(), Terms: Terms, Words: Words ); } if ((this.DocList != null) && (TermsList != null)) { lock (this.DocList) { foreach (string KeywordTerm in TermsList.Keys) { MacroscopeDocumentList DocumentList; if (this.DocList.ContainsKey(KeywordTerm)) { DocumentList = this.DocList[KeywordTerm]; } else { DocumentList = new MacroscopeDocumentList(); this.DocList.Add(KeywordTerm, DocumentList); } DocumentList.AddDocument(msDoc); } } } }
/**************************************************************************/ public List <MacroscopeDocumentList> AnalyzeInSitemaps(MacroscopeDocumentCollection DocCollection) { Dictionary <string, Dictionary <string, bool> > UrlMap = this.BuildSitemapUrlList(DocCollection: DocCollection); MacroscopeDocumentList InSitemapsDocumentList = new MacroscopeDocumentList(); MacroscopeDocumentList NotInSitemapsDocumentList = new MacroscopeDocumentList(); List <MacroscopeDocumentList> DocumentLists = new List <MacroscopeDocumentList>(2); DocumentLists.Add(NotInSitemapsDocumentList); DocumentLists.Add(InSitemapsDocumentList); foreach (MacroscopeDocument msDoc in DocCollection.IterateDocuments()) { bool InSitemap = false; string DocumentNote = null; string Url = msDoc.GetUrl(); if (msDoc.GetIsExternal()) { continue; } if (!msDoc.IsDocumentType(Type: MacroscopeConstants.DocumentType.HTML)) { continue; } foreach (string SitemapUrl in UrlMap.Keys) { if (UrlMap[SitemapUrl].ContainsKey(Url)) { InSitemap = true; DocumentNote = SitemapUrl; } } if (InSitemap) { InSitemapsDocumentList.AddDocument(msDoc: msDoc); InSitemapsDocumentList.AddDocumentNote(msDoc: msDoc, Note: DocumentNote); } else { NotInSitemapsDocumentList.AddDocument(msDoc: msDoc); } } return(DocumentLists); }
/**************************************************************************/ private MacroscopeDocumentList FindSitemaps(MacroscopeDocumentCollection DocCollection) { MacroscopeDocumentList SitemapDocumentList = new MacroscopeDocumentList(); foreach (MacroscopeDocument msDoc in DocCollection.IterateDocuments()) { if ( msDoc.IsDocumentType(Type: MacroscopeConstants.DocumentType.SITEMAPTEXT) || msDoc.IsDocumentType(Type: MacroscopeConstants.DocumentType.SITEMAPXML)) { SitemapDocumentList.AddDocument(msDoc: msDoc); } } return(SitemapDocumentList); }
/**************************************************************************/ private void RenderListViewSitemapsAudit(MacroscopeDocumentCollection DocCollection) { MacroscopeDocumentList DocumentsNotInSitemaps = DocCollection.GetDocumentsNotInSitemaps(); MacroscopeDocumentList DocumentsInSitemaps = DocCollection.GetDocumentsInSitemaps(); this._RenderListViewSitemapsAudit( DocCollection: DocCollection, DocumentList: DocumentsNotInSitemaps, InOut: false ); this._RenderListViewSitemapsAudit( DocCollection: DocCollection, DocumentList: DocumentsInSitemaps, InOut: true ); return; }
/**************************************************************************/ public new void RenderListView(MacroscopeDocumentCollection DocCollection) { List <ListViewItem> ListViewItems = new List <ListViewItem>(1); MacroscopeDocumentList OrphanedDocumentList = DocCollection.GetOrphanedDocumentList(); this.ClearData(); if (OrphanedDocumentList != null) { foreach (MacroscopeDocument msDoc in OrphanedDocumentList.IterateDocuments()) { this.RenderListView( ListViewItems: ListViewItems, DocCollection: DocCollection, msDoc: msDoc, Url: msDoc.GetUrl() ); } } this.DisplayListView.Items.AddRange(ListViewItems.ToArray()); }
/**************************************************************************/ public void WriteXslx(MacroscopeJobMaster JobMaster, string OutputFilename) { XLWorkbook Workbook = new XLWorkbook(); MacroscopeDocumentList DocumentsNotInSitemaps = JobMaster.GetDocCollection().GetDocumentsNotInSitemaps(); MacroscopeDocumentList DocumentsInSitemaps = JobMaster.GetDocCollection().GetDocumentsInSitemaps(); this.BuildWorksheetSitemapXmlErrors(JobMaster, Workbook, "Sitemap XML Errors"); this.BuildWorksheetSitemapsAudit(JobMaster, Workbook, "Sitemaps Audit - Missing", DocumentsNotInSitemaps, false); this.BuildWorksheetSitemapsAudit(JobMaster, Workbook, "Sitemaps Audit - Present", DocumentsInSitemaps, true); try { Workbook.SaveAs(OutputFilename); } catch (IOException) { MacroscopeSaveExcelFileException CannotSaveExcelFileException; CannotSaveExcelFileException = new MacroscopeSaveExcelFileException( string.Format("Cannot write to Excel file at {0}", OutputFilename) ); throw CannotSaveExcelFileException; } }
/**************************************************************************/ private void BuildWorksheetPageOrphanedPages( MacroscopeJobMaster JobMaster, CsvWriter ws ) { MacroscopeDocumentCollection DocCollection = JobMaster.GetDocCollection(); MacroscopeDocumentList OrphanedDocumentList = DocCollection.GetOrphanedDocumentList(); { ws.WriteField("URL"); ws.WriteField("Status Code"); ws.WriteField("Status"); ws.NextRecord(); } if (OrphanedDocumentList != null) { foreach (MacroscopeDocument msDoc in OrphanedDocumentList.IterateDocuments()) { string Url = msDoc.GetUrl(); string StatusCode = ((int)msDoc.GetStatusCode()).ToString(); string Status = msDoc.GetStatusCode().ToString(); string Checksum = msDoc.GetChecksum(); int Count = DocCollection.GetStatsChecksumCount(Checksum: Checksum); this.InsertAndFormatUrlCell(ws, msDoc); this.InsertAndFormatContentCell(ws, StatusCode); this.InsertAndFormatContentCell(ws, Status); ws.NextRecord(); } } }
/**************************************************************************/ private void BuildWorksheetKeywordTerms( MacroscopeJobMaster JobMaster, XLWorkbook wb, string WorksheetLabel, Dictionary <string, int> DicTerms ) { var ws = wb.Worksheets.Add(WorksheetLabel); decimal TermTotal = DicTerms.Count; decimal TermCount = 0; int iRow = 1; int iCol = 1; int iColMax = 1; MacroscopeDocumentCollection DocCollection = JobMaster.GetDocCollection(); MacroscopeAllowedHosts AllowedHosts = JobMaster.GetAllowedHosts(); { ws.Cell(iRow, iCol).Value = "Occurrences"; iCol++; ws.Cell(iRow, iCol).Value = "Term"; iCol++; ws.Cell(iRow, iCol).Value = "URL"; } iColMax = iCol; iRow++; foreach (string Term in DicTerms.Keys) { MacroscopeDocumentList DocumentList = DocCollection.GetDeepKeywordAnalysDocumentList(Term); decimal DocTotal = ( decimal )DocumentList.CountDocuments(); decimal DocCount = 0; TermCount++; if (TermTotal > 0) { this.ProgressForm.UpdatePercentages( Title: null, Message: null, MajorPercentage: -1, ProgressLabelMajor: null, MinorPercentage: (( decimal )100 / TermTotal) * TermCount, ProgressLabelMinor: "Keywords Processed", SubMinorPercentage: -1, ProgressLabelSubMinor: null ); } foreach (MacroscopeDocument msDoc in DocumentList.IterateDocuments()) { DocCount++; if (DocTotal > 0) { this.ProgressForm.UpdatePercentages( Title: null, Message: null, MajorPercentage: -1, ProgressLabelMajor: null, MinorPercentage: -1, ProgressLabelMinor: null, SubMinorPercentage: (( decimal )100 / DocTotal) * DocCount, ProgressLabelSubMinor: "Documents Processed" ); } iCol = 1; this.InsertAndFormatContentCell(ws, iRow, iCol, this.FormatIfMissing(DicTerms[Term].ToString())); iCol++; this.InsertAndFormatContentCell(ws, iRow, iCol, this.FormatIfMissing(Term)); iCol++; this.InsertAndFormatUrlCell(ws, iRow, iCol, msDoc.GetUrl()); iRow++; } } { var rangeData = ws.Range(1, 1, iRow - 1, iColMax); var excelTable = rangeData.CreateTable(); } }
/**************************************************************************/ public MacroscopeDocumentList AnalyzeOrphanedDocumentsInCollection(MacroscopeDocumentCollection DocCollection) { MacroscopeDocumentList OrphanedDocumentList = new MacroscopeDocumentList(); foreach (MacroscopeDocument msDocLeft in DocCollection.IterateDocuments()) { bool IsOrphan = true; string UrlLeft = msDocLeft.GetUrl(); if (!IsValidDocument(msDoc: msDocLeft)) { continue; } foreach (MacroscopeDocument msDocRight in DocCollection.IterateDocuments()) { if (MacroscopeHttpUrlUtils.CompareUrls(UrlLeft: UrlLeft, UrlRight: msDocRight.GetUrl())) { continue; } if (!this.IsValidDocument(msDoc: msDocRight)) { continue; } foreach (MacroscopeHyperlinkOut HyperlinkOut in msDocRight.IterateHyperlinksOut()) { string UrlRight = HyperlinkOut.GetTargetUrl(); string UrlRightRaw = HyperlinkOut.GetRawTargetUrl(); if (MacroscopeHttpUrlUtils.CompareUrls(UrlLeft: UrlLeft, UrlRight: UrlRight)) { IsOrphan = false; } else if (MacroscopeHttpUrlUtils.CompareUrls(UrlLeft: UrlLeft, UrlRight: UrlRightRaw)) { IsOrphan = false; } if (!IsOrphan) { break; } } if (!IsOrphan) { break; } } if (IsOrphan) { OrphanedDocumentList.AddDocument(msDoc: msDocLeft); msDocLeft.AddRemark("ORPHAN1", "This appears to be an orphaned page, not linked to from any other HTML page in this collection."); msDocLeft.AddRemark("ORPHAN2", "This page appears to only be referenced from one or more sitemaps."); } else { msDocLeft.RemoveRemark("ORPHAN1"); msDocLeft.RemoveRemark("ORPHAN2"); } } return(OrphanedDocumentList); }
/**************************************************************************/ private void BuildWorksheetSitemapsAudit( MacroscopeJobMaster JobMaster, XLWorkbook wb, string WorksheetLabel, MacroscopeDocumentList DocumentList, bool InOut ) { var ws = wb.Worksheets.Add(WorksheetLabel); int iRow = 1; int iCol = 1; int iColMax = 1; MacroscopeDocumentCollection DocCollection = JobMaster.GetDocCollection(); MacroscopeAllowedHosts AllowedHosts = JobMaster.GetAllowedHosts(); { ws.Cell(iRow, iCol).Value = "URL"; iCol++; ws.Cell(iRow, iCol).Value = "In Sitemap"; iCol++; ws.Cell(iRow, iCol).Value = "Status Code"; iCol++; ws.Cell(iRow, iCol).Value = "Is Redirect"; iCol++; ws.Cell(iRow, iCol).Value = "Robots"; iCol++; ws.Cell(iRow, iCol).Value = "Sitemap"; } iColMax = iCol; iRow++; foreach (MacroscopeDocument msDoc in DocumentList.IterateDocuments()) { string Url = null; string Robots = null; string SitemapUrl = null; int StatusCode; if (!msDoc.IsDocumentType(Type: MacroscopeConstants.DocumentType.HTML)) { continue; } if (msDoc.GetIsExternal()) { continue; } Url = msDoc.GetUrl(); StatusCode = (int)msDoc.GetStatusCode(); Robots = msDoc.GetAllowedByRobotsAsString(); SitemapUrl = DocumentList.GetDocumentNote(msDoc: msDoc); iCol = 1; this.InsertAndFormatUrlCell(ws, iRow, iCol, msDoc); if (msDoc.GetIsInternal()) { ws.Cell(iRow, iCol).Style.Font.SetFontColor(XLColor.Green); } else { ws.Cell(iRow, iCol).Style.Font.SetFontColor(XLColor.Gray); } iCol++; this.InsertAndFormatContentCell(ws, iRow, iCol, InOut.ToString()); if (InOut) { ws.Cell(iRow, iCol).Style.Font.SetFontColor(XLColor.Green); } else { ws.Cell(iRow, iCol).Style.Font.SetFontColor(XLColor.Red); } iCol++; this.InsertAndFormatStatusCodeCell(ws, iRow, iCol, msDoc); iCol++; this.InsertAndFormatRedirectCell(ws, iRow, iCol, msDoc); iCol++; this.InsertAndFormatRobotsCell(ws, iRow, iCol, msDoc); iCol++; this.InsertAndFormatUrlCell(ws, iRow, iCol, SitemapUrl); if (AllowedHosts.IsInternalUrl(Url: SitemapUrl)) { ws.Cell(iRow, iCol).Style.Font.SetFontColor(XLColor.Green); } else { ws.Cell(iRow, iCol).Style.Font.SetFontColor(XLColor.Gray); } iRow++; } { var rangeData = ws.Range(1, 1, iRow - 1, iColMax); var excelTable = rangeData.CreateTable(); } }
/**************************************************************************/ private void BuildWorksheetPageOrphanedPages( MacroscopeJobMaster JobMaster, XLWorkbook wb, string WorksheetLabel ) { var ws = wb.Worksheets.Add(WorksheetLabel); int iRow = 1; int iCol = 1; int iColMax = 1; MacroscopeDocumentCollection DocCollection = JobMaster.GetDocCollection(); MacroscopeAllowedHosts AllowedHosts = JobMaster.GetAllowedHosts(); MacroscopeDocumentList OrphanedDocumentList = DocCollection.GetOrphanedDocumentList(); { ws.Cell(iRow, iCol).Value = "URL"; iCol++; ws.Cell(iRow, iCol).Value = "Status Code"; iCol++; ws.Cell(iRow, iCol).Value = "Status"; } iColMax = iCol; iRow++; if (OrphanedDocumentList != null) { foreach (MacroscopeDocument msDoc in OrphanedDocumentList.IterateDocuments()) { string Url = msDoc.GetUrl(); string StatusCode = ((int)msDoc.GetStatusCode()).ToString(); string Status = msDoc.GetStatusCode().ToString(); iCol = 1; this.InsertAndFormatUrlCell(ws, iRow, iCol, msDoc); if (AllowedHosts.IsInternalUrl(Url: Url)) { ws.Cell(iRow, iCol).Style.Font.SetFontColor(XLColor.Green); } else { ws.Cell(iRow, iCol).Style.Font.SetFontColor(XLColor.Gray); } iCol++; this.InsertAndFormatContentCell(ws, iRow, iCol, StatusCode); iCol++; this.InsertAndFormatContentCell(ws, iRow, iCol, Status); iRow++; } { var rangeData = ws.Range(1, 1, iRow - 1, iColMax); var excelTable = rangeData.CreateTable(); } } }
/** -------------------------------------------------------------------- **/ private void _RenderListViewSitemapsAudit( MacroscopeDocumentCollection DocCollection, MacroscopeDocumentList DocumentList, bool InOut ) { List <ListViewItem> ListViewItems = new List <ListViewItem>(1); foreach (MacroscopeDocument msDoc in DocumentList.IterateDocuments()) { string Url = null; string Robots = null; string SitemapUrl = null; string PairKey = null; ListViewItem lvItem = null; int StatusCode; if (!msDoc.IsDocumentType(Type: MacroscopeConstants.DocumentType.HTML)) { continue; } if (msDoc.GetIsExternal()) { continue; } Url = msDoc.GetUrl(); StatusCode = (int)msDoc.GetStatusCode(); Robots = msDoc.GetAllowedByRobotsAsString(); SitemapUrl = DocumentList.GetDocumentNote(msDoc: msDoc); PairKey = string.Join("::::::::", Url); if (this.DisplayListView.Items.ContainsKey(PairKey)) { try { lvItem = this.DisplayListView.Items[PairKey]; lvItem.SubItems[COL_URL].Text = Url; lvItem.SubItems[COL_IN_SITEMAP].Text = InOut.ToString(); lvItem.SubItems[COL_STATUS_CODE].Text = msDoc.GetStatusCode().ToString(); lvItem.SubItems[COL_IS_REDIRECT].Text = msDoc.GetIsRedirect().ToString(); lvItem.SubItems[COL_ROBOTS].Text = Robots; lvItem.SubItems[COL_SITEMAP].Text = SitemapUrl; } catch (Exception ex) { DebugMsg(string.Format("_RenderListViewSitemapsAudit 1: {0}", ex.Message)); } } else { try { lvItem = new ListViewItem(PairKey); lvItem.UseItemStyleForSubItems = false; lvItem.Name = PairKey; lvItem.SubItems[COL_URL].Text = Url; lvItem.SubItems.Add(InOut.ToString()); lvItem.SubItems.Add(msDoc.GetStatusCode().ToString()); lvItem.SubItems.Add(msDoc.GetIsRedirect().ToString()); lvItem.SubItems.Add(Robots); lvItem.SubItems.Add(SitemapUrl); ListViewItems.Add(lvItem); } catch (Exception ex) { DebugMsg(string.Format("_RenderListViewSitemapsAudit 2: {0}", ex.Message)); } } try { if (lvItem != null) { lvItem.ForeColor = Color.Blue; if (msDoc.GetIsInternal()) { lvItem.SubItems[COL_URL].ForeColor = Color.Green; } else { lvItem.SubItems[COL_URL].ForeColor = Color.Gray; } if (InOut) { lvItem.SubItems[COL_IN_SITEMAP].ForeColor = Color.Green; } else { lvItem.SubItems[COL_IN_SITEMAP].ForeColor = Color.Red; } if ((StatusCode >= 200) && (StatusCode <= 299)) { lvItem.SubItems[COL_STATUS_CODE].ForeColor = Color.Green; } else if ((StatusCode >= 300) && (StatusCode <= 399)) { lvItem.SubItems[COL_STATUS_CODE].ForeColor = Color.Goldenrod; } else if ((StatusCode >= 400) && (StatusCode <= 599)) { lvItem.SubItems[COL_STATUS_CODE].ForeColor = Color.Red; } else { lvItem.SubItems[COL_STATUS_CODE].ForeColor = Color.Blue; } if (StatusCode == 410) { lvItem.SubItems[COL_STATUS_CODE].ForeColor = Color.Purple; } if (msDoc.GetIsRedirect()) { lvItem.SubItems[COL_IS_REDIRECT].ForeColor = Color.Goldenrod; } else { lvItem.SubItems[COL_IS_REDIRECT].ForeColor = Color.Gray; } if (!msDoc.GetAllowedByRobots()) { lvItem.SubItems[COL_ROBOTS].ForeColor = Color.Red; } else { lvItem.SubItems[COL_ROBOTS].ForeColor = Color.Green; } if (msDoc.GetIsInternal()) { lvItem.SubItems[COL_SITEMAP].ForeColor = Color.Green; } else { lvItem.SubItems[COL_SITEMAP].ForeColor = Color.Gray; } } else { lvItem.SubItems[3].ForeColor = Color.Gray; } } catch (Exception ex) { DebugMsg(string.Format("_RenderListViewSitemapsAudit 3: {0}", ex.Message)); } } this.DisplayListView.Items.AddRange(ListViewItems.ToArray()); return; }