/**************************************************************************/ public bool ContainsHyperlinkOut ( string Url ) { bool LinkPresent = false; lock( this.Links ) { foreach( MacroscopeHyperlinkOut HyperlinkOut in this.Links ) { if( HyperlinkOut.GetTargetUrl() == Url ) { LinkPresent = true; } } } return( LinkPresent ); }
/** -------------------------------------------------------------------- **/ private void GenerateTextSitemapPdfEntries( MacroscopeDocument msDoc, List <string> SitemapText, Dictionary <string, bool> Dedupe ) { foreach (MacroscopeHyperlinkOut HyperlinkOut in msDoc.IterateHyperlinksOut()) { string Url = HyperlinkOut.GetTargetUrl(); Uri UrlParsed = new Uri(uriString: Url); if (Dedupe.ContainsKey(Url)) { continue; } else { Dedupe.Add(Url, true); } if (!UrlParsed.AbsolutePath.ToLower().EndsWith(".pdf", StringComparison.InvariantCultureIgnoreCase)) { continue; } if (!this.DocCollection.GetAllowedHosts().IsAllowedFromUrl(Url: Url)) { continue; } if (!MacroscopeHttpUrlUtils.VerifySameHost(BaseUrl: msDoc.GetUrl(), Url: Url)) { continue; } SitemapText.Add(Url); } }
/**************************************************************************/ private void RenderListViewSearchTargetUrls( List <ListViewItem> ListViewItems, MacroscopeDocument msDoc, string Url, string UrlFragment ) { MacroscopeAllowedHosts AllowedHosts = this.MainForm.GetJobMaster().GetAllowedHosts(); MacroscopeHyperlinksOut HyperlinksOut = msDoc.GetHyperlinksOut(); MacroscopeDocumentCollection DocCollection = this.MainForm.GetJobMaster().GetDocCollection(); foreach (MacroscopeHyperlinkOut HyperlinkOut in HyperlinksOut.IterateLinks()) { string UrlTarget = HyperlinkOut.GetTargetUrl(); HttpStatusCode StatusCode = HttpStatusCode.NotFound; string StatusCodeText = "Not crawled"; string StatusText = "Not crawled"; string PairKey = string.Join(":", UrlToDigest(Url: Url), UrlToDigest(Url: UrlTarget)).ToString(); string LinkTarget = HyperlinkOut.GetLinkTarget(); string LinkText = HyperlinkOut.GetAnchorText(); string LinkTitle = HyperlinkOut.GetTitle(); string AltText = HyperlinkOut.GetAltText(); string LinkTextLabel = LinkText; string LinkTitleLabel = LinkTitle; string AltTextLabel = AltText; string DoFollow = "No Follow"; try { if (DocCollection.ContainsDocument(Url: HyperlinkOut.GetTargetUrl())) { StatusCode = DocCollection.GetDocumentByUrl(Url: HyperlinkOut.GetTargetUrl()).GetStatusCode(); StatusCodeText = ((int)StatusCode).ToString(); StatusText = StatusCode.ToString(); } else { DebugMsg("Not in DocCollection"); } } catch (Exception ex) { this.DebugMsg(ex.Message); } if (HyperlinkOut.GetDoFollow()) { DoFollow = "Follow"; } if (LinkText.Length == 0) { LinkTextLabel = "MISSING"; } if (LinkTitle.Length == 0) { LinkTitleLabel = "MISSING"; } if (AltText.Length == 0) { AltTextLabel = "MISSING"; } if ( (UrlTarget != null) && (UrlTarget.IndexOf(UrlFragment, StringComparison.CurrentCulture) >= 0)) { ListViewItem lvItem = null; if (this.DisplayListView.Items.ContainsKey(PairKey)) { try { lvItem = this.DisplayListView.Items[PairKey]; lvItem.SubItems[ColUrl].Text = Url; lvItem.SubItems[ColUrlTarget].Text = UrlTarget; lvItem.SubItems[ColStatusCode].Text = StatusCodeText; lvItem.SubItems[ColStatus].Text = StatusText; lvItem.SubItems[ColDoFollow].Text = DoFollow; lvItem.SubItems[ColLinkTarget].Text = LinkTarget; lvItem.SubItems[ColLinkAnchorTextLabel].Text = LinkTextLabel; lvItem.SubItems[ColLinkTitleLabel].Text = LinkTitleLabel; lvItem.SubItems[ColAltTextLabel].Text = AltTextLabel; } catch (Exception ex) { this.DebugMsg(string.Format("MacroscopeDisplayLinks 1: {0}", ex.Message)); } } else { try { lvItem = new ListViewItem(PairKey); lvItem.UseItemStyleForSubItems = false; lvItem.Name = PairKey; lvItem.SubItems[ColUrl].Text = Url; lvItem.SubItems.Add(UrlTarget); lvItem.SubItems.Add(StatusCodeText); lvItem.SubItems.Add(StatusText); lvItem.SubItems.Add(DoFollow); lvItem.SubItems.Add(LinkTarget); lvItem.SubItems.Add(LinkTextLabel); lvItem.SubItems.Add(LinkTitleLabel); lvItem.SubItems.Add(AltTextLabel); ListViewItems.Add(lvItem); } catch (Exception ex) { this.DebugMsg(string.Format("MacroscopeDisplayLinks 2: {0}", ex.Message)); } } if (lvItem != null) { for (int i = 0; i < lvItem.SubItems.Count; i++) { lvItem.SubItems[i].ForeColor = Color.Blue; } if (AllowedHosts.IsAllowedFromUrl(Url)) { lvItem.SubItems[ColUrl].ForeColor = Color.Green; } else { lvItem.SubItems[ColUrl].ForeColor = Color.Gray; } if (AllowedHosts.IsAllowedFromUrl(UrlTarget)) { lvItem.SubItems[ColUrlTarget].ForeColor = Color.Green; } else { lvItem.SubItems[ColUrlTarget].ForeColor = Color.Gray; } if (AllowedHosts.IsAllowedFromUrl(Url)) { if (HyperlinkOut.GetDoFollow()) { lvItem.SubItems[ColDoFollow].ForeColor = Color.Green; } else { lvItem.SubItems[ColDoFollow].ForeColor = Color.Red; } } else { lvItem.SubItems[ColDoFollow].ForeColor = Color.Gray; } if (LinkText.Length == 0) { lvItem.SubItems[ColLinkAnchorTextLabel].ForeColor = Color.Gray; } if (LinkTitle.Length == 0) { lvItem.SubItems[ColLinkTitleLabel].ForeColor = Color.Gray; } if (AltText.Length == 0) { lvItem.SubItems[ColAltTextLabel].ForeColor = Color.Gray; } if ( (LinkText.Length == 0) && (LinkTitle.Length == 0) && (AltText.Length == 0)) { lvItem.SubItems[ColLinkAnchorTextLabel].ForeColor = Color.Red; lvItem.SubItems[ColLinkTitleLabel].ForeColor = Color.Red; lvItem.SubItems[ColAltTextLabel].ForeColor = Color.Red; } } } } }
/**************************************************************************/ // TODO: Finish this. private void Descend( LinkedList <string> PageChain, MacroscopeDocument ParentDoc ) { string ParentUrl = ParentDoc.GetUrl(); PageChain.AddLast(ParentUrl); foreach (MacroscopeHyperlinkOut HyperlinkOut in ParentDoc.IterateHyperlinksOut()) { if (HyperlinkOut.GetTargetUrl().Equals(ParentUrl)) { continue; } if (this.CheckNodeAlreadyVisited(msDoc: ParentDoc, HyperlinkOut: HyperlinkOut)) { continue; } else { MacroscopeDocument CurrentDoc = this.DocCollection.GetDocumentByUrl(Url: HyperlinkOut.GetTargetUrl()); if (CurrentDoc != null) { if (CurrentDoc.GetHostAndPort().Equals(ParentDoc.GetHostAndPort())) { this.Descend( PageChain: PageChain, ParentDoc: CurrentDoc ); } } } } { LinkedList <string> PageChainClone = new LinkedList <string> (); foreach (string Url in PageChain) { PageChainClone.AddLast(Url); if (ParentDoc.GetUrl().Equals(Url)) { break; } } if (!this.PageChains.ContainsKey(PageChain.Last.Value)) { this.PageChains.Add(PageChain.Last.Value, new List <LinkedList <string> > ()); } this.PageChains[PageChain.Last.Value].Add(PageChainClone); } PageChain.RemoveLast(); return; }
/**************************************************************************/ public MacroscopeDocumentList AnalyzeOrphanedDocumentsInCollection(MacroscopeDocumentCollection DocCollection) { MacroscopeDocumentList OrphanedDocumentList = new MacroscopeDocumentList(); foreach (MacroscopeDocument msDocLeft in DocCollection.IterateDocuments()) { bool IsOrphan = true; string UrlLeft = msDocLeft.GetUrl(); if (!IsValidDocument(msDoc: msDocLeft)) { continue; } foreach (MacroscopeDocument msDocRight in DocCollection.IterateDocuments()) { if (MacroscopeHttpUrlUtils.CompareUrls(UrlLeft: UrlLeft, UrlRight: msDocRight.GetUrl())) { continue; } if (!this.IsValidDocument(msDoc: msDocRight)) { continue; } foreach (MacroscopeHyperlinkOut HyperlinkOut in msDocRight.IterateHyperlinksOut()) { string UrlRight = HyperlinkOut.GetTargetUrl(); string UrlRightRaw = HyperlinkOut.GetRawTargetUrl(); if (MacroscopeHttpUrlUtils.CompareUrls(UrlLeft: UrlLeft, UrlRight: UrlRight)) { IsOrphan = false; } else if (MacroscopeHttpUrlUtils.CompareUrls(UrlLeft: UrlLeft, UrlRight: UrlRightRaw)) { IsOrphan = false; } if (!IsOrphan) { break; } } if (!IsOrphan) { break; } } if (IsOrphan) { OrphanedDocumentList.AddDocument(msDoc: msDocLeft); msDocLeft.AddRemark("ORPHAN1", "This appears to be an orphaned page, not linked to from any other HTML page in this collection."); msDocLeft.AddRemark("ORPHAN2", "This page appears to only be referenced from one or more sitemaps."); } else { msDocLeft.RemoveRemark("ORPHAN1"); msDocLeft.RemoveRemark("ORPHAN2"); } } return(OrphanedDocumentList); }
/** -------------------------------------------------------------------- **/ private void GenerateXmlSitemapPdfEntries( MacroscopeDocument msDoc, XmlDocument SitemapXml, XmlElement UrlSetNode, Dictionary <string, bool> Dedupe ) { foreach (MacroscopeHyperlinkOut HyperlinkOut in msDoc.IterateHyperlinksOut()) { string Url = HyperlinkOut.GetTargetUrl(); Uri UrlParsed = new Uri(uriString: Url); if (Dedupe.ContainsKey(Url)) { continue; } else { Dedupe.Add(Url, true); } if (!UrlParsed.AbsolutePath.ToLower().EndsWith(".pdf", StringComparison.InvariantCultureIgnoreCase)) { continue; } if (!this.DocCollection.GetAllowedHosts().IsAllowedFromUrl(Url: Url)) { continue; } if (!MacroscopeHttpUrlUtils.VerifySameHost(BaseUrl: msDoc.GetUrl(), Url: Url)) { continue; } XmlElement UrlNode = SitemapXml.CreateElement(string.Empty, "url", MacroscopeSitemapGenerator.XmlNamespace); UrlSetNode.AppendChild(UrlNode); { XmlElement EntryNode = SitemapXml.CreateElement(string.Empty, "loc", MacroscopeSitemapGenerator.XmlNamespace); XmlText TextNode = SitemapXml.CreateTextNode(Url); UrlNode.AppendChild(EntryNode); EntryNode.AppendChild(TextNode); } { XmlElement EntryNode = SitemapXml.CreateElement(string.Empty, "changefreq", MacroscopeSitemapGenerator.XmlNamespace); XmlText TextNode = SitemapXml.CreateTextNode("daily"); UrlNode.AppendChild(EntryNode); EntryNode.AppendChild(TextNode); } { XmlElement EntryNode = SitemapXml.CreateElement(string.Empty, "priority", MacroscopeSitemapGenerator.XmlNamespace); XmlText TextNode = SitemapXml.CreateTextNode("1.0"); UrlNode.AppendChild(EntryNode); EntryNode.AppendChild(TextNode); } } }
/**************************************************************************/ private void BuildWorksheetPageHyperlinks( MacroscopeJobMaster JobMaster, XLWorkbook wb, string WorksheetLabel ) { var ws = wb.Worksheets.Add(WorksheetLabel); int iRow = 1; int iCol = 1; int iColMax = 1; MacroscopeDocumentCollection DocCollection = JobMaster.GetDocCollection(); MacroscopeAllowedHosts AllowedHosts = JobMaster.GetAllowedHosts(); { ws.Cell(iRow, iCol).Value = "Source URL"; iCol++; ws.Cell(iRow, iCol).Value = "Target URL"; iCol++; ws.Cell(iRow, iCol).Value = "Follow"; iCol++; ws.Cell(iRow, iCol).Value = "Target"; iCol++; ws.Cell(iRow, iCol).Value = "Anchor Text"; iCol++; ws.Cell(iRow, iCol).Value = "Title"; iCol++; ws.Cell(iRow, iCol).Value = "Alt Text"; iCol++; ws.Cell(iRow, iCol).Value = "Raw Target URL"; } iColMax = iCol; iRow++; foreach (MacroscopeDocument msDoc in DocCollection.IterateDocuments()) { MacroscopeHyperlinksOut HyperlinksOut = msDoc.GetHyperlinksOut(); foreach (MacroscopeHyperlinkOut HyperlinkOut in HyperlinksOut.IterateLinks()) { string HyperlinkOutUrl = HyperlinkOut.GetTargetUrl(); string DoFollow = "No Follow"; string LinkTarget = HyperlinkOut.GetLinkTarget(); string AnchorText = HyperlinkOut.GetAnchorText(); string Title = HyperlinkOut.GetTitle(); string AltText = HyperlinkOut.GetAltText(); string RawTargetUrl = HyperlinkOut.GetRawTargetUrl(); if (HyperlinkOutUrl == null) { HyperlinkOutUrl = ""; } if (HyperlinkOut.GetDoFollow()) { DoFollow = "Follow"; } iCol = 1; this.InsertAndFormatUrlCell(ws, iRow, iCol, msDoc); if (AllowedHosts.IsInternalUrl(Url: msDoc.GetUrl())) { ws.Cell(iRow, iCol).Style.Font.SetFontColor(XLColor.Green); } else { ws.Cell(iRow, iCol).Style.Font.SetFontColor(XLColor.Gray); } iCol++; this.InsertAndFormatUrlCell(ws, iRow, iCol, HyperlinkOutUrl); if ((HyperlinkOutUrl.Length > 0) && (AllowedHosts.IsInternalUrl(Url: HyperlinkOutUrl))) { ws.Cell(iRow, iCol).Style.Font.SetFontColor(XLColor.Green); } else if ((HyperlinkOutUrl.Length > 0) && (AllowedHosts.IsExternalUrl(Url: HyperlinkOutUrl))) { ws.Cell(iRow, iCol).Style.Font.SetFontColor(XLColor.Gray); } else { this.InsertAndFormatContentCell(ws, iRow, iCol, this.FormatIfMissing(HyperlinkOutUrl)); ws.Cell(iRow, iCol).Style.Font.SetFontColor(XLColor.Red); } iCol++; this.InsertAndFormatContentCell(ws, iRow, iCol, DoFollow); iCol++; this.InsertAndFormatContentCell(ws, iRow, iCol, LinkTarget); iCol++; this.InsertAndFormatContentCell(ws, iRow, iCol, this.FormatIfMissing(AnchorText)); iCol++; this.InsertAndFormatContentCell(ws, iRow, iCol, this.FormatIfMissing(Title)); iCol++; this.InsertAndFormatContentCell(ws, iRow, iCol, this.FormatIfMissing(AltText)); iCol++; this.InsertAndFormatContentCell(ws, iRow, iCol, RawTargetUrl); iRow++; } } { var rangeData = ws.Range(1, 1, iRow - 1, iColMax); var excelTable = rangeData.CreateTable(); } }
/**************************************************************************/ protected override void RenderListView( List <ListViewItem> ListViewItems, MacroscopeDocumentCollection DocCollection, MacroscopeDocument msDoc, string Url ) { MacroscopeAllowedHosts AllowedHosts = this.MainForm.GetJobMaster().GetAllowedHosts(); MacroscopeHyperlinksOut HyperlinksOut = msDoc.GetHyperlinksOut(); foreach (MacroscopeHyperlinkOut HyperlinkOut in HyperlinksOut.IterateLinks()) { ListViewItem lvItem = null; string UrlTarget = HyperlinkOut.GetTargetUrl(); string PairKey = string.Join(":", UrlToDigest(Url), UrlToDigest(UrlTarget)); string LinkTarget = HyperlinkOut.GetLinkTarget(); string LinkText = HyperlinkOut.GetLinkText(); string LinkTitle = HyperlinkOut.GetLinkTitle(); string AltText = HyperlinkOut.GetAltText(); string LinkTextLabel = LinkText; string LinkTitleLabel = LinkTitle; string AltTextLabel = AltText; string RawTargetUrl = HyperlinkOut.GetRawTargetUrl(); string DoFollow = "No Follow"; if (HyperlinkOut.GetDoFollow()) { DoFollow = "Follow"; } if (LinkText.Length == 0) { LinkTextLabel = "MISSING"; } if (LinkTitle.Length == 0) { LinkTitleLabel = "MISSING"; } if (AltText.Length == 0) { AltTextLabel = "MISSING"; } if (this.DisplayListView.Items.ContainsKey(PairKey)) { try { lvItem = this.DisplayListView.Items[PairKey]; lvItem.SubItems[ColUrl].Text = Url; lvItem.SubItems[ColUrlTarget].Text = UrlTarget; lvItem.SubItems[ColDoFollow].Text = DoFollow; lvItem.SubItems[ColLinkTarget].Text = LinkTarget; lvItem.SubItems[ColLinkTextLabel].Text = LinkTextLabel; lvItem.SubItems[ColLinkTitleLabel].Text = LinkTitleLabel; lvItem.SubItems[ColAltTextLabel].Text = AltTextLabel; lvItem.SubItems[ColRawTargetUrl].Text = RawTargetUrl; } catch (Exception ex) { this.DebugMsg(string.Format("MacroscopeDisplayLinks 1: {0}", ex.Message)); } } else { try { lvItem = new ListViewItem(PairKey); lvItem.UseItemStyleForSubItems = false; lvItem.Name = PairKey; lvItem.SubItems[ColUrl].Text = Url; lvItem.SubItems.Add(UrlTarget); lvItem.SubItems.Add(DoFollow); lvItem.SubItems.Add(LinkTarget); lvItem.SubItems.Add(LinkTextLabel); lvItem.SubItems.Add(LinkTitleLabel); lvItem.SubItems.Add(AltTextLabel); lvItem.SubItems.Add(RawTargetUrl); ListViewItems.Add(lvItem); } catch (Exception ex) { this.DebugMsg(string.Format("MacroscopeDisplayLinks 2: {0}", ex.Message)); } } if (lvItem != null) { for (int i = 0; i < lvItem.SubItems.Count; i++) { lvItem.SubItems[i].ForeColor = Color.Blue; } if (AllowedHosts.IsAllowedFromUrl(Url)) { lvItem.SubItems[ColUrl].ForeColor = Color.Green; } else { lvItem.SubItems[ColUrl].ForeColor = Color.Gray; } if (AllowedHosts.IsAllowedFromUrl(UrlTarget)) { lvItem.SubItems[ColUrlTarget].ForeColor = Color.Green; } else { lvItem.SubItems[ColUrlTarget].ForeColor = Color.Gray; } if (AllowedHosts.IsAllowedFromUrl(Url)) { if (HyperlinkOut.GetDoFollow()) { lvItem.SubItems[ColDoFollow].ForeColor = Color.Green; } else { lvItem.SubItems[ColDoFollow].ForeColor = Color.Red; } } else { lvItem.SubItems[ColDoFollow].ForeColor = Color.Gray; } if (LinkText.Length == 0) { lvItem.SubItems[ColLinkTextLabel].ForeColor = Color.Gray; } if (LinkTitle.Length == 0) { lvItem.SubItems[ColLinkTitleLabel].ForeColor = Color.Gray; } if (AltText.Length == 0) { lvItem.SubItems[ColAltTextLabel].ForeColor = Color.Gray; } if ( (LinkText.Length == 0) && (LinkTitle.Length == 0) && (AltText.Length == 0)) { lvItem.SubItems[ColLinkTextLabel].ForeColor = Color.Red; lvItem.SubItems[ColLinkTitleLabel].ForeColor = Color.Red; lvItem.SubItems[ColAltTextLabel].ForeColor = Color.Red; } } } }
/**************************************************************************/ private void BuildWorksheetPageHyperlinks( MacroscopeJobMaster JobMaster, CsvWriter ws ) { MacroscopeDocumentCollection DocCollection = JobMaster.GetDocCollection(); MacroscopeAllowedHosts AllowedHosts = JobMaster.GetAllowedHosts(); { ws.WriteField("Source URL"); ws.WriteField("Target URL"); ws.WriteField("Follow"); ws.WriteField("Target"); ws.WriteField("Anchor Text"); ws.WriteField("Title"); ws.WriteField("Alt Text"); ws.WriteField("Raw Target URL"); ws.NextRecord(); } foreach (MacroscopeDocument msDoc in DocCollection.IterateDocuments()) { MacroscopeHyperlinksOut HyperlinksOut = msDoc.GetHyperlinksOut(); foreach (MacroscopeHyperlinkOut HyperlinkOut in HyperlinksOut.IterateLinks()) { string HyperlinkOutUrl = HyperlinkOut.GetTargetUrl(); string DoFollow = "No Follow"; string LinkTarget = HyperlinkOut.GetLinkTarget(); string AnchorText = HyperlinkOut.GetAnchorText(); string Title = HyperlinkOut.GetTitle(); string AltText = HyperlinkOut.GetAltText(); string RawTargetUrl = HyperlinkOut.GetRawTargetUrl(); if (string.IsNullOrEmpty(HyperlinkOutUrl)) { HyperlinkOutUrl = ""; } if (HyperlinkOut.GetDoFollow()) { DoFollow = "Follow"; } this.InsertAndFormatUrlCell(ws, msDoc); this.InsertAndFormatUrlCell(ws, HyperlinkOutUrl); this.InsertAndFormatContentCell(ws, DoFollow); this.InsertAndFormatContentCell(ws, LinkTarget); this.InsertAndFormatContentCell(ws, this.FormatIfMissing(AnchorText)); this.InsertAndFormatContentCell(ws, this.FormatIfMissing(Title)); this.InsertAndFormatContentCell(ws, this.FormatIfMissing(AltText)); this.InsertAndFormatContentCell(ws, RawTargetUrl); ws.NextRecord(); } } }