/**************************************************************************/ private MacroscopeLink AddSitemapTextOutlink( string AbsoluteUrl, MacroscopeConstants.InOutLinkType LinkType, Boolean Follow ) { MacroscopeLink OutLink = null; if (!MacroscopePreferencesManager.GetCheckExternalLinks()) { MacroscopeAllowedHosts AllowedHosts = this.DocCollection.GetAllowedHosts(); if (AllowedHosts != null) { if (!AllowedHosts.IsAllowedFromUrl(Url: AbsoluteUrl)) { return(OutLink); } } } OutLink = new MacroscopeLink( SourceUrl: this.GetUrl(), TargetUrl: AbsoluteUrl, LinkType: LinkType, Follow: Follow ); this.Outlinks.Add(OutLink); return(OutLink); }
/** Pure Text Out Links ***************************************************/ private void ProcessPureTextOutlinks(List <string> TextDoc, MacroscopeConstants.InOutLinkType LinkType) { foreach (string Text in TextDoc) { this.ProcessPureTextOutlinks(TextDoc: Text, LinkType: LinkType); } }
/**************************************************************************/ private MacroscopeLink AddSitemapXmlOutlink( string AbsoluteUrl, MacroscopeConstants.InOutLinkType LinkType, Boolean Follow ) { MacroscopeLink OutLink = null; Boolean Proceed = true; if (!MacroscopePreferencesManager.GetCheckExternalLinks()) { MacroscopeAllowedHosts AllowedHosts = this.DocCollection.GetAllowedHosts(); if (AllowedHosts != null) { if (!AllowedHosts.IsAllowedFromUrl(Url: AbsoluteUrl)) { Proceed = false; } } } switch (LinkType) { case MacroscopeConstants.InOutLinkType.SITEMAPXML: if (!MacroscopePreferencesManager.GetFetchXml()) { Proceed = false; } break; } if (Proceed) { OutLink = new MacroscopeLink( SourceUrl: this.GetUrl(), TargetUrl: AbsoluteUrl, LinkType: LinkType, Follow: Follow ); this.Outlinks.Add(OutLink); } return(OutLink); }
/**************************************************************************/ public MacroscopeLink( string SourceUrl, string TargetUrl, MacroscopeConstants.InOutLinkType LinkType, Boolean Follow ) { this.LinkGuid = Guid.NewGuid(); this.LinkType = LinkType; this.SourceUrl = SourceUrl; this.TargetUrl = TargetUrl; this.DoFollow = Follow; this.RawSourceUrl = SourceUrl; this.RawTargetUrl = TargetUrl; }
/**************************************************************************/ public MacroscopeLink( string SourceUrl, string TargetUrl, MacroscopeConstants.InOutLinkType LinkType, bool Follow ) { this.LinkGuid = Guid.NewGuid(); this.LinkType = LinkType; //this.SourceUrl = SourceUrl; //this.TargetUrl = TargetUrl; this.SetSourceUrl(SourceUrl: SourceUrl); this.SetTargetUrl(TargetUrl: TargetUrl); this.DoFollow = Follow; this.RawSourceUrl = SourceUrl; this.RawTargetUrl = TargetUrl; }
/** -------------------------------------------------------------------- **/ public void ProcessPureTextOutlinks(string TextDoc, MacroscopeConstants.InOutLinkType LinkType) { // BUG: Trailing punctuation in the detected URL can cause problems: Regex UrlRegex = new Regex( @"(https?://[^/]+/[^\s]*)", RegexOptions.IgnoreCase ); Match UrlMatch = UrlRegex.Match(TextDoc); while (UrlMatch.Success) { Group CaptureGroups = UrlMatch.Groups[0]; CaptureCollection Captures = CaptureGroups.Captures; Capture Captured = null; string UrlProcessing = null; string UrlCleaned = null; if (Captures.Count <= 0) { continue; } Captured = Captures[0]; UrlProcessing = Captured.Value; UrlProcessing = UrlProcessing.Trim(); UrlProcessing = UrlProcessing.Trim(','); UrlProcessing = UrlProcessing.Trim('.'); UrlProcessing = UrlProcessing.Trim('('); UrlProcessing = UrlProcessing.Trim(')'); UrlProcessing = UrlProcessing.Trim('"'); UrlProcessing = UrlProcessing.Trim('\''); if (!string.IsNullOrEmpty(UrlProcessing)) { try { Uri PureTextUri = new Uri(UrlProcessing); if (PureTextUri != null) { UrlCleaned = UrlProcessing; } } catch (UriFormatException ex) { this.DebugMsg(string.Format("ProcessPureTextOutlinks: {0}", ex.Message)); UrlCleaned = null; } catch (Exception ex) { this.DebugMsg(string.Format("ProcessPureTextOutlinks: {0}", ex.Message)); UrlCleaned = null; } if (UrlCleaned != null) { MacroscopeLink Outlink; Outlink = this.AddDocumentOutlink( AbsoluteUrl: UrlCleaned, LinkType: LinkType, Follow: true ); if (Outlink != null) { Outlink.SetRawTargetUrl(TargetUrl: UrlCleaned); } } } UrlMatch = UrlMatch.NextMatch(); } }
/** Link Type *************************************************************/ public void SetLinkType(MacroscopeConstants.InOutLinkType LinkType) { this.LinkType = LinkType; }
/**************************************************************************/ private void ProcessHttpLinkHeader(string HttpLinkHeader) { // https://webmasters.googleblog.com/2011/09/pagination-with-relnext-and-relprev.html // Link: <http://www.example.com/downloads/white-paper.pdf>; rel="canonical" string[] HttpLinkHeaderItems = Regex.Split(HttpLinkHeader, @",\s*"); for (int i = 0; i < HttpLinkHeaderItems.Length; i++) { string Url = null; string Rel = null; MatchCollection matches; matches = Regex.Matches(HttpLinkHeader, "<([^<>]+)>\\s*;\\srel=\"([^\"]+)\""); foreach (Match match in matches) { Url = match.Groups[1].Value; Rel = match.Groups[2].Value; } if ( (!string.IsNullOrEmpty(Rel)) && (!string.IsNullOrEmpty(Url))) { string LinkUrl = null; string LinkUrlAbs = null; MacroscopeConstants.InOutLinkType LinkType = MacroscopeConstants.InOutLinkType.RELATED; switch (Rel.ToLower()) { case @"canonical": this.SetCanonical(Url: Url); break; case @"shortlink": this.SetLinkShortLink(Url: Url); break; case @"first": this.SetLinkFirst(Url: Url); break; case @"prev": this.SetLinkPrev(Url: Url); break; case @"next": this.SetLinkNext(Url: Url); break; case @"last": this.SetLinkLast(Url: Url); break; default: this.DebugMsgForced(string.Format("Link Rel: {0} :: {1}", Rel, Url)); break; } LinkUrl = Uri.UnescapeDataString(stringToUnescape: Url); if (!string.IsNullOrEmpty(LinkUrlAbs)) { LinkUrlAbs = MacroscopeHttpUrlUtils.MakeUrlAbsolute( BaseHref: this.GetBaseHref(), BaseUrl: this.DocUrl, Url: LinkUrl ); if (!string.IsNullOrEmpty(LinkUrlAbs)) { this.AddDocumentOutlink( AbsoluteUrl: LinkUrlAbs, LinkType: LinkType, Follow: true ); } } } } return; }