/// <summary> /// 根据配置获取路径集合 /// </summary> /// <param name="invocation"></param> /// <returns></returns> private IList <URL> GetUrls(IInvocation invocation) { //参数检查 if (!Urls.ContainsKey(invocation.TargetType.FullName) && !BadUrls.ContainsKey(invocation.TargetType.FullName)) { throw new Exception($"not find the {invocation.TargetType.FullName}'s urls,please config it "); } if (Urls.ContainsKey(invocation.TargetType.FullName) && Urls?[invocation.TargetType.FullName]?.Count() > 0) { var result = filterUrls(invocation, Urls[invocation.TargetType.FullName]); if (result?.Count > 0) { Logger().LogInformation("from good urls"); return(result); } } if (BadUrls.ContainsKey(invocation.TargetType.FullName) && BadUrls?[invocation.TargetType.FullName]?.Count() > 0) { var result = filterUrls(invocation, BadUrls[invocation.TargetType.FullName].Select(w => w.Url).ToList()); if (result?.Count > 0) { Logger().LogInformation("from bad urls"); return(result); } } throw new Exception($"not find the {invocation.AppPoint()}{invocation.TargetType.FullName}{invocation.PointVersion()}'s urls,please config it,config version must <= Url version "); }
/// <summary> /// Parses a page looking for links. /// </summary> /// <param name="page">The page whose text is to be parsed.</param> /// <param name="sourceUrl">The source url of the page.</param> public void ParseLinks(Page page, string sourceUrl) { MatchCollection matches = Regex.Matches(page.Text, _LINK_REGEX); for (int i = 0; i <= matches.Count - 1; i++) { Match anchorMatch = matches[i]; if (anchorMatch.Value == String.Empty) { BadUrls.Add("Blank url value on page " + sourceUrl); continue; } string foundHref = null; try { foundHref = anchorMatch.Value.Replace("href=\"", ""); foundHref = foundHref.Substring(0, foundHref.IndexOf("\"")); } catch (Exception exc) { Exceptions.Add("Error parsing matched href: " + exc.Message); } if (!GoodUrls.Contains(foundHref)) { if (foundHref != "/") { if (IsExternalUrl(foundHref)) { _externalUrls.Add(foundHref); } else if (!IsAWebPage(foundHref)) { foundHref = Crawler.FixPath(sourceUrl, foundHref); _otherUrls.Add(foundHref); } else { GoodUrls.Add(foundHref); } } } } }
private void OnChanged(ZooyardOption value, string name) { Logger().LogInformation($"{name} has changed:{ value}"); Console.WriteLine($"{name} has changed:{ value}"); this.Address = URL.ValueOf(value.RegisterUrl); foreach (var item in value.Clients) { var list = item.Value.Urls.Select(w => URL.ValueOf(w).AddParameterIfAbsent("interface", item.Value.Service.FullName)).ToList(); //优先移除被隔离了的URL if (this.BadUrls.ContainsKey(item.Key)) { var removeUrls = new List <BadUrl>(); foreach (var badUrl in this.BadUrls[item.Key]) { var exitsUrl = list.FirstOrDefault(w => w.ToIdentityString() == badUrl.Url.ToIdentityString()); if (exitsUrl == null) { removeUrls.Add(badUrl); } } foreach (var url in removeUrls) { this.BadUrls[item.Key].Remove(url); } } if (this.Urls.ContainsKey(item.Key)) { //移除注销的提供者 var removeUrls = new List <URL>(); foreach (var url in this.Urls[item.Key]) { var exitsUrl = list.FirstOrDefault(w => w.ToIdentityString() == url.ToIdentityString()); if (exitsUrl == null) { removeUrls.Add(url); } } foreach (var url in removeUrls) { this.Urls[item.Key].Remove(url); } //发现新的提供者 foreach (var i in list) { URL exitsUrl = null; if (this.Urls.TryGetValue(item.Key, out IList <URL> urlList)) { exitsUrl = urlList.FirstOrDefault(w => w.ToIdentityString() == i.ToIdentityString()); } BadUrl exitsBadUrl = null; if (BadUrls.TryGetValue(item.Key, out IList <BadUrl> badUrlList)) { badUrlList.FirstOrDefault(w => w.Url.ToIdentityString() == i.ToIdentityString()); } if (exitsUrl == null && exitsBadUrl == null) { this.Urls[item.Key].Add(i); } } } else { this.Urls.TryAdd(item.Key, list); } } }
/// <summary> /// Parses a page looking for links. /// </summary> /// <param name="page">The page whose text is to be parsed.</param> /// <param name="sourceUrl">The source url of the page.</param> public void ParseLinks(InputSite inputSite, Page page, string sourceUrl) { if (sourceUrl.EndsWith(".xml")) { var matches = Regex.Matches(page.Text, _SITEMAP_REGEX); for (var i = 0; i <= matches.Count - 1; i++) { var anchorMatch = matches[i]; var foundHref = BddJson.NormalizeUrl(anchorMatch.Value); // TODO faire un Regex Match foundHref = foundHref.Replace("<loc>", ""); foundHref = foundHref.Replace("</loc>", ""); if (!IsBad(foundHref) && !GoodUrls.Contains(foundHref)) { GoodUrls.Add(foundHref); } } } else { var matches = Regex.Matches(page.Text, _LINK_REGEX); for (var i = 0; i <= matches.Count - 1; i++) { var anchorMatch = matches[i]; if (anchorMatch.Value == string.Empty) { BadUrls.Add("Blank url value on page " + sourceUrl); continue; } string foundHref = null; try { foundHref = anchorMatch.Value.Replace("href=\"", ""); foundHref = foundHref.Substring(0, foundHref.IndexOf("\"")); } catch (Exception exc) { Exceptions.Add("Error parsing matched href: " + exc.Message); } foundHref = BddJson.NormalizeUrl(foundHref); if (!IsBad(foundHref) && !GoodUrls.Contains(foundHref)) { if (IsExternalUrl(inputSite, foundHref)) { ExternalUrls.Add(foundHref); } else if (!IsAWebPage(foundHref)) { foundHref = Crawler.FixPath(inputSite, sourceUrl); OtherUrls.Add(foundHref); } else { GoodUrls.Add(foundHref); } } } } }