Пример #1
0
        /// <summary>
        /// 根据配置获取路径集合
        /// </summary>
        /// <param name="invocation"></param>
        /// <returns></returns>
        private IList <URL> GetUrls(IInvocation invocation)
        {
            //参数检查
            if (!Urls.ContainsKey(invocation.TargetType.FullName) && !BadUrls.ContainsKey(invocation.TargetType.FullName))
            {
                throw new Exception($"not find the {invocation.TargetType.FullName}'s urls,please config it ");
            }

            if (Urls.ContainsKey(invocation.TargetType.FullName) &&
                Urls?[invocation.TargetType.FullName]?.Count() > 0)
            {
                var result = filterUrls(invocation, Urls[invocation.TargetType.FullName]);
                if (result?.Count > 0)
                {
                    Logger().LogInformation("from good urls");
                    return(result);
                }
            }

            if (BadUrls.ContainsKey(invocation.TargetType.FullName) &&
                BadUrls?[invocation.TargetType.FullName]?.Count() > 0)
            {
                var result = filterUrls(invocation, BadUrls[invocation.TargetType.FullName].Select(w => w.Url).ToList());
                if (result?.Count > 0)
                {
                    Logger().LogInformation("from bad urls");
                    return(result);
                }
            }

            throw new Exception($"not find the {invocation.AppPoint()}{invocation.TargetType.FullName}{invocation.PointVersion()}'s urls,please config it,config version must <= Url version ");
        }
Пример #2
0
        /// <summary>
        /// Parses a page looking for links.
        /// </summary>
        /// <param name="page">The page whose text is to be parsed.</param>
        /// <param name="sourceUrl">The source url of the page.</param>
        public void ParseLinks(Page page, string sourceUrl)
        {
            MatchCollection matches = Regex.Matches(page.Text, _LINK_REGEX);

            for (int i = 0; i <= matches.Count - 1; i++)
            {
                Match anchorMatch = matches[i];

                if (anchorMatch.Value == String.Empty)
                {
                    BadUrls.Add("Blank url value on page " + sourceUrl);
                    continue;
                }

                string foundHref = null;
                try
                {
                    foundHref = anchorMatch.Value.Replace("href=\"", "");
                    foundHref = foundHref.Substring(0, foundHref.IndexOf("\""));
                }
                catch (Exception exc)
                {
                    Exceptions.Add("Error parsing matched href: " + exc.Message);
                }


                if (!GoodUrls.Contains(foundHref))
                {
                    if (foundHref != "/")
                    {
                        if (IsExternalUrl(foundHref))
                        {
                            _externalUrls.Add(foundHref);
                        }
                        else if (!IsAWebPage(foundHref))
                        {
                            foundHref = Crawler.FixPath(sourceUrl, foundHref);
                            _otherUrls.Add(foundHref);
                        }
                        else
                        {
                            GoodUrls.Add(foundHref);
                        }
                    }
                }
            }
        }
Пример #3
0
        private void OnChanged(ZooyardOption value, string name)
        {
            Logger().LogInformation($"{name} has changed:{ value}");
            Console.WriteLine($"{name} has changed:{ value}");

            this.Address = URL.ValueOf(value.RegisterUrl);

            foreach (var item in value.Clients)
            {
                var list = item.Value.Urls.Select(w => URL.ValueOf(w).AddParameterIfAbsent("interface", item.Value.Service.FullName)).ToList();
                //优先移除被隔离了的URL
                if (this.BadUrls.ContainsKey(item.Key))
                {
                    var removeUrls = new List <BadUrl>();
                    foreach (var badUrl in this.BadUrls[item.Key])
                    {
                        var exitsUrl = list.FirstOrDefault(w => w.ToIdentityString() == badUrl.Url.ToIdentityString());
                        if (exitsUrl == null)
                        {
                            removeUrls.Add(badUrl);
                        }
                    }
                    foreach (var url in removeUrls)
                    {
                        this.BadUrls[item.Key].Remove(url);
                    }
                }

                if (this.Urls.ContainsKey(item.Key))
                {
                    //移除注销的提供者
                    var removeUrls = new List <URL>();
                    foreach (var url in this.Urls[item.Key])
                    {
                        var exitsUrl = list.FirstOrDefault(w => w.ToIdentityString() == url.ToIdentityString());
                        if (exitsUrl == null)
                        {
                            removeUrls.Add(url);
                        }
                    }
                    foreach (var url in removeUrls)
                    {
                        this.Urls[item.Key].Remove(url);
                    }

                    //发现新的提供者
                    foreach (var i in list)
                    {
                        URL exitsUrl = null;
                        if (this.Urls.TryGetValue(item.Key, out IList <URL> urlList))
                        {
                            exitsUrl = urlList.FirstOrDefault(w => w.ToIdentityString() == i.ToIdentityString());
                        }
                        BadUrl exitsBadUrl = null;
                        if (BadUrls.TryGetValue(item.Key, out IList <BadUrl> badUrlList))
                        {
                            badUrlList.FirstOrDefault(w => w.Url.ToIdentityString() == i.ToIdentityString());
                        }
                        if (exitsUrl == null && exitsBadUrl == null)
                        {
                            this.Urls[item.Key].Add(i);
                        }
                    }
                }
                else
                {
                    this.Urls.TryAdd(item.Key, list);
                }
            }
        }
Пример #4
0
        /// <summary>
        ///     Parses a page looking for links.
        /// </summary>
        /// <param name="page">The page whose text is to be parsed.</param>
        /// <param name="sourceUrl">The source url of the page.</param>
        public void ParseLinks(InputSite inputSite, Page page, string sourceUrl)
        {
            if (sourceUrl.EndsWith(".xml"))
            {
                var matches = Regex.Matches(page.Text, _SITEMAP_REGEX);

                for (var i = 0; i <= matches.Count - 1; i++)
                {
                    var anchorMatch = matches[i];
                    var foundHref   = BddJson.NormalizeUrl(anchorMatch.Value);
                    // TODO faire un Regex Match
                    foundHref = foundHref.Replace("<loc>", "");
                    foundHref = foundHref.Replace("</loc>", "");

                    if (!IsBad(foundHref) && !GoodUrls.Contains(foundHref))
                    {
                        GoodUrls.Add(foundHref);
                    }
                }
            }
            else
            {
                var matches = Regex.Matches(page.Text, _LINK_REGEX);

                for (var i = 0; i <= matches.Count - 1; i++)
                {
                    var anchorMatch = matches[i];

                    if (anchorMatch.Value == string.Empty)
                    {
                        BadUrls.Add("Blank url value on page " + sourceUrl);
                        continue;
                    }

                    string foundHref = null;
                    try
                    {
                        foundHref = anchorMatch.Value.Replace("href=\"", "");
                        foundHref = foundHref.Substring(0, foundHref.IndexOf("\""));
                    }
                    catch (Exception exc)
                    {
                        Exceptions.Add("Error parsing matched href: " + exc.Message);
                    }

                    foundHref = BddJson.NormalizeUrl(foundHref);

                    if (!IsBad(foundHref) && !GoodUrls.Contains(foundHref))
                    {
                        if (IsExternalUrl(inputSite, foundHref))
                        {
                            ExternalUrls.Add(foundHref);
                        }
                        else if (!IsAWebPage(foundHref))
                        {
                            foundHref = Crawler.FixPath(inputSite, sourceUrl);
                            OtherUrls.Add(foundHref);
                        }
                        else
                        {
                            GoodUrls.Add(foundHref);
                        }
                    }
                }
            }
        }