Example #1
0
 private static bool MasterAddUrlEvent(AddUrlEventArgs args)
 {
     if (!filter.Contains(args.Url))//不包含就添加
     {
         filter.Add(args.Url);
         return(true);
     }
     return(false); // 返回 false 代表:不添加到队列中
 }
Example #2
0
        /// <summary>
        /// The master add url event.
        /// </summary>
        /// <param name="args">
        /// The args.
        /// </param>
        /// <returns>
        /// The <see cref="bool"/>.
        /// </returns>
        private static bool MasterAddUrlEvent(AddUrlEventArgs args)
        {
            if (!filter.Contains(args.Url))
            {
                filter.Add(args.Url);
                Console.WriteLine(args.Url);
                return(true);
            }

            return(false); // 返回 false 代表:不添加到队列中
        }
Example #3
0
        private void OnAddUrlEventHandler(object sender, AddUrlEventArgs args)
        {
            bool isExist = urlPool.Add(args.Url);

            if (!isExist)
            {
                urlQueue.EnQueue(new UrlInfo(args.Url, args.Depth + 1));
            }

            if (AddUrlEventHandler != null)
            {
                AddUrlEventHandler(sender, args);
            }
        }
Example #4
0
        /// <summary>
        /// The master add url event.
        /// </summary>
        /// <param name="args">
        /// The args.
        /// </param>
        /// <returns>
        /// The <see cref="bool"/>.
        /// </returns>
        private static bool MasterAddUrlEvent(AddUrlEventArgs args)
        {
            //符合条件的url
            //if (urlFilterKeyWord.Any(c => args.Url.Contains(c))) return false;//url过滤
            if (!simpleCrawler.CanAddUrl(args))
            {
                return(false);
            }
            if (!filter.Contains(args.Url))
            {
                filter.Add(args.Url);
                Console.WriteLine(args.Url);
                return(true);
            }

            return(false); // 返回 false 代表:不添加到队列中
        }
Example #5
0
 /// <summary>
 /// 监测Url是否重复
 /// </summary>
 /// <param name="url"></param>
 /// <returns></returns>
 private static bool MasterAddUrlEvent(AddUrlEventArgs args)
 {
     foreach (string item in UrlDebar)
     {
         if (!string.IsNullOrWhiteSpace(item))
         {
             if (args.Url.Contains(item))
             {
                 return(false); // 返回 false 代表:不添加到队列中
             }
         }
     }
     if (!filter.Contains(args.Url))
     {
         filter.Add(args.Url);
         return(true);
     }
     return(false);
 }
        /// <summary>
        /// url处理,是否可添加
        /// </summary>
        /// <param name="args"></param>
        /// <returns></returns>
        public bool CanAddUrl(AddUrlEventArgs args)
        {
            #region //todo:是否重复building 临时
            var queryStr = GetQueryString(args.Url);
            if (!string.IsNullOrEmpty(queryStr))
            {
                var dic = HttpUtility.ParseQueryString(queryStr);

                var buildId = dic["building"] != null ? dic["building"].ToString() : string.Empty;
                if (!string.IsNullOrEmpty(buildId))
                {
                    if (BloomBuildingIds.Contains(buildId))
                    {
                        return(false);
                    }
                }
            }
            #endregion
            return(true);
        }
        /// <summary>
        /// url处理,是否可添加
        /// </summary>
        /// <param name="args"></param>
        /// <returns></returns>
        public bool CanAddUrl(AddUrlEventArgs args)
        {

            return true;
        }
Example #8
0
        /// <summary>
        /// The master add url event.
        /// </summary>
        /// <param name="args">
        /// The args.
        /// </param>
        /// <returns>
        /// The <see cref="bool"/>.
        /// </returns>
        private static bool MasterAddUrlEvent(AddUrlEventArgs args)
        {
            if (!filter.Contains(args.Url))
            {
                filter.Add(args.Url);
                Console.WriteLine(args.Url);
                return true;
            }

            return false; // 返回 false 代表:不添加到队列中
        }
Example #9
0
        /// <summary>
        /// The master add url event.
        /// </summary>
        /// <param name="args">
        /// The args.
        /// </param>
        /// <returns>
        /// The <see cref="bool"/>.
        /// </returns>
        private static bool MasterAddUrlEvent(AddUrlEventArgs args)
        {
            if (!filter.Contains(args.Url))//不包含就添加
            {
                filter.Add(args.Url);
                Console.WriteLine(args.Url);
                File.AppendAllText(urlFilePath, args.Url + "\r\n");
                return true;
            }

            return false; // 返回 false 代表:不添加到队列中
        }
Example #10
0
        /// <summary>
        /// 解析页面链接
        /// </summary>
        /// <param name="html"></param>
        private void ParseLinks(UrlInfo currentUrl, string html)
        {
            if (_settings.CrawlDepth > 0 && currentUrl.Depth >= _settings.CrawlDepth)
            {
                return;
            }

            // 获取页面所有链接
            Dictionary <string, string> urls = new Dictionary <string, string>();
            Match match = Regex.Match(html, "(?i)<a .*?href=\"([^\"]+)\"[^>]*>(.*?)</a>");

            while (match.Success)
            {
                // 以href作为key,已链接文本作为value
                urls[match.Groups[1].Value] = Regex.Replace(match.Groups[2].Value, "(?i)<.*?>", "");
                match = match.NextMatch();
            }

            foreach (var linknode in urls)
            {
                string href     = linknode.Key;
                string linkText = linknode.Value;
                if (href != null)
                {
                    bool canBeAdd = true;
                    if (_escapeLinks != null)
                    {
                        foreach (var node in _escapeLinks)
                        {
                            if (href.EndsWith(node, StringComparison.OrdinalIgnoreCase))
                            {
                                canBeAdd = false;
                                break;
                            }
                        }
                    }
                    if (_keywords != null)
                    {
                        if (!_keywords.Any(linkText.Contains))
                        {
                            canBeAdd = false;
                        }
                    }

                    if (canBeAdd)
                    {
                        string url = href
                                     .Replace("%3f", "?")
                                     .Replace("%3d", "=")
                                     .Replace("%2f", "/")
                                     .Replace("&amp;", "&");

                        if (String.IsNullOrEmpty(url) ||
                            url.StartsWith("#") ||
                            url.StartsWith("mailto:", StringComparison.OrdinalIgnoreCase) ||
                            url.StartsWith("javascript:", StringComparison.OrdinalIgnoreCase))
                        {
                            continue;
                        }

                        Uri uri     = new Uri(currentUrl.Url);
                        Uri thisUri = url.StartsWith("http", StringComparison.OrdinalIgnoreCase) ? new Uri(url) : new Uri(uri, url);
                        url = thisUri.AbsoluteUri;
                        if (_settings.LockHost)
                        {
                            // 对于new.baidu.com和www.baidu.com
                            // 如果去除二级域名后相等,则认为是同一个网站
                            if (uri.Host.Split('.').Skip(1).Aggregate((a, b) => a + "." + b) !=
                                thisUri.Host.Split('.').Skip(1).Aggregate((a, b) => a + "." + b))
                            {
                                continue;
                            }
                        }
                        if (!IsUrlMatchRegex(url))
                        {
                            continue;
                        }

                        AddUrlEventArgs args = new AddUrlEventArgs {
                            Title = linkText, Depth = currentUrl.Depth + 1, Url = url
                        };
                        if (AddUrlEvent != null && !AddUrlEvent(args))
                        {
                            continue;
                        }

                        lock (this)
                        {
                            _urlQueue.Enqueue(new UrlInfo {
                                Url = url, Depth = currentUrl.Depth + 1
                            });
                        }
                    }
                }
            }
        }
Example #11
0
 private bool MasterAddUrlEvent(AddUrlEventArgs args)
 {
     return(_linkStorage.TryAdd(args.Url));
 }
Example #12
0
        /// <summary>
        /// The master add url event.
        /// </summary>
        /// <param name="args">
        /// The args.
        /// </param>
        /// <returns>
        /// The <see cref="bool"/>.
        /// </returns>
        private static bool MasterAddUrlEvent(AddUrlEventArgs args)
        {
            if (urlFilterKeyWord.Any(c => args.Url.Contains(c))) return false;//url过滤
            if (!filter.Contains(args.Url))
            {
                filter.Add(args.Url);
                Console.WriteLine(args.Url);
                return true;
            }

            return false; // 返回 false 代表:不添加到队列中
        }
Example #13
0
        /// <summary>
        /// The crawl process.
        /// </summary>
        /// <param name="threadIndex">
        /// The thread index.
        /// </param>
        private void CrawlProcess(object threadIndex)
        {
            var currentThreadIndex = (int)threadIndex;

            while (true)
            {
                // 根据队列中的 Url 数量和空闲线程的数量,判断线程是睡眠还是退出
                if (UrlQueue.Instance.Count == 0)
                {
                    this.threadStatus[currentThreadIndex] = true;
                    if (!this.threadStatus.Any(t => t == false))
                    {
                        break;
                    }

                    Thread.Sleep(2000);
                    continue;
                }

                this.threadStatus[currentThreadIndex] = false;

                if (UrlQueue.Instance.Count == 0)
                {
                    continue;
                }


                UrlInfo urlInfo = UrlQueue.Instance.DeQueue();

                if (urlInfo == null)
                {
                    continue;
                }

                HttpHandle.HttpResult httpResult = HttpHandle.Get(urlInfo);
                if (httpResult != null)
                {
                    string html = httpResult.Body;
                    string code = httpResult.Code.ToString();

                    if (urlInfo.Type && httpResult.Code == 200)
                    {
                        this.ParseLinks(urlInfo, html);


                        Match mc = Regex.Match(urlInfo.UrlString, @"((([A-Za-z]{3,9}:(?:\/\/)?)(?:[-;:&=\+\$,\w]+@)?[A-Za-z0-9.-]+(:[0-9]+)?|(?:ww‌​w.|[-;:&=\+\$,\w]+@)[A-Za-z0-9.-]+)((?:\/[\+~%\/.\w-_]*)?\??(?:[-\+=&;%@.\w_]*)#?‌​(?:[\w]*))?)", RegexOptions.IgnoreCase);

                        for (int i = 0; i < Program.UrlSuffix.Count; i++)
                        {
                            string url1 = mc.ToString();
                            if (url1.Substring(url1.Length - 1, 1) == "/")
                            {
                                url1 = url1.Substring(0, url1.Length - 1);
                            }
                            url1 = url1 + Program.UrlSuffix[i];
                            //url1 = url1.Replace("//", "/");
                            var addUrlEventArgs1 = new AddUrlEventArgs {
                                Title = url1, Depth = urlInfo.Depth + 1, Url = url1
                            };
                            if (this.AddUrlEvent != null && !this.AddUrlEvent(addUrlEventArgs1))
                            {
                                continue;
                            }
                            UrlQueue.Instance.EnQueue(new UrlInfo(url1)
                            {
                                Depth = 1, Type = false
                            });
                        }
                        ;
                    }
                    else
                    {
                        String regex = @"<title>.+</title>";

                        //返回网页标题
                        String title = Regex.Match(html, regex).ToString();
                        html = Regex.Replace(title, @"[\""]+", "");
                        if (html.IndexOf("404") > -1 || html.IndexOf("Page Not Found") > -1 || html.IndexOf("未找到") > -1 || html.IndexOf("不存在") > -1 || html.IndexOf("错误") > -1 || html.IndexOf("网站防火墙") > -1 || html.IndexOf("请联系空间提供商") > -1)
                        {
                            code = "404";
                        }
                    }
                    if (this.DataReceivedEvent != null)
                    {
                        this.DataReceivedEvent(
                            new DataReceivedEventArgs
                        {
                            Url   = urlInfo.UrlString,
                            Depth = urlInfo.Depth,
                            Html  = html,
                            Code  = code,
                            Type  = urlInfo.Type
                        });
                    }
                }
            }
        }
Example #14
0
        /// <summary>
        /// The parse links.
        /// </summary>
        /// <param name="urlInfo">
        /// The url info.
        /// </param>
        /// <param name="html">
        /// The html.
        /// </param>
        private void ParseLinks(UrlInfo urlInfo, string html)
        {
            var urlDictionary = new Dictionary <string, string>();

            Match match = Regex.Match(html, @"((([A-Za-z]{3,9}:(?:\/\/)?)(?:[-;:&=\+\$,\w]+@)?[A-Za-z0-9.-]+(:[0-9]+)?|(?:ww‌​w.|[-;:&=\+\$,\w]+@)[A-Za-z0-9.-]+)((?:\/[\+~%\/.\w-_]*)?\??(?:[-\+=&;%@.\w_]*)#?‌​(?:[\w]*))?)");

            while (match.Success)
            {
                ///;
                // 以 href 作为 key
                string urlKey = match.Value;

                // 以 text 作为 value
                string urlValue = Regex.Replace(match.Value, "(?i)<.*?>", string.Empty);
                string Url      = @"^http(s)?://([\w-]+\.)+[\w-]+(/[\w- ./?%&=]*)?$";
                if (Regex.IsMatch(urlValue, Url))
                {
                    urlDictionary[urlKey] = urlValue;
                }
                match = match.NextMatch();
            }

            foreach (var item in urlDictionary)
            {
                string href = item.Key;
                string text = item.Value;

                if (!string.IsNullOrEmpty(href))
                {
                    string url = href.Replace("%3f", "?")
                                 .Replace("%3d", "=")
                                 .Replace("%2f", "/")
                                 .Replace("&amp;", "&");
                    if (string.IsNullOrEmpty(url) || url.StartsWith("#") ||
                        url.StartsWith("mailto:", StringComparison.OrdinalIgnoreCase) ||
                        url.StartsWith("javascript:", StringComparison.OrdinalIgnoreCase))
                    {
                        continue;
                    }

                    var baseUri    = new Uri(urlInfo.UrlString);
                    Uri currentUri = url.StartsWith("http", StringComparison.OrdinalIgnoreCase)
                                         ? new Uri(url)
                                         : new Uri(baseUri, url);

                    url = currentUri.AbsoluteUri;

                    var addUrlEventArgs = new AddUrlEventArgs {
                        Title = text, Depth = urlInfo.Depth + 1, Url = url
                    };
                    if (this.AddUrlEvent != null && !this.AddUrlEvent(addUrlEventArgs))
                    {
                        continue;
                    }
                    UrlQueue.Instance.EnQueue(new UrlInfo(url)
                    {
                        Depth = urlInfo.Depth + 1, Type = true
                    });
                }
            }
        }