Пример #1
0
        private async void btn_Fetch_Click(object sender, RoutedEventArgs e)
        {
            var url = this.tbox_Url.Text.Trim();

            if (string.IsNullOrEmpty(url))
            {
                EMessageBox.Show("请输入网址");
                return;
            }

            var robotsUrl = url.EndsWith("/") ? url + "robots.txt" : url + "/robots.txt";

            robotsUrl = UrlUtil.FixUrl(robotsUrl);

            if (WebUtil.IsResourceAvailable(robotsUrl) == false)
            {
                this.lbl_Result.Text = "该网站没有爬虫协议";
                return;
            }

            var stream = await WebUtil.GetHtmlStreamAsync(robotsUrl);

            var result = WebUtil.ResolveRobotsProtocol(stream);

            stream.Close();

            //显示结果
            this.lbl_Result.Text = $"从【{url}】获取到的爬虫协议如下(目录以;分隔)\r\n\n";
            foreach (var item in result)
            {
                this.lbl_Result.Text += item.ToString();
            }
        }
Пример #2
0
        public async void SurfingByFCL(string url, Action <string> act)
        {
            try
            {
                //Url Check
                url = UrlUtil.FixUrl(url);

                string html = await WebUtil.GetHtmlSource(url);

                act?.Invoke(html);
            }
            catch (Exception ex)
            {
                //TODO
                ShowStatusText(ex.Message);
            }
        }
Пример #3
0
        private void btn_Surfing_Click(object sender, RoutedEventArgs e)
        {
            string url = this.tbox_Url.Text;

            if (string.IsNullOrEmpty(url))
            {
                ShowStatusText("请输入Url");
                return;
            }

            if (globalData.CrawlerConfig.CommonConfig.UrlCheck == true)
            {
                if (RegexUtil.IsUrl(url) == false)
                {
                    ShowStatusText("网址输入有误");
                    return;
                }
            }

            baseUrl = UrlUtil.FixUrl(url);

            Reset();
            Surfing(url);
        }
Пример #4
0
        private void btn_Surfing_Click(object sender, RoutedEventArgs e)
        {
            string url = this.tbox_Url.Text.Trim();

            if (string.IsNullOrEmpty(url))
            {
                ShowStatusText("请输入Url");
                return;
            }

            if (cbx_HttpWebRequest.IsChecked == true)
            {
                if (RegexUtil.IsUrl(url) == false)
                {
                    ShowStatusText("网址输入有误");
                    return;
                }
            }

            imageCollection.Clear();

            BaseUrl = UrlUtil.FixUrl(url);
            Surfing(url);
        }
Пример #5
0
        private async void GetUrlRecursion(string url, int depth)
        {
            if (depth > recursionDepth)
            {
                return;
            }

            try
            {
                //Url Check
                var extractUrl = "";

                url = UrlUtil.FixUrl(url);

                string html = await WebUtil.GetHtmlSource(url);

                var recursionBaseUrl = UrlUtil.ExtractBaseUrl(url);

                await Task.Run(() => {
                    HtmlAgilityPack.HtmlDocument doc = new HtmlAgilityPack.HtmlDocument();
                    doc.LoadHtml(html.ToString());
                    HtmlAgilityPack.HtmlNodeCollection nodeCollection = doc.DocumentNode.SelectNodes("//a");

                    if (nodeCollection == null)
                    {
                        return;
                    }

                    for (int i = 0; i < nodeCollection.Count; i++)
                    {
                        var hrefAttribute = nodeCollection[i].Attributes["href"];
                        if (hrefAttribute == null)
                        {
                            continue;
                        }
                        extractUrl = hrefAttribute.Value;
                        if (string.IsNullOrEmpty(extractUrl))
                        {
                            continue;
                        }
                        if (extractUrl.StartsWith("/"))
                        {
                            extractUrl = recursionBaseUrl + extractUrl;
                        }
                        AddToCollection(new UrlStruct()
                        {
                            Id = (i + 1), Status = "", Title = "", Url = extractUrl
                        }, globalBaseUrl);

                        System.Threading.Thread.Sleep(3000);

                        GetUrlRecursion(extractUrl, depth);
                    }
                });
            }
            catch (Exception ex)
            {
                ShowStatusText(ex.Message);
            }

            depth++;
        }