private async void btn_Fetch_Click(object sender, RoutedEventArgs e) { var url = this.tbox_Url.Text.Trim(); if (string.IsNullOrEmpty(url)) { EMessageBox.Show("请输入网址"); return; } var robotsUrl = url.EndsWith("/") ? url + "robots.txt" : url + "/robots.txt"; robotsUrl = UrlUtil.FixUrl(robotsUrl); if (WebUtil.IsResourceAvailable(robotsUrl) == false) { this.lbl_Result.Text = "该网站没有爬虫协议"; return; } var stream = await WebUtil.GetHtmlStreamAsync(robotsUrl); var result = WebUtil.ResolveRobotsProtocol(stream); stream.Close(); //显示结果 this.lbl_Result.Text = $"从【{url}】获取到的爬虫协议如下(目录以;分隔)\r\n\n"; foreach (var item in result) { this.lbl_Result.Text += item.ToString(); } }
public async void SurfingByFCL(string url, Action <string> act) { try { //Url Check url = UrlUtil.FixUrl(url); string html = await WebUtil.GetHtmlSource(url); act?.Invoke(html); } catch (Exception ex) { //TODO ShowStatusText(ex.Message); } }
private void btn_Surfing_Click(object sender, RoutedEventArgs e) { string url = this.tbox_Url.Text; if (string.IsNullOrEmpty(url)) { ShowStatusText("请输入Url"); return; } if (globalData.CrawlerConfig.CommonConfig.UrlCheck == true) { if (RegexUtil.IsUrl(url) == false) { ShowStatusText("网址输入有误"); return; } } baseUrl = UrlUtil.FixUrl(url); Reset(); Surfing(url); }
private void btn_Surfing_Click(object sender, RoutedEventArgs e) { string url = this.tbox_Url.Text.Trim(); if (string.IsNullOrEmpty(url)) { ShowStatusText("请输入Url"); return; } if (cbx_HttpWebRequest.IsChecked == true) { if (RegexUtil.IsUrl(url) == false) { ShowStatusText("网址输入有误"); return; } } imageCollection.Clear(); BaseUrl = UrlUtil.FixUrl(url); Surfing(url); }
private async void GetUrlRecursion(string url, int depth) { if (depth > recursionDepth) { return; } try { //Url Check var extractUrl = ""; url = UrlUtil.FixUrl(url); string html = await WebUtil.GetHtmlSource(url); var recursionBaseUrl = UrlUtil.ExtractBaseUrl(url); await Task.Run(() => { HtmlAgilityPack.HtmlDocument doc = new HtmlAgilityPack.HtmlDocument(); doc.LoadHtml(html.ToString()); HtmlAgilityPack.HtmlNodeCollection nodeCollection = doc.DocumentNode.SelectNodes("//a"); if (nodeCollection == null) { return; } for (int i = 0; i < nodeCollection.Count; i++) { var hrefAttribute = nodeCollection[i].Attributes["href"]; if (hrefAttribute == null) { continue; } extractUrl = hrefAttribute.Value; if (string.IsNullOrEmpty(extractUrl)) { continue; } if (extractUrl.StartsWith("/")) { extractUrl = recursionBaseUrl + extractUrl; } AddToCollection(new UrlStruct() { Id = (i + 1), Status = "", Title = "", Url = extractUrl }, globalBaseUrl); System.Threading.Thread.Sleep(3000); GetUrlRecursion(extractUrl, depth); } }); } catch (Exception ex) { ShowStatusText(ex.Message); } depth++; }