public IActionResult About(string url) { var list = typeof(ScrapingXpath).GetProperties().Select(s => s.GetCustomAttributes(typeof(DisplayAttribute))).ToList().Select(c => ((DisplayAttribute)c.First()).Name).ToList(); var web = new HtmlWeb(); var doc = web.Load(url) ; //foreach (var items in doc.DocumentNode.SelectNodes("//script")) //{ // items.Remove(); //} //foreach (var htmlNode in doc.DocumentNode.ChildNodes.Where(x => x.XPath.Contains("#comment")).ToList()) //{ // htmlNode.Remove(); //} //foreach (var htmlNode in doc.DocumentNode.ChildNodes.Where(x => x.XPath.Contains("/#document")).ToList()) //{ // htmlNode.Remove(); //} //foreach (var selectNode in doc.DocumentNode.SelectNodes("//head/meta")) //{ // selectNode.Remove(); //} var keyValues = new List <KeyValuePair <string, string> >(); var model = new CrawlerViewModel { KeyValuePairs = keyValues, HtmlDoc = doc.DocumentNode.InnerHtml, MapTags = list, Url = url }; return(this.View(model)); }
public IActionResult About() { var list = typeof(ScrapingXpath).GetProperties().Select(s => s.GetCustomAttributes(typeof(DisplayAttribute))).ToList().Select(c => ((DisplayAttribute)c.First()).Name).ToList(); var model = new CrawlerViewModel { MapTags = list }; return(this.View(new CrawlerViewModel())); }
public ActionResult Default(CrawlerViewModel viewModel) { try { crawler.Crawl(viewModel); } catch (Exception ex) { viewModel.ErrorMsg = ex.Message; } return(View(viewModel)); }
public CrawlerViewModel Crawl(CrawlerViewModel viewModel) { if (!Helper.IsValidUrl(viewModel.UrlToCrawl)) { viewModel.ErrorMsg = String.Format(" Please enter mail adress"); return(viewModel); } allLinksOnPage = new List <Uri>(); CrawlConfiguration config = new CrawlerNetConfig().Initalize(); this.crawler = new PoliteWebCrawler(config); crawler.PageCrawlCompleted += crawler_PageCrawlCompleted; // CrawlResult result = crawler.Crawl(new Uri(viewModel.UrlToCrawl)); if (result.ErrorOccurred) { viewModel.ErrorMsg = String.Format("Crawler completed with error: {0}", result.ErrorException.Message); } var isProd = Convert.ToBoolean(ConfigurationManager.AppSettings["IsProd"].ToString()); if (isProd) { viewModel.CrawledLinks.AddRange(allLinksOnPage); } else { viewModel.CrawledLinks.AddRange(allLinksOnPage.Take(10)); } viewModel.SuccessMsg = " Successfully Listed !"; return(viewModel); }
public MainWindow() { cvm = new CrawlerViewModel(); this.DataContext = cvm; InitializeComponent(); }
public CrawlerViewModel Crawler( Repository repository, string siteName, MediaFolder folder, string[] source, string catcherPathFormat) { var result = new CrawlerViewModel { state = "SUCCESS" }; if (source == null || source.Length == 0) { result.state = "参数错误:没有指定抓取源"; return(result); } result.list = source.Select(item => { var crawler = new CrawlerItemViewModel(); if (!IsExternalIPAddress(item)) { crawler.state = "INVALID_URL"; return(crawler); } crawler.source = item; var request = HttpWebRequest.Create(item) as HttpWebRequest; using (var response = request.GetResponse() as HttpWebResponse) { if (response.StatusCode != HttpStatusCode.OK) { crawler.state = "Url returns " + response.StatusCode + ", " + response.StatusDescription; return(crawler); } if (response.ContentType.IndexOf("image") == -1) { crawler.state = "Url is not an image"; return(crawler); } try { var name = Path.GetFileName(item); if (!Path.HasExtension(item)) { name += ".jpg"; } var fileName = PathFormatter.Format(name, catcherPathFormat); var stream = response.GetResponseStream(); var reader = new BinaryReader(stream); using (var ms = new MemoryStream()) { byte[] buffer = new byte[4096]; int count; while ((count = reader.Read(buffer, 0, buffer.Length)) != 0) { ms.Write(buffer, 0, count); } ms.Flush(); var res = Upload(repository, siteName, folder, fileName, ms); crawler.url = res.url; } crawler.state = "SUCCESS"; } catch (Exception e) { crawler.state = "抓取错误:" + e.Message; } return(crawler); } return(crawler); }); return(result); }