public override string[] ExtractTargets(string target) { List <string> results = new List <string>(); Url url = Url.FromUri(target); CQ q = CQ.Create(url.Html); q["a"].Each((i, d) => { string href = d.Attributes["href"]; if (href != null && !href.Equals("#")) { if (!Uri.IsWellFormedUriString(href, UriKind.Absolute)) { try { href = System.IO.Path.Combine(Root, href); } catch //(Exception ex) { return; } if (!Uri.IsWellFormedUriString(href, UriKind.Absolute)) { return; } } results.Add(href); } }); return(results.ToArray()); }
public override bool WasProcessed(string target = "") { Url url = Url.FromUri(target); if (url.Id == null) { return(false); } return(Image.OneWhere(c => c.UrlId == url.Id) != null); }
/// <summary> /// Reads the target and saves the url of any img tag it finds /// </summary> /// <param name="target"></param> public override void ProcessTarget(string target) { Url url = Url.FromUri(target);//new Url(target); if (url.Id == null) { url.Save(); } CQ q = CQ.Create(url.Html); q["img"].Each((i, d) => { try { string imgUrl = d.Attributes["src"]; if (Uri.IsWellFormedUriString(imgUrl, UriKind.Relative)) { imgUrl = $"{url.ProtocolOfProtocolId.Value}://{url.DomainOfDomainId.Value}{url.PathOfPathId.Value}{imgUrl}"; } Url image = Url.FromUri(new Uri(imgUrl), true);// new Url(imgUrl); Image img = Image.OneWhere(c => c.UrlId == image.Id); if (img == null) { Crawler cr = Crawler.OneWhere(c => c.Name == this.Name); cr.RootUrl = target; img = new Image(); img.UrlId = image.Id; img.Date = DateTime.UtcNow; img.CrawlerId = cr.Id; img.Save(); } if (OnImageFound != null) { OnImageFound(url, imgUrl); } } catch (Exception ex) { Logging.Log.AddEntry("Error occurred in image crawler: {0}", ex, ex.Message); } }); }