private async Task <IEnumerable <Link> > GetRootPageLinksAsync() { IEnumerable <Link> links = null; var html = await GetRootPageHtmlAsync(); links = CrawlerParser.GetPageLinks(this.RootUri, html); if (links != null) { links = FilterLinks(links); } else { return(null); } foreach (var link in links) { try { if (link != null && !string.IsNullOrEmpty(link.Href)) { SiteNode.AddChild(link); } } catch (Exception ex) { } } return(links); }
private static readonly string[] staticFileExtensions = { ".css", ".ico", ".pdf", ".json", ".xml", ".jpg", ".jpeg", ".bmp", ".gif", ".png", ".js" }; // etc /// <summary> /// Get a list of Link objects from html /// </summary> /// <param name="html">Page Html</param> /// <returns>List of Link objects</returns> public static IEnumerable <Link> GetPageLinks(string rootUrl, string html) { if (string.IsNullOrEmpty(html)) { return(null); } var list = new List <Link>(); string strRegex = @"href\s*=\s*(?:\""(?<1>[^\""]*)\""|(?<1>\\S+))"; //regex: searches anchor tags var regex = new Regex(strRegex, RegexOptions.IgnoreCase | RegexOptions.Multiline); //On match found parse the link and store it as Link if (regex != null && regex.IsMatch(html)) { foreach (Match match in regex.Matches(html)) { var link = CrawlerParser.ParseLink(rootUrl, match.Value); //Only add if Link(:href) does not exist in the list if (!list.Exists(x => x.Href.Equals(link.Href))) { list.Add(link); } } } return(list); }
/// <summary> /// Parse Link object from method Crawler.GetPageLinks() retunred html /// </summary> /// <param name="rootUri"></param> /// <param name="raw"></param> /// <returns></returns> public static Link ParseLink(string rootUri, string raw) { //Parse URI from regex match value and perform some cleanups var href = CrawlerParser.ParseTextFromQuote(raw); // sets true if wipro site bool isWipro = CrawlerParser.IsWipro(href); // sets true if url is relative bool isRelative = CrawlerParser.IsRelativeUrl(href); // sets true javascript links bool isJs = CrawlerParser.IsJavaScriptOrHashLink(raw, href); // sets true if file is static bool isStatic = CrawlerParser.IsStatic(href); //get filename from url string extension = string.Empty; string fileName = null; if (isWipro && !isJs && !isStatic) { fileName = Utilities.GetFileNameFromUrl(rootUri, href, out extension); } //get file-type from extension var type = Utilities.GetHrefType(extension); // group[1] value contains Title //var title = match.Groups.Count > 0 ? match.Groups[1].Value : ""; //generate Link object var link = new Link { Raw = raw, // store raw search Href = href, // hyperlink IsRelativeUrl = isRelative, IsWipro = isWipro, IsJavaScript = isJs, IsStatic = isStatic, Type = type, FileName = fileName }; return(link); }
public IEnumerable <Link> GetPageImages(string html) { var list = new List <Link>(); var strRegex = @"<img.+?src=[""'](.+?)[""'].*?>"; //regex: searches image tags var regex = new Regex(strRegex, RegexOptions.IgnoreCase | RegexOptions.Multiline); //On match found parse the link and store it as Link if (regex.IsMatch(html)) { foreach (Match match in regex.Matches(html)) { list.Add(new Link { Raw = match.Value, // store raw search Href = match.Groups[1].Value, // hyperlink IsRelativeUrl = CrawlerParser.IsRelativeUrl(match.Groups[1].Value), // set to true if wipro site IsImage = true // javascript links }); } } return(list); }
public IEnumerable <Link> ExtractAll(string url, bool isRoot) { //sort, remove duplicates IEnumerable <Link> links = null; if (isRoot) { var html = GetRootPageHtml(); if (string.IsNullOrEmpty(html)) { return(links); } links = FilterLinks(CrawlerParser.GetPageLinks(url, html)); foreach (var link in links) { if (!this.Links.Contains(link)) { this.Links.Add(link); } SiteNode.AddChild(link); ExtractAll(link.Href, false); } } else { url = CrawlerParser.GetAbsoluteUri(this.RootUri, url); var html = GetPageHtml(url); if (string.IsNullOrEmpty(html)) { return(links); } links = CrawlerParser.GetPageLinks(this.RootUri, html); if (links != null) { links = FilterLinks(links); } else { return(links); } foreach (var link in links) { var node = this.SiteNode.FindTreeNode(x => x.Data.Href.Equals(link.Href)); if (node == null) { this.SiteNode.AddChild(link); if (depthCounter2 < MAX_DEPTH_COUNT) { depthCounter2++; this.Links.AddRange(ExtractAll(link.Href, false)); } } else { if (depthCounter2 < MAX_DEPTH_COUNT) { depthCounter2++; node.AddChild(link); link.Links = ExtractAll(link.Href, false); } } } } return(links); }
public async Task ExtractAllAsync(string url, bool isRoot) { //sort, remove duplicates IEnumerable <Link> links = null; if (isRoot) { var html = await GetRootPageHtmlAsync(); links = CrawlerParser.GetPageLinks(this.RootUri, html); if (links != null) { links = links.Where(x => !x.IsJavaScript && !x.IsStatic && x.IsWipro); } else { return; } foreach (var link in links) { try { if (link != null && !string.IsNullOrEmpty(link.Href)) { SiteNode.AddChild(link); if (!blackListUrls.Contains(link.Href)) { await ExtractAllAsync(link.Href, false); } } } catch (Exception ex) { } } } else { url = CrawlerParser.GetAbsoluteUri(this.RootUri, url); string html = string.Empty; try { html = await GetPageAsync(url); } catch (Exception ex) { } links = CrawlerParser.GetPageLinks(this.RootUri, html); if (links != null) { links = links.Where(x => !x.IsJavaScript && !x.IsStatic && x.IsWipro); } else { return; } foreach (var link in links) { try { var node = this.SiteNode.FindTreeNode(x => x.Data.Href.Equals(link.Href)); if (node == null) { if (!this.SiteNode.Children.Any(x => x.Data.Href.Equals(link.Href))) { this.SiteNode.AddChild(link); } if (!blackListUrls.Contains(link.Href) && link.Href.ToLower() != this.RootUri.ToLower() && depthCounter2 < MAX_DEPTH_COUNT) { depthCounter2++; await ExtractAllAsync(link.Href, false); Console.WriteLine("Href: {0}\nChildren Count: {1}\nElements Index: {2}", link.Href, SiteNode.Children.Count, SiteNode.ElementsIndex.Count); } } else { if (!blackListUrls.Contains(link.Href) && link.Href.ToLower() != this.RootUri.ToLower() && depthCounter2 < MAX_DEPTH_COUNT) { depthCounter2++; if (!node.IsRoot && !node.Children.Any(x => x.Data.Href.Equals(link.Href))) { node.AddChild(link); } await ExtractAllAsync(link.Href, false); } } } catch (Exception ex) { } } } }
private async Task <IEnumerable <Link> > GetChildPagesLinksAsync(string url) { IEnumerable <Link> links = null; url = CrawlerParser.GetAbsoluteUri(this.RootUri, url); string html = string.Empty; try { html = await GetPageAsync(url); } catch (Exception ex) { } links = CrawlerParser.GetPageLinks(this.RootUri, html); if (links != null) { links = FilterLinks(links); } else { return(null); } Parallel.ForEach(links, async link => { try { var node = this.SiteNode.FindTreeNode(x => x.Data.Href.Equals(link.Href)); if (node == null) { if (!this.SiteNode.Children.Any(x => x.Data.Href.Equals(link.Href))) { this.SiteNode.AddChild(link); } if (depthCounter2 < MAX_DEPTH_COUNT) { depthCounter2++; link.Links = await GetChildPagesLinksAsync(link.Href); Debug.WriteLine("Href: {0}\nChildren Count: {1}\nElements Index: {2}\nDepth: {3}", link.Href, SiteNode.Children.Count, SiteNode.ElementsIndex.Count, depthCounter2); } } else { if (depthCounter2 < MAX_DEPTH_COUNT) { depthCounter2++; if (!node.IsRoot && !node.Children.Any(x => x.Data.Href.Equals(link.Href))) { node.AddChild(link); } link.Links = await GetChildPagesLinksAsync(link.Href); Debug.WriteLine("Href: {0}\nChildren Count: {1}\nElements Index: {2}\nDepth: {3}", link.Href, SiteNode.Children.Count, SiteNode.ElementsIndex.Count, depthCounter2); } } } catch (Exception ex) { } }); return(links); }