async Task handle(string url) { if (VisitedUrl.Add(url)) { var context = BrowsingContext.New(Configuration.Default.WithDefaultLoader()); try { var document = await context.OpenAsync(url); foreach (var item in document.QuerySelectorAll("[href]")) { var u = fixUrl(item.GetAttribute("href")); try { if (new Uri(u).Host.EndsWith(StartUrl.Host) && !PendingUrl.Contains(u) && !VisitedUrl.Contains(u)) { PendingUrl.Enqueue(u); } } catch (System.UriFormatException) { ErrorUrl.Add(u); } } foreach (var item in document.QuerySelectorAll("[src]")) { var u = fixUrl(item.GetAttribute("src")); if (FileTypes.Contains(u.Split('.')[^ 1]) && VisitedUrl.Add(u))
/// <summary> /// Crawls a given XML document (sitemap) by URI, checking each found URI's date if possible /// and filtering out links disallowed or older than 2 months. Places found and valid XMLs/URLs /// into relevant azure queue for processing. /// </summary> /// <param name="data"> /// Crawler data helper. Ref. /// </param> /// <param name="storage"> /// Crawler azure storage helper. Ref. /// </param> /// <param name="xml"> /// URI to XML document (sitemap) to crawl. /// </param> public static void CrawlXml(ref CrawlrDataHelper data, ref CrawlrStorageManager storage, string xml) { if (data.ChkIfUriAllowed(xml)) { try { if (xml.EndsWith(".xml")) { XElement sitemap = XElement.Load(xml); string sitemapType = sitemap.Name.LocalName; string nameSpace = sitemap.GetDefaultNamespace().ToString(); string dateNameSpace = null; XName dateParent = null; XName date = null; if (sitemap.ToString().Contains(@"xmlns:news")) { dateNameSpace = sitemap.GetNamespaceOfPrefix("news").ToString(); dateParent = XName.Get("news", dateNameSpace); date = XName.Get("publication_date", dateNameSpace); } else if (sitemap.ToString().Contains(@"xmlns:video")) { dateNameSpace = sitemap.GetNamespaceOfPrefix("video").ToString(); dateParent = XName.Get("video", dateNameSpace); date = XName.Get("publication_date", dateNameSpace); } XName loc = XName.Get("loc", nameSpace); XName lastMod = XName.Get("lastmod", nameSpace); XName elementSelector; if (sitemapType == "sitemapindex") { elementSelector = XName.Get("sitemap", nameSpace); } else { elementSelector = XName.Get("url", nameSpace); } List <string> xmlsToQueue = new List <string>(); List <string> urlsToQueue = new List <string>(); foreach (var element in sitemap.Elements(elementSelector)) { bool validDateIfExists = true; var currLocElement = element.Element(loc); string currLocValue = currLocElement.Value; var currLastModElement = element.Element(lastMod); if (currLastModElement == null) { currLastModElement = element.Element(dateParent); currLastModElement = (currLastModElement == null ? null : currLastModElement.Element(date)); } if (currLastModElement != null) { validDateIfExists = DateTime.Parse(currLastModElement.Value) >= DateTime.Now - TimeSpan.FromDays(62); } if (currLocValue.Contains(".xml")) { if (!data.QueuedXmls.Contains(currLocValue) && validDateIfExists) { xmlsToQueue.Add(currLocValue); } } else { if (!data.QueuedUrls.Contains(currLocValue) && validDateIfExists) { urlsToQueue.Add(currLocValue); } } } foreach (string newXml in xmlsToQueue) { CloudQueueMessage msg = new CloudQueueMessage(newXml); storage.XmlQueue.AddMessage(msg); data.QueuedXmls.Add(newXml); data.NumXmlsQueued++; } foreach (string newUrl in urlsToQueue) { UrlCrawlr.ChkAndAddUrl(newUrl, xml, null, ref data, ref storage); } } } catch (Exception ex) { ErrorUrl errorUrl = new ErrorUrl(xml, ex.ToString()); TableOperation insertErrorUrl = TableOperation.InsertOrReplace(errorUrl); storage.ErrorTable.Execute(insertErrorUrl); } } }
/// <summary> /// Crawls a given URL, queueing all found URLs and storing information about /// the given URL for later querying. /// </summary> /// <param name="data"> /// Crawler data helper. Ref. /// </param> /// <param name="storage"> /// Crawler azure storage helper. Ref. /// </param> /// <param name="url"> /// The given URL to crawl. /// </param> public static void CrawlUrl(ref CrawlrDataHelper data, ref CrawlrStorageManager storage, string url) { if (data.ChkIfUriAllowed(url)) { ///* Unsure if necessary. */ //if (!url.Contains(".htm")) //{ // if (!url.Contains(".jpg") && !url.Contains(".png")) // { // if (url.EndsWith(@"/")) // { // url += "index.html"; // } // else // { // url += @"/index.html"; // } // } //} try { var web = new HtmlWeb(); var currDoc = web.Load(url); var urlNodes = currDoc.DocumentNode.Descendants("a") .ToList(); var urlPageTitle = currDoc.DocumentNode.Descendants("title") .First() .InnerText; var urlLastModNode = currDoc.DocumentNode.Descendants("meta") .Select(y => y) .Where(y => y.Attributes.Contains("name")) .Where(y => y.Attributes["name"].Value == "pubdate") .ToList(); DateTime?urlLastMod = null; if (urlLastModNode.Count > 0) { urlLastMod = DateTime.Parse( urlLastModNode.First().Attributes["content"].Value); } List <string> urlsToQueue = new List <string>(); foreach (var urlNode in urlNodes) { if (urlNode.Attributes.Contains("href")) { urlsToQueue.Add(urlNode.Attributes["href"].Value); } } foreach (string newUrl in urlsToQueue) { ChkAndAddUrl(newUrl, url, urlLastMod, ref data, ref storage); } if (!data.AddedUrls.Contains(url)) { data.AddedUrls.Add(url); data.NumUrlsIndexed++; } data.NumUrlsCrawled++; FoundUrl finishedUrl = new FoundUrl(urlPageTitle, (urlLastMod != null ? urlLastMod.ToString() : "NULL"), url); UrlTableCount newCount = new UrlTableCount(data.NumUrlsCrawled, data.NumUrlsIndexed); TableOperation insertUrl = TableOperation.InsertOrReplace(finishedUrl); TableOperation insertCount = TableOperation.InsertOrReplace(newCount); storage.UrlTable.Execute(insertUrl); storage.UrlTable.Execute(insertCount); if (data.LastTenUrls.Count == 10) { data.LastTenUrls.Dequeue(); } data.LastTenUrls.Enqueue(url); } catch (Exception ex) { ErrorUrl errorUrl = new ErrorUrl(url, ex.ToString()); TableOperation insertErrorUrl = TableOperation.InsertOrReplace(errorUrl); storage.ErrorTable.Execute(insertErrorUrl); } } }