/// <summary> /// Process a single url, add items to queue as necessary - no exceptions!! /// </summary> private static WebPageWithDocument ProcessNextItem(PendingCrawlItem item, QueueWithDeduping<PendingCrawlItem> queue) { Uri urlRoot = item.UrlRoot; // For external items we don't load or crawl them if (item is PendingCrawlItem.PendingExternalWebPage) return new WebPageWithDocument { WebPage = new WebPage.External(item.Uri, item.UrlRoot, DateTime.UtcNow)}; // We don't care about date in this case // It must be an internal page ... try { Uri url = item.Uri; HttpWebResponse webResponse = FetchWebPageWithRetries(url); DateTime lastModified = webResponse.LastModified; if ((int)webResponse.StatusCode >= 300 && (int)webResponse.StatusCode <= 399) { // This means we have been directed off the domain we were on string uriString = webResponse.Headers["Location"]; return new WebPageWithDocument { WebPage = new WebPage.ExternalRedirect(url, uriString, urlRoot, lastModified)}; } if (webResponse.ContentType.StartsWith("text/html", StringComparison.InvariantCultureIgnoreCase)) { // COULD LOOK AT LAST-MODIFIED DATE AND PULL FROM CACHE ONLY AS NECESSARY / NOT READ THE WHOLE PAGE OVER AND OVER! //foreach (var header in webResponse.Headers) //{ // Console.WriteLine(header); //} WebPage.Internal result = null; HtmlDocument doc = new HtmlDocument(); using (var resultStream = webResponse.GetResponseStream()) { doc.Load(resultStream, System.Text.Encoding.UTF8); // The HtmlAgilityPack if (doc.DocumentNode != null && doc.DocumentNode.FirstChild != null && doc.DocumentNode.FirstChild.Name == "?xml") { Console.WriteLine("********* NOT AN HTML DOCUMENT, THIS IS XML ************"); return new WebPageWithDocument { WebPage = new WebPage.OtherContent(url, urlRoot, "text/xml", lastModified)}; } else { result = new WebPage.Internal(uri: url, root: urlRoot, dateTimeLastModified: lastModified); } resultStream.Close(); } // TODO: Look at HttpStatus Code: 500 or 404 is not OK (but that would throw exception, right?) result.ProcessDocumentToFindLinks(doc); if (webResponse.StatusCode != HttpStatusCode.OK) { Console.WriteLine("********* ODD, WHY DIDN'T THIS THROW AN EXCEPTION ************"); } // THIS WAS WRONG - HAD A PAGE ON BLOG CALLED JAVASCRIPT ERROR REPORTING ////// Some web sites have error report pages - we don't want to report them as proper pages! Should handle this using httpStatus codes! ////if (result.Title != null && result.Title.ToLower().Contains("error report")) ////{ //// return new WebPage.LoadError(url, HttpStatusCode.OK, item.UriReferrer, urlRoot); ////} foreach (var link in result.Links.OfType<WebUrlLink>()) { var pendingItem = PendingCrawlItem.Factory(urlRoot:urlRoot, href:link.Href, referrer:url); queue.Enqueue(pendingItem); } result.HtmlDocument = doc; // put the document in a wekreference on the web page internal // Success, hand off the page return new WebPageWithDocument { WebPage = result, HtmlDocument = doc }; } else { // OTHER TYPES OF CONTENT - need to close the resultstream! var resultStream = webResponse.GetResponseStream(); // Exceptions abound ... handled below resultStream.Close(); // could return a value for other types of files ... result = new Internal() { Url = url, HtmlDocument = doc }; //Console.WriteLine(" Response was a " + webResponse.StatusCode + " " + webResponse.ContentType); return new WebPageWithDocument { WebPage = new WebPage.OtherContent(item.Uri, item.UrlRoot, webResponse.ContentType, lastModified) }; } } // And if we got any kind of exception at all, return an appropriate web page problem object catch (WebException webException) { HttpStatusCode httpCode = 0; if (webException != null) { var respX = webException.Response as System.Net.HttpWebResponse; if (respX != null) httpCode = respX.StatusCode; } if (webException.Message.ToLower().Contains("timeout") || webException.Message.ToLower().Contains("timed out")) return new WebPageWithDocument { WebPage = new WebPage.Timeout(item.Uri, urlRoot, DateTime.UtcNow) }; else return new WebPageWithDocument { WebPage = new WebPage.LoadError(item.Uri, httpCode, item.UriReferrer, urlRoot, DateTime.UtcNow) }; } catch (Exception exception) { return new WebPageWithDocument { WebPage = new WebPage.ExceptionError(item.Uri, exception, urlRoot, DateTime.UtcNow) }; } }
/// <summary> /// Get every WebPage.Internal on a web site (or part of a web site) visiting all internal links just once /// plus every external page (or other Url) linked to the web site as a WebPage.External /// </summary> /// <remarks> /// Use .OfType WebPage.Internal to get just the internal ones if that's what you want /// </remarks> public static IEnumerable<WebPageWithDocument> GetAllPagesUnderWithDocument(Uri urlRoot, int delayBetweenPagesInSeconds, Regex[] excludedPaths) { // Get the root page ... HttpWebResponse webResponse = FetchWebPageWithRetries(urlRoot); if (webResponse.ResponseUri.AbsoluteUri != urlRoot.AbsoluteUri) { Console.WriteLine("*** ROOT REQUEST WAS REDIRECTED USING: " + webResponse.ResponseUri + "***"); urlRoot = webResponse.ResponseUri; } var queue = new QueueWithDeduping<PendingCrawlItem>(); var start = PendingCrawlItem.Factory(urlRoot:urlRoot, href:urlRoot, referrer:urlRoot); queue.Enqueue(start); while (queue.Count > 0) { // pull an item off the queue, inspect it and deal with it var nextItem = queue.Dequeue(); if (excludedPaths.Any(p => p.IsMatch(nextItem.Uri.AbsoluteUri))) { Console.WriteLine("Skipping " + nextItem.Uri + " because path is excluded"); continue; } var result = ProcessNextItem(nextItem, queue); yield return result; // And delay but only on internal pages - external pages don't count if (!(result.WebPage is WebPage.External)) Thread.Sleep(delayBetweenPagesInSeconds); } }