//Implemented BST for web crawing public IEnumerable <ParsedHtmlDocumentResult> Craw() { IDictionary <Uri, ParsedHtmlDocumentResult> visitedPages = new Dictionary <Uri, ParsedHtmlDocumentResult>(); //Page queue stores pages to be visited Queue <Uri> pageQueue = new Queue <Uri>(); // Test if the url is active var responseUri = WebCrawlerUtil.GetResponseUri(this.WebUri); this.WebUri = responseUri ?? throw new Exception(string.Format("The URL is either invalid or not found: {0}", this.WebUri.AbsoluteUri)); //Start from the main page var parsedHtmlDoc = this.ParseHtmlDoc(this.WebUri, pageQueue, visitedPages); visitedPages.Add(this.WebUri, parsedHtmlDoc); yield return(parsedHtmlDoc); //Process queue while (pageQueue.Count > 0) { var item = pageQueue.Dequeue(); if (!visitedPages.ContainsKey(item)) { parsedHtmlDoc = this.ParseHtmlDoc(item, pageQueue, visitedPages); visitedPages.Add(item, parsedHtmlDoc); yield return(parsedHtmlDoc); } } }
//Use HTML agility pack to parse HTML pages; //if the page is within the same domain and it has not been visited and is not in the queue, add to the queue for processing private ParsedHtmlDocumentResult ParseHtmlDoc(Uri uri, Queue <Uri> pageQueue, IDictionary <Uri, ParsedHtmlDocumentResult> visitedPages) { var web = new HtmlWeb(); var htmlDoc = web.Load(uri); var parsedHtmlDoc = new ParsedHtmlDocumentResult(uri); try { var nodes = htmlDoc.DocumentNode.SelectNodes( "//a[@href] | //link[@rel='stylesheet' and @href] | //img[@src] | //script[@type='text/javascript' and @src='*.js']"); if (nodes != null) { foreach (var node in nodes) { var linkUrl = this.GetNodeLink(node); uri = WebCrawlerUtil.ConvertToAbsoluteUri(linkUrl, parsedHtmlDoc.Uri); if (uri != null && uri != this.WebUri && uri != parsedHtmlDoc.Uri) { if (uri.Host == this.WebUri.Host) //internal links { if (node.Name == "a") //links to internal pages { if (!visitedPages.ContainsKey(uri) && !pageQueue.Contains(uri)) { pageQueue.Enqueue(uri); } } else { parsedHtmlDoc.AddStaticContent(uri); } } else // links to external pages { parsedHtmlDoc.AddExternalLink(uri); } } else { //log messages } } } return(parsedHtmlDoc); } catch (Exception ex) { parsedHtmlDoc.ErrorMessage = string.Format("Exception occured while peocessing {0}; message: {1}", parsedHtmlDoc.Uri.AbsoluteUri, ex.Message); return(parsedHtmlDoc); } }