Esempio n. 1
0
        //Use HTML agility pack to parse HTML pages;
        //if the page is within the same domain and it has not been visited and is not in the queue, add to the queue for processing
        private ParsedHtmlDocumentResult ParseHtmlDoc(Uri uri, Queue <Uri> pageQueue, IDictionary <Uri, ParsedHtmlDocumentResult> visitedPages)
        {
            var web           = new HtmlWeb();
            var htmlDoc       = web.Load(uri);
            var parsedHtmlDoc = new ParsedHtmlDocumentResult(uri);

            try
            {
                var nodes = htmlDoc.DocumentNode.SelectNodes(
                    "//a[@href] | //link[@rel='stylesheet' and @href] | //img[@src] | //script[@type='text/javascript' and @src='*.js']");
                if (nodes != null)
                {
                    foreach (var node in nodes)
                    {
                        var linkUrl = this.GetNodeLink(node);
                        uri = WebCrawlerUtil.ConvertToAbsoluteUri(linkUrl, parsedHtmlDoc.Uri);
                        if (uri != null && uri != this.WebUri && uri != parsedHtmlDoc.Uri)
                        {
                            if (uri.Host == this.WebUri.Host) //internal links
                            {
                                if (node.Name == "a")         //links to internal pages
                                {
                                    if (!visitedPages.ContainsKey(uri) && !pageQueue.Contains(uri))
                                    {
                                        pageQueue.Enqueue(uri);
                                    }
                                }
                                else
                                {
                                    parsedHtmlDoc.AddStaticContent(uri);
                                }
                            }
                            else // links to external pages
                            {
                                parsedHtmlDoc.AddExternalLink(uri);
                            }
                        }
                        else
                        {
                            //log messages
                        }
                    }
                }

                return(parsedHtmlDoc);
            }
            catch (Exception ex)
            {
                parsedHtmlDoc.ErrorMessage = string.Format("Exception occured while peocessing {0}; message: {1}",
                                                           parsedHtmlDoc.Uri.AbsoluteUri, ex.Message);
                return(parsedHtmlDoc);
            }
        }
Esempio n. 2
0
        private static void DisplayResult(ParsedHtmlDocumentResult item, int pageNumber)
        {
            Console.WriteLine("Page #{0}: {1}", pageNumber, item.Uri.AbsoluteUri);

            Console.WriteLine("\tInternal Static Content");
            foreach (var content in item.StaticContents)
            {
                Console.WriteLine("\t\t{0}", content);
            }

            Console.WriteLine("\tExternal Links:");
            foreach (var link in item.ExternalLinks)
            {
                Console.WriteLine("\t\t{0}", link);
            }
        }