public static List<Page> ConvertTreeToListOfPages(Page p_pgRootPage) { List<Page> lstPgAllPages = new List<Page>(); lstPgAllPages.Add(p_pgRootPage); foreach (Page page in p_pgRootPage.DirectChildren) { AddPageToCollection(page, lstPgAllPages); } return lstPgAllPages; }
public Page(Page p_pParentPage) { this.Parent = p_pParentPage; this.Document = new HtmlDocument(); this.MetaInfo = new List<PageMeta>(); this.MetaLinkInfo = new List<PageMetaLink>(); this.AnchorList = new List<Anchor>(); this.ImageList = new List<Image>(); this.OtherTags = new Dict<string, string>(); this.DirectChildren = new List<Page>(); }
public Page Sniff(string p_strURL, string p_strMethod = "GET") { // TODO: check URL well-formedness Page pCurrentPage = new Page(); pCurrentPage.URL = p_strURL; try { // We've got a working URL, let's get on with grabbing more page info // Build up the request to the URL and grab the stream from the response (i.e. the source of the page) IPEndPoint iepRemoteEP = null; HttpWebRequest hwrqRequest = (HttpWebRequest)WebRequest.Create(new Uri(p_strURL)); hwrqRequest.AllowAutoRedirect = false; // Switch off AutoRedirect so that we can capture 301s & 302s hwrqRequest.ServicePoint.BindIPEndPointDelegate = delegate(ServicePoint p_spServicePoint, IPEndPoint p_iepRemoteEP, int p_intRetryCount) { iepRemoteEP = p_iepRemoteEP; return null; }; hwrqRequest.Method = p_strMethod.ToUpper(); WebResponse wrpResponse = null; // Getting a 404 throws a WebException so handle that here. try { wrpResponse = hwrqRequest.GetResponse(); } catch (WebException wex) { wrpResponse = (WebResponse)wex.Response; pCurrentPage.StatusCode = HttpStatusCode.NotFound; pCurrentPage.LastError = wex.ToString(); } catch (Exception ex) { pCurrentPage.LastError = ex.ToString(); } // If we've got a response, continue grabbing details if (wrpResponse != null) { HttpWebResponse hwrpResponse = (HttpWebResponse)wrpResponse; //HttpWebRequest hwrqRequest = (HttpWebRequest)wrqRequest; // Add status code to page properties pCurrentPage.StatusCode = hwrpResponse.StatusCode; // Grab page info from HTML page contents using (Stream stmPageStream = wrpResponse.GetResponseStream()) { using (StreamReader srdrPageReader = new StreamReader(stmPageStream)) { HtmlDocument htmlDocDocument = new HtmlDocument(); htmlDocDocument.Load(srdrPageReader); pCurrentPage.RequestHeader = "<strong>----------- By Keys -----------</strong><br /> "; pCurrentPage.RequestHeader += hwrqRequest.Headers.AllKeys.OrderBy(key => key).Aggregate(string.Empty, (curr, _new) => curr + "<strong>" + _new + ":</strong> " + hwrqRequest.Headers[_new] + "<br />"); pCurrentPage.RequestHeader += "<strong>----------- By Properties -----------</strong><br /> "; pCurrentPage.RequestHeader += "<strong>Accept:</strong> " + hwrqRequest.Accept + "<br />"; pCurrentPage.RequestHeader += "<strong>Address:</strong> " + hwrqRequest.Address + "<br />"; pCurrentPage.RequestHeader += "<strong>AllowAutoRedirect:</strong> " + hwrqRequest.AllowAutoRedirect + "<br />"; pCurrentPage.RequestHeader += "<strong>AllowWriteStreamBuffering:</strong> " + hwrqRequest.AllowWriteStreamBuffering + "<br />"; pCurrentPage.RequestHeader += "<strong>AutomaticDecompression:</strong> " + hwrqRequest.AutomaticDecompression + "<br />"; pCurrentPage.RequestHeader += "<strong>CachePolicy:</strong> " + hwrqRequest.CachePolicy + "<br />"; pCurrentPage.RequestHeader += "<strong>Connection:</strong> " + hwrqRequest.Connection + "<br />"; pCurrentPage.RequestHeader += "<strong>ContentLength:</strong> " + hwrqRequest.ContentLength + "<br />"; pCurrentPage.RequestHeader += "<strong>ContentType:</strong> " + hwrqRequest.ContentType + "<br />"; pCurrentPage.RequestHeader += "<strong>Date:</strong> " + hwrqRequest.Date + "<br />"; pCurrentPage.RequestHeader += "<strong>HaveResponse:</strong> " + hwrqRequest.HaveResponse + "<br />"; pCurrentPage.RequestHeader += "<strong>Host:</strong> " + hwrqRequest.Host + "<br />"; pCurrentPage.RequestHeader += "<strong>IfModifiedSince:</strong> " + hwrqRequest.IfModifiedSince + "<br />"; pCurrentPage.RequestHeader += "<strong>KeepAlive:</strong> " + hwrqRequest.KeepAlive + "<br />"; pCurrentPage.RequestHeader += "<strong>MaximumAutomaticRedirections:</strong> " + hwrqRequest.MaximumAutomaticRedirections + "<br />"; pCurrentPage.RequestHeader += "<strong>MaximumResponseHeadersLength:</strong> " + hwrqRequest.MaximumResponseHeadersLength + "<br />"; pCurrentPage.RequestHeader += "<strong>MediaType:</strong> " + hwrqRequest.MediaType + "<br />"; pCurrentPage.RequestHeader += "<strong>Method:</strong> " + hwrqRequest.Method + "<br />"; pCurrentPage.RequestHeader += "<strong>Pipelined:</strong> " + hwrqRequest.Pipelined + "<br />"; pCurrentPage.RequestHeader += "<strong>PreAuthenticate:</strong> " + hwrqRequest.PreAuthenticate + "<br />"; pCurrentPage.RequestHeader += "<strong>ProtocolVersion:</strong> " + hwrqRequest.ProtocolVersion + "<br />"; pCurrentPage.RequestHeader += "<strong>Proxy:</strong> " + hwrqRequest.Proxy.ToString() + "<br />"; pCurrentPage.RequestHeader += "<strong>ReadWriteTimeout:</strong> " + hwrqRequest.ReadWriteTimeout + "<br />"; pCurrentPage.RequestHeader += "<strong>Referer:</strong> " + hwrqRequest.Referer + "<br />"; pCurrentPage.RequestHeader += "<strong>RequestUri:</strong> " + hwrqRequest.RequestUri + "<br />"; pCurrentPage.RequestHeader += "<strong>SendChunked:</strong> " + hwrqRequest.SendChunked + "<br />"; pCurrentPage.RequestHeader += "<strong>ServicePoint Address:</strong> " + hwrqRequest.ServicePoint.Address + "<br />"; pCurrentPage.RequestHeader += "<strong>Timeout:</strong> " + hwrqRequest.Timeout + "<br />"; pCurrentPage.RequestHeader += "<strong>TransferEncoding:</strong> " + hwrqRequest.TransferEncoding + "<br />"; pCurrentPage.RequestHeader += "<strong>UnsafeAuthenticatedConnectionSharing:</strong> " + hwrqRequest.UnsafeAuthenticatedConnectionSharing + "<br />"; pCurrentPage.RequestHeader += "<strong>UseDefaultCredentials:</strong> " + hwrqRequest.UseDefaultCredentials + "<br />"; pCurrentPage.RequestHeader += "<strong>UserAgent:</strong> " + hwrqRequest.UserAgent + "<br />"; pCurrentPage.RequestHeader += "<strong>Remote Address:</strong> " + hwrqRequest.Headers["REMOTE_ADDR"] + "<br />"; pCurrentPage.ResponseHeader = "<strong>----------- By Keys -----------</strong><br /> "; pCurrentPage.ResponseHeader += hwrpResponse.Headers.AllKeys.OrderBy(key => key).Aggregate(string.Empty, (curr, _new) => curr + "<strong>" + _new + ":</strong> " + hwrpResponse.Headers[_new] + "<br />"); pCurrentPage.ResponseHeader += "<strong>----------- By Properties -----------</strong><br /> "; pCurrentPage.ResponseHeader += "<strong>CharacterSet:</strong> " + hwrpResponse.CharacterSet + "<br />"; pCurrentPage.ResponseHeader += "<strong>ContentEncoding:</strong> " + hwrpResponse.ContentEncoding + "<br />"; pCurrentPage.ResponseHeader += "<strong>ContentLength:</strong> " + hwrpResponse.ContentLength + "<br />"; pCurrentPage.ResponseHeader += "<strong>ContentType:</strong> " + hwrpResponse.ContentType + "<br />"; pCurrentPage.ResponseHeader += "<strong>IsFromCache:</strong> " + hwrpResponse.IsFromCache + "<br />"; pCurrentPage.ResponseHeader += "<strong>IsMutuallyAuthenticated:</strong> " + hwrpResponse.IsMutuallyAuthenticated + "<br />"; pCurrentPage.ResponseHeader += "<strong>LastModified:</strong> " + hwrpResponse.LastModified + "<br />"; pCurrentPage.ResponseHeader += "<strong>Method:</strong> " + hwrpResponse.Method + "<br />"; pCurrentPage.ResponseHeader += "<strong>ProtocolVersion:</strong> " + hwrpResponse.ProtocolVersion + "<br />"; pCurrentPage.ResponseHeader += "<strong>ResponseUri:</strong> " + hwrpResponse.ResponseUri + "<br />"; pCurrentPage.ResponseHeader += "<strong>Server:</strong> " + hwrpResponse.Server + "<br />"; pCurrentPage.ResponseHeader += "<strong>StatusCode:</strong> " + hwrpResponse.StatusCode + "<br />"; pCurrentPage.ResponseHeader += "<strong>StatusDescription:</strong> " + hwrpResponse.StatusDescription + "<br />"; pCurrentPage.ResponseHeader += "<strong>Remote Address:</strong> " + hwrpResponse.Headers["REMOTE_ADDR"] + "<br />"; pCurrentPage.RemoteIP = iepRemoteEP.Address + ""; } } // Close response objects hwrpResponse.Close(); wrpResponse.Close(); } } catch (Exception ex) { pCurrentPage.LastError = ex.Message; } return pCurrentPage; }
public Page TraversePages(string p_strURL, Page p_pParentPage = null, bool blnIsRecursive = false) { // Grab the site domain on the very first call of the function (the only case when the parent is null) // This works for recursive calls, but not for non-recursive page crawls with a parent, so we need a second condition if (p_pParentPage == null || !blnIsRecursive) { // Extract scheme + domain info (i.e. http(s)://www.domain.name) if (Uri.IsWellFormedUriString(p_strURL, UriKind.Absolute)) { m_strSiteDomain = UsefulStuff.GrabHostAndScheme(p_strURL); } else { return null; } } // Create new page instance (even if the parent's null) Page pCurrentPage = new Page(p_pParentPage); pCurrentPage.URL = p_strURL; // Check if URL is well-formed. If not, create the page and return it with an error. if (!Uri.IsWellFormedUriString(p_strURL, UriKind.RelativeOrAbsolute)) { pCurrentPage.LastError = "URL is not well formed."; p_strURL = UsefulStuff.CreateWellFormedUrl(p_strURL, m_strSiteDomain); if(p_strURL == "") // If it's no good, return with an error and break the thread { pCurrentPage.LastError += " Could not resolve ill-formedness."; return pCurrentPage; } } if (Uri.IsWellFormedUriString(p_strURL, UriKind.Absolute)) { // We've got a working URL, let's get on with grabbing more page info // Build up the request to the URL and grab the stream from the response (i.e. the source of the page) HttpWebRequest hwrqRequest = (HttpWebRequest)WebRequest.Create(p_strURL); hwrqRequest.AllowAutoRedirect = false; // Switch off AutoRedirect so that we can capture 301s & 302s WebResponse wrpResponse = null; // Getting a 404 throws a WebException so handle that here. try { wrpResponse = hwrqRequest.GetResponse(); } catch (WebException wex) { wrpResponse = (WebResponse)wex.Response; pCurrentPage.StatusCode = HttpStatusCode.NotFound; pCurrentPage.LastError = wex.ToString(); } catch (Exception ex) { pCurrentPage.LastError = ex.ToString(); return pCurrentPage; } // If we've got a response, continue grabbing details if (wrpResponse != null) { HttpWebResponse hwrpResponse = (HttpWebResponse)wrpResponse; // Add status code to page properties pCurrentPage.StatusCode = hwrpResponse.StatusCode; // Check if the request returns with a 200 status // If the status is cool, carry on by analysing the page data with HtmlAgility if (hwrpResponse.StatusCode == HttpStatusCode.OK) { // Grab page info from HTML page contents using (Stream stmPageStream = wrpResponse.GetResponseStream()) { using (StreamReader srdrPageReader = new StreamReader(stmPageStream)) { HtmlDocument htmlDocDocument = new HtmlDocument(); htmlDocDocument.Load(srdrPageReader); string strPageHtmlContent = srdrPageReader.ReadToEnd();//.Replace("\n", "").Replace("\r", ""); // Replace all new lines so that we can do singleline matches to make our life easier pCurrentPage.Document = htmlDocDocument; pCurrentPage.DocType = GetDocType(strPageHtmlContent); pCurrentPage.Title = GetPageTitle(htmlDocDocument); pCurrentPage.MetaInfo = GetPageMetaInfo(htmlDocDocument); pCurrentPage.MetaLinkInfo = GetPageMetaLinkInfo(htmlDocDocument); pCurrentPage.AnchorList = GetAnchorList(htmlDocDocument); pCurrentPage.ImageList = GetImageList(htmlDocDocument); pCurrentPage.URL = UsefulStuff.CreateWellFormedUrl(p_strURL, m_strSiteDomain); m_lstGlobalList.Add(pCurrentPage); if (p_pParentPage != null) p_pParentPage.DirectChildren.Add(pCurrentPage); // If we're getting the page recursively (i.e. all the links on page as well) and we've got a list of anchors, // then get on and grab the stuff if (blnIsRecursive && pCurrentPage.AnchorList != null) { // Loop through foreach (Anchor anchor in pCurrentPage.AnchorList) { // Check if we're staying on the site, otherwise we'll grab the whole internet ;) string strWellFormedHref = UsefulStuff.CreateWellFormedUrl(anchor.Href, m_strSiteDomain); if (UsefulStuff.GrabHostAndScheme(strWellFormedHref) == m_strSiteDomain) { // Check if the page has been traversed before, ignore if so int pageCount = m_lstGlobalList.Count(x => x.URL == strWellFormedHref); if (pageCount < 1) { TraversePages(strWellFormedHref, pCurrentPage, true); } } } } } } } // Close response objects hwrpResponse.Close(); wrpResponse.Close(); } } return pCurrentPage; }
/// <summary> /// Recursive method for adding a page and its direct children to a collection (ref type) /// </summary> /// <param name="p_pgPage"></param> /// <param name="p_lstPgAllPages"></param> private static void AddPageToCollection(Page p_pgPage, List<Page> p_lstPgAllPages) { p_lstPgAllPages.Add(p_pgPage); foreach (Page page in p_pgPage.DirectChildren) { AddPageToCollection(page, p_lstPgAllPages); } }