Beispiel #1
0
 public static List<Page> ConvertTreeToListOfPages(Page p_pgRootPage)
 {
     List<Page> lstPgAllPages = new List<Page>();
     lstPgAllPages.Add(p_pgRootPage);
     foreach (Page page in p_pgRootPage.DirectChildren) {
         AddPageToCollection(page, lstPgAllPages);
     }
     return lstPgAllPages;
 }
Beispiel #2
0
 public Page(Page p_pParentPage)
 {
     this.Parent = p_pParentPage;
     this.Document = new HtmlDocument();
     this.MetaInfo = new List<PageMeta>();
     this.MetaLinkInfo = new List<PageMetaLink>();
     this.AnchorList = new List<Anchor>();
     this.ImageList = new List<Image>();
     this.OtherTags = new Dict<string, string>();
     this.DirectChildren = new List<Page>();
 }
Beispiel #3
0
        public Page Sniff(string p_strURL, string p_strMethod = "GET")
        {
            //  TODO: check URL well-formedness

            Page pCurrentPage = new Page();
            pCurrentPage.URL = p_strURL;

            try {
                //  We've got a working URL, let's get on with grabbing more page info
                //  Build up the request to the URL and grab the stream from the response (i.e. the source of the page)
                IPEndPoint iepRemoteEP = null;
                HttpWebRequest hwrqRequest = (HttpWebRequest)WebRequest.Create(new Uri(p_strURL));
                hwrqRequest.AllowAutoRedirect = false;  //  Switch off AutoRedirect so that we can capture 301s & 302s

                hwrqRequest.ServicePoint.BindIPEndPointDelegate = delegate(ServicePoint p_spServicePoint, IPEndPoint p_iepRemoteEP, int p_intRetryCount)
                {
                    iepRemoteEP = p_iepRemoteEP;
                    return null;
                };
                hwrqRequest.Method = p_strMethod.ToUpper();

                WebResponse wrpResponse = null;

                //  Getting a 404 throws a WebException so handle that here.
                try
                {
                    wrpResponse = hwrqRequest.GetResponse();
                }
                catch (WebException wex)
                {
                    wrpResponse = (WebResponse)wex.Response;
                    pCurrentPage.StatusCode = HttpStatusCode.NotFound;
                    pCurrentPage.LastError = wex.ToString();
                }
                catch (Exception ex)
                {
                    pCurrentPage.LastError = ex.ToString();
                }

                //  If we've got a response, continue grabbing details
                if (wrpResponse != null)
                {

                    HttpWebResponse hwrpResponse = (HttpWebResponse)wrpResponse;
                    //HttpWebRequest hwrqRequest = (HttpWebRequest)wrqRequest;

                    //  Add status code to page properties
                    pCurrentPage.StatusCode = hwrpResponse.StatusCode;

                    //  Grab page info from HTML page contents
                    using (Stream stmPageStream = wrpResponse.GetResponseStream())
                    {
                        using (StreamReader srdrPageReader = new StreamReader(stmPageStream))
                        {
                            HtmlDocument htmlDocDocument = new HtmlDocument();
                            htmlDocDocument.Load(srdrPageReader);
                            pCurrentPage.RequestHeader = "<strong>----------- By Keys -----------</strong><br /> ";
                            pCurrentPage.RequestHeader += hwrqRequest.Headers.AllKeys.OrderBy(key => key).Aggregate(string.Empty, (curr, _new) => curr + "<strong>" + _new + ":</strong> " + hwrqRequest.Headers[_new] + "<br />");
                            pCurrentPage.RequestHeader += "<strong>----------- By Properties -----------</strong><br /> ";
                            pCurrentPage.RequestHeader += "<strong>Accept:</strong> " + hwrqRequest.Accept + "<br />";
                            pCurrentPage.RequestHeader += "<strong>Address:</strong> " + hwrqRequest.Address + "<br />";
                            pCurrentPage.RequestHeader += "<strong>AllowAutoRedirect:</strong> " + hwrqRequest.AllowAutoRedirect + "<br />";
                            pCurrentPage.RequestHeader += "<strong>AllowWriteStreamBuffering:</strong> " + hwrqRequest.AllowWriteStreamBuffering + "<br />";
                            pCurrentPage.RequestHeader += "<strong>AutomaticDecompression:</strong> " + hwrqRequest.AutomaticDecompression + "<br />";
                            pCurrentPage.RequestHeader += "<strong>CachePolicy:</strong> " + hwrqRequest.CachePolicy + "<br />";
                            pCurrentPage.RequestHeader += "<strong>Connection:</strong> " + hwrqRequest.Connection + "<br />";
                            pCurrentPage.RequestHeader += "<strong>ContentLength:</strong> " + hwrqRequest.ContentLength + "<br />";
                            pCurrentPage.RequestHeader += "<strong>ContentType:</strong> " + hwrqRequest.ContentType + "<br />";
                            pCurrentPage.RequestHeader += "<strong>Date:</strong> " + hwrqRequest.Date + "<br />";
                            pCurrentPage.RequestHeader += "<strong>HaveResponse:</strong> " + hwrqRequest.HaveResponse + "<br />";
                            pCurrentPage.RequestHeader += "<strong>Host:</strong> " + hwrqRequest.Host + "<br />";
                            pCurrentPage.RequestHeader += "<strong>IfModifiedSince:</strong> " + hwrqRequest.IfModifiedSince + "<br />";
                            pCurrentPage.RequestHeader += "<strong>KeepAlive:</strong> " + hwrqRequest.KeepAlive + "<br />";
                            pCurrentPage.RequestHeader += "<strong>MaximumAutomaticRedirections:</strong> " + hwrqRequest.MaximumAutomaticRedirections + "<br />";
                            pCurrentPage.RequestHeader += "<strong>MaximumResponseHeadersLength:</strong> " + hwrqRequest.MaximumResponseHeadersLength + "<br />";
                            pCurrentPage.RequestHeader += "<strong>MediaType:</strong> " + hwrqRequest.MediaType + "<br />";
                            pCurrentPage.RequestHeader += "<strong>Method:</strong> " + hwrqRequest.Method + "<br />";
                            pCurrentPage.RequestHeader += "<strong>Pipelined:</strong> " + hwrqRequest.Pipelined + "<br />";
                            pCurrentPage.RequestHeader += "<strong>PreAuthenticate:</strong> " + hwrqRequest.PreAuthenticate + "<br />";
                            pCurrentPage.RequestHeader += "<strong>ProtocolVersion:</strong> " + hwrqRequest.ProtocolVersion + "<br />";
                            pCurrentPage.RequestHeader += "<strong>Proxy:</strong> " + hwrqRequest.Proxy.ToString() + "<br />";
                            pCurrentPage.RequestHeader += "<strong>ReadWriteTimeout:</strong> " + hwrqRequest.ReadWriteTimeout + "<br />";
                            pCurrentPage.RequestHeader += "<strong>Referer:</strong> " + hwrqRequest.Referer + "<br />";
                            pCurrentPage.RequestHeader += "<strong>RequestUri:</strong> " + hwrqRequest.RequestUri + "<br />";
                            pCurrentPage.RequestHeader += "<strong>SendChunked:</strong> " + hwrqRequest.SendChunked + "<br />";
                            pCurrentPage.RequestHeader += "<strong>ServicePoint Address:</strong> " + hwrqRequest.ServicePoint.Address + "<br />";
                            pCurrentPage.RequestHeader += "<strong>Timeout:</strong> " + hwrqRequest.Timeout + "<br />";
                            pCurrentPage.RequestHeader += "<strong>TransferEncoding:</strong> " + hwrqRequest.TransferEncoding + "<br />";
                            pCurrentPage.RequestHeader += "<strong>UnsafeAuthenticatedConnectionSharing:</strong> " + hwrqRequest.UnsafeAuthenticatedConnectionSharing + "<br />";
                            pCurrentPage.RequestHeader += "<strong>UseDefaultCredentials:</strong> " + hwrqRequest.UseDefaultCredentials + "<br />";
                            pCurrentPage.RequestHeader += "<strong>UserAgent:</strong> " + hwrqRequest.UserAgent + "<br />";
                            pCurrentPage.RequestHeader += "<strong>Remote Address:</strong> " + hwrqRequest.Headers["REMOTE_ADDR"] + "<br />";

                            pCurrentPage.ResponseHeader = "<strong>----------- By Keys -----------</strong><br /> ";
                            pCurrentPage.ResponseHeader += hwrpResponse.Headers.AllKeys.OrderBy(key => key).Aggregate(string.Empty, (curr, _new) => curr + "<strong>" + _new + ":</strong> " + hwrpResponse.Headers[_new] + "<br />");
                            pCurrentPage.ResponseHeader += "<strong>----------- By Properties -----------</strong><br /> ";
                            pCurrentPage.ResponseHeader += "<strong>CharacterSet:</strong> " + hwrpResponse.CharacterSet + "<br />";
                            pCurrentPage.ResponseHeader += "<strong>ContentEncoding:</strong> " + hwrpResponse.ContentEncoding + "<br />";
                            pCurrentPage.ResponseHeader += "<strong>ContentLength:</strong> " + hwrpResponse.ContentLength + "<br />";
                            pCurrentPage.ResponseHeader += "<strong>ContentType:</strong> " + hwrpResponse.ContentType + "<br />";
                            pCurrentPage.ResponseHeader += "<strong>IsFromCache:</strong> " + hwrpResponse.IsFromCache + "<br />";
                            pCurrentPage.ResponseHeader += "<strong>IsMutuallyAuthenticated:</strong> " + hwrpResponse.IsMutuallyAuthenticated + "<br />";
                            pCurrentPage.ResponseHeader += "<strong>LastModified:</strong> " + hwrpResponse.LastModified + "<br />";
                            pCurrentPage.ResponseHeader += "<strong>Method:</strong> " + hwrpResponse.Method + "<br />";
                            pCurrentPage.ResponseHeader += "<strong>ProtocolVersion:</strong> " + hwrpResponse.ProtocolVersion + "<br />";
                            pCurrentPage.ResponseHeader += "<strong>ResponseUri:</strong> " + hwrpResponse.ResponseUri + "<br />";
                            pCurrentPage.ResponseHeader += "<strong>Server:</strong> " + hwrpResponse.Server + "<br />";
                            pCurrentPage.ResponseHeader += "<strong>StatusCode:</strong> " + hwrpResponse.StatusCode + "<br />";
                            pCurrentPage.ResponseHeader += "<strong>StatusDescription:</strong> " + hwrpResponse.StatusDescription + "<br />";
                            pCurrentPage.ResponseHeader += "<strong>Remote Address:</strong> " + hwrpResponse.Headers["REMOTE_ADDR"] + "<br />";

                            pCurrentPage.RemoteIP = iepRemoteEP.Address + "";

                        }
                    }

                    //  Close response objects
                    hwrpResponse.Close();
                    wrpResponse.Close();
                }
            }
            catch (Exception ex)
            {
                pCurrentPage.LastError = ex.Message;
            }

            return pCurrentPage;
        }
Beispiel #4
0
        public Page TraversePages(string p_strURL, Page p_pParentPage = null, bool blnIsRecursive = false)
        {
            //  Grab the site domain on the very first call of the function (the only case when the parent is null)
            //  This works for recursive calls, but not for non-recursive page crawls with a parent, so we need a second condition
            if (p_pParentPage == null || !blnIsRecursive)
            {
                //  Extract scheme + domain info (i.e. http(s)://www.domain.name)
                if (Uri.IsWellFormedUriString(p_strURL, UriKind.Absolute))
                {
                    m_strSiteDomain = UsefulStuff.GrabHostAndScheme(p_strURL);
                }
                else
                {
                    return null;
                }
            }

            //  Create new page instance (even if the parent's null)
            Page pCurrentPage = new Page(p_pParentPage);
            pCurrentPage.URL = p_strURL;

            //  Check if URL is well-formed. If not, create the page and return it with an error.
            if (!Uri.IsWellFormedUriString(p_strURL, UriKind.RelativeOrAbsolute))
            {
                pCurrentPage.LastError = "URL is not well formed.";

                p_strURL = UsefulStuff.CreateWellFormedUrl(p_strURL, m_strSiteDomain);
                if(p_strURL == "")    //  If it's no good, return with an error and break the thread
                {
                    pCurrentPage.LastError += " Could not resolve ill-formedness.";
                    return pCurrentPage;
                }
            }

            if (Uri.IsWellFormedUriString(p_strURL, UriKind.Absolute)) {
                //  We've got a working URL, let's get on with grabbing more page info
                //  Build up the request to the URL and grab the stream from the response (i.e. the source of the page)
                HttpWebRequest hwrqRequest = (HttpWebRequest)WebRequest.Create(p_strURL);
                hwrqRequest.AllowAutoRedirect = false;  //  Switch off AutoRedirect so that we can capture 301s & 302s

                WebResponse wrpResponse = null;

                //  Getting a 404 throws a WebException so handle that here.
                try
                {
                    wrpResponse = hwrqRequest.GetResponse();
                }
                catch (WebException wex) {
                    wrpResponse = (WebResponse)wex.Response;
                    pCurrentPage.StatusCode = HttpStatusCode.NotFound;
                    pCurrentPage.LastError = wex.ToString();
                }
                catch (Exception ex)
                {
                    pCurrentPage.LastError = ex.ToString();
                    return pCurrentPage;
                }

                //  If we've got a response, continue grabbing details
                if (wrpResponse != null)
                {

                    HttpWebResponse hwrpResponse = (HttpWebResponse)wrpResponse;

                    //  Add status code to page properties
                    pCurrentPage.StatusCode = hwrpResponse.StatusCode;

                    //  Check if the request returns with a 200 status
                    //  If the status is cool, carry on by analysing the page data with HtmlAgility
                    if (hwrpResponse.StatusCode == HttpStatusCode.OK)
                    {
                        //  Grab page info from HTML page contents
                        using (Stream stmPageStream = wrpResponse.GetResponseStream())
                        {
                            using (StreamReader srdrPageReader = new StreamReader(stmPageStream))
                            {
                                HtmlDocument htmlDocDocument = new HtmlDocument();
                                htmlDocDocument.Load(srdrPageReader);

                                string strPageHtmlContent = srdrPageReader.ReadToEnd();//.Replace("\n", "").Replace("\r", "");         // Replace all new lines so that we can do singleline matches to make our life easier
                                pCurrentPage.Document = htmlDocDocument;
                                pCurrentPage.DocType = GetDocType(strPageHtmlContent);
                                pCurrentPage.Title = GetPageTitle(htmlDocDocument);
                                pCurrentPage.MetaInfo = GetPageMetaInfo(htmlDocDocument);
                                pCurrentPage.MetaLinkInfo = GetPageMetaLinkInfo(htmlDocDocument);
                                pCurrentPage.AnchorList = GetAnchorList(htmlDocDocument);
                                pCurrentPage.ImageList = GetImageList(htmlDocDocument);
                                pCurrentPage.URL = UsefulStuff.CreateWellFormedUrl(p_strURL, m_strSiteDomain);

                                m_lstGlobalList.Add(pCurrentPage);
                                if (p_pParentPage != null) p_pParentPage.DirectChildren.Add(pCurrentPage);

                                //  If we're getting the page recursively (i.e. all the links on page as well) and we've got a list of anchors,
                                //  then get on and grab the stuff
                                if (blnIsRecursive && pCurrentPage.AnchorList != null)
                                {
                                    //  Loop through
                                    foreach (Anchor anchor in pCurrentPage.AnchorList)
                                    {
                                        //  Check if we're staying on the site, otherwise we'll grab the whole internet ;)
                                        string strWellFormedHref = UsefulStuff.CreateWellFormedUrl(anchor.Href, m_strSiteDomain);
                                        if (UsefulStuff.GrabHostAndScheme(strWellFormedHref) == m_strSiteDomain)
                                        {
                                            //  Check if the page has been traversed before, ignore if so
                                            int pageCount = m_lstGlobalList.Count(x => x.URL == strWellFormedHref);
                                            if (pageCount < 1)
                                            {
                                                TraversePages(strWellFormedHref, pCurrentPage, true);
                                            }
                                        }
                                    }
                                }
                            }
                        }
                    }

                    //  Close response objects
                    hwrpResponse.Close();
                    wrpResponse.Close();
                }
            }

            return pCurrentPage;
        }
Beispiel #5
0
 /// <summary>
 /// Recursive method for adding a page and its direct children to a collection (ref type)
 /// </summary>
 /// <param name="p_pgPage"></param>
 /// <param name="p_lstPgAllPages"></param>
 private static void AddPageToCollection(Page p_pgPage, List<Page> p_lstPgAllPages)
 {
     p_lstPgAllPages.Add(p_pgPage);
     foreach (Page page in p_pgPage.DirectChildren)
     {
         AddPageToCollection(page, p_lstPgAllPages);
     }
 }