public static CommunicationResponse GetPageWithTries(string url, int tries) { while (tries > 0) { CommunicationResponse comms = GetPage(url, string.Empty); if (comms.IsValid) { return(comms); } tries--; } return(new CommunicationResponse()); }
public static CommunicationResponse GetPage(string url, string referer = "", bool resetCookies = false, USER_AGENT userAgent = USER_AGENT.Windows, CallBack cb = null, int retryCount = 0) { CheckAndSleep(); CommunicationResponse communication = new CommunicationResponse(url); HttpWebRequest request = null; CookieContainer cookies = null; HttpWebResponse response = null; StreamReader response_stream = null; if (resetCookies) { } else { cookies = null; //-Responder.COOKIES; CRITICAL. in GET request. always clearing the cooking } if (cookies == null) { cookies = new CookieContainer(); } try { request = (HttpWebRequest)WebRequest.Create(url); request.UserAgent = userAgents[userAgent.ToSafeString()]; request.Method = "GET"; request.AllowAutoRedirect = true; request.CookieContainer = cookies; if (string.IsNullOrEmpty(referer) == false) request.Referer = referer; response = (HttpWebResponse)request.GetResponse(); if (response.StatusCode == HttpStatusCode.OK) { if (string.IsNullOrEmpty(SourceCharsetEncoding)) { response_stream = new StreamReader(response.GetResponseStream()); communication.Html = response_stream.ReadToEnd(); } else { response_stream = new StreamReader(response.GetResponseStream(), Encoding.GetEncoding(SourceCharsetEncoding)); communication.Html = SournceEncodingToUTF8(response_stream.ReadToEnd()); } } #region Do this whatever the status is returned if (response != null) { communication.StatusCode = response.StatusCode; if (response.StatusCode != HttpStatusCode.OK) //if OK, already set above { communication.Html = response_stream.ReadToEnd(); } communication.Cookies = response.Cookies; communication.Uri = response.ResponseUri; cookies.Add(communication.Cookies); COOKIES = cookies; } #endregion } catch (WebException e) { #region Sleep if blocked if (retryCount < 3) { GetPage(url, referer, resetCookies, userAgent, cb, ++retryCount); } #endregion using (WebResponse Exresponse = e.Response) { HttpWebResponse httpResponse = (HttpWebResponse)Exresponse; if (httpResponse != null) { communication.StatusCode = httpResponse.StatusCode; communication.Cookies = httpResponse.Cookies; communication.Uri = httpResponse.ResponseUri; try { if (string.IsNullOrEmpty(SourceCharsetEncoding)) { response_stream = new StreamReader(httpResponse.GetResponseStream()); communication.Html = response_stream.ReadToEnd(); } else { response_stream = new StreamReader(httpResponse.GetResponseStream(), Encoding.GetEncoding(SourceCharsetEncoding)); communication.Html = SournceEncodingToUTF8(response_stream.ReadToEnd()); } } catch { } cookies.Add(communication.Cookies); COOKIES = cookies; } } } catch (Exception e) { #region Sleep if blocked if (retryCount < 3) { GetPage(url, referer, resetCookies, userAgent, cb, ++retryCount); } else { communication.Clear(); } #endregion } finally { if (response_stream != null) response_stream.Close(); if (response != null) response.Close(); if (cb != null) { communication = cb(communication); } } return communication; }
private static List <ModelFeedsData> StartScrapper(ModelFeed feedsSetting) { List <ModelFeedsData> FeedList = new List <ModelFeedsData>(); ModelFeedsData FeedData = new ModelFeedsData(); try { CommunicationResponse resp = CommunicationResponse.GetPage(feedsSetting.FeedURL); if (resp.IsValid) { //XmlDocument RSSXml = new XmlDocument(); //RSSXml.Load(url); //XmlDocument xd = resp.ToXml(); XmlDocument xd = new XmlDocument(); xd.Load(feedsSetting.FeedURL); if (xd != null) { //XmlNodeList nodes1 = RSSXml.SelectNodes("rss/channel/item"); //XmlNode rssSubNode = nodes1[0].SelectSingleNode("/span/feed/title"); //FeedData.MainTitle = rssSubNode != null ? rssSubNode.InnerText : ""; //XmlDocument xdInner = new XmlDocument(); //xdInner.LoadXml(xd.InnerXml); //XmlNodeList rssSubNodeXml = xdInner.SelectNodes("//channel/item"); //XmlNode rssSubNode = xdInner.SelectSingleNode("//channel/title"); //if (rssSubNode != null) //{ // FeedData.MainTitle = rssSubNode.InnerText; //} //else //{ // FeedData.MainTitle = String.Empty; //} ModelFeedDetail modelfeedDetail = new ModelFeedDetail(); XmlElement xmlDoc = xd.DocumentElement; XmlNodeList xmlNodeLists = xd.SelectNodes(feedsSetting.FeedItemsPath); if (xmlNodeLists.Count > 0 && xmlNodeLists != null) { foreach (XmlNode item in xmlNodeLists) { if (!String.IsNullOrEmpty(feedsSetting.FeedTitlePath.Trim())) { if (item.SelectSingleNode(feedsSetting.FeedTitlePath) != null) { //getting item Title from RSS Feed Page modelfeedDetail.PostTitle = item.SelectSingleNode(feedsSetting.FeedTitlePath).InnerText; } } if (!String.IsNullOrEmpty(feedsSetting.FeedShortDescPath.Trim())) { if (item.SelectSingleNode(feedsSetting.FeedShortDescPath) != null) { //getting short description from RSS Feed Page modelfeedDetail.ShortDescription = item.SelectSingleNode(feedsSetting.FeedShortDescPath).InnerText; } } if (!String.IsNullOrEmpty(feedsSetting.FeedPubDatePath.Trim())) { if (item.SelectSingleNode(feedsSetting.FeedPubDatePath) != null) { //getting item publish date from RSS Feed Page modelfeedDetail.PublishDate = item.SelectSingleNode(feedsSetting.FeedPubDatePath).InnerText; } } if (!String.IsNullOrEmpty(feedsSetting.FeedImagePath.Trim())) { if (item.SelectSingleNode(feedsSetting.FeedImagePath) != null) { //getting item image from RSS Feed Page modelfeedDetail.RssImage = item.SelectSingleNode(feedsSetting.FeedImagePath).InnerText; } } if (!String.IsNullOrEmpty(feedsSetting.FeedDetailPageURLPath.Trim())) { if (item.SelectSingleNode(feedsSetting.FeedDetailPageURLPath) != null) { //getting Detail Page URL from Feed modelfeedDetail.DetailPageURL = item.SelectSingleNode(feedsSetting.FeedDetailPageURLPath).InnerText; } //If Detail page URL not empty Then if (!String.IsNullOrEmpty(modelfeedDetail.DetailPageURL)) { //web Request on Detial page URL //XmlDocument xdDetailPage = new XmlDocument(); //xdDetailPage.Load(modelfeedDetail.DetailPageURL); CommunicationResponse GetdetailPageResponse = CommunicationResponse.GetPage(modelfeedDetail.DetailPageURL); XmlDocument xdDetailPage = GetdetailPageResponse.ToXml(); if (xdDetailPage != null) { //Detail Page Will screape here } } } } } /* * XmlElement xmlDoc = xd.DocumentElement; * foreach (XmlElement node in xd.DocumentElement) * { * if (node.Name == "title") * { * string feedTile = node.InnerText; * } * if (node.Name == "updated") * { * string feedcategory = node.InnerText; * } * * XmlNodeList xmlNodeList = node.SelectNodes("/entry"); * if (xmlNodeList.Count > 0) * { * * } * * //if (!String.IsNullOrEmpty(node.Attributes[0].Value)) * //{ * // string feedTile = node["title"].Value; * //} * //if (!String.IsNullOrEmpty(node["updated"].Value)) * //{ * //string feedcategory = node["updated"].Value; * //} * * } * string xmlNode = xmlDoc.ParentNode.InnerText; * XmlNodeList xmlNodeLists = xd.SelectNodes("/feed/entry"); * //XmlNode titleNode = xmlNodeList.Count.ToString(); * //string title = xmlNodeLists.Count.ToString(); * if (xmlNodeLists != null) * { * //FeedData.MainTitle = rssSubNodetitle.InnerText; * } * XmlNode rssSubNodeCategory = xd.SelectSingleNode("//feed/category[1]"); * if (rssSubNodeCategory != null) * { * FeedData.MainCategory = rssSubNodeCategory.InnerText; * } * XmlNode rssSubNodeLogo = xd.SelectSingleNode("//feed/logo"); * if (rssSubNodeLogo != null) * { * FeedData.CoverImage = rssSubNodeLogo.InnerText; * } * * * XmlNodeList nodes = xd.SelectNodes("//entry"); * if (nodes != null) * { * ModelFeedDetail modelfeedDetail = new ModelFeedDetail(); * foreach (XmlNode item in nodes) * { * modelfeedDetail.FeedDetailId = item.SelectSingleNode("/dc:identifier").InnerText; * modelfeedDetail.PostTitle = item.SelectSingleNode("/title").InnerText; * modelfeedDetail.PublishDate = item.SelectSingleNode("/published").InnerText; * //optional * modelfeedDetail.RssImage = item.SelectSingleNode("").InnerText; * modelfeedDetail.ShortDescription = item.SelectSingleNode("/summary").InnerText; * * modelfeedDetail.Category = item.SelectSingleNode("/category ").Attributes["label"].Value; * //for send request on Detail page to get Post Details * if (item.SelectSingleNode("/link").Attributes != null) * { * modelfeedDetail.DetailPageURL = item.SelectSingleNode("/link").Attributes["href"].Value; * } * else * { * modelfeedDetail.DetailPageURL = item.SelectSingleNode("/link").InnerText; * } * CommunicationResponse respDetailPage = CommunicationResponse.GetPage(modelfeedDetail.DetailPageURL); * if (resp.IsValid) * { * XmlDocument xdDetailPage = respDetailPage.ToXml(); * modelfeedDetail.DetailPageImage = xdDetailPage.SelectSingleNode("//*[@id='page']/div/div[2]/div/div[1]/div[1]/div[2]/figure[1]/span/img").Attributes["src"].Value; * modelfeedDetail.DetailPagePostDetail = xdDetailPage.SelectSingleNode("//*[@id='page']/div/div[2]/div/div[1]/div[1]/div[2]/p").InnerText; * } * FeedData.FeedDetail.Add(modelfeedDetail); * } * } */ } } return(FeedList); } catch (Exception ex) { throw ex; } }
public static CommunicationResponse GetPage(string url, string referer = "", bool resetCookies = false, USER_AGENT userAgent = USER_AGENT.Windows, CallBack cb = null, int retryCount = 0) { CheckAndSleep(); CommunicationResponse communication = new CommunicationResponse(url); HttpWebRequest request = null; CookieContainer cookies = null; HttpWebResponse response = null; StreamReader response_stream = null; if (resetCookies) { } else { cookies = null; //-Responder.COOKIES; CRITICAL. in GET request. always clearing the cooking } if (cookies == null) { cookies = new CookieContainer(); } try { request = (HttpWebRequest)WebRequest.Create(url); request.UserAgent = userAgents[userAgent.ToSafeString()]; request.Method = "GET"; request.AllowAutoRedirect = true; request.CookieContainer = cookies; if (string.IsNullOrEmpty(referer) == false) { request.Referer = referer; } response = (HttpWebResponse)request.GetResponse(); if (response.StatusCode == HttpStatusCode.OK) { if (string.IsNullOrEmpty(SourceCharsetEncoding)) { response_stream = new StreamReader(response.GetResponseStream()); communication.Html = response_stream.ReadToEnd(); } else { response_stream = new StreamReader(response.GetResponseStream(), Encoding.GetEncoding(SourceCharsetEncoding)); communication.Html = SournceEncodingToUTF8(response_stream.ReadToEnd()); } } #region Do this whatever the status is returned if (response != null) { communication.StatusCode = response.StatusCode; if (response.StatusCode != HttpStatusCode.OK) //if OK, already set above { communication.Html = response_stream.ReadToEnd(); } communication.Cookies = response.Cookies; communication.Uri = response.ResponseUri; cookies.Add(communication.Cookies); COOKIES = cookies; } #endregion } catch (WebException e) { #region Sleep if blocked if (retryCount < 3) { GetPage(url, referer, resetCookies, userAgent, cb, ++retryCount); } #endregion using (WebResponse Exresponse = e.Response) { HttpWebResponse httpResponse = (HttpWebResponse)Exresponse; if (httpResponse != null) { communication.StatusCode = httpResponse.StatusCode; communication.Cookies = httpResponse.Cookies; communication.Uri = httpResponse.ResponseUri; try { if (string.IsNullOrEmpty(SourceCharsetEncoding)) { response_stream = new StreamReader(httpResponse.GetResponseStream()); communication.Html = response_stream.ReadToEnd(); } else { response_stream = new StreamReader(httpResponse.GetResponseStream(), Encoding.GetEncoding(SourceCharsetEncoding)); communication.Html = SournceEncodingToUTF8(response_stream.ReadToEnd()); } } catch { } cookies.Add(communication.Cookies); COOKIES = cookies; } } } catch (Exception e) { #region Sleep if blocked if (retryCount < 3) { GetPage(url, referer, resetCookies, userAgent, cb, ++retryCount); } else { communication.Clear(); } #endregion } finally { if (response_stream != null) { response_stream.Close(); } if (response != null) { response.Close(); } if (cb != null) { communication = cb(communication); } } return(communication); }