/// <summary> /// /// </summary> /// <param name="htmlDocument"></param> /// <returns></returns> public int GetTotalResults(string htmlDocument) { HtmlAgilityPack.HtmlDocument _doc = new HtmlAgilityPack.HtmlDocument(); _doc.LoadHtml(htmlDocument); //<span id="headerResults"><strong>39</strong></span> var _node = HtmlUtil.GetNode(_doc.DocumentNode, "span", "id", "headerResults"); if (_node != null) { var _Ns = _node.Descendants("strong").ToArray(); if (_Ns != null && _Ns.Count() > 0) { var _n = _Ns[0]; int _r = 0; try { _r = int.Parse(_n.InnerText); } catch { } return(_r); } } return(0); }
/// <summary> /// /// </summary> /// <param name="htmlDocument"></param> /// <param name="actionAdExtracted"></param> /// <returns></returns> public Models.Advertisement.Advertisements ExtractAds(string htmlDocument, Action <Handlers.EventHandlers.AdExtractedEventArgs> actionAdExtracted) { Models.Advertisement.Advertisements _ads = null; try { HtmlAgilityPack.HtmlDocument _doc = new HtmlAgilityPack.HtmlDocument(); _doc.LoadHtml(htmlDocument); var _N = HtmlUtil.GetNode(_doc.DocumentNode, "ul", "id", "searchResultListings"); HtmlAgilityPack.HtmlNode[] _nodes = _N.Descendants("li") .Where(li => li.Attributes.Contains("class") && li.Attributes[@"class"].Value.Contains("listingContainer")) .ToArray(); if (_nodes != null && _nodes.Count() > 0) { _ads = new Models.Advertisement.Advertisements(); int _pgItemIdx = 0; foreach (HtmlAgilityPack.HtmlNode _n in _nodes) { Models.Advertisement _ad = new Models.Advertisement(); _pgItemIdx++; Stopwatch _stopwatch = new Stopwatch(); _stopwatch.Reset(); _stopwatch.Start(); _ad = this.EscrapeAdInfo(_n); if (_ad != null) { _ad = this.EscrapeAdInfoExtend(_ad); } _stopwatch.Stop(); _ads.Add(_ad); if (actionAdExtracted != null) { actionAdExtracted(new Handlers.EventHandlers.AdExtractedEventArgs(_ad, _stopwatch.Elapsed, _pgItemIdx)); } } } } catch (Exception ex) { if (this.frameworkExceptionInvoke != null) { Exception _ex = new Exception(string.Format("Exception in {0}.{1}(?)", this.directoryProviderSetting.ServicedCountry.ToString(), "ExtractAds"), ex); this.frameworkExceptionInvoke(new Handlers.EventHandlers.FrameworkExceptionEventArgs(_ex)); } } return(_ads); }
/// <summary> /// /// </summary> /// <param name="htmlDocument"></param> /// <param name="_errMsg"></param> /// <returns></returns> public bool CheckPageIfError(string htmlDocument, ref string _errMsg) { HtmlAgilityPack.HtmlDocument _doc = new HtmlAgilityPack.HtmlDocument(); _doc.LoadHtml(htmlDocument); //http://www.yellowpages.com.au/search/listings?clue=kobo&locationClue=lyoo&selectedViewMode=LIST&emsLocationId= /*<div id="zeroResultsMessage"> * We couldn't find any results for <strong>kobo</strong> in * <strong>Lue, NSW 2850</strong> and/or nearby areas. * </div>*/ HtmlAgilityPack.HtmlNode _node = HtmlUtil.GetNode(_doc.DocumentNode, "div", "id", "zeroResultsMessage"); if (_node != null) { _errMsg = _node.InnerText; _errMsg = _errMsg.Replace("<strong>", "").Replace("</strong>", ""); return(true); } if (_node == null) { /* * <div id="fourOFourMessage"> * <h2> * Sorry we couldn't find that page.<br> * Perhaps the page doesn't exist or address was mistyped. * </h2> * You could try: * <ul> * <li>to do a business search in the search fields at the top of the page</li> * <li>go to the <a href="http://www.yellowpages.com.au">Yellow Pages<sup>®</sup></a> home page</li> * </ul> * </div> */ _node = HtmlUtil.GetNode(_doc.DocumentNode, "div", "id", "fourOFourMessage"); if (_node != null) { var _Ns = _node.Descendants("h2").ToArray(); if (_Ns != null && _Ns.Count() > 0) { var _n = _Ns[0]; _errMsg = _n.InnerText; return(true); } } } return(false); }
/// <summary> /// /// </summary> /// <param name="htmlDocument"></param> /// <param name="_errMsg"></param> /// <returns></returns> public bool CheckPageIfError(string htmlDocument, ref string _errMsg) { HtmlAgilityPack.HtmlDocument _doc = new HtmlAgilityPack.HtmlDocument(); _doc.LoadHtml(htmlDocument); HtmlAgilityPack.HtmlNode _node = HtmlUtil.GetNode(_doc.DocumentNode, "div", "class", "ypgErrorText"); if (_node == null) { _node = HtmlUtil.GetNode(_doc.DocumentNode, "div", "id", "ypgSearchErrorMessage"); } if (_node != null) { _errMsg = _node.InnerText; _errMsg = _errMsg.Replace("<strong>", "").Replace("</strong>", ""); return(true); } return(false); }
/// <summary> /// /// </summary> /// <param name="hmaPgDoc"></param> private void ExtractProxies(string hmaPgDoc) { HtmlAgilityPack.HtmlDocument _doc = new HtmlAgilityPack.HtmlDocument(); _doc.LoadHtml(hmaPgDoc); var _tblNode = HtmlUtil.GetNode(_doc.DocumentNode, "table", "id", "listtable"); if (_tblNode != null) { var _tRowNodes = _tblNode.Descendants("tr"); if (_tRowNodes != null && _tRowNodes.Count() > 0) { if (this.FreeProxies == null) { this.FreeProxies = new Queue <FreeProxy>(); } for (int ctr = 1; ctr < _tRowNodes.Count(); ctr++) { FreeProxy _freeProxy = null; try { if (ParseRowToProxy(_tRowNodes.ToArray()[ctr], ref _freeProxy)) { if (_freeProxy != null && !this.FreeProxies.ToList().Exists(p => p.Host.Equals(_freeProxy.Host))) { this.FreeProxies.Enqueue(_freeProxy); } } } catch (Exception ex) { } } } } }
/// <summary> /// /// </summary> /// <param name="hmaPgDoc"></param> private int ExtractProxies(string hmaPgDoc) { int _rowCount = 0; HtmlAgilityPack.HtmlDocument _doc = new HtmlAgilityPack.HtmlDocument(); _doc.LoadHtml(hmaPgDoc); var _tblNode = HtmlUtil.GetNode(_doc.DocumentNode, "table", "id", "listtable"); if (_tblNode != null) { var _tRowNodes = _tblNode.Descendants("tr"); _rowCount = _tRowNodes.Count(); if (_tRowNodes != null && _rowCount > 0) { Guid _usageId = Guid.NewGuid(); for (int ctr = 1; ctr < _tRowNodes.Count(); ctr++) { bool _ProtocolOK = true; bool _AnonymityLevelOK = true; bool _SpeedOK = true; bool _ConnectionTimeOK = true; FreeProxy _freeProxy = null; try { if (ParseRowToProxy(_tRowNodes.ToArray()[ctr], ref _freeProxy)) { if (_freeProxy != null) { _freeProxy.RotationTokenId = _usageId; if (this.Protocol.HasValue) { _ProtocolOK = (this.Protocol.Value == _freeProxy.Protocol); } if (this.AnonymityLevel.HasValue) { _AnonymityLevelOK = this.AnonymityLevel.Value == _freeProxy.AnonymityLevel; } if (this.Speed.HasValue) { if (this.Speed.Value == ProxySpeedEnum.Fast) { _SpeedOK = _freeProxy.Speed == ProxySpeedEnum.Fast; } else if (this.Speed.Value == ProxySpeedEnum.Medium) { _SpeedOK = _freeProxy.Speed == ProxySpeedEnum.Medium | _freeProxy.Speed == ProxySpeedEnum.Fast; } else { _SpeedOK = true; } } if (this.ConnectionTime.HasValue) { if (this.ConnectionTime.Value == ProxyConnectionSpeedEnum.Fast) { _ConnectionTimeOK = _freeProxy.ConnectionTime == ProxyConnectionSpeedEnum.Fast; } else if (this.ConnectionTime.Value == ProxyConnectionSpeedEnum.Medium) { _ConnectionTimeOK = _freeProxy.ConnectionTime == ProxyConnectionSpeedEnum.Medium | _freeProxy.ConnectionTime == ProxyConnectionSpeedEnum.Fast; } else { _ConnectionTimeOK = true; } } } } if (_ProtocolOK && _AnonymityLevelOK && _SpeedOK && _ConnectionTimeOK) { this.InvokeEventFreeIPProxyFetched(new EventHandlers.FreeIPProxyFetchedEventArgs(_freeProxy)); } } catch (Exception ex) { //Requested value 'LEBANON' was not found. } } } } return(_rowCount); }
/// <summary> /// /// </summary> /// <param name="rowNode"></param> /// <param name="freeProxy"></param> /// <returns></returns> private bool ParseRowToProxy(HtmlNode rowNode, ref FreeProxy freeProxy) { try { var _prxCells = rowNode.Descendants("td"); if (_prxCells != null && _prxCells.Count() > 0) { freeProxy = new FreeProxy(); freeProxy.Host = SnatchIPAddress(_prxCells.ToArray()[1].InnerHtml); freeProxy.Port = int.Parse(_prxCells.ToArray()[2].InnerText); string _country = _prxCells.ToArray()[3].InnerText.Trim().ToUpper(); _country = _country.Replace("; ", "___").Replace(", ", "__").Replace(" ", "_"); freeProxy.Country = (ProxyCountry)Enum.Parse(typeof(ProxyCountry), _country); var _connspeedNode = _prxCells.ToArray()[5]; _connspeedNode = HtmlUtil.GetNode(_connspeedNode, "div", "class", "speedbar connection_time"); _connspeedNode = _connspeedNode.Descendants("div").ToArray()[0]; string _rate = _connspeedNode.Attributes["style"].Value.Replace("width:", "").Replace("%", ""); freeProxy.ConnectionTimeRate = int.Parse(_rate); if (freeProxy.ConnectionTimeRate <= 35) { freeProxy.ConnectionTime = ProxyConnectionSpeedEnum.Slow; } else if (freeProxy.ConnectionTimeRate > 35 && freeProxy.ConnectionTimeRate <= 65) { freeProxy.ConnectionTime = ProxyConnectionSpeedEnum.Medium; } else if (freeProxy.ConnectionTimeRate > 65) { freeProxy.ConnectionTime = ProxyConnectionSpeedEnum.Medium; } var _speedNode = _prxCells.ToArray()[4]; _speedNode = HtmlUtil.GetNode(_speedNode, "div", "class", "speedbar response_time"); _speedNode = _speedNode.Descendants("div").ToArray()[0]; _rate = _speedNode.Attributes["style"].Value.Replace("width:", "").Replace("%", ""); freeProxy.SpeedRate = int.Parse(_rate); if (freeProxy.SpeedRate <= 35) { freeProxy.Speed = ProxySpeedEnum.Slow; } else if (freeProxy.SpeedRate > 35 && freeProxy.SpeedRate <= 65) { freeProxy.Speed = ProxySpeedEnum.Medium; } else if (freeProxy.SpeedRate > 65) { freeProxy.Speed = ProxySpeedEnum.Medium; } string _protocol = _prxCells.ToArray()[6].InnerText.Replace("/", "_"); freeProxy.Protocol = (ProxyProtocolEnum)Enum.Parse(typeof(ProxyProtocolEnum), _protocol); string _anonymity = _prxCells.ToArray()[7].InnerText.Replace(" +", "__"); freeProxy.AnonymityLevel = (ProxyAnonymityLevelEnum)Enum.Parse(typeof(ProxyAnonymityLevelEnum), _anonymity); return(true); } } catch (Exception ex) { throw ex; } return(false); }
/// <summary> /// /// </summary> /// <param name="adInf"></param> /// <returns></returns> public Models.Advertisement EscrapeAdInfoExtend(Models.Advertisement adInf) { if (adInf != null && !string.IsNullOrEmpty(adInf.AdvertiserLink)) { try { string _htmlDoc = HtmlUtil.GetPageDocument(adInf.AdvertiserLink); HtmlAgilityPack.HtmlDocument htmDocAg = new HtmlAgilityPack.HtmlDocument(); htmDocAg.LoadHtml(_htmlDoc); string _googMap = "[NA]"; string _keywords = "[NA]"; var _n = HtmlUtil.GetNode(htmDocAg.DocumentNode, "meta", "name", "keywords"); if (_n != null) { _keywords = _n.Attributes["content"].Value; } string _description = "[NA]"; HtmlAgilityPack.HtmlNode _metaDesc = HtmlUtil.GetNode(htmDocAg.DocumentNode, "meta", "name", "description"); if (_metaDesc != null) { _description = _metaDesc.Attributes["content"].Value; } string _emailAdd = "[NA]"; /* * <a id="mainEmailAddressLink" class="emailBusinessLink" rel="nofollow" href="/onlineSolution_emailBusiness.do?listingId=14074960&classification=MAIN&context=businessTypeSearch&referredBy=YOL" title="Contact Turner Freeman Lawyers"> * <img class="emailAddressIcon" src="/ui/standard/bpp/email_icon.png" alt="Main Email Address"> * <span>[email protected]</span> * </a> */ HtmlAgilityPack.HtmlNode _emailAdNode = HtmlUtil.GetNode(htmDocAg.DocumentNode, "a", "id", "mainEmailAddressLink"); if (_emailAdNode != null) { _emailAdd = _emailAdNode.Descendants("span").ToArray()[0].InnerText; } string _locations = "[NA]"; //string _dateAdded = "[NA]"; adInf.GoogleMap = _googMap; adInf.Keywords = _keywords; adInf.Description = _description; adInf.EmailAddress = _emailAdd; adInf.Locations = _locations; //adInf.DateAdded = _dateAdded; } catch (Exception ex) { if (this.frameworkExceptionInvoke != null) { Exception _ex = new Exception(string.Format("Exception in {0}.{1}(?)", this.directoryProviderSetting.ServicedCountry.ToString(), "EscrapeAdInfoExtend"), ex); this.frameworkExceptionInvoke(new Handlers.EventHandlers.FrameworkExceptionEventArgs(_ex)); } } } return(adInf); }
/// <summary> /// /// </summary> /// <param name="htmNode"></param> /// <returns></returns> public Models.Advertisement EscrapeAdInfo(HtmlAgilityPack.HtmlNode htmNode) { Models.Advertisement _adInfo = null; if (htmNode != null) { try { HtmlAgilityPack.HtmlNode __n = null; string _businessName = "[ERROR]"; __n = HtmlUtil.GetNode(htmNode, "meta", "itemprop", "name"); _businessName = __n.Attributes["content"].Value; _businessName = Models.Advertisement.Resolve(_businessName); string _description = "[NA]"; __n = HtmlUtil.GetNode(htmNode, "div", "class", "enhancedTextDesc paragraph"); if (__n != null) { _description = __n.InnerText; } string _phone = "[NA]"; try { /* <div class="preferredContact paragraph"> * <span class="prefix">ph:</span> * <span preferredcontact="1">(02) 8222 3333</span> * </div> */ _phone = HtmlUtil.GetNode(htmNode, "div", "class", "preferredContact paragraph").Descendants("span").ToArray()[1].InnerText; } catch { } string _fax = "[NA]"; //HtmlUtil.GetInnerText(_n, "div", "class", "phoneNumber"); /* * <span class="address">Level 11/ 75 Elizabeth St, Sydney NSW 2000</span> */ string _fullAddress = HtmlUtil.GetInnerText(htmNode, "span", "class", "address"); string _streetBlk = string.Empty; string _locality = string.Empty; string _region = string.Empty; string _postalCode = string.Empty; Parsers.SplitAddresses(_fullAddress, ref _streetBlk, ref _locality, ref _region, ref _postalCode); string _website = "[NA]"; __n = HtmlUtil.GetNode(htmNode, "a", "name", "listing_website"); if (__n != null) { _website = Models.Advertisement.Resolve(__n.InnerText); } string _latitude = "[NA]"; string _longitude = "[NA]"; /*<li flagnumber="1" * class="gold mappableListing listingContainer omnitureListing" * longitude="151.210118" * latitude="-33.867857" * product=";473590701;;;;evar26=Turner_Freeman_Lawye|evar23=O|evar46=YOLDSOL-DC" listingposition="1">*/ _latitude = htmNode.Attributes["latitude"].Value; _longitude = htmNode.Attributes["longitude"].Value; /* * <div class="yelp-rating review-rating" review="5"></div> */ string _rating = "[NA]"; __n = HtmlUtil.GetNode(htmNode, "div", "class", "yelp-rating review-rating"); if (__n != null) { _rating = __n.Attributes["review"].Value; } /* <a href="/nsw/sydney/edwards-barrie-13025623-listing.html?context=businessTypeSearch&referredBy=YOL" name="listing_name" class="omnitureListingNameLink" id="listing-name-link-25"> * <span id="listing-name-25">Edwards Barrie</span> * </a> */ string _adLink = "[ERROR]"; __n = HtmlUtil.GetNode(htmNode, "a", "class", "omnitureListingNameLink"); if (__n != null) { _adLink = __n.Attributes["href"].Value; } //-------------------------------------------------------- _adInfo = new Models.Advertisement() { BusinessName = _businessName, Description = _description, Phone = _phone, Fax = _fax, FullAddress = _fullAddress, StreetBlk = _streetBlk, Locality = _locality, Region = _region, PostalCode = _postalCode, Website = _website, Latitude = _latitude, Longtitude = _longitude, Rating = _rating, AdvertiserLink = string.Format("{0}{1}", this.directoryProviderSetting.BaseUrl, _adLink), }; } catch (Exception ex) { if (this.frameworkExceptionInvoke != null) { Exception _ex = new Exception(string.Format("Exception in {0}.{1}(?)", this.directoryProviderSetting.ServicedCountry.ToString(), "EscrapeAdInfo"), ex); this.frameworkExceptionInvoke(new Handlers.EventHandlers.FrameworkExceptionEventArgs(_ex)); } } } return(_adInfo); }
/// <summary> /// /// </summary> /// <param name="adInf"></param> /// <returns></returns> public Models.Advertisement EscrapeAdInfoExtend(Models.Advertisement adInf) { if (adInf != null && !string.IsNullOrEmpty(adInf.AdvertiserLink)) { try { string _htmlDoc = HtmlUtil.GetPageDocument(adInf.AdvertiserLink); HtmlAgilityPack.HtmlDocument htmDocAg = new HtmlAgilityPack.HtmlDocument(); htmDocAg.LoadHtml(_htmlDoc); string _latitude = "[NA]"; string _longitude = "[NA]"; HtmlAgilityPack.HtmlNode _mapDatNode = HtmlUtil.GetNode(htmDocAg.DocumentNode, "div", "id", "ypgMapContainer"); if (_mapDatNode != null) { //latLong = new VELatLong(43.8087172232, -79.5469648855); map.CreateAndLoadMap string _mapDat = _mapDatNode.InnerText; int _mrkrStart = _mapDat.IndexOf("VELatLong", 0); int _mrkrEnd = _mapDat.IndexOf("map.CreateAndLoadMap", 0); _mapDat = _mapDat.Substring(_mrkrStart, _mrkrEnd - _mrkrStart); _mapDat = _mapDat .Replace("VELatLong(", "") .Replace("map.CreateAndLoadMap", "") .Replace(");", "") .Trim(); string[] _coords = _mapDat.Split(new char[] { ',' }, StringSplitOptions.RemoveEmptyEntries); _latitude = _coords[0]; _longitude = _coords[1]; } string _googMap = "[NA]"; string _keywords = "[NA]"; HtmlAgilityPack.HtmlNode _metaKeywords = HtmlUtil.GetNode(htmDocAg.DocumentNode, "meta", "name", "keywords"); if (_metaKeywords != null) { _keywords = _metaKeywords.Attributes["content"].Value; } string _description = "[NA]"; HtmlAgilityPack.HtmlNode _metaDesc = HtmlUtil.GetNode(htmDocAg.DocumentNode, "meta", "name", "description"); if (_metaDesc != null) { _description = _metaDesc.Attributes["content"].Value; } string _rating = "[NA]"; string _emailAdd = "[NA]"; HtmlAgilityPack.HtmlNode _emailAdNode = HtmlUtil.GetNode(htmDocAg.DocumentNode, "div", "class", "busCardLeftLinks"); if (_emailAdNode != null) { try { _emailAdd = _emailAdNode.Descendants("a").ToArray()[0].Attributes["content"].Value; } catch { } } string _locations = "[NA]"; //string _dateAdded = "[NA]"; adInf.Latitude = _latitude; adInf.Longtitude = _longitude; adInf.GoogleMap = _googMap; adInf.Keywords = _keywords; adInf.Description = _description; adInf.Rating = _rating; adInf.EmailAddress = _emailAdd; adInf.Locations = _locations; //adInf.DateAdded = _dateAdded; } catch (Exception ex) { if (this.frameworkExceptionInvoke != null) { Exception _ex = new Exception(string.Format("Exception in {0}.{1}(?)", this.directoryProviderSetting.ServicedCountry.ToString(), "EscrapeAdInfoExtend"), ex); this.frameworkExceptionInvoke(new Handlers.EventHandlers.FrameworkExceptionEventArgs(_ex)); } } } return(adInf); }
/// <summary> /// /// </summary> /// <param name="htmNode"></param> /// <returns></returns> public Models.Advertisement EscrapeAdInfo(HtmlAgilityPack.HtmlNode htmNode) { Models.Advertisement _adInfo = null; if (htmNode != null) { try { string _businessName = HtmlUtil.GetInnerText(htmNode, "span", "class", "listingTitle"); _businessName = Models.Advertisement.Resolve(_businessName); string _phone = HtmlUtil.GetInnerText(htmNode, "div", "class", "phoneNumber"); string _fax = "[NA]"; //HtmlUtil.GetInnerText(_n, "div", "class", "phoneNumber"); string _fullAddress = HtmlUtil.GetInnerText(htmNode, "div", "class", "address"); string _streetBlk = string.Empty; string _locality = string.Empty; string _region = string.Empty; string _postalCode = string.Empty; Parsers.SplitAddresses(_fullAddress, ref _streetBlk, ref _locality, ref _region, ref _postalCode); HtmlAgilityPack.HtmlNode __n = null; string _website = "[NA]"; __n = HtmlUtil.GetNode(htmNode, "ul", "class", "ypgListingLinks"); if (__n != null) { try { __n = HtmlUtil.GetNode(htmNode, "li", "class", "noPrint"); __n = __n.Descendants("a").ToArray()[0]; _website = __n.Attributes["href"].Value.Replace("/gourl/", string.Empty); } catch { } } string _adLink = "[NA]"; __n = HtmlUtil.GetNode(htmNode, "h3", "class", "listingTitleLine"); if (__n != null) { __n = __n.Descendants("a").ToArray()[0]; _adLink = __n.Attributes["href"].Value; } //-------------------------------------------------------- _adInfo = new Models.Advertisement() { BusinessName = _businessName, Phone = _phone, Fax = _fax, FullAddress = _fullAddress, StreetBlk = _streetBlk, Locality = _locality, Region = _region, PostalCode = _postalCode, Website = _website, AdvertiserLink = string.Format("{0}{1}", this.directoryProviderSetting.BaseUrl, _adLink), }; } catch (Exception ex) { if (this.frameworkExceptionInvoke != null) { Exception _ex = new Exception(string.Format("Exception in {0}.{1}(?)", this.directoryProviderSetting.ServicedCountry.ToString(), "EscrapeAdInfo"), ex); this.frameworkExceptionInvoke(new Handlers.EventHandlers.FrameworkExceptionEventArgs(_ex)); } } } return(_adInfo); }