private List <RightmovePropertyListing> FindPropertyListings(string html) { //TODO: Validate the current HTML and throw an exception // if the elements we expect to find are not present // in the page. Make sure to take into account the below issue // whereby the DOM simply hasn't caught up with the crawler. // See ref xxx1. var propertyListings = new List <RightmovePropertyListing>(); var searchResults = _document.QuerySelectorAll(".l-searchResult"); // Loop through each of the listings first, gather as much as we can. // We will later filter the listings based on the information gathered // here to reduce the number of requests we have to make, this speeds // up the crawling and makes the traffic look more human. This is why // we don't crawl the individual property details pages yet. foreach (IElement element in searchResults) { if (element.ClassList.Contains("l-searchResult-loading")) { // Sometimes the property listings don't load quick enough so // we need to refresh the page to get them. Ref: xxx1 _browser.Reload(true); _crawlerErrors++; return(null); } var rightmovePropertyListing = new RightmovePropertyListing(); rightmovePropertyListing.Html = element.Html(); rightmovePropertyListing.Link = "https://www.rightmove.co.uk" + element.QuerySelector(".propertyCard-link").GetAttribute("href"); rightmovePropertyListing.Title = element.QuerySelector(".propertyCard-title").Text(); // We set the description for now in case it helps us save making a request // when filtering later even though we know it isn't the complete description // that will be obtained when visiting the property details page. rightmovePropertyListing.Description = element.QuerySelector(".propertyCard-description span span").Text(); var price = element.QuerySelector(".propertyCard-priceValue").Text(); var priceQualifier = element.QuerySelector(".propertyCard-priceQualifier")?.Text(); SetPrice(rightmovePropertyListing, price, priceQualifier); propertyListings.Add(rightmovePropertyListing); } return(propertyListings); }
private void SetPrice(RightmovePropertyListing rightmovePropertyListing, string price, string priceQualifier) { //REVIEW: It might make sense just to consider any non-numeric as a price qualifier. if (price != "POA" && price != "Offers Invited" && price != "Sale by Tender") { rightmovePropertyListing.Price = ParsePrice(price); rightmovePropertyListing.PriceQualifier = priceQualifier; } else { // Handle price on application scenario, etc. No // property on Rightmove can have a price of £0, // £1 is the minium property price so it is safe // to set the price to 0. rightmovePropertyListing.Price = 0; rightmovePropertyListing.PriceQualifier = price; } }
private RightmovePropertyListing CrawlPropertyDetailsPage() { //TODO: Validate the current HTML and throw an exception // if the elements we expect to find are not present // in the page. var rightmovePropertyListing = new RightmovePropertyListing(); rightmovePropertyListing.Html = _currentHtml; rightmovePropertyListing.Link = _browser.Address.Trim(); rightmovePropertyListing.Title = _document.QuerySelector("h1").Text(); rightmovePropertyListing.Description = _document.QuerySelector(".agent-content").OuterHtml; var price = _document.QuerySelector(".property-header-price strong").Text(); var priceQualifier = _document.QuerySelector(".property-header-qualifier")?.Text(); SetPrice(rightmovePropertyListing, price, priceQualifier); rightmovePropertyListing.Address = _document.QuerySelector(".property-header address").Text().Trim(); var imageElements = _document.QuerySelectorAll("meta[property=\"og:image\"]"); foreach (IElement imageElement in imageElements) { rightmovePropertyListing.Images.Add(imageElement.GetAttribute("content")); } var addedOnRightmove = _document.QuerySelector("#firstListedDateValue")?.Text(); if (!string.IsNullOrEmpty(addedOnRightmove)) { rightmovePropertyListing.DateAdded = DateTime.ParseExact(addedOnRightmove, "dd MMMM yyyy", new CultureInfo("en-GB")); } else { // Annoyingly, not all properties on Rightmove list when they were added. rightmovePropertyListing.DateAdded = DateTime.MinValue; } return(rightmovePropertyListing); }
private bool HasPropertyAlreadyBeenCrawled(RightmovePropertyListing rightmovePropertyListing) { return(_urlLog.Contains(rightmovePropertyListing.Link)); }
private bool PropertyMatchesShortlist(RightmovePropertyListing rightmovePropertyListing, bool fullTest = true) { // The property might be excluded more than once if it is a featured property so we // need to account for this, hence why we don't use the _removedProperties.Add() // method and instead use the indexer if (_shortlist.MinimumPrice > 0 && rightmovePropertyListing.Price > 0 && rightmovePropertyListing.Price < _shortlist.MinimumPrice) { if (OptionsForm.DebugingEnabled) { _removedProperties[rightmovePropertyListing.Link] = "Price too low: " + rightmovePropertyListing.PriceQualifier + " " + rightmovePropertyListing.Price; } return(false); } if (_shortlist.MaximumPrice > 0 && rightmovePropertyListing.Price > 0 && rightmovePropertyListing.Price > _shortlist.MaximumPrice) { if (OptionsForm.DebugingEnabled) { _removedProperties[rightmovePropertyListing.Link] = "Price too high: " + rightmovePropertyListing.PriceQualifier + " " + rightmovePropertyListing.Price; } return(false); } if (_shortlist.MaximumPrice > 0 && rightmovePropertyListing.Price > 0 && rightmovePropertyListing.Price == _shortlist.MaximumPrice && rightmovePropertyListing.PriceQualifier == "Offers in Excess of") { if (OptionsForm.DebugingEnabled) { _removedProperties[rightmovePropertyListing.Link] = "Price too high: Offers in Excess of " + rightmovePropertyListing.PriceQualifier + " " + rightmovePropertyListing.Price; } return(false); } if (_shortlist.ExcludePriceOnApplication && rightmovePropertyListing.PriceQualifier == "POA") { if (OptionsForm.DebugingEnabled) { _removedProperties[rightmovePropertyListing.Link] = "POA excluded."; } return(false); } if (_shortlist.ExcludeOffersInvited && rightmovePropertyListing.PriceQualifier == "Offers Invited") { if (OptionsForm.DebugingEnabled) { _removedProperties[rightmovePropertyListing.Link] = "Offers Invited excluded."; } return(false); } if (_shortlist.ExcludeTerms.Count() > 0) { string excludedTermFound = null; foreach (string term in _shortlist.ExcludeTerms) { var regex = new Regex(@"\b" + term + @"\b", RegexOptions.IgnoreCase | RegexOptions.Compiled); if (regex.IsMatch(rightmovePropertyListing.Title)) { excludedTermFound = term; break; } else if (regex.IsMatch(rightmovePropertyListing.Description)) { excludedTermFound = term; break; } else if (!string.IsNullOrEmpty(rightmovePropertyListing.PriceQualifier) && regex.IsMatch(rightmovePropertyListing.PriceQualifier)) { excludedTermFound = term; break; } } if (!string.IsNullOrEmpty(excludedTermFound)) { if (OptionsForm.DebugingEnabled) { _removedProperties[rightmovePropertyListing.Link] = "Excluded term found: " + excludedTermFound; } return(false); } } if (fullTest) { // We only test for terms that must be included in a full test // as the search listings page doesn't include the full // description so we could potentially filter out matching // properties if we were to do this prior to having the full // description of the property. if (_shortlist.ExcludeTerms.Count() > 0) { bool requiredTermFound = false; foreach (string term in _shortlist.IncludeTerms) { var regex = new Regex(@"\b" + term + @"\b", RegexOptions.IgnoreCase | RegexOptions.Compiled); if (regex.IsMatch(rightmovePropertyListing.Title)) { requiredTermFound = true; break; } else if (regex.IsMatch(rightmovePropertyListing.Description)) { requiredTermFound = true; break; } else if (!string.IsNullOrEmpty(rightmovePropertyListing.PriceQualifier) && regex.IsMatch(rightmovePropertyListing.PriceQualifier)) { requiredTermFound = true; break; } } if (!requiredTermFound) { if (OptionsForm.DebugingEnabled) { _removedProperties[rightmovePropertyListing.Link] = "One or more required terms were not found."; } return(false); } } // Annoyingly, not all properties on Rightmove list when they were added so any with a DateTime.MinValue // are properties that were added on an unknown date. if (_shortlist.ExcludePostcodes.Count() > 0) { var postcodeRegex = new Regex("propertyPostcode: \"([^\"]+)\",", RegexOptions.Compiled); var postcodeMatch = postcodeRegex.Match(rightmovePropertyListing.Html); if (postcodeMatch.Success) { var postcode = Postcode.Parse(postcodeMatch.Groups[1].Value); foreach (string excludePostcode in _shortlist.ExcludePostcodes) { var excludedPostcode = Postcode.Parse(excludePostcode); if (excludedPostcode.IsPartcialMatch(postcode)) { return(false); } } } } //TODO: finish filtering the listings.... // AddedAfter, etc. } return(true); }