示例#1
0
        private List <RightmovePropertyListing> FindPropertyListings(string html)
        {
            //TODO: Validate the current HTML and throw an exception
            // if the elements we expect to find are not present
            // in the page. Make sure to take into account the below issue
            // whereby the DOM simply hasn't caught up with the crawler.
            // See ref xxx1.

            var propertyListings = new List <RightmovePropertyListing>();

            var searchResults = _document.QuerySelectorAll(".l-searchResult");

            // Loop through each of the listings first, gather as much as we can.
            // We will later filter the listings based on the information gathered
            // here to reduce the number of requests we have to make, this speeds
            // up the crawling and makes the traffic look more human. This is why
            // we don't crawl the individual property details pages yet.
            foreach (IElement element in searchResults)
            {
                if (element.ClassList.Contains("l-searchResult-loading"))
                {
                    // Sometimes the property listings don't load quick enough so
                    // we need to refresh the page to get them. Ref: xxx1
                    _browser.Reload(true);
                    _crawlerErrors++;
                    return(null);
                }

                var rightmovePropertyListing = new RightmovePropertyListing();

                rightmovePropertyListing.Html = element.Html();
                rightmovePropertyListing.Link = "https://www.rightmove.co.uk" + element.QuerySelector(".propertyCard-link").GetAttribute("href");

                rightmovePropertyListing.Title = element.QuerySelector(".propertyCard-title").Text();

                // We set the description for now in case it helps us save making a request
                // when filtering later even though we know it isn't the complete description
                // that will be obtained when visiting the property details page.
                rightmovePropertyListing.Description = element.QuerySelector(".propertyCard-description span span").Text();

                var price          = element.QuerySelector(".propertyCard-priceValue").Text();
                var priceQualifier = element.QuerySelector(".propertyCard-priceQualifier")?.Text();
                SetPrice(rightmovePropertyListing, price, priceQualifier);

                propertyListings.Add(rightmovePropertyListing);
            }

            return(propertyListings);
        }
示例#2
0
 private void SetPrice(RightmovePropertyListing rightmovePropertyListing, string price, string priceQualifier)
 {
     //REVIEW: It might make sense just to consider any non-numeric as a price qualifier.
     if (price != "POA" && price != "Offers Invited" && price != "Sale by Tender")
     {
         rightmovePropertyListing.Price          = ParsePrice(price);
         rightmovePropertyListing.PriceQualifier = priceQualifier;
     }
     else
     {
         // Handle price on application scenario, etc. No
         // property on Rightmove can have a price of £0,
         // £1 is the minium property price so it is safe
         // to set the price to 0.
         rightmovePropertyListing.Price          = 0;
         rightmovePropertyListing.PriceQualifier = price;
     }
 }
示例#3
0
        private RightmovePropertyListing CrawlPropertyDetailsPage()
        {
            //TODO: Validate the current HTML and throw an exception
            // if the elements we expect to find are not present
            // in the page.

            var rightmovePropertyListing = new RightmovePropertyListing();

            rightmovePropertyListing.Html = _currentHtml;
            rightmovePropertyListing.Link = _browser.Address.Trim();

            rightmovePropertyListing.Title = _document.QuerySelector("h1").Text();

            rightmovePropertyListing.Description = _document.QuerySelector(".agent-content").OuterHtml;

            var price          = _document.QuerySelector(".property-header-price strong").Text();
            var priceQualifier = _document.QuerySelector(".property-header-qualifier")?.Text();

            SetPrice(rightmovePropertyListing, price, priceQualifier);

            rightmovePropertyListing.Address = _document.QuerySelector(".property-header address").Text().Trim();

            var imageElements = _document.QuerySelectorAll("meta[property=\"og:image\"]");

            foreach (IElement imageElement in imageElements)
            {
                rightmovePropertyListing.Images.Add(imageElement.GetAttribute("content"));
            }

            var addedOnRightmove = _document.QuerySelector("#firstListedDateValue")?.Text();

            if (!string.IsNullOrEmpty(addedOnRightmove))
            {
                rightmovePropertyListing.DateAdded = DateTime.ParseExact(addedOnRightmove, "dd MMMM yyyy", new CultureInfo("en-GB"));
            }
            else
            {
                // Annoyingly, not all properties on Rightmove list when they were added.
                rightmovePropertyListing.DateAdded = DateTime.MinValue;
            }

            return(rightmovePropertyListing);
        }
示例#4
0
 private bool HasPropertyAlreadyBeenCrawled(RightmovePropertyListing rightmovePropertyListing)
 {
     return(_urlLog.Contains(rightmovePropertyListing.Link));
 }
示例#5
0
        private bool PropertyMatchesShortlist(RightmovePropertyListing rightmovePropertyListing, bool fullTest = true)
        {
            // The property might be excluded more than once if it is a featured property so we
            // need to account for this, hence why we don't use the _removedProperties.Add()
            // method and instead use the indexer

            if (_shortlist.MinimumPrice > 0 && rightmovePropertyListing.Price > 0 && rightmovePropertyListing.Price < _shortlist.MinimumPrice)
            {
                if (OptionsForm.DebugingEnabled)
                {
                    _removedProperties[rightmovePropertyListing.Link] = "Price too low: " + rightmovePropertyListing.PriceQualifier + " " + rightmovePropertyListing.Price;
                }
                return(false);
            }

            if (_shortlist.MaximumPrice > 0 && rightmovePropertyListing.Price > 0 && rightmovePropertyListing.Price > _shortlist.MaximumPrice)
            {
                if (OptionsForm.DebugingEnabled)
                {
                    _removedProperties[rightmovePropertyListing.Link] = "Price too high: " + rightmovePropertyListing.PriceQualifier + " " + rightmovePropertyListing.Price;
                }
                return(false);
            }

            if (_shortlist.MaximumPrice > 0 && rightmovePropertyListing.Price > 0 && rightmovePropertyListing.Price == _shortlist.MaximumPrice && rightmovePropertyListing.PriceQualifier == "Offers in Excess of")
            {
                if (OptionsForm.DebugingEnabled)
                {
                    _removedProperties[rightmovePropertyListing.Link] = "Price too high: Offers in Excess of " + rightmovePropertyListing.PriceQualifier + " " + rightmovePropertyListing.Price;
                }
                return(false);
            }

            if (_shortlist.ExcludePriceOnApplication && rightmovePropertyListing.PriceQualifier == "POA")
            {
                if (OptionsForm.DebugingEnabled)
                {
                    _removedProperties[rightmovePropertyListing.Link] = "POA excluded.";
                }
                return(false);
            }

            if (_shortlist.ExcludeOffersInvited && rightmovePropertyListing.PriceQualifier == "Offers Invited")
            {
                if (OptionsForm.DebugingEnabled)
                {
                    _removedProperties[rightmovePropertyListing.Link] = "Offers Invited excluded.";
                }
                return(false);
            }

            if (_shortlist.ExcludeTerms.Count() > 0)
            {
                string excludedTermFound = null;
                foreach (string term in _shortlist.ExcludeTerms)
                {
                    var regex = new Regex(@"\b" + term + @"\b", RegexOptions.IgnoreCase | RegexOptions.Compiled);
                    if (regex.IsMatch(rightmovePropertyListing.Title))
                    {
                        excludedTermFound = term;
                        break;
                    }
                    else if (regex.IsMatch(rightmovePropertyListing.Description))
                    {
                        excludedTermFound = term;
                        break;
                    }
                    else if (!string.IsNullOrEmpty(rightmovePropertyListing.PriceQualifier) && regex.IsMatch(rightmovePropertyListing.PriceQualifier))
                    {
                        excludedTermFound = term;
                        break;
                    }
                }
                if (!string.IsNullOrEmpty(excludedTermFound))
                {
                    if (OptionsForm.DebugingEnabled)
                    {
                        _removedProperties[rightmovePropertyListing.Link] = "Excluded term found: " + excludedTermFound;
                    }
                    return(false);
                }
            }

            if (fullTest)
            {
                // We only test for terms that must be included in a full test
                // as the search listings page doesn't include the full
                // description so we could potentially filter out matching
                // properties if we were to do this prior to having the full
                // description of the property.

                if (_shortlist.ExcludeTerms.Count() > 0)
                {
                    bool requiredTermFound = false;
                    foreach (string term in _shortlist.IncludeTerms)
                    {
                        var regex = new Regex(@"\b" + term + @"\b", RegexOptions.IgnoreCase | RegexOptions.Compiled);
                        if (regex.IsMatch(rightmovePropertyListing.Title))
                        {
                            requiredTermFound = true;
                            break;
                        }
                        else if (regex.IsMatch(rightmovePropertyListing.Description))
                        {
                            requiredTermFound = true;
                            break;
                        }
                        else if (!string.IsNullOrEmpty(rightmovePropertyListing.PriceQualifier) && regex.IsMatch(rightmovePropertyListing.PriceQualifier))
                        {
                            requiredTermFound = true;
                            break;
                        }
                    }
                    if (!requiredTermFound)
                    {
                        if (OptionsForm.DebugingEnabled)
                        {
                            _removedProperties[rightmovePropertyListing.Link] = "One or more required terms were not found.";
                        }
                        return(false);
                    }
                }

                // Annoyingly, not all properties on Rightmove list when they were added so any with a DateTime.MinValue
                // are properties that were added on an unknown date.

                if (_shortlist.ExcludePostcodes.Count() > 0)
                {
                    var postcodeRegex = new Regex("propertyPostcode: \"([^\"]+)\",", RegexOptions.Compiled);
                    var postcodeMatch = postcodeRegex.Match(rightmovePropertyListing.Html);
                    if (postcodeMatch.Success)
                    {
                        var postcode = Postcode.Parse(postcodeMatch.Groups[1].Value);
                        foreach (string excludePostcode in _shortlist.ExcludePostcodes)
                        {
                            var excludedPostcode = Postcode.Parse(excludePostcode);
                            if (excludedPostcode.IsPartcialMatch(postcode))
                            {
                                return(false);
                            }
                        }
                    }
                }

                //TODO: finish filtering the listings....
                // AddedAfter, etc.
            }

            return(true);
        }