Пример #1
0
        static async Task Execute()
        {
            using (var dbContext = new RealEstateCrawlerContext())
            {
                var list             = Source.GetList();
                var crawlFailedCount = 0;

                foreach (var item in list)
                {
                    try
                    {
                        var crawlFailed = await ProcessAdvertisement(item, dbContext);

                        crawlFailedCount = crawlFailed ? crawlFailedCount + 1 : 0;

                        if (crawlFailedCount == 5)
                        {
                            Trace.WriteLine("");
                            Trace.WriteLine("Crawl failed 5 times in a row, aborting...");
                            break;
                        }
                    }
                    catch (WebException exception)
                    {
                        Trace.WriteLine(exception);

                        var response = (HttpWebResponse)exception.Response;
                        if (response.StatusCode == HttpStatusCode.Forbidden)
                        {
                            Trace.WriteLine("");
                            Trace.WriteLine("Blocked by the host, aborting...");
                            break;
                        }
                    }
                    catch (Exception exception)
                    {
                        Trace.WriteLine(exception);
                    }

                    Trace.WriteLine("-");
                }

                await dbContext.SaveChangesAsync();
            }
        }
Пример #2
0
        /// <returns>Crawl failed?</returns>
        static async Task <bool> ProcessAdvertisement(string advertisementUrl, RealEstateCrawlerContext dbContext)
        {
            // Advertisement no
            var advertisementNoText = advertisementUrl.Substring(advertisementUrl.LastIndexOf("-", StringComparison.InvariantCulture) + 1,
                                                                 advertisementUrl.LastIndexOf("/", StringComparison.InvariantCulture) - (advertisementUrl.LastIndexOf("-", StringComparison.InvariantCulture) + 1));

            var advertisementNo = int.Parse(advertisementNoText);

            // Look for an existing advertisement
            var advertisement = dbContext.AdvertisementSet.SingleOrDefault(adv => adv.AdvertisementNo == advertisementNo);

            // New advertisement?
            var newAdvertisement = advertisement == null;

            if (newAdvertisement)
            {
                advertisement = new Advertisement
                {
                    AdvertisementNo  = advertisementNo,
                    AdvertisementUrl = advertisementUrl
                };

                dbContext.AdvertisementSet.Add(advertisement);
            }

            // Already deleted?
            if (!newAdvertisement && advertisement.Deleted)
            {
                Trace.WriteLine($"Already deleted: {advertisementNo}");
                return(false);
            }

            // Already crawled?
            if (!newAdvertisement && advertisement.AdvertisementDate.HasValue)
            {
                // Todo Recrawl
                Trace.WriteLine($"Already crawled: {advertisementNo}");
                return(false);
            }

            // Previous crawl failed, but not old enough?
            if (false && !newAdvertisement && DateTime.UtcNow.Subtract(advertisement.ModifiedOn).Days < 2)
            {
                Trace.WriteLine($"Not crawled, but not old enough: {advertisementNo}");
                return(false);
            }

            // Process html
            var html = await RequestHtml(advertisement);

            //var html = ReadHtmlFile(advertisement.AdvertisementNo);

            var doc = new HtmlDocument();

            doc.LoadHtml(html);

            // Main node
            var infoNode = doc.DocumentNode
                           .Descendants()
                           .FirstOrDefault(node => node.Attributes["class"] != null && node.Attributes["class"]
                                           .Value
                                           .TrimEtc() == "classifiedInfo");

            // Crawl failed?
            if (infoNode == null)
            {
                Trace.WriteLine($"Crawl failed: {advertisement.AdvertisementNo}");
                return(true);
            }

            // Price
            var price = infoNode.Descendants("h3")
                        .Single()
                        .FirstChild
                        .InnerText
                        .Replace(".", "")
                        .Replace(" TL", "")
                        .TrimEtc();

            // Location
            var locationNodes = infoNode.Descendants("h2").Single().Descendants("a").ToList();
            var province      = locationNodes[0].InnerText.TrimEtc();
            var district      = locationNodes[1].InnerText.TrimEtc();
            var neighborhood  = locationNodes[2].InnerText.TrimEtc();

            // Info list
            var infoListLabels = infoNode.Descendants("ul").Single().Descendants("strong").ToList();
            var infoListValues = infoNode.Descendants("ul").Single().Descendants("span").ToList();

            // Advertisement date
            var advertisementDateText = infoListValues[1].InnerText.TrimEtc();
            var advertisementDay      = int.Parse(advertisementDateText.Substring(0,
                                                                                  advertisementDateText.IndexOf(" ", StringComparison.InvariantCulture)));
            var advertisementMonth = Utils.GetMonth(advertisementDateText.Substring(
                                                        advertisementDateText.IndexOf(" ", StringComparison.InvariantCulture) + 1,
                                                        advertisementDateText.LastIndexOf(" ", StringComparison.InvariantCulture) -
                                                        (advertisementDateText.IndexOf(" ", StringComparison.InvariantCulture) + 1)));
            var advertisementYear = int.Parse(advertisementDateText.Substring(
                                                  advertisementDateText.LastIndexOf(" ", StringComparison.InvariantCulture) + 1));
            var advertisementDate = new DateTime(advertisementYear, advertisementMonth, advertisementDay);

            var advertisementType = infoListValues[2].InnerText.TrimEtc();
            var squareMeters      = infoListValues[3].InnerText.TrimEtc();
            var numberOfRooms     = infoListValues[4].InnerText.TrimEtc();
            var buildingAge       = infoListValues[5].InnerText.TrimEtc();
            var floor             = infoListValues[6].InnerText.TrimEtc();
            var numberOfFloors    = infoListValues[7].InnerText.TrimEtc();
            var heatingSystem     = infoListValues[8].InnerText.TrimEtc();
            var numberOfToilets   = infoListValues[9].InnerText.TrimEtc();
            var furnished         = infoListValues[10].InnerText.TrimEtc();
            var currentState      = infoListValues[11].InnerText.TrimEtc();
            var inComplex         = infoListValues[12].InnerText.TrimEtc();
            var subscriptionCosts = infoListValues[13].InnerText.TrimEtc();

            // Only some of the advertisements has "Complex Name"
            var hasComplexName = infoListLabels[14].InnerText == "Site Adı";
            var complexName    = hasComplexName ? infoListValues[14].InnerText.TrimEtc() : string.Empty;

            var suitableForLoadIndex    = hasComplexName ? 15 : 14;
            var advertisementOwnerIndex = hasComplexName ? 16 : 15;
            var swappableIndex          = hasComplexName ? 17 : 16;

            var suitableForLoan    = infoListValues[suitableForLoadIndex].InnerText.TrimEtc();
            var advertisementOwner = infoListValues[advertisementOwnerIndex].InnerText.TrimEtc();
            var swappable          = infoListValues[swappableIndex].InnerText.TrimEtc();

            Trace.WriteLine($"advertisementNo: {advertisementNo}");
            Trace.WriteLine($"advertisementDate: {advertisementDate}");
            Trace.WriteLine($"province: {province}");
            Trace.WriteLine($"district: {district}");
            Trace.WriteLine($"neighborhood: {neighborhood}");
            Trace.WriteLine($"price: {price}");
            Trace.WriteLine($"advertisementType: {advertisementType}");
            Trace.WriteLine($"squareMeters: {squareMeters}");
            Trace.WriteLine($"numberOfRooms: {numberOfRooms}");
            Trace.WriteLine($"buildingAge: {buildingAge}");
            Trace.WriteLine($"floor: {floor}");
            Trace.WriteLine($"totalFloors: {numberOfFloors}");
            Trace.WriteLine($"heatingSystem: {heatingSystem}");
            Trace.WriteLine($"numberOfFloors: {numberOfFloors}");
            Trace.WriteLine($"numberOfToilets: {numberOfToilets}");
            Trace.WriteLine($"furnished: {furnished}");
            Trace.WriteLine($"currentState: {currentState}");
            Trace.WriteLine($"inComplex: {inComplex}");
            Trace.WriteLine($"subscriptionCosts: {subscriptionCosts}");
            Trace.WriteLine($"complexName: {complexName}");
            Trace.WriteLine($"suitableForLoan: {suitableForLoan}");
            Trace.WriteLine($"advertisementOwner: {advertisementOwner}");
            Trace.WriteLine($"swappable: {swappable}");

            advertisement.Province           = province;
            advertisement.District           = district;
            advertisement.Neighborhood       = neighborhood;
            advertisement.Price              = decimal.Parse(price);
            advertisement.AdvertisementDate  = advertisementDate;
            advertisement.AdvertisementType  = advertisementType;
            advertisement.SquareMeters       = int.Parse(squareMeters);
            advertisement.NumberOfRooms      = numberOfRooms;
            advertisement.BuildingAge        = buildingAge;
            advertisement.Floor              = floor;
            advertisement.NumberOfFloors     = numberOfFloors;
            advertisement.HeatingSystem      = heatingSystem;
            advertisement.NumberOfToilets    = numberOfToilets;
            advertisement.Furnished          = furnished;
            advertisement.CurrentState       = currentState;
            advertisement.InComplex          = inComplex;
            advertisement.SubscriptionCosts  = subscriptionCosts;
            advertisement.ComplexName        = complexName;
            advertisement.SuitableForLoad    = suitableForLoan;
            advertisement.AdvertisementOwner = advertisementOwner;
            advertisement.Swappable          = swappable;
            advertisement.ModifiedOn         = DateTime.UtcNow;

            return(false);
        }