static async Task Execute() { using (var dbContext = new RealEstateCrawlerContext()) { var list = Source.GetList(); var crawlFailedCount = 0; foreach (var item in list) { try { var crawlFailed = await ProcessAdvertisement(item, dbContext); crawlFailedCount = crawlFailed ? crawlFailedCount + 1 : 0; if (crawlFailedCount == 5) { Trace.WriteLine(""); Trace.WriteLine("Crawl failed 5 times in a row, aborting..."); break; } } catch (WebException exception) { Trace.WriteLine(exception); var response = (HttpWebResponse)exception.Response; if (response.StatusCode == HttpStatusCode.Forbidden) { Trace.WriteLine(""); Trace.WriteLine("Blocked by the host, aborting..."); break; } } catch (Exception exception) { Trace.WriteLine(exception); } Trace.WriteLine("-"); } await dbContext.SaveChangesAsync(); } }
/// <returns>Crawl failed?</returns> static async Task <bool> ProcessAdvertisement(string advertisementUrl, RealEstateCrawlerContext dbContext) { // Advertisement no var advertisementNoText = advertisementUrl.Substring(advertisementUrl.LastIndexOf("-", StringComparison.InvariantCulture) + 1, advertisementUrl.LastIndexOf("/", StringComparison.InvariantCulture) - (advertisementUrl.LastIndexOf("-", StringComparison.InvariantCulture) + 1)); var advertisementNo = int.Parse(advertisementNoText); // Look for an existing advertisement var advertisement = dbContext.AdvertisementSet.SingleOrDefault(adv => adv.AdvertisementNo == advertisementNo); // New advertisement? var newAdvertisement = advertisement == null; if (newAdvertisement) { advertisement = new Advertisement { AdvertisementNo = advertisementNo, AdvertisementUrl = advertisementUrl }; dbContext.AdvertisementSet.Add(advertisement); } // Already deleted? if (!newAdvertisement && advertisement.Deleted) { Trace.WriteLine($"Already deleted: {advertisementNo}"); return(false); } // Already crawled? if (!newAdvertisement && advertisement.AdvertisementDate.HasValue) { // Todo Recrawl Trace.WriteLine($"Already crawled: {advertisementNo}"); return(false); } // Previous crawl failed, but not old enough? if (false && !newAdvertisement && DateTime.UtcNow.Subtract(advertisement.ModifiedOn).Days < 2) { Trace.WriteLine($"Not crawled, but not old enough: {advertisementNo}"); return(false); } // Process html var html = await RequestHtml(advertisement); //var html = ReadHtmlFile(advertisement.AdvertisementNo); var doc = new HtmlDocument(); doc.LoadHtml(html); // Main node var infoNode = doc.DocumentNode .Descendants() .FirstOrDefault(node => node.Attributes["class"] != null && node.Attributes["class"] .Value .TrimEtc() == "classifiedInfo"); // Crawl failed? if (infoNode == null) { Trace.WriteLine($"Crawl failed: {advertisement.AdvertisementNo}"); return(true); } // Price var price = infoNode.Descendants("h3") .Single() .FirstChild .InnerText .Replace(".", "") .Replace(" TL", "") .TrimEtc(); // Location var locationNodes = infoNode.Descendants("h2").Single().Descendants("a").ToList(); var province = locationNodes[0].InnerText.TrimEtc(); var district = locationNodes[1].InnerText.TrimEtc(); var neighborhood = locationNodes[2].InnerText.TrimEtc(); // Info list var infoListLabels = infoNode.Descendants("ul").Single().Descendants("strong").ToList(); var infoListValues = infoNode.Descendants("ul").Single().Descendants("span").ToList(); // Advertisement date var advertisementDateText = infoListValues[1].InnerText.TrimEtc(); var advertisementDay = int.Parse(advertisementDateText.Substring(0, advertisementDateText.IndexOf(" ", StringComparison.InvariantCulture))); var advertisementMonth = Utils.GetMonth(advertisementDateText.Substring( advertisementDateText.IndexOf(" ", StringComparison.InvariantCulture) + 1, advertisementDateText.LastIndexOf(" ", StringComparison.InvariantCulture) - (advertisementDateText.IndexOf(" ", StringComparison.InvariantCulture) + 1))); var advertisementYear = int.Parse(advertisementDateText.Substring( advertisementDateText.LastIndexOf(" ", StringComparison.InvariantCulture) + 1)); var advertisementDate = new DateTime(advertisementYear, advertisementMonth, advertisementDay); var advertisementType = infoListValues[2].InnerText.TrimEtc(); var squareMeters = infoListValues[3].InnerText.TrimEtc(); var numberOfRooms = infoListValues[4].InnerText.TrimEtc(); var buildingAge = infoListValues[5].InnerText.TrimEtc(); var floor = infoListValues[6].InnerText.TrimEtc(); var numberOfFloors = infoListValues[7].InnerText.TrimEtc(); var heatingSystem = infoListValues[8].InnerText.TrimEtc(); var numberOfToilets = infoListValues[9].InnerText.TrimEtc(); var furnished = infoListValues[10].InnerText.TrimEtc(); var currentState = infoListValues[11].InnerText.TrimEtc(); var inComplex = infoListValues[12].InnerText.TrimEtc(); var subscriptionCosts = infoListValues[13].InnerText.TrimEtc(); // Only some of the advertisements has "Complex Name" var hasComplexName = infoListLabels[14].InnerText == "Site Adı"; var complexName = hasComplexName ? infoListValues[14].InnerText.TrimEtc() : string.Empty; var suitableForLoadIndex = hasComplexName ? 15 : 14; var advertisementOwnerIndex = hasComplexName ? 16 : 15; var swappableIndex = hasComplexName ? 17 : 16; var suitableForLoan = infoListValues[suitableForLoadIndex].InnerText.TrimEtc(); var advertisementOwner = infoListValues[advertisementOwnerIndex].InnerText.TrimEtc(); var swappable = infoListValues[swappableIndex].InnerText.TrimEtc(); Trace.WriteLine($"advertisementNo: {advertisementNo}"); Trace.WriteLine($"advertisementDate: {advertisementDate}"); Trace.WriteLine($"province: {province}"); Trace.WriteLine($"district: {district}"); Trace.WriteLine($"neighborhood: {neighborhood}"); Trace.WriteLine($"price: {price}"); Trace.WriteLine($"advertisementType: {advertisementType}"); Trace.WriteLine($"squareMeters: {squareMeters}"); Trace.WriteLine($"numberOfRooms: {numberOfRooms}"); Trace.WriteLine($"buildingAge: {buildingAge}"); Trace.WriteLine($"floor: {floor}"); Trace.WriteLine($"totalFloors: {numberOfFloors}"); Trace.WriteLine($"heatingSystem: {heatingSystem}"); Trace.WriteLine($"numberOfFloors: {numberOfFloors}"); Trace.WriteLine($"numberOfToilets: {numberOfToilets}"); Trace.WriteLine($"furnished: {furnished}"); Trace.WriteLine($"currentState: {currentState}"); Trace.WriteLine($"inComplex: {inComplex}"); Trace.WriteLine($"subscriptionCosts: {subscriptionCosts}"); Trace.WriteLine($"complexName: {complexName}"); Trace.WriteLine($"suitableForLoan: {suitableForLoan}"); Trace.WriteLine($"advertisementOwner: {advertisementOwner}"); Trace.WriteLine($"swappable: {swappable}"); advertisement.Province = province; advertisement.District = district; advertisement.Neighborhood = neighborhood; advertisement.Price = decimal.Parse(price); advertisement.AdvertisementDate = advertisementDate; advertisement.AdvertisementType = advertisementType; advertisement.SquareMeters = int.Parse(squareMeters); advertisement.NumberOfRooms = numberOfRooms; advertisement.BuildingAge = buildingAge; advertisement.Floor = floor; advertisement.NumberOfFloors = numberOfFloors; advertisement.HeatingSystem = heatingSystem; advertisement.NumberOfToilets = numberOfToilets; advertisement.Furnished = furnished; advertisement.CurrentState = currentState; advertisement.InComplex = inComplex; advertisement.SubscriptionCosts = subscriptionCosts; advertisement.ComplexName = complexName; advertisement.SuitableForLoad = suitableForLoan; advertisement.AdvertisementOwner = advertisementOwner; advertisement.Swappable = swappable; advertisement.ModifiedOn = DateTime.UtcNow; return(false); }