public IExtractionResults Read(IExtractionArguments arguments, string sourceSystem) { IExtractionResults results = new ExtractionResults(); var matchingExtractionResult = _dataContext.ExtractionResultsViews.Where(sqlResult => sqlResult.MakeName == arguments.Make && sqlResult.ModelName == arguments.Model && sqlResult.From == arguments.From && sqlResult.To == arguments.To && sqlResult.SourceSystem == sourceSystem); if (!matchingExtractionResult.Any()) return null; var mostRecentExtractionResult = matchingExtractionResult.OrderByDescending(p => p.Id).First(); var vehicles = _dataContext.Vehicles.Where(p => p.ExtractionResults_FK == mostRecentExtractionResult.Id); foreach (var vehicle in vehicles) { results.Vehicles.Add(new Core.Extraction.Vehicle() { Make = mostRecentExtractionResult.MakeName, Model = mostRecentExtractionResult.ModelName, Milage = vehicle.Milage.ToString(), Price = Convert.ToDouble(vehicle.Price), Title = vehicle.Title, Year = vehicle.Year }); } return results; }
public void Extract(IExtractionArguments args, IExtractionResults extractionResults) { if (extractionResults == null) throw new ArgumentNullException("extractionResults"); _log.Debug("Extraction starting"); extractionResults.Start(); var firstPageUrl = _pageScraper.GetFirstPageUrl(args); HtmlDocument firstPage = _htmlWebWrapper.Load(firstPageUrl.OriginalString); extractionResults.Vehicles.AddRange(_pageScraper.Scrape(args, firstPage)); List<string> remainingUrls = _pageScraper.GetRemainingUrls(firstPage); Parallel.ForEach(remainingUrls, pagedUrl => { var resultsPage = _htmlWebWrapper.Load(pagedUrl); lock (_lockObject) { extractionResults.Vehicles.AddRange(_pageScraper.Scrape(args, resultsPage)); } }); extractionResults.Stop(); }
public Uri GetFirstPageUrl(IExtractionArguments extractionArgs) { //http://www.goo-net.com/usedcar/LEXUS__IS_F.html var uri = string.Format("{0}/{1}__{2}.html", GooNetArgumentBuilder.RootUrl, extractionArgs.Make, extractionArgs.Model); return new Uri(uri); }
public Uri GetFirstPageUrl(IExtractionArguments args) { _firstPageUrl = new Uri(string.Format(_baseUri.OriginalString, args.Make, args.Model)); string dateRange = string.Empty; for (int i = args.From; i <= args.To; i++) { dateRange += string.Format(yearString, i); } _firstPageUrl = new Uri(_firstPageUrl.OriginalString + dateRange + "search"); return _firstPageUrl; }
public void Init() { _htmlWrapper = MockRepository.GenerateMock<IHtmlWebWrapper>(); _log = MockRepository.GenerateMock<ILog>(); _extractionArgs = MockRepository.GenerateMock<IExtractionArguments>(); _extractionResults = MockRepository.GenerateMock<IExtractionResults>(); _pageScraper = MockRepository.GenerateMock<IAutoTraderZaPageScraper>(); _pageScraper.Stub(p => p.Scrape(Arg<IExtractionArguments>.Is.Anything, Arg<HtmlDocument>.Is.Anything)).Return(new List<IVehicle>()); _extractionResults.Stub(p => p.Vehicles).Return(new List<IVehicle>()); _dummyUri = new Uri("http://dummyUrl.com"); _dummyFirstDocument = new HtmlDocument(); _extractorEngine = new AutoTraderExtractionEngine(_htmlWrapper, _log, _pageScraper); }
public IList<IVehicle> Scrape(IExtractionArguments args, HtmlDocument page) { IList<IVehicle> vehicles = new List<IVehicle>(); var nodes = page.DocumentNode.SelectNodes("//div[@class='box_item_detail section ']"); var gooNodes = page.DocumentNode.SelectNodes("//div[@class='box_item_detail section no_goo_area']"); if (nodes == null) nodes = gooNodes; else if (gooNodes != null) { foreach (var selectNode in gooNodes) { nodes.Add(selectNode); } } foreach (var node in nodes) { Vehicle vehicle = new Vehicle(); vehicle.Title = node.SelectNodes("div/div[@class='heading_inner']") .First() .InnerText.Trim() .Replace(" ", " ") .Replace("\t", string.Empty) .Replace("\n", string.Empty); vehicle.Make = args.Make; vehicle.Model = args.Model; vehicle.Milage = node.SelectNodes("div/div/table/tr/td[@class='w63']").First().InnerText; double price = 0; double.TryParse(node.SelectNodes("div/div/table/tr/td/div[@class='priceInfo']/p/em").First().InnerText, out price); vehicle.Price = price; var dirtyYear = node.SelectNodes("div/div/table/tr/td[@class='w66']")[0].InnerText; dirtyYear = dirtyYear.Substring(0, dirtyYear.IndexOf("(")); vehicle.Year = int.Parse(dirtyYear); vehicles.Add(vehicle); } return vehicles; }
public IList<IVehicle> Scrape(IExtractionArguments args, HtmlDocument page) { var vehicleList = new List<IVehicle>(); var searchResults = page.DocumentNode.SelectNodes("//div[@class='searchResult ']"); if (searchResults == null) return vehicleList; foreach (var searchResult in searchResults) { IVehicle vehicle = new Vehicle(); vehicle.Make = args.Make; vehicle.Model = args.Model; vehicle.Title = searchResult.SelectSingleNode(".//h2[@class='serpTitle']").InnerText.Trim(); vehicle.Price = GetVehiclePrice(searchResult); vehicle.Milage = GetMilage(searchResult); vehicle.Year = GetAge(searchResult); vehicleList.Add(vehicle); _log.DebugFormat("Added {0}", vehicle); } return vehicleList; }
public void Init() { _log = MockRepository.GenerateMock<ILog>(); _extractionArguments = MockRepository.GenerateMock<IExtractionArguments>(); _pageScraper = new AutoTraderZaPageScraper(_log); }
public void Write(IExtractionArguments arguments, IExtractionResults extractionResults, string sourceSystem) { _log.DebugFormat("Writing extractionResults for {0}", extractionResults); var sourceSystemEntity = _dataContext.SourceSystems.FirstOrDefault(p => p.Name == sourceSystem); if (sourceSystemEntity == null) throw new ArgumentException("SourceSystem does not exist in the database", "sourceSystem"); var makeEntity = _dataContext.Makes.FirstOrDefault(p => p.MakeName == arguments.Make && p.SourceSystem == sourceSystemEntity); if (makeEntity == null) throw new ArgumentException("Make does not exist in the database", "arguments"); var modelEntity = makeEntity.Models.FirstOrDefault(p => p.ModelName == arguments.Model); if (modelEntity == null) throw new ArgumentException("Model does not exist in the database", "arguments"); var resultEntity = new ExtractionResult() { Model = modelEntity, From = arguments.From, To = arguments.To, ExtractionDateTime = DateTime.Now }; _dataContext.ExtractionResults.InsertOnSubmit(resultEntity); _dataContext.SubmitChanges(); foreach (var vehicle in extractionResults.Vehicles) { _dataContext.Vehicles.InsertOnSubmit(new Vehicle() { ExtractionResult = resultEntity, Milage = SafeInt( vehicle.Milage), Year = vehicle.Year, Price = Convert.ToDecimal( vehicle.Price), Title = vehicle.Title }); } _dataContext.SubmitChanges(); }
public bool TryRead(IExtractionArguments arguments, out IExtractionResults results) { throw new NotImplementedException(); }