public void CreateYear2013Scraper() { scraper = new TableScraper("http://www.sii.cl/pagina/valores/uf/uf2013.htm", new TableLayout { XPathTemplate = "//*[@id=\"contenido\"]/table/tbody/tr[row]/td[column]", RowOffset = 1, RowCount = 1, ColumnOffset = 31, ColumnCount = 12 }); scraper.Load(); }
public void InitializeTableScraper() { TableScraper = new TableScraper(Path(), CurrentLayout()); TableScraper.CellReader += CellReading; }
public async Task <ICollection <Contact> > GetContactsAsync(int start, int limit) { _logger.LogInformation("Scraping contacts"); var contactScraper = new TableScraper <Contact>(); var enquiryScraper = new TableScraper <Enquiry>(); if (start > 0) { // When start > 0 , the server doesn't send a header... // So we need to go grab the first record to get the header. var table0Html = await RunAsync("clients/contacts", 0, 1); var contact0 = contactScraper.ScrapeTable(table0Html.DocumentNode).ToList(); } var tableHtml = await RunAsync("clients/contacts", start, limit); var batches = contactScraper.ScrapeTable(tableHtml.DocumentNode) .Select((contact, i) => (batch: i / BatchSize, contact: contact)) .GroupBy(x => x.batch) .Select(grp => grp.Select(x => x.contact).ToList()); var contacts = new List <Contact>(); foreach (var batch in batches) { _logger.LogInformation("Scraping contact ids = " + string.Join(", ", batch.Select(x => x.Id))); var batchTasks = batch.Select(async contact => { var response = await _htmlClient.GetAsync($"clients/contacts/{contact.Id}"); response.EnsureSuccessStatusCode(); var htmlString = await response.Content.ReadAsStringAsync(); var html = new HtmlDocument(); html.LoadHtml(htmlString); contactScraper.ScrapeForm(html, "clients_contact", contact); var enquiryTable = await RunTabAsync(html.DocumentNode, "load_contact_enqlist_tab", ("contactid", contact.Id), ("enquiryid", "0")); contact.Enquiries = enquiryScraper.ScrapeTable(enquiryTable.DocumentNode).ToList(); foreach (var enquiry in contact.Enquiries) { enquiry.ContactId = contact.Id; } return(contact); }); var batchContacts = await Task.WhenAll(batchTasks); contacts.AddRange(batchContacts); } return(contacts); }