示例#1
0
 public void CreateYear2013Scraper()
 {
     scraper = new TableScraper("http://www.sii.cl/pagina/valores/uf/uf2013.htm",
                                new TableLayout
     {
         XPathTemplate = "//*[@id=\"contenido\"]/table/tbody/tr[row]/td[column]",
         RowOffset     = 1,
         RowCount      = 1,
         ColumnOffset  = 31,
         ColumnCount   = 12
     });
     scraper.Load();
 }
示例#2
0
 public void InitializeTableScraper()
 {
     TableScraper             = new TableScraper(Path(), CurrentLayout());
     TableScraper.CellReader += CellReading;
 }
示例#3
0
        public async Task <ICollection <Contact> > GetContactsAsync(int start, int limit)
        {
            _logger.LogInformation("Scraping contacts");

            var contactScraper = new TableScraper <Contact>();
            var enquiryScraper = new TableScraper <Enquiry>();

            if (start > 0)
            {
                // When start > 0 , the server doesn't send a header...
                // So we need to go grab the first record to get the header.

                var table0Html = await RunAsync("clients/contacts", 0, 1);

                var contact0 = contactScraper.ScrapeTable(table0Html.DocumentNode).ToList();
            }

            var tableHtml = await RunAsync("clients/contacts", start, limit);


            var batches = contactScraper.ScrapeTable(tableHtml.DocumentNode)
                          .Select((contact, i) => (batch: i / BatchSize, contact: contact))
                          .GroupBy(x => x.batch)
                          .Select(grp => grp.Select(x => x.contact).ToList());

            var contacts = new List <Contact>();

            foreach (var batch in batches)
            {
                _logger.LogInformation("Scraping contact ids = " + string.Join(", ", batch.Select(x => x.Id)));

                var batchTasks = batch.Select(async contact =>
                {
                    var response = await _htmlClient.GetAsync($"clients/contacts/{contact.Id}");
                    response.EnsureSuccessStatusCode();
                    var htmlString = await response.Content.ReadAsStringAsync();

                    var html = new HtmlDocument();
                    html.LoadHtml(htmlString);

                    contactScraper.ScrapeForm(html, "clients_contact", contact);



                    var enquiryTable = await RunTabAsync(html.DocumentNode,
                                                         "load_contact_enqlist_tab",
                                                         ("contactid", contact.Id),
                                                         ("enquiryid", "0"));

                    contact.Enquiries = enquiryScraper.ScrapeTable(enquiryTable.DocumentNode).ToList();
                    foreach (var enquiry in contact.Enquiries)
                    {
                        enquiry.ContactId = contact.Id;
                    }

                    return(contact);
                });
                var batchContacts = await Task.WhenAll(batchTasks);

                contacts.AddRange(batchContacts);
            }

            return(contacts);
        }