public static void run() { items = new List <BookData>(); var spider = new SimpleSpider("BooksToScrape", new Uri("http://books.toscrape.com/")); // callback to gather links spider.FetchCompleted += (s, a) => { // This callback can be replaced by: // spider.Configuration.Auto_AnchorsLinks = true; (which is Enabled by default) // and is here for demonstration purposes // Use a simple SubString-based split to get all "<a>" tags var links = AnchorHelper.GetAnchors(a.Link.Uri, a.Html); // Add the collected links to the queue (s as SimpleSpider).AddPages(links, a.Link); }; // callback to gather items spider.FetchCompleted += fetchCompleted_items_XPath; // Sample using XPath spider.FetchCompleted += fetchCompleted_items_HObject; //Sample using HObject // Ignore (cancel) the pages containing "/reviews/" spider.ShouldFetch += (s, a) => { a.Cancel = a.Link.Uri.ToString().Contains("/reviews/"); }; // execute from first page spider.Execute(); // List all books foreach (var b in items) { Console.WriteLine($" > {b.Price:C2} {b.Title}"); } }
public void Catalogar(int ano) { Senadores = new List <Lib.Senado.Leg.Senador>(); // obter listagem var page = FetchHelper.FetchResourceDocument(new Uri("https://www25.senado.leg.br/web/transparencia/sen/"), enableCaching: true); var select = new Select(page.DocumentNode.SelectSingleNode("//select")); var idsSenadores = select.GetItems() .Select(id => id.Value.Trim()) .Where(id => !string.IsNullOrEmpty(id)); // caminhar pelo site do senado obtendo os demais dados var init = InitializationParams .Default002() .SetConfig(c => c.Disable_AutoAnchorsLinks()); var spider = new SimpleSpider("Senado.Leg", new System.Uri("https://www6g.senado.leg.br/"), init); spider.FetchCompleted += Spider_FetchCompleted; foreach (var i in idsSenadores) { spider.AddPage(montarUriSenador(ano, i), spider.BaseUri); } spider.Execute(); }
// Similar to [RafaelEstevam.Simple.Spider.Test.Sample.BooksToScrape], // see for more in depth cover of the crawling part public static void run() { // Creates a new instance var storage = new Storage.SQLiteStorage <Quote>(); // set the spider to use it var init = new InitializationParams() .SetStorage(storage); var spider = new SimpleSpider("QuotesToScrape", new Uri("http://quotes.toscrape.com/"), init); Console.WriteLine($"The sqlite database is at {storage.DatabaseFilePath}"); Console.WriteLine($"The quotes are being stored in the table {storage.TableNameOfT}"); spider.FetchCompleted += spider_FetchCompleted; spider.ShouldFetch += Spider_ShouldFetch; spider.Execute(); Console.WriteLine("Quotes from Albert Einstein"); foreach (Quote q in storage.GetItemsWith("Author", "Albert Einstein")) { Console.WriteLine($"{q.Author}: {q.Text}"); } Console.WriteLine("All Quotes"); foreach (Quote q in spider.Storage.RetrieveAllItems()) { Console.WriteLine($"{q.Author}: {q.Text}"); } }
public static void run() { var init = new InitializationParams() .SetCacher(new ContentCacher()) .SetDownloader(new WebClientDownloader()) //.SetOfflineMode() // Remove all downloaders (sets a NullDownloader) .SetConfig(c => c.Enable_Caching() .Disable_Cookies() .Enable_AutoAnchorsLinks() // enable automatic link following .Set_CachingNoLimit() .Set_DownloadDelay(5000)); var spider = new SimpleSpider("QuotesToScrape", new Uri("http://quotes.toscrape.com/"), init); // Defines pages that should not be fetched spider.ShouldFetch += (_, args) // ShouldFetch args also supports chaining to easy cancel resources // Note: the order is very important => args.CancelIfContains("/login") .CancelIfContains("/tag/") // not fetch tags .AllowIfContains("/tag/choices/") // I like to have choices =) ; //Defines pages (or ages) that should not use local cache spider.ShouldUseCache += (_, args) // can be chained too => args.CancelIfOlderThan(new TimeSpan(24, 0, 0)); // Sets up the fetch completed callback spider.FetchCompleted += Spider_FetchCompleted; // execute spider.Execute(); }
public static void run() { var spider = new SimpleSpider("QuotesToScrape", new Uri("http://quotes.toscrape.com/")); // add callback to json pages spider.Parsers.OfType <JsonParser>().First().ParsedData += json_ParsedData; // add first spider.AddPage(buildPageUri(1), spider.BaseUri); // execute spider.Execute(); }
public static void run() { var spider = new SimpleSpider("QuotesToScrape", new Uri("http://quotes.toscrape.com/")); // create a json parser for our QuotesObject class spider.Parsers.Add(new JsonDeserializeParser <QuotesObject>(parsedResult_event)); // add first page /api/quotes?page={pageNo} spider.AddPage(buildPageUri(1), spider.BaseUri); // execute spider.Execute(); }
public static void run() { var iP = new InitializationParams() // Defines a Storage Engine // All stored items will be in spider folder as JsonLines .SetStorage(new Storage.JsonLinesStorage()); // JsonLines: https://jsonlines.org/ var spider = new SimpleSpider("BooksToScrape", new Uri("http://books.toscrape.com/"), iP); // callback to gather items spider.FetchCompleted += fetchCompleted_items; // Ignore (cancel) the pages containing "/reviews/" spider.ShouldFetch += (s, a) => { a.Cancel = a.Link.Uri.ToString().Contains("/reviews/"); }; // execute from first page spider.Execute(); }
public static void run(SqliteDB database) { db = database; db.CreateTables() .Add <PessoalModel>() .Add <Deputado>() .Commit(); var init = InitializationParams.Default002() .SetConfig(c => c.Disable_AutoAnchorsLinks() .Set_DownloadDelay(0)); var spider = new SimpleSpider("camara_gabinete", new Uri("https://www.camara.leg.br/"), init); spider.FetchCompleted += Spider_FetchCompleted; spider.AddPage(new Uri("https://www.camara.leg.br/deputados/quem-sao"), spider.BaseUri); spider.Execute(); }
public static void run() { var init = new InitializationParams() .SetCacher(new ContentCacher()) .SetDownloader(new WebClientDownloader()) // create a json parser for our QuotesObject class .AddParser(new Parsers.JsonDeserializeParser <QuotesObject>(parsedResult_event)) // Received Json class // Adds a SQLite storage to keep all collected quotes .SetStorage(new Storage.SQLiteStorage <Quote>()) // Single quote class .SetConfig(c => c.Enable_Caching() .Disable_Cookies() .Disable_AutoAnchorsLinks() .Set_CachingNoLimit() .Set_DownloadDelay(5000)); var spider = new SimpleSpider("QuotesToScrape", new Uri("http://quotes.toscrape.com/"), init); // add first spider.AddPage(buildPageUri(1), spider.BaseUri); // execute spider.Execute(); }
public void Executar() { var init = InitializationParams .Default002() // Usar configs padrão .SetConfig(c => c.Set_DownloadDelay(000) // Aguardar 2s entre requisições .Disable_AutoAnchorsLinks()); // Não sair navegando var spider = new SimpleSpider("cota_camara", new Uri("https://www.camara.leg.br"), init); spider.Configuration.SpiderAllowHostViolation = true; // Permite sair do domínio *.camara.leg.br // Obter todos os palamentares spider.AddPage(new Uri("https://www.camara.leg.br/cota-parlamentar/index.jsp"), spider.BaseUri); // Obter páginas spider.FetchCompleted += Spider_FetchCompleted; spider.FetchFailed += spider_FetchFailed; // Ignorar alguns endereços spider.ShouldFetch += spider_ShouldFetch; // mandar ver ... spider.Execute(); }