public static void run() { var init = new InitializationParams() .SetCacher(new ContentCacher()) .SetDownloader(new WebClientDownloader()) //.SetOfflineMode() // Remove all downloaders (sets a NullDownloader) .SetConfig(c => c.Enable_Caching() .Disable_Cookies() .Enable_AutoAnchorsLinks() // enable automatic link following .Set_CachingNoLimit() .Set_DownloadDelay(5000)); var spider = new SimpleSpider("QuotesToScrape", new Uri("http://quotes.toscrape.com/"), init); // Defines pages that should not be fetched spider.ShouldFetch += (_, args) // ShouldFetch args also supports chaining to easy cancel resources // Note: the order is very important => args.CancelIfContains("/login") .CancelIfContains("/tag/") // not fetch tags .AllowIfContains("/tag/choices/") // I like to have choices =) ; //Defines pages (or ages) that should not use local cache spider.ShouldUseCache += (_, args) // can be chained too => args.CancelIfOlderThan(new TimeSpan(24, 0, 0)); // Sets up the fetch completed callback spider.FetchCompleted += Spider_FetchCompleted; // execute spider.Execute(); }
static void Main(string[] args) { // See here some samples // Uncomment and use the Debug Step Into feature (F11) to see the action // Spider Samples: //Sample.BooksToScrape.run(); //Sample.QuotesToScrape_Chaining.run(); //Sample.QuotesToScrapeAPI_Chaining.run(); // similar, but Json-based (API) //Sample.QuotesToScrapeAPI_Scroll.run(); //Sample.QuotesToScrapeAPI_Scroll_Deserialize.run(); // Helper Samples //Sample.QuotesToScrape_Login.run(); // Capture a form element and submit it //Sample.ApiPooler_FetcherHelper.run(); // Easy fetch of single resources // Easy-parsing sample, use the Debug Step Into feature (F11) to see the action //Sample.QuotesToScrape_HObject.run(); // Parse quotes with HObject //Storage Engines //Sample.BooksToScrape_StorageJsonLines.run(); //ModulesSamples.Storage_Sqlite_Quotes.run(); // IgnoreMe file to internal screwing around // Add a file named IgnoreMe.cs with a static void run() on it to play around with the spider //IgnoreMe.run(); SimpleSpider.HowToUse_PrintToConsole(); Console.WriteLine("-END-"); Console.ReadKey(); }
private void processaNF(SimpleSpider spider, FetchCompleteEventArgs args) { if (args.Html.Contains("<!--NFE-API-->")) { } else { // Ainda não há o que fazer return; } var html = args.Html.Substring(args.Html.IndexOf("<!--NFE-API-->")); if (html.Contains(".location")) { } else { // Ainda não há o que fazer return; } html = html.Substring(html.IndexOf("http")); var uri = new Uri(html.Substring(0, html.IndexOf("\""))); listaNotasAcessar.Add(uri); spider.AddPage(uri, args.Link); }
// Similar to [RafaelEstevam.Simple.Spider.Test.Sample.BooksToScrape], // see for more in depth cover of the crawling part public static void run() { // Creates a new instance var storage = new Storage.SQLiteStorage <Quote>(); // set the spider to use it var init = new InitializationParams() .SetStorage(storage); var spider = new SimpleSpider("QuotesToScrape", new Uri("http://quotes.toscrape.com/"), init); Console.WriteLine($"The sqlite database is at {storage.DatabaseFilePath}"); Console.WriteLine($"The quotes are being stored in the table {storage.TableNameOfT}"); spider.FetchCompleted += spider_FetchCompleted; spider.ShouldFetch += Spider_ShouldFetch; spider.Execute(); Console.WriteLine("Quotes from Albert Einstein"); foreach (Quote q in storage.GetItemsWith("Author", "Albert Einstein")) { Console.WriteLine($"{q.Author}: {q.Text}"); } Console.WriteLine("All Quotes"); foreach (Quote q in spider.Storage.RetrieveAllItems()) { Console.WriteLine($"{q.Author}: {q.Text}"); } }
public void Catalogar(int ano) { Senadores = new List <Lib.Senado.Leg.Senador>(); // obter listagem var page = FetchHelper.FetchResourceDocument(new Uri("https://www25.senado.leg.br/web/transparencia/sen/"), enableCaching: true); var select = new Select(page.DocumentNode.SelectSingleNode("//select")); var idsSenadores = select.GetItems() .Select(id => id.Value.Trim()) .Where(id => !string.IsNullOrEmpty(id)); // caminhar pelo site do senado obtendo os demais dados var init = InitializationParams .Default002() .SetConfig(c => c.Disable_AutoAnchorsLinks()); var spider = new SimpleSpider("Senado.Leg", new System.Uri("https://www6g.senado.leg.br/"), init); spider.FetchCompleted += Spider_FetchCompleted; foreach (var i in idsSenadores) { spider.AddPage(montarUriSenador(ano, i), spider.BaseUri); } spider.Execute(); }
private void processaSumarizado(SimpleSpider spider, FetchCompleteEventArgs args) { // cataloga Deputado var tag = new Tag(args.GetDocument()); var id = args.Link.Uri.Query .Split('&')[0] // primeiro bloco .Split('=')[1] // após o igual .ToInt(); var h3 = tag.SelectTag("//h3[@class=\"header\"]"); ListaDeputados.Add(new Deputado() { Id = id, Nome = h3.SelectTag <Anchor>().InnerText.Trim(), PartidoLideranca = h3.Node.ChildNodes[2].InnerText.Trim() }); // Carrega despesas var linhas = tag.SelectTags("//table//tr") .Skip(1) // Ignora o Header .SkipLast(1); // Ignora o total var dados = linhas .Select(tr => tr.SelectTag <Anchor>()); spider.AddPages(dados, args.Link); }
public static void run() { items = new List <BookData>(); var spider = new SimpleSpider("BooksToScrape", new Uri("http://books.toscrape.com/")); // callback to gather links spider.FetchCompleted += (s, a) => { // This callback can be replaced by: // spider.Configuration.Auto_AnchorsLinks = true; (which is Enabled by default) // and is here for demonstration purposes // Use a simple SubString-based split to get all "<a>" tags var links = AnchorHelper.GetAnchors(a.Link.Uri, a.Html); // Add the collected links to the queue (s as SimpleSpider).AddPages(links, a.Link); }; // callback to gather items spider.FetchCompleted += fetchCompleted_items_XPath; // Sample using XPath spider.FetchCompleted += fetchCompleted_items_HObject; //Sample using HObject // Ignore (cancel) the pages containing "/reviews/" spider.ShouldFetch += (s, a) => { a.Cancel = a.Link.Uri.ToString().Contains("/reviews/"); }; // execute from first page spider.Execute(); // List all books foreach (var b in items) { Console.WriteLine($" > {b.Price:C2} {b.Title}"); } }
void IParserBase.Parse(SimpleSpider spider, FetchCompleteEventArgs FetchInfo) { if (ParsedData == null) { return; } ParsedData(spider, new ParserEventArgs <JObject>(FetchInfo: FetchInfo, Data: JObject.Parse(FetchInfo.Html))); }
void IParserBase.Parse(SimpleSpider spider, FetchCompleteEventArgs FetchInfo) { if (ParsedData == null) { return; } ParsedData(spider, new ParserEventArgs <XElement>(FetchInfo, XElement.Parse(Encoding.GetString(FetchInfo.Result)))); }
public static void run() { var spider = new SimpleSpider("QuotesToScrape", new Uri("http://quotes.toscrape.com/")); // create a json parser for our QuotesObject class spider.Parsers.Add(new JsonDeserializeParser <QuotesObject>(parsedResult_event)); // add first page /api/quotes?page={pageNo} spider.AddPage(buildPageUri(1), spider.BaseUri); // execute spider.Execute(); }
public static void run() { var spider = new SimpleSpider("QuotesToScrape", new Uri("http://quotes.toscrape.com/")); // add callback to json pages spider.Parsers.OfType <JsonParser>().First().ParsedData += json_ParsedData; // add first spider.AddPage(buildPageUri(1), spider.BaseUri); // execute spider.Execute(); }
private void processaDocumento(SimpleSpider spider, FetchCompleteEventArgs args) { int idDeputado = args.Link.Uri.Query .Split('&')[0] .Split('=')[1] .ToInt(); var hObj = args.GetHObject(); var ULs = hObj["ul > .listaDefinicao"].ToArray(); var ulDados = ULs[1]; var ulFornecedor = ULs[2]; var ulValorDespesa = ULs[3]; var ulValores2 = ULs[4]; int codigoDespesa = args.Link.Uri.Query .Split('&')[3] .Split('=')[1] .ToInt(); //string nomeDepsesa = ulDados["span"][1].Trim(); string numero = ulDados["span"][5].Trim(); string dtEmissao = ulDados["span"][7].Trim(); string competencia = ulDados["span"][9].Trim(); string fornecedorNome = ulFornecedor["span"][1].Trim(); string fornecedorCnpj = ulFornecedor["span"][5].Trim(); string valorDespesa = ulValorDespesa["span"][0].Trim(); string deducoes = ulValores2["span"][0].Trim(); string glosas = ulValores2["span"][1].Trim(); string restituicoes = ulValores2["span"][2].Trim(); string reembolso = ulValorDespesa["span"][5].Trim(); ListaDespesas.Add(new Despesa() { IdDeputado = idDeputado, TipoDespesa = (TiposDespesa)codigoDespesa, Numero = numero, DocumentoFornecedor = fornecedorCnpj, NomeFornecedor = WebUtility.HtmlDecode(fornecedorNome), DataEmissao = LocalizationHelper.ParseDatetime(dtEmissao), DataCompetencia = LocalizationHelper.ParseDatetime("01/" + competencia), ValorDespesa = converteValor(valorDespesa), Deducoes = converteValor(deducoes), Glosas = converteValor(glosas), Restituicoes = converteValor(restituicoes), Reembolso = converteValor(reembolso) }); }
public static void run() { var iP = new InitializationParams() // Defines a Storage Engine // All stored items will be in spider folder as JsonLines .SetStorage(new Storage.JsonLinesStorage()); // JsonLines: https://jsonlines.org/ var spider = new SimpleSpider("BooksToScrape", new Uri("http://books.toscrape.com/"), iP); // callback to gather items spider.FetchCompleted += fetchCompleted_items; // Ignore (cancel) the pages containing "/reviews/" spider.ShouldFetch += (s, a) => { a.Cancel = a.Link.Uri.ToString().Contains("/reviews/"); }; // execute from first page spider.Execute(); }
public static void run(SqliteDB database) { db = database; db.CreateTables() .Add <PessoalModel>() .Add <Deputado>() .Commit(); var init = InitializationParams.Default002() .SetConfig(c => c.Disable_AutoAnchorsLinks() .Set_DownloadDelay(0)); var spider = new SimpleSpider("camara_gabinete", new Uri("https://www.camara.leg.br/"), init); spider.FetchCompleted += Spider_FetchCompleted; spider.AddPage(new Uri("https://www.camara.leg.br/deputados/quem-sao"), spider.BaseUri); spider.Execute(); }
private void processaAnalitico(SimpleSpider spider, FetchCompleteEventArgs args) { var linhas = new Tag(args.GetDocument()) .SelectTags("//table/tbody/tr") .SkipLast(1); // Ignora o total foreach (var linha in linhas) { var lnk = linha.SelectTag <Anchor>(".//a"); if (lnk.Href.Contains("/documento?nuDeputadoId")) { spider.AddPage(lnk, args.Link); } // https://www.camara.leg.br/cota-parlamentar/nota-fiscal-eletronica?ideDocumentoFiscal=0000000 if (lnk.Href.Contains("/nota-fiscal-eletronica?")) { spider.AddPage(lnk, args.Link); } } }
public static void run() { var init = new InitializationParams() .SetCacher(new ContentCacher()) .SetDownloader(new WebClientDownloader()) // create a json parser for our QuotesObject class .AddParser(new Parsers.JsonDeserializeParser <QuotesObject>(parsedResult_event)) // Received Json class // Adds a SQLite storage to keep all collected quotes .SetStorage(new Storage.SQLiteStorage <Quote>()) // Single quote class .SetConfig(c => c.Enable_Caching() .Disable_Cookies() .Disable_AutoAnchorsLinks() .Set_CachingNoLimit() .Set_DownloadDelay(5000)); var spider = new SimpleSpider("QuotesToScrape", new Uri("http://quotes.toscrape.com/"), init); // add first spider.AddPage(buildPageUri(1), spider.BaseUri); // execute spider.Execute(); }
public void Executar() { var init = InitializationParams .Default002() // Usar configs padrão .SetConfig(c => c.Set_DownloadDelay(000) // Aguardar 2s entre requisições .Disable_AutoAnchorsLinks()); // Não sair navegando var spider = new SimpleSpider("cota_camara", new Uri("https://www.camara.leg.br"), init); spider.Configuration.SpiderAllowHostViolation = true; // Permite sair do domínio *.camara.leg.br // Obter todos os palamentares spider.AddPage(new Uri("https://www.camara.leg.br/cota-parlamentar/index.jsp"), spider.BaseUri); // Obter páginas spider.FetchCompleted += Spider_FetchCompleted; spider.FetchFailed += spider_FetchFailed; // Ignorar alguns endereços spider.ShouldFetch += spider_ShouldFetch; // mandar ver ... spider.Execute(); }
/// <summary> /// Adds an Anchors to fetch /// </summary> /// <param name="spider">Spider to add</param> /// <param name="anchors">Anchors to fetch</param> /// <param name="SourcePage">Uri where the Anchor was found</param> /// <returns>Array of Links</returns> public static Spider.Link[] AddPages(this SimpleSpider spider, IEnumerable <Anchor> anchors, Uri SourcePage) { return(anchors.Select(a => AddPage(spider, a, SourcePage)).ToArray()); }
/// <summary> /// Add an Anchor to fetch /// </summary> /// <param name="spider">Spider to add</param> /// <param name="anchor">Anchor to fetch</param> /// <param name="SourcePage">Uri where the Anchor was found</param> /// <returns>Link object</returns> public static Spider.Link AddPage(this SimpleSpider spider, Anchor anchor, Uri SourcePage) { return(spider.AddPage(anchor.GetUri(SourcePage), SourcePage)); }
void IParserBase.Parse(SimpleSpider spider, FetchCompleteEventArgs fetchInfo) { var result = JsonConvert.DeserializeObject <T>(fetchInfo.Html); ParsedData(spider, new ParserEventArgs <T>(fetchInfo, result)); }
private void processaCupom(SimpleSpider spider, FetchCompleteEventArgs args) { if (args.Html.Contains("Nota não encontrada")) { return; } if (args.Html == "") { return; // ?? } if (args.Link.Uri.Host.Contains(".rj.gov.")) { return; // Usa POST } if (args.Link.Uri.Host.Contains(".es.gov.")) { return; // Usa POST } if (args.Link.Uri.Host.Contains(".rr.gov.")) { return; // Usa POST } if (args.Link.Uri.Host.Contains(".pb.gov.")) { return; // Captcha } if (args.Link.Uri.Host.Contains(".ro.gov.")) { return; // Captcha } if (args.Link.Uri.Host.Contains(".ma.gov.")) { return; // Captcha } if (args.Link.Uri.Host.Contains(".ap.gov.")) { return; // Redirect -> Captcha } if (args.Link.Uri.Host.Contains(".pi.gov.")) { return; // Erro interno, um dia volta ? } if (args.Html.Contains("iframe")) { var frame = new Tag(args.GetDocument()).SelectTag <IFrame>(); if (frame == null) { return; // Ainda não sei como pegar } var newUri = frame.Src; spider.AddPage(new Uri(newUri), args.Link); return; } if (args.Html.Contains("CPF")) { contagemCPF++; } else if (args.Html.Contains("CNPJ")) { var ocorrencias = args.Html.Split("CNPJ"); if (ocorrencias.Length > 2) { } } else { } }