public void Catalogar(int ano) { Senadores = new List <Lib.Senado.Leg.Senador>(); // obter listagem var page = FetchHelper.FetchResourceDocument(new Uri("https://www25.senado.leg.br/web/transparencia/sen/"), enableCaching: true); var select = new Select(page.DocumentNode.SelectSingleNode("//select")); var idsSenadores = select.GetItems() .Select(id => id.Value.Trim()) .Where(id => !string.IsNullOrEmpty(id)); // caminhar pelo site do senado obtendo os demais dados var init = InitializationParams .Default002() .SetConfig(c => c.Disable_AutoAnchorsLinks()); var spider = new SimpleSpider("Senado.Leg", new System.Uri("https://www6g.senado.leg.br/"), init); spider.FetchCompleted += Spider_FetchCompleted; foreach (var i in idsSenadores) { spider.AddPage(montarUriSenador(ano, i), spider.BaseUri); } spider.Execute(); }
private void processaNF(SimpleSpider spider, FetchCompleteEventArgs args) { if (args.Html.Contains("<!--NFE-API-->")) { } else { // Ainda não há o que fazer return; } var html = args.Html.Substring(args.Html.IndexOf("<!--NFE-API-->")); if (html.Contains(".location")) { } else { // Ainda não há o que fazer return; } html = html.Substring(html.IndexOf("http")); var uri = new Uri(html.Substring(0, html.IndexOf("\""))); listaNotasAcessar.Add(uri); spider.AddPage(uri, args.Link); }
public static void run() { var spider = new SimpleSpider("QuotesToScrape", new Uri("http://quotes.toscrape.com/")); // add callback to json pages spider.Parsers.OfType <JsonParser>().First().ParsedData += json_ParsedData; // add first spider.AddPage(buildPageUri(1), spider.BaseUri); // execute spider.Execute(); }
public static void run() { var spider = new SimpleSpider("QuotesToScrape", new Uri("http://quotes.toscrape.com/")); // create a json parser for our QuotesObject class spider.Parsers.Add(new JsonDeserializeParser <QuotesObject>(parsedResult_event)); // add first page /api/quotes?page={pageNo} spider.AddPage(buildPageUri(1), spider.BaseUri); // execute spider.Execute(); }
private void processaAnalitico(SimpleSpider spider, FetchCompleteEventArgs args) { var linhas = new Tag(args.GetDocument()) .SelectTags("//table/tbody/tr") .SkipLast(1); // Ignora o total foreach (var linha in linhas) { var lnk = linha.SelectTag <Anchor>(".//a"); if (lnk.Href.Contains("/documento?nuDeputadoId")) { spider.AddPage(lnk, args.Link); } // https://www.camara.leg.br/cota-parlamentar/nota-fiscal-eletronica?ideDocumentoFiscal=0000000 if (lnk.Href.Contains("/nota-fiscal-eletronica?")) { spider.AddPage(lnk, args.Link); } } }
public static void run(SqliteDB database) { db = database; db.CreateTables() .Add <PessoalModel>() .Add <Deputado>() .Commit(); var init = InitializationParams.Default002() .SetConfig(c => c.Disable_AutoAnchorsLinks() .Set_DownloadDelay(0)); var spider = new SimpleSpider("camara_gabinete", new Uri("https://www.camara.leg.br/"), init); spider.FetchCompleted += Spider_FetchCompleted; spider.AddPage(new Uri("https://www.camara.leg.br/deputados/quem-sao"), spider.BaseUri); spider.Execute(); }
public static void run() { var init = new InitializationParams() .SetCacher(new ContentCacher()) .SetDownloader(new WebClientDownloader()) // create a json parser for our QuotesObject class .AddParser(new Parsers.JsonDeserializeParser <QuotesObject>(parsedResult_event)) // Received Json class // Adds a SQLite storage to keep all collected quotes .SetStorage(new Storage.SQLiteStorage <Quote>()) // Single quote class .SetConfig(c => c.Enable_Caching() .Disable_Cookies() .Disable_AutoAnchorsLinks() .Set_CachingNoLimit() .Set_DownloadDelay(5000)); var spider = new SimpleSpider("QuotesToScrape", new Uri("http://quotes.toscrape.com/"), init); // add first spider.AddPage(buildPageUri(1), spider.BaseUri); // execute spider.Execute(); }
public void Executar() { var init = InitializationParams .Default002() // Usar configs padrão .SetConfig(c => c.Set_DownloadDelay(000) // Aguardar 2s entre requisições .Disable_AutoAnchorsLinks()); // Não sair navegando var spider = new SimpleSpider("cota_camara", new Uri("https://www.camara.leg.br"), init); spider.Configuration.SpiderAllowHostViolation = true; // Permite sair do domínio *.camara.leg.br // Obter todos os palamentares spider.AddPage(new Uri("https://www.camara.leg.br/cota-parlamentar/index.jsp"), spider.BaseUri); // Obter páginas spider.FetchCompleted += Spider_FetchCompleted; spider.FetchFailed += spider_FetchFailed; // Ignorar alguns endereços spider.ShouldFetch += spider_ShouldFetch; // mandar ver ... spider.Execute(); }
/// <summary> /// Add an Anchor to fetch /// </summary> /// <param name="spider">Spider to add</param> /// <param name="anchor">Anchor to fetch</param> /// <param name="SourcePage">Uri where the Anchor was found</param> /// <returns>Link object</returns> public static Spider.Link AddPage(this SimpleSpider spider, Anchor anchor, Uri SourcePage) { return(spider.AddPage(anchor.GetUri(SourcePage), SourcePage)); }
private void processaCupom(SimpleSpider spider, FetchCompleteEventArgs args) { if (args.Html.Contains("Nota não encontrada")) { return; } if (args.Html == "") { return; // ?? } if (args.Link.Uri.Host.Contains(".rj.gov.")) { return; // Usa POST } if (args.Link.Uri.Host.Contains(".es.gov.")) { return; // Usa POST } if (args.Link.Uri.Host.Contains(".rr.gov.")) { return; // Usa POST } if (args.Link.Uri.Host.Contains(".pb.gov.")) { return; // Captcha } if (args.Link.Uri.Host.Contains(".ro.gov.")) { return; // Captcha } if (args.Link.Uri.Host.Contains(".ma.gov.")) { return; // Captcha } if (args.Link.Uri.Host.Contains(".ap.gov.")) { return; // Redirect -> Captcha } if (args.Link.Uri.Host.Contains(".pi.gov.")) { return; // Erro interno, um dia volta ? } if (args.Html.Contains("iframe")) { var frame = new Tag(args.GetDocument()).SelectTag <IFrame>(); if (frame == null) { return; // Ainda não sei como pegar } var newUri = frame.Src; spider.AddPage(new Uri(newUri), args.Link); return; } if (args.Html.Contains("CPF")) { contagemCPF++; } else if (args.Html.Contains("CNPJ")) { var ocorrencias = args.Html.Split("CNPJ"); if (ocorrencias.Length > 2) { } } else { } }