public static void run()
        {
            var init = new InitializationParams()
                       .SetCacher(new ContentCacher())
                       .SetDownloader(new WebClientDownloader())
                       //.SetOfflineMode() // Remove all downloaders (sets a NullDownloader)
                       .SetConfig(c => c.Enable_Caching()
                                  .Disable_Cookies()
                                  .Enable_AutoAnchorsLinks() // enable automatic link following
                                  .Set_CachingNoLimit()
                                  .Set_DownloadDelay(5000));

            var spider = new SimpleSpider("QuotesToScrape", new Uri("http://quotes.toscrape.com/"), init);

            // Defines pages that should not be fetched
            spider.ShouldFetch += (_, args)
                                                                    // ShouldFetch args also supports chaining to easy cancel resources
                                                                    // Note: the order is very important
                                  => args.CancelIfContains("/login")
                                  .CancelIfContains("/tag/")        // not fetch tags
                                  .AllowIfContains("/tag/choices/") // I like to have choices =)
            ;

            //Defines pages (or ages) that should not use local cache
            spider.ShouldUseCache += (_, args)
                                     // can be chained too
                                     => args.CancelIfOlderThan(new TimeSpan(24, 0, 0));


            // Sets up the fetch completed callback
            spider.FetchCompleted += Spider_FetchCompleted;
            // execute
            spider.Execute();
        }
Exemple #2
0
        static void Main(string[] args)
        {
            // See here some samples
            // Uncomment and use the Debug Step Into feature (F11) to see the action

            // Spider Samples:
            //Sample.BooksToScrape.run();
            //Sample.QuotesToScrape_Chaining.run();
            //Sample.QuotesToScrapeAPI_Chaining.run(); // similar, but Json-based (API)
            //Sample.QuotesToScrapeAPI_Scroll.run();
            //Sample.QuotesToScrapeAPI_Scroll_Deserialize.run();

            // Helper Samples
            //Sample.QuotesToScrape_Login.run();    // Capture a form element and submit it
            //Sample.ApiPooler_FetcherHelper.run(); // Easy fetch of single resources

            // Easy-parsing sample, use the Debug Step Into feature (F11) to see the action
            //Sample.QuotesToScrape_HObject.run(); // Parse quotes with HObject

            //Storage Engines
            //Sample.BooksToScrape_StorageJsonLines.run();
            //ModulesSamples.Storage_Sqlite_Quotes.run();

            // IgnoreMe file to internal screwing around
            //  Add a file named IgnoreMe.cs with a static void run() on it to play around with the spider
            //IgnoreMe.run();

            SimpleSpider.HowToUse_PrintToConsole();

            Console.WriteLine("-END-");
            Console.ReadKey();
        }
        private void processaNF(SimpleSpider spider, FetchCompleteEventArgs args)
        {
            if (args.Html.Contains("<!--NFE-API-->"))
            {
            }
            else
            {
                // Ainda não há o que fazer
                return;
            }

            var html = args.Html.Substring(args.Html.IndexOf("<!--NFE-API-->"));

            if (html.Contains(".location"))
            {
            }
            else
            {
                // Ainda não há o que fazer
                return;
            }

            html = html.Substring(html.IndexOf("http"));
            var uri = new Uri(html.Substring(0, html.IndexOf("\"")));

            listaNotasAcessar.Add(uri);
            spider.AddPage(uri, args.Link);
        }
        // Similar to [RafaelEstevam.Simple.Spider.Test.Sample.BooksToScrape],
        //   see for more in depth cover of the crawling part
        public static void run()
        {
            // Creates a new instance
            var storage = new Storage.SQLiteStorage <Quote>();
            // set the spider to use it
            var init = new InitializationParams()
                       .SetStorage(storage);

            var spider = new SimpleSpider("QuotesToScrape",
                                          new Uri("http://quotes.toscrape.com/"),
                                          init);

            Console.WriteLine($"The sqlite database is at {storage.DatabaseFilePath}");
            Console.WriteLine($"The quotes are being stored in the table {storage.TableNameOfT}");

            spider.FetchCompleted += spider_FetchCompleted;
            spider.ShouldFetch    += Spider_ShouldFetch;
            spider.Execute();

            Console.WriteLine("Quotes from Albert Einstein");
            foreach (Quote q in storage.GetItemsWith("Author", "Albert Einstein"))
            {
                Console.WriteLine($"{q.Author}: {q.Text}");
            }
            Console.WriteLine("All Quotes");
            foreach (Quote q in spider.Storage.RetrieveAllItems())
            {
                Console.WriteLine($"{q.Author}: {q.Text}");
            }
        }
        public void Catalogar(int ano)
        {
            Senadores = new List <Lib.Senado.Leg.Senador>();

            // obter listagem
            var page = FetchHelper.FetchResourceDocument(new Uri("https://www25.senado.leg.br/web/transparencia/sen/"),
                                                         enableCaching: true);
            var select       = new Select(page.DocumentNode.SelectSingleNode("//select"));
            var idsSenadores = select.GetItems()
                               .Select(id => id.Value.Trim())
                               .Where(id => !string.IsNullOrEmpty(id));

            // caminhar pelo site do senado obtendo os demais dados
            var init = InitializationParams
                       .Default002()
                       .SetConfig(c => c.Disable_AutoAnchorsLinks());
            var spider = new SimpleSpider("Senado.Leg",
                                          new System.Uri("https://www6g.senado.leg.br/"),
                                          init);

            spider.FetchCompleted += Spider_FetchCompleted;

            foreach (var i in idsSenadores)
            {
                spider.AddPage(montarUriSenador(ano, i),
                               spider.BaseUri);
            }

            spider.Execute();
        }
        private void processaSumarizado(SimpleSpider spider, FetchCompleteEventArgs args)
        {
            // cataloga Deputado
            var tag = new Tag(args.GetDocument());

            var id = args.Link.Uri.Query
                     .Split('&')[0] // primeiro bloco
                     .Split('=')[1] // após o igual
                     .ToInt();

            var h3 = tag.SelectTag("//h3[@class=\"header\"]");

            ListaDeputados.Add(new Deputado()
            {
                Id               = id,
                Nome             = h3.SelectTag <Anchor>().InnerText.Trim(),
                PartidoLideranca = h3.Node.ChildNodes[2].InnerText.Trim()
            });

            // Carrega despesas
            var linhas = tag.SelectTags("//table//tr")
                         .Skip(1)      // Ignora o Header
                         .SkipLast(1); // Ignora o total

            var dados = linhas
                        .Select(tr => tr.SelectTag <Anchor>());

            spider.AddPages(dados, args.Link);
        }
        public static void run()
        {
            items = new List <BookData>();
            var spider = new SimpleSpider("BooksToScrape", new Uri("http://books.toscrape.com/"));

            // callback to gather links
            spider.FetchCompleted += (s, a) =>
            {
                // This callback can be replaced by:
                //  spider.Configuration.Auto_AnchorsLinks = true; (which is Enabled by default)
                // and is here for demonstration purposes

                // Use a simple SubString-based split to get all "<a>" tags
                var links = AnchorHelper.GetAnchors(a.Link.Uri, a.Html);
                // Add the collected links to the queue
                (s as SimpleSpider).AddPages(links, a.Link);
            };
            // callback to gather items
            spider.FetchCompleted += fetchCompleted_items_XPath;   // Sample using XPath
            spider.FetchCompleted += fetchCompleted_items_HObject; //Sample using HObject
            // Ignore (cancel) the pages containing "/reviews/"
            spider.ShouldFetch += (s, a) => { a.Cancel = a.Link.Uri.ToString().Contains("/reviews/"); };

            // execute from first page
            spider.Execute();

            // List all books
            foreach (var b in items)
            {
                Console.WriteLine($" > {b.Price:C2} {b.Title}");
            }
        }
Exemple #8
0
        void IParserBase.Parse(SimpleSpider spider, FetchCompleteEventArgs FetchInfo)
        {
            if (ParsedData == null)
            {
                return;
            }

            ParsedData(spider, new ParserEventArgs <JObject>(FetchInfo: FetchInfo, Data: JObject.Parse(FetchInfo.Html)));
        }
Exemple #9
0
        void IParserBase.Parse(SimpleSpider spider, FetchCompleteEventArgs FetchInfo)
        {
            if (ParsedData == null)
            {
                return;
            }

            ParsedData(spider, new ParserEventArgs <XElement>(FetchInfo, XElement.Parse(Encoding.GetString(FetchInfo.Result))));
        }
Exemple #10
0
        public static void run()
        {
            var spider = new SimpleSpider("QuotesToScrape", new Uri("http://quotes.toscrape.com/"));

            // create a json parser for our QuotesObject class
            spider.Parsers.Add(new JsonDeserializeParser <QuotesObject>(parsedResult_event));
            // add first page /api/quotes?page={pageNo}
            spider.AddPage(buildPageUri(1), spider.BaseUri);
            // execute
            spider.Execute();
        }
Exemple #11
0
        public static void run()
        {
            var spider = new SimpleSpider("QuotesToScrape", new Uri("http://quotes.toscrape.com/"));

            // add callback to json pages
            spider.Parsers.OfType <JsonParser>().First().ParsedData += json_ParsedData;
            // add first
            spider.AddPage(buildPageUri(1), spider.BaseUri);
            // execute
            spider.Execute();
        }
        private void processaDocumento(SimpleSpider spider, FetchCompleteEventArgs args)
        {
            int idDeputado = args.Link.Uri.Query
                             .Split('&')[0]
                             .Split('=')[1]
                             .ToInt();

            var hObj = args.GetHObject();
            var ULs  = hObj["ul > .listaDefinicao"].ToArray();

            var ulDados        = ULs[1];
            var ulFornecedor   = ULs[2];
            var ulValorDespesa = ULs[3];
            var ulValores2     = ULs[4];

            int codigoDespesa = args.Link.Uri.Query
                                .Split('&')[3]
                                .Split('=')[1]
                                .ToInt();
            //string nomeDepsesa = ulDados["span"][1].Trim();

            string numero      = ulDados["span"][5].Trim();
            string dtEmissao   = ulDados["span"][7].Trim();
            string competencia = ulDados["span"][9].Trim();

            string fornecedorNome = ulFornecedor["span"][1].Trim();
            string fornecedorCnpj = ulFornecedor["span"][5].Trim();

            string valorDespesa = ulValorDespesa["span"][0].Trim();
            string deducoes     = ulValores2["span"][0].Trim();
            string glosas       = ulValores2["span"][1].Trim();
            string restituicoes = ulValores2["span"][2].Trim();
            string reembolso    = ulValorDespesa["span"][5].Trim();

            ListaDespesas.Add(new Despesa()
            {
                IdDeputado  = idDeputado,
                TipoDespesa = (TiposDespesa)codigoDespesa,
                Numero      = numero,

                DocumentoFornecedor = fornecedorCnpj,
                NomeFornecedor      = WebUtility.HtmlDecode(fornecedorNome),

                DataEmissao     = LocalizationHelper.ParseDatetime(dtEmissao),
                DataCompetencia = LocalizationHelper.ParseDatetime("01/" + competencia),

                ValorDespesa = converteValor(valorDespesa),
                Deducoes     = converteValor(deducoes),
                Glosas       = converteValor(glosas),
                Restituicoes = converteValor(restituicoes),
                Reembolso    = converteValor(reembolso)
            });
        }
Exemple #13
0
        public static void run()
        {
            var iP = new InitializationParams()
                                                                  // Defines a Storage Engine
                                                                  // All stored items will be in spider folder as JsonLines
                     .SetStorage(new Storage.JsonLinesStorage()); // JsonLines: https://jsonlines.org/

            var spider = new SimpleSpider("BooksToScrape", new Uri("http://books.toscrape.com/"), iP);

            // callback to gather items
            spider.FetchCompleted += fetchCompleted_items;
            // Ignore (cancel) the pages containing "/reviews/"
            spider.ShouldFetch += (s, a) => { a.Cancel = a.Link.Uri.ToString().Contains("/reviews/"); };
            // execute from first page
            spider.Execute();
        }
        public static void run(SqliteDB database)
        {
            db = database;
            db.CreateTables()
            .Add <PessoalModel>()
            .Add <Deputado>()
            .Commit();

            var init = InitializationParams.Default002()
                       .SetConfig(c => c.Disable_AutoAnchorsLinks()
                                  .Set_DownloadDelay(0));

            var spider = new SimpleSpider("camara_gabinete", new Uri("https://www.camara.leg.br/"), init);

            spider.FetchCompleted += Spider_FetchCompleted;
            spider.AddPage(new Uri("https://www.camara.leg.br/deputados/quem-sao"), spider.BaseUri);
            spider.Execute();
        }
        private void processaAnalitico(SimpleSpider spider, FetchCompleteEventArgs args)
        {
            var linhas = new Tag(args.GetDocument())
                         .SelectTags("//table/tbody/tr")
                         .SkipLast(1); // Ignora o total

            foreach (var linha in linhas)
            {
                var lnk = linha.SelectTag <Anchor>(".//a");
                if (lnk.Href.Contains("/documento?nuDeputadoId"))
                {
                    spider.AddPage(lnk, args.Link);
                }
                // https://www.camara.leg.br/cota-parlamentar/nota-fiscal-eletronica?ideDocumentoFiscal=0000000
                if (lnk.Href.Contains("/nota-fiscal-eletronica?"))
                {
                    spider.AddPage(lnk, args.Link);
                }
            }
        }
Exemple #16
0
        public static void run()
        {
            var init = new InitializationParams()
                       .SetCacher(new ContentCacher())
                       .SetDownloader(new WebClientDownloader())
                                                                                                        // create a json parser for our QuotesObject class
                       .AddParser(new Parsers.JsonDeserializeParser <QuotesObject>(parsedResult_event)) // Received Json class
                                                                                                        // Adds a SQLite storage to keep all collected quotes
                       .SetStorage(new Storage.SQLiteStorage <Quote>())                                 // Single quote class
                       .SetConfig(c => c.Enable_Caching()
                                  .Disable_Cookies()
                                  .Disable_AutoAnchorsLinks()
                                  .Set_CachingNoLimit()
                                  .Set_DownloadDelay(5000));

            var spider = new SimpleSpider("QuotesToScrape", new Uri("http://quotes.toscrape.com/"), init);

            // add first
            spider.AddPage(buildPageUri(1), spider.BaseUri);
            // execute
            spider.Execute();
        }
        public void Executar()
        {
            var init = InitializationParams
                       .Default002()                            // Usar configs padrão
                       .SetConfig(c => c.Set_DownloadDelay(000) // Aguardar 2s entre requisições
                                  .Disable_AutoAnchorsLinks()); // Não sair navegando

            var spider = new SimpleSpider("cota_camara", new Uri("https://www.camara.leg.br"), init);

            spider.Configuration.SpiderAllowHostViolation = true; // Permite sair do domínio *.camara.leg.br

            // Obter todos os palamentares
            spider.AddPage(new Uri("https://www.camara.leg.br/cota-parlamentar/index.jsp"), spider.BaseUri);
            // Obter páginas
            spider.FetchCompleted += Spider_FetchCompleted;

            spider.FetchFailed += spider_FetchFailed;

            // Ignorar alguns endereços
            spider.ShouldFetch += spider_ShouldFetch;
            // mandar ver ...
            spider.Execute();
        }
Exemple #18
0
 /// <summary>
 /// Adds an Anchors to fetch
 /// </summary>
 /// <param name="spider">Spider to add</param>
 /// <param name="anchors">Anchors to fetch</param>
 /// <param name="SourcePage">Uri where the Anchor was found</param>
 /// <returns>Array of Links</returns>
 public static Spider.Link[] AddPages(this SimpleSpider spider, IEnumerable <Anchor> anchors, Uri SourcePage)
 {
     return(anchors.Select(a => AddPage(spider, a, SourcePage)).ToArray());
 }
Exemple #19
0
 /// <summary>
 /// Add an Anchor to fetch
 /// </summary>
 /// <param name="spider">Spider to add</param>
 /// <param name="anchor">Anchor to fetch</param>
 /// <param name="SourcePage">Uri where the Anchor was found</param>
 /// <returns>Link object</returns>
 public static Spider.Link AddPage(this SimpleSpider spider, Anchor anchor, Uri SourcePage)
 {
     return(spider.AddPage(anchor.GetUri(SourcePage), SourcePage));
 }
Exemple #20
0
        void IParserBase.Parse(SimpleSpider spider, FetchCompleteEventArgs fetchInfo)
        {
            var result = JsonConvert.DeserializeObject <T>(fetchInfo.Html);

            ParsedData(spider, new ParserEventArgs <T>(fetchInfo, result));
        }
        private void processaCupom(SimpleSpider spider, FetchCompleteEventArgs args)
        {
            if (args.Html.Contains("Nota não encontrada"))
            {
                return;
            }

            if (args.Html == "")
            {
                return;                  // ??
            }
            if (args.Link.Uri.Host.Contains(".rj.gov."))
            {
                return;                                          // Usa POST
            }
            if (args.Link.Uri.Host.Contains(".es.gov."))
            {
                return;                                          // Usa POST
            }
            if (args.Link.Uri.Host.Contains(".rr.gov."))
            {
                return;                                          // Usa POST
            }
            if (args.Link.Uri.Host.Contains(".pb.gov."))
            {
                return;                                          // Captcha
            }
            if (args.Link.Uri.Host.Contains(".ro.gov."))
            {
                return;                                          // Captcha
            }
            if (args.Link.Uri.Host.Contains(".ma.gov."))
            {
                return;                                          // Captcha
            }
            if (args.Link.Uri.Host.Contains(".ap.gov."))
            {
                return;                                          // Redirect -> Captcha
            }
            if (args.Link.Uri.Host.Contains(".pi.gov."))
            {
                return;                                          // Erro interno, um dia volta ?
            }
            if (args.Html.Contains("iframe"))
            {
                var frame = new Tag(args.GetDocument()).SelectTag <IFrame>();
                if (frame == null)
                {
                    return; // Ainda não sei como pegar
                }


                var newUri = frame.Src;
                spider.AddPage(new Uri(newUri), args.Link);
                return;
            }

            if (args.Html.Contains("CPF"))
            {
                contagemCPF++;
            }
            else if (args.Html.Contains("CNPJ"))
            {
                var ocorrencias = args.Html.Split("CNPJ");
                if (ocorrencias.Length > 2)
                {
                }
            }
            else
            {
            }
        }