예제 #1
0
        public static void run()
        {
            items = new List <BookData>();
            var spider = new SimpleSpider("BooksToScrape", new Uri("http://books.toscrape.com/"));

            // callback to gather links
            spider.FetchCompleted += (s, a) =>
            {
                // This callback can be replaced by:
                //  spider.Configuration.Auto_AnchorsLinks = true; (which is Enabled by default)
                // and is here for demonstration purposes

                // Use a simple SubString-based split to get all "<a>" tags
                var links = AnchorHelper.GetAnchors(a.Link.Uri, a.Html);
                // Add the collected links to the queue
                (s as SimpleSpider).AddPages(links, a.Link);
            };
            // callback to gather items
            spider.FetchCompleted += fetchCompleted_items_XPath;   // Sample using XPath
            spider.FetchCompleted += fetchCompleted_items_HObject; //Sample using HObject
            // Ignore (cancel) the pages containing "/reviews/"
            spider.ShouldFetch += (s, a) => { a.Cancel = a.Link.Uri.ToString().Contains("/reviews/"); };

            // execute from first page
            spider.Execute();

            // List all books
            foreach (var b in items)
            {
                Console.WriteLine($" > {b.Price:C2} {b.Title}");
            }
        }
        public void Catalogar(int ano)
        {
            Senadores = new List <Lib.Senado.Leg.Senador>();

            // obter listagem
            var page = FetchHelper.FetchResourceDocument(new Uri("https://www25.senado.leg.br/web/transparencia/sen/"),
                                                         enableCaching: true);
            var select       = new Select(page.DocumentNode.SelectSingleNode("//select"));
            var idsSenadores = select.GetItems()
                               .Select(id => id.Value.Trim())
                               .Where(id => !string.IsNullOrEmpty(id));

            // caminhar pelo site do senado obtendo os demais dados
            var init = InitializationParams
                       .Default002()
                       .SetConfig(c => c.Disable_AutoAnchorsLinks());
            var spider = new SimpleSpider("Senado.Leg",
                                          new System.Uri("https://www6g.senado.leg.br/"),
                                          init);

            spider.FetchCompleted += Spider_FetchCompleted;

            foreach (var i in idsSenadores)
            {
                spider.AddPage(montarUriSenador(ano, i),
                               spider.BaseUri);
            }

            spider.Execute();
        }
예제 #3
0
        // Similar to [RafaelEstevam.Simple.Spider.Test.Sample.BooksToScrape],
        //   see for more in depth cover of the crawling part
        public static void run()
        {
            // Creates a new instance
            var storage = new Storage.SQLiteStorage <Quote>();
            // set the spider to use it
            var init = new InitializationParams()
                       .SetStorage(storage);

            var spider = new SimpleSpider("QuotesToScrape",
                                          new Uri("http://quotes.toscrape.com/"),
                                          init);

            Console.WriteLine($"The sqlite database is at {storage.DatabaseFilePath}");
            Console.WriteLine($"The quotes are being stored in the table {storage.TableNameOfT}");

            spider.FetchCompleted += spider_FetchCompleted;
            spider.ShouldFetch    += Spider_ShouldFetch;
            spider.Execute();

            Console.WriteLine("Quotes from Albert Einstein");
            foreach (Quote q in storage.GetItemsWith("Author", "Albert Einstein"))
            {
                Console.WriteLine($"{q.Author}: {q.Text}");
            }
            Console.WriteLine("All Quotes");
            foreach (Quote q in spider.Storage.RetrieveAllItems())
            {
                Console.WriteLine($"{q.Author}: {q.Text}");
            }
        }
        public static void run()
        {
            var init = new InitializationParams()
                       .SetCacher(new ContentCacher())
                       .SetDownloader(new WebClientDownloader())
                       //.SetOfflineMode() // Remove all downloaders (sets a NullDownloader)
                       .SetConfig(c => c.Enable_Caching()
                                  .Disable_Cookies()
                                  .Enable_AutoAnchorsLinks() // enable automatic link following
                                  .Set_CachingNoLimit()
                                  .Set_DownloadDelay(5000));

            var spider = new SimpleSpider("QuotesToScrape", new Uri("http://quotes.toscrape.com/"), init);

            // Defines pages that should not be fetched
            spider.ShouldFetch += (_, args)
                                                                    // ShouldFetch args also supports chaining to easy cancel resources
                                                                    // Note: the order is very important
                                  => args.CancelIfContains("/login")
                                  .CancelIfContains("/tag/")        // not fetch tags
                                  .AllowIfContains("/tag/choices/") // I like to have choices =)
            ;

            //Defines pages (or ages) that should not use local cache
            spider.ShouldUseCache += (_, args)
                                     // can be chained too
                                     => args.CancelIfOlderThan(new TimeSpan(24, 0, 0));


            // Sets up the fetch completed callback
            spider.FetchCompleted += Spider_FetchCompleted;
            // execute
            spider.Execute();
        }
예제 #5
0
        public static void run()
        {
            var spider = new SimpleSpider("QuotesToScrape", new Uri("http://quotes.toscrape.com/"));

            // add callback to json pages
            spider.Parsers.OfType <JsonParser>().First().ParsedData += json_ParsedData;
            // add first
            spider.AddPage(buildPageUri(1), spider.BaseUri);
            // execute
            spider.Execute();
        }
예제 #6
0
        public static void run()
        {
            var spider = new SimpleSpider("QuotesToScrape", new Uri("http://quotes.toscrape.com/"));

            // create a json parser for our QuotesObject class
            spider.Parsers.Add(new JsonDeserializeParser <QuotesObject>(parsedResult_event));
            // add first page /api/quotes?page={pageNo}
            spider.AddPage(buildPageUri(1), spider.BaseUri);
            // execute
            spider.Execute();
        }
예제 #7
0
        public static void run()
        {
            var iP = new InitializationParams()
                                                                  // Defines a Storage Engine
                                                                  // All stored items will be in spider folder as JsonLines
                     .SetStorage(new Storage.JsonLinesStorage()); // JsonLines: https://jsonlines.org/

            var spider = new SimpleSpider("BooksToScrape", new Uri("http://books.toscrape.com/"), iP);

            // callback to gather items
            spider.FetchCompleted += fetchCompleted_items;
            // Ignore (cancel) the pages containing "/reviews/"
            spider.ShouldFetch += (s, a) => { a.Cancel = a.Link.Uri.ToString().Contains("/reviews/"); };
            // execute from first page
            spider.Execute();
        }
예제 #8
0
        public static void run(SqliteDB database)
        {
            db = database;
            db.CreateTables()
            .Add <PessoalModel>()
            .Add <Deputado>()
            .Commit();

            var init = InitializationParams.Default002()
                       .SetConfig(c => c.Disable_AutoAnchorsLinks()
                                  .Set_DownloadDelay(0));

            var spider = new SimpleSpider("camara_gabinete", new Uri("https://www.camara.leg.br/"), init);

            spider.FetchCompleted += Spider_FetchCompleted;
            spider.AddPage(new Uri("https://www.camara.leg.br/deputados/quem-sao"), spider.BaseUri);
            spider.Execute();
        }
예제 #9
0
        public static void run()
        {
            var init = new InitializationParams()
                       .SetCacher(new ContentCacher())
                       .SetDownloader(new WebClientDownloader())
                                                                                                        // create a json parser for our QuotesObject class
                       .AddParser(new Parsers.JsonDeserializeParser <QuotesObject>(parsedResult_event)) // Received Json class
                                                                                                        // Adds a SQLite storage to keep all collected quotes
                       .SetStorage(new Storage.SQLiteStorage <Quote>())                                 // Single quote class
                       .SetConfig(c => c.Enable_Caching()
                                  .Disable_Cookies()
                                  .Disable_AutoAnchorsLinks()
                                  .Set_CachingNoLimit()
                                  .Set_DownloadDelay(5000));

            var spider = new SimpleSpider("QuotesToScrape", new Uri("http://quotes.toscrape.com/"), init);

            // add first
            spider.AddPage(buildPageUri(1), spider.BaseUri);
            // execute
            spider.Execute();
        }
        public void Executar()
        {
            var init = InitializationParams
                       .Default002()                            // Usar configs padrão
                       .SetConfig(c => c.Set_DownloadDelay(000) // Aguardar 2s entre requisições
                                  .Disable_AutoAnchorsLinks()); // Não sair navegando

            var spider = new SimpleSpider("cota_camara", new Uri("https://www.camara.leg.br"), init);

            spider.Configuration.SpiderAllowHostViolation = true; // Permite sair do domínio *.camara.leg.br

            // Obter todos os palamentares
            spider.AddPage(new Uri("https://www.camara.leg.br/cota-parlamentar/index.jsp"), spider.BaseUri);
            // Obter páginas
            spider.FetchCompleted += Spider_FetchCompleted;

            spider.FetchFailed += spider_FetchFailed;

            // Ignorar alguns endereços
            spider.ShouldFetch += spider_ShouldFetch;
            // mandar ver ...
            spider.Execute();
        }