public static void run()
            var init = new InitializationParams()
                       .SetCacher(new ContentCacher())
                       .SetDownloader(new WebClientDownloader())
                       //.SetOfflineMode() // Remove all downloaders (sets a NullDownloader)
                       .SetConfig(c => c.Enable_Caching()
                                  .Enable_AutoAnchorsLinks() // enable automatic link following

            var spider = new SimpleSpider("QuotesToScrape", new Uri(""), init);

            // Defines pages that should not be fetched
            spider.ShouldFetch += (_, args)
                                                                    // ShouldFetch args also supports chaining to easy cancel resources
                                                                    // Note: the order is very important
                                  => args.CancelIfContains("/login")
                                  .CancelIfContains("/tag/")        // not fetch tags
                                  .AllowIfContains("/tag/choices/") // I like to have choices =)

            //Defines pages (or ages) that should not use local cache
            spider.ShouldUseCache += (_, args)
                                     // can be chained too
                                     => args.CancelIfOlderThan(new TimeSpan(24, 0, 0));

            // Sets up the fetch completed callback
            spider.FetchCompleted += Spider_FetchCompleted;
            // execute
Ejemplo n.º 2
        static void Main(string[] args)
            // See here some samples
            // Uncomment and use the Debug Step Into feature (F11) to see the action

            // Spider Samples:
            //; // similar, but Json-based (API)

            // Helper Samples
            //;    // Capture a form element and submit it
            //; // Easy fetch of single resources

            // Easy-parsing sample, use the Debug Step Into feature (F11) to see the action
            //; // Parse quotes with HObject

            //Storage Engines

            // IgnoreMe file to internal screwing around
            //  Add a file named IgnoreMe.cs with a static void run() on it to play around with the spider


        private void processaNF(SimpleSpider spider, FetchCompleteEventArgs args)
            if (args.Html.Contains("<!--NFE-API-->"))
                // Ainda não há o que fazer

            var html = args.Html.Substring(args.Html.IndexOf("<!--NFE-API-->"));

            if (html.Contains(".location"))
                // Ainda não há o que fazer

            html = html.Substring(html.IndexOf("http"));
            var uri = new Uri(html.Substring(0, html.IndexOf("\"")));

            spider.AddPage(uri, args.Link);
Ejemplo n.º 4
        // Similar to [RafaelEstevam.Simple.Spider.Test.Sample.BooksToScrape],
        //   see for more in depth cover of the crawling part
        public static void run()
            // Creates a new instance
            var storage = new Storage.SQLiteStorage <Quote>();
            // set the spider to use it
            var init = new InitializationParams()

            var spider = new SimpleSpider("QuotesToScrape",
                                          new Uri(""),

            Console.WriteLine($"The sqlite database is at {storage.DatabaseFilePath}");
            Console.WriteLine($"The quotes are being stored in the table {storage.TableNameOfT}");

            spider.FetchCompleted += spider_FetchCompleted;
            spider.ShouldFetch    += Spider_ShouldFetch;

            Console.WriteLine("Quotes from Albert Einstein");
            foreach (Quote q in storage.GetItemsWith("Author", "Albert Einstein"))
                Console.WriteLine($"{q.Author}: {q.Text}");
            Console.WriteLine("All Quotes");
            foreach (Quote q in spider.Storage.RetrieveAllItems())
                Console.WriteLine($"{q.Author}: {q.Text}");
        public void Catalogar(int ano)
            Senadores = new List <Lib.Senado.Leg.Senador>();

            // obter listagem
            var page = FetchHelper.FetchResourceDocument(new Uri(""),
                                                         enableCaching: true);
            var select       = new Select(page.DocumentNode.SelectSingleNode("//select"));
            var idsSenadores = select.GetItems()
                               .Select(id => id.Value.Trim())
                               .Where(id => !string.IsNullOrEmpty(id));

            // caminhar pelo site do senado obtendo os demais dados
            var init = InitializationParams
                       .SetConfig(c => c.Disable_AutoAnchorsLinks());
            var spider = new SimpleSpider("Senado.Leg",
                                          new System.Uri(""),

            spider.FetchCompleted += Spider_FetchCompleted;

            foreach (var i in idsSenadores)
                spider.AddPage(montarUriSenador(ano, i),

        private void processaSumarizado(SimpleSpider spider, FetchCompleteEventArgs args)
            // cataloga Deputado
            var tag = new Tag(args.GetDocument());

            var id = args.Link.Uri.Query
                     .Split('&')[0] // primeiro bloco
                     .Split('=')[1] // após o igual

            var h3 = tag.SelectTag("//h3[@class=\"header\"]");

            ListaDeputados.Add(new Deputado()
                Id               = id,
                Nome             = h3.SelectTag <Anchor>().InnerText.Trim(),
                PartidoLideranca = h3.Node.ChildNodes[2].InnerText.Trim()

            // Carrega despesas
            var linhas = tag.SelectTags("//table//tr")
                         .Skip(1)      // Ignora o Header
                         .SkipLast(1); // Ignora o total

            var dados = linhas
                        .Select(tr => tr.SelectTag <Anchor>());

            spider.AddPages(dados, args.Link);
Ejemplo n.º 7
        public static void run()
            items = new List <BookData>();
            var spider = new SimpleSpider("BooksToScrape", new Uri(""));

            // callback to gather links
            spider.FetchCompleted += (s, a) =>
                // This callback can be replaced by:
                //  spider.Configuration.Auto_AnchorsLinks = true; (which is Enabled by default)
                // and is here for demonstration purposes

                // Use a simple SubString-based split to get all "<a>" tags
                var links = AnchorHelper.GetAnchors(a.Link.Uri, a.Html);
                // Add the collected links to the queue
                (s as SimpleSpider).AddPages(links, a.Link);
            // callback to gather items
            spider.FetchCompleted += fetchCompleted_items_XPath;   // Sample using XPath
            spider.FetchCompleted += fetchCompleted_items_HObject; //Sample using HObject
            // Ignore (cancel) the pages containing "/reviews/"
            spider.ShouldFetch += (s, a) => { a.Cancel = a.Link.Uri.ToString().Contains("/reviews/"); };

            // execute from first page

            // List all books
            foreach (var b in items)
                Console.WriteLine($" > {b.Price:C2} {b.Title}");
Ejemplo n.º 8
        void IParserBase.Parse(SimpleSpider spider, FetchCompleteEventArgs FetchInfo)
            if (ParsedData == null)

            ParsedData(spider, new ParserEventArgs <JObject>(FetchInfo: FetchInfo, Data: JObject.Parse(FetchInfo.Html)));
Ejemplo n.º 9
        void IParserBase.Parse(SimpleSpider spider, FetchCompleteEventArgs FetchInfo)
            if (ParsedData == null)

            ParsedData(spider, new ParserEventArgs <XElement>(FetchInfo, XElement.Parse(Encoding.GetString(FetchInfo.Result))));
Ejemplo n.º 10
        public static void run()
            var spider = new SimpleSpider("QuotesToScrape", new Uri(""));

            // create a json parser for our QuotesObject class
            spider.Parsers.Add(new JsonDeserializeParser <QuotesObject>(parsedResult_event));
            // add first page /api/quotes?page={pageNo}
            spider.AddPage(buildPageUri(1), spider.BaseUri);
            // execute
Ejemplo n.º 11
        public static void run()
            var spider = new SimpleSpider("QuotesToScrape", new Uri(""));

            // add callback to json pages
            spider.Parsers.OfType <JsonParser>().First().ParsedData += json_ParsedData;
            // add first
            spider.AddPage(buildPageUri(1), spider.BaseUri);
            // execute
        private void processaDocumento(SimpleSpider spider, FetchCompleteEventArgs args)
            int idDeputado = args.Link.Uri.Query

            var hObj = args.GetHObject();
            var ULs  = hObj["ul > .listaDefinicao"].ToArray();

            var ulDados        = ULs[1];
            var ulFornecedor   = ULs[2];
            var ulValorDespesa = ULs[3];
            var ulValores2     = ULs[4];

            int codigoDespesa = args.Link.Uri.Query
            //string nomeDepsesa = ulDados["span"][1].Trim();

            string numero      = ulDados["span"][5].Trim();
            string dtEmissao   = ulDados["span"][7].Trim();
            string competencia = ulDados["span"][9].Trim();

            string fornecedorNome = ulFornecedor["span"][1].Trim();
            string fornecedorCnpj = ulFornecedor["span"][5].Trim();

            string valorDespesa = ulValorDespesa["span"][0].Trim();
            string deducoes     = ulValores2["span"][0].Trim();
            string glosas       = ulValores2["span"][1].Trim();
            string restituicoes = ulValores2["span"][2].Trim();
            string reembolso    = ulValorDespesa["span"][5].Trim();

            ListaDespesas.Add(new Despesa()
                IdDeputado  = idDeputado,
                TipoDespesa = (TiposDespesa)codigoDespesa,
                Numero      = numero,

                DocumentoFornecedor = fornecedorCnpj,
                NomeFornecedor      = WebUtility.HtmlDecode(fornecedorNome),

                DataEmissao     = LocalizationHelper.ParseDatetime(dtEmissao),
                DataCompetencia = LocalizationHelper.ParseDatetime("01/" + competencia),

                ValorDespesa = converteValor(valorDespesa),
                Deducoes     = converteValor(deducoes),
                Glosas       = converteValor(glosas),
                Restituicoes = converteValor(restituicoes),
                Reembolso    = converteValor(reembolso)
Ejemplo n.º 13
        public static void run()
            var iP = new InitializationParams()
                                                                  // Defines a Storage Engine
                                                                  // All stored items will be in spider folder as JsonLines
                     .SetStorage(new Storage.JsonLinesStorage()); // JsonLines:

            var spider = new SimpleSpider("BooksToScrape", new Uri(""), iP);

            // callback to gather items
            spider.FetchCompleted += fetchCompleted_items;
            // Ignore (cancel) the pages containing "/reviews/"
            spider.ShouldFetch += (s, a) => { a.Cancel = a.Link.Uri.ToString().Contains("/reviews/"); };
            // execute from first page
Ejemplo n.º 14
        public static void run(SqliteDB database)
            db = database;
            .Add <PessoalModel>()
            .Add <Deputado>()

            var init = InitializationParams.Default002()
                       .SetConfig(c => c.Disable_AutoAnchorsLinks()

            var spider = new SimpleSpider("camara_gabinete", new Uri(""), init);

            spider.FetchCompleted += Spider_FetchCompleted;
            spider.AddPage(new Uri(""), spider.BaseUri);
        private void processaAnalitico(SimpleSpider spider, FetchCompleteEventArgs args)
            var linhas = new Tag(args.GetDocument())
                         .SkipLast(1); // Ignora o total

            foreach (var linha in linhas)
                var lnk = linha.SelectTag <Anchor>(".//a");
                if (lnk.Href.Contains("/documento?nuDeputadoId"))
                    spider.AddPage(lnk, args.Link);
                if (lnk.Href.Contains("/nota-fiscal-eletronica?"))
                    spider.AddPage(lnk, args.Link);
Ejemplo n.º 16
        public static void run()
            var init = new InitializationParams()
                       .SetCacher(new ContentCacher())
                       .SetDownloader(new WebClientDownloader())
                                                                                                        // create a json parser for our QuotesObject class
                       .AddParser(new Parsers.JsonDeserializeParser <QuotesObject>(parsedResult_event)) // Received Json class
                                                                                                        // Adds a SQLite storage to keep all collected quotes
                       .SetStorage(new Storage.SQLiteStorage <Quote>())                                 // Single quote class
                       .SetConfig(c => c.Enable_Caching()

            var spider = new SimpleSpider("QuotesToScrape", new Uri(""), init);

            // add first
            spider.AddPage(buildPageUri(1), spider.BaseUri);
            // execute
        public void Executar()
            var init = InitializationParams
                       .Default002()                            // Usar configs padrão
                       .SetConfig(c => c.Set_DownloadDelay(000) // Aguardar 2s entre requisições
                                  .Disable_AutoAnchorsLinks()); // Não sair navegando

            var spider = new SimpleSpider("cota_camara", new Uri(""), init);

            spider.Configuration.SpiderAllowHostViolation = true; // Permite sair do domínio *

            // Obter todos os palamentares
            spider.AddPage(new Uri(""), spider.BaseUri);
            // Obter páginas
            spider.FetchCompleted += Spider_FetchCompleted;

            spider.FetchFailed += spider_FetchFailed;

            // Ignorar alguns endereços
            spider.ShouldFetch += spider_ShouldFetch;
            // mandar ver ...
Ejemplo n.º 18
 /// <summary>
 /// Adds an Anchors to fetch
 /// </summary>
 /// <param name="spider">Spider to add</param>
 /// <param name="anchors">Anchors to fetch</param>
 /// <param name="SourcePage">Uri where the Anchor was found</param>
 /// <returns>Array of Links</returns>
 public static Spider.Link[] AddPages(this SimpleSpider spider, IEnumerable <Anchor> anchors, Uri SourcePage)
     return(anchors.Select(a => AddPage(spider, a, SourcePage)).ToArray());
Ejemplo n.º 19
 /// <summary>
 /// Add an Anchor to fetch
 /// </summary>
 /// <param name="spider">Spider to add</param>
 /// <param name="anchor">Anchor to fetch</param>
 /// <param name="SourcePage">Uri where the Anchor was found</param>
 /// <returns>Link object</returns>
 public static Spider.Link AddPage(this SimpleSpider spider, Anchor anchor, Uri SourcePage)
     return(spider.AddPage(anchor.GetUri(SourcePage), SourcePage));
Ejemplo n.º 20
        void IParserBase.Parse(SimpleSpider spider, FetchCompleteEventArgs fetchInfo)
            var result = JsonConvert.DeserializeObject <T>(fetchInfo.Html);

            ParsedData(spider, new ParserEventArgs <T>(fetchInfo, result));
        private void processaCupom(SimpleSpider spider, FetchCompleteEventArgs args)
            if (args.Html.Contains("Nota não encontrada"))

            if (args.Html == "")
                return;                  // ??
            if (args.Link.Uri.Host.Contains(""))
                return;                                          // Usa POST
            if (args.Link.Uri.Host.Contains(""))
                return;                                          // Usa POST
            if (args.Link.Uri.Host.Contains(""))
                return;                                          // Usa POST
            if (args.Link.Uri.Host.Contains(""))
                return;                                          // Captcha
            if (args.Link.Uri.Host.Contains(""))
                return;                                          // Captcha
            if (args.Link.Uri.Host.Contains(""))
                return;                                          // Captcha
            if (args.Link.Uri.Host.Contains(""))
                return;                                          // Redirect -> Captcha
            if (args.Link.Uri.Host.Contains(""))
                return;                                          // Erro interno, um dia volta ?
            if (args.Html.Contains("iframe"))
                var frame = new Tag(args.GetDocument()).SelectTag <IFrame>();
                if (frame == null)
                    return; // Ainda não sei como pegar

                var newUri = frame.Src;
                spider.AddPage(new Uri(newUri), args.Link);

            if (args.Html.Contains("CPF"))
            else if (args.Html.Contains("CNPJ"))
                var ocorrencias = args.Html.Split("CNPJ");
                if (ocorrencias.Length > 2)