示例#1
0
        /// <summary>
        /// The DownloadArticleContent
        /// </summary>
        /// <param name="filters">The <see cref="string"/></param>
        public void DownloadArticleContent(string select, string[] filters)
        {
            var          w   = new HtmlAgilityPack.HtmlWeb();
            HtmlDocument doc = new HtmlDocument();

            if (Links.Count > 0)
            {
                try
                {
                    HttpDownloader downloader = new HttpDownloader(Links[0].Uri.ToString(), null, Configuration.UserAgent);
                    doc.LoadHtml(downloader.GetPage());
                }
                catch (Exception ex)
                {
                    doc = null;
                    this.DisplayText = this.DisplayLine + " ERROR.";
                    logger.Error($"Could not load {Links[0].Uri}");
                    logger.Error(ex);
                }
            }
            if (doc == null)
            {
                return;
            }

            HtmlToText conv = new HtmlToText()
            {
                Select = select, Filters = filters?.ToList(), LinkStartFrom = this.Links.Count
            };
            Collection <Uri> links  = new Collection <Uri>();
            Collection <Uri> images = new Collection <Uri>();

            var resultString = conv.ConvertHtml(doc.DocumentNode.OuterHtml,
                                                Links[0].Uri, out links, out images);
            //remove multiple lines from article content. It makes text more condensed.
            var cleanedContent = Regex.Replace(resultString, @"^\s+$[\r\n]*", "\r\n", RegexOptions.Multiline);

            ExternalLinks  = links;
            ImageLinks     = images;
            ArticleContent = cleanedContent;

            IsLoaded = true;
            Save();
        }
        public ActionResult Index(ImportContentViewModel model)
        {
            cleanModel(model);
            var configuration = updateDatabase(model);
            
            var credential = $"DefaultEndpointsProtocol=https;AccountName={ model.StorageInfo.AccountName };AccountKey={ model.StorageInfo.StorageKey }";
            var storageAccount = CloudStorageAccount.Parse(credential);
            var tableClient = storageAccount.CreateCloudBlobClient();
            var blobRef = tableClient.GetContainerReference("scripts");
            blobRef.CreateIfNotExists();

            var styleText = string.Empty;
            var ReferenceOldName = string.Empty;
            var ReferenceName = string.Empty;
            foreach (var item in model.ReferenceFileURLs)
            {
                if (item.IndexOf("style.css") > -1)
                {
                    var ReferHttp = new HttpDownloader(item);
                    styleText = ReferHttp.GetPage();
                    ReferenceOldName = item;
                    ReferenceName = item.Replace("?", "/").Split(new char[] { '/' }, StringSplitOptions.RemoveEmptyEntries).Last(element => element.IndexOf("style.css") > -1);
                }
            }

            foreach (var item in model.ReferenceFileURLs)
            {
                if(item.IndexOf("style.css") == -1) {
                    var fontsHttp = new HttpDownloader(item);
                    var fontsText = fontsHttp.GetPage();
                    var fontsName = item.Split(new char[] { '/' }, StringSplitOptions.RemoveEmptyEntries).LastOrDefault();

                    using (var fileStream = GenerateStreamFromString(fontsText))
                    {
                        var blockBlob = blobRef.GetBlockBlobReference(fontsName);
                        blockBlob.UploadFromStream(fileStream);
                        var url = blockBlob.Uri.AbsoluteUri;
                        styleText = replaceFontReference(styleText, fontsName, url);
                    }
                }
            }

            using (var fileStream = GenerateStreamFromString(styleText))
            {
                var blockBlob = blobRef.GetBlockBlobReference(ReferenceName);
                blockBlob.Properties.ContentType = "text/css";
                blockBlob.UploadFromStream(fileStream);
                blockBlob.SetProperties();
                ReferenceName = blockBlob.Uri.AbsoluteUri;
            }

            blobRef = tableClient.GetContainerReference("htmls");
            blobRef.CreateIfNotExists();

            // Create home page
            var Http = new HttpDownloader($"{model.BaseURL}{model.HomePageURL}");
            var rawText = Http.GetPage();
            var replacedText = replaceContent(rawText, model.ReplaceSections);
            replacedText = replaceLink(replacedText, model.BaseURL, model.PagesURLs);
            replacedText = replaceStyleReference(replacedText, ReferenceOldName, ReferenceName);

            using (var fileStream = GenerateStreamFromString(replacedText))
            {
                var blockBlob = blobRef.GetBlockBlobReference(HomePageName);
                blockBlob.Properties.ContentType = "text/html";
                blockBlob.UploadFromStream(fileStream);
                blockBlob.SetProperties();
            }

            // Other pages
            foreach (var item in model.PagesURLs)
            {
                Http = new HttpDownloader($"{model.BaseURL}{item}");
                rawText = Http.GetPage();
                replacedText = replaceContent(rawText, model.ReplaceSections);
                replacedText = replaceLink(replacedText, model.BaseURL, model.PagesURLs);
                replacedText = replaceStyleReference(replacedText, ReferenceOldName, ReferenceName);
                var fileName = item.Split(new char[] { '/' }, StringSplitOptions.RemoveEmptyEntries).LastOrDefault();

                using (var fileStream = GenerateStreamFromString(replacedText))
                {
                    var blockBlob = blobRef.GetBlockBlobReference(fileName);
                    blockBlob.Properties.ContentType = "text/html";
                    blockBlob.UploadFromStream(fileStream);
                    blockBlob.SetProperties();
                }
            }
            
            return RedirectToAction("Index");
        }
示例#3
0
        private static void RetrieveDataEstante()
        {
            Logger.Setup();
            _logger = LogManager.GetLogger(typeof(Program));

            ScrapaContext     scrapaContext     = new ScrapaContext();
            EstanteRepository estanteRepository = new EstanteRepository(scrapaContext);
            LivroRepository   livroRepository   = new LivroRepository(scrapaContext);


            //ScrapTipoEstantes(estanteRepository);
            string baseUrl = "https://www.estantevirtual.com.br";
            IEnumerable <Estante> estantes = estanteRepository.GetAll().Where(e => e.IdEstante > 38).ToList();

            foreach (Estante estante in estantes)
            {
                Console.WriteLine($@"Starting with estante {estante.Nome}");
                _logger.Info($"Starting with Estante {estante.Nome}");
                try
                {
                    CancellationTokenSource producerCancellationToken = new CancellationTokenSource();
                    CancellationTokenSource consumerCancellationToken = new CancellationTokenSource();

                    BlockingCollection <HtmlNode> pageCollection = new BlockingCollection <HtmlNode>(250);
                    Task producerThread = Task.Factory.StartNew(() =>
                    {
                        int i = -1;
                        Parallel.For(0, 350,
                                     new ParallelOptions()
                        {
                            MaxDegreeOfParallelism = 3, CancellationToken = producerCancellationToken.Token
                        },
                                     (iPage) =>
                        {
                            if (producerCancellationToken.Token.IsCancellationRequested)
                            {
                                pageCollection.CompleteAdding();
                            }

                            Interlocked.Increment(ref i);
                            //for (int iPage = 0; iPage < 250; ++iPage)
                            Console.WriteLine($"Retriving {i}.");
                            _logger.Info($"Retriving {i}.");
                            HtmlAgilityPack.HtmlDocument getPage = new HtmlAgilityPack.HtmlDocument();
                            HttpDownloader htmlDownloader        =
                                new HttpDownloader($"{estante.Link}&offset={i}", null, null);
                            getPage.LoadHtml(htmlDownloader.GetPage());

                            pageCollection.Add(getPage.DocumentNode);

                            int sleepTime = new Random(DateTime.Now.Millisecond).Next(5, 50);
                            Console.WriteLine(
                                $@"Sleeping for {sleepTime}. Page {i}. is ready. OnThread: {
                                            Thread.CurrentThread.ManagedThreadId
                                        })");
                            _logger.Info(
                                $"Sleeping for {sleepTime}. Page {i}. is ready (OnThread: {Thread.CurrentThread.ManagedThreadId})");
                        });

                        pageCollection.CompleteAdding();
                    });

                    Task consumerThread = Task.Factory.StartNew(() =>
                    {
                        long iPage      = 0;
                        long errorCount = 0;

                        Parallel.For(0, 350, new ParallelOptions()
                        {
                            MaxDegreeOfParallelism = 3, CancellationToken = consumerCancellationToken.Token
                        }, (i) =>
                        {
                            consumerCancellationToken.Token.ThrowIfCancellationRequested();

                            Interlocked.Increment(ref iPage);

                            try
                            {
                                if (pageCollection.TryTake(out HtmlNode item, TimeSpan.FromSeconds(30)) == false)
                                {
                                    if (producerCancellationToken.Token.IsCancellationRequested && pageCollection.Count == 0)
                                    {
                                        consumerCancellationToken.Cancel();
                                        return;
                                    }
                                }
                                ;

                                HtmlNodeCollection books =
                                    item.SelectNodes("//*[contains(@class, 'js-busca-resultados')]/a");
                                ConcurrentBag <HtmlNode> booksConcurrentBag =
                                    new ConcurrentBag <HtmlNode>(books.ToList());

                                ConcurrentBag <Livro> livros = new ConcurrentBag <Livro>();

                                Parallel.ForEach(
                                    booksConcurrentBag,
                                    new ParallelOptions {
                                    MaxDegreeOfParallelism = 20
                                },
                                    book =>
                                {
                                    HtmlNode header =
                                        book.ChildNodes.FirstOrDefault(
                                            d => d.Attributes.Contains("class") &&
                                            d.Attributes["class"].Value.Contains("breslt-header"));

                                    string bookName = header
                                                      .ChildNodes
                                                      .FirstOrDefault(d => d.Attributes.Contains("class")
                                                                      &&
                                                                      d.Attributes["class"]
                                                                      .Value.Contains("busca-title"))
                                                      .InnerText;

                                    string bookAuthor = header
                                                        .ChildNodes
                                                        .FirstOrDefault(d => d.Attributes.Contains("class")
                                                                        &&
                                                                        d.Attributes["class"]
                                                                        .Value.Contains("busca-author"))
                                                        .InnerText;

                                    if (livroRepository.Exists(bookName, bookAuthor))
                                    {
                                        return;
                                    }

                                    if (livros.Any(b =>
                                                   (string.Compare(bookName, b.Nome,
                                                                   StringComparison.InvariantCultureIgnoreCase) == 0)
                                                   &&
                                                   (string.Compare(bookAuthor, b.Autor,
                                                                   StringComparison.InvariantCultureIgnoreCase) == 0)))
                                    {
                                        return;
                                    }

                                    var url = book.Attributes["href"];
                                    Uri uri = new Uri(baseUrl + url.Value);

                                    Livro livro            = new Livro();
                                    livro.IdEstante        = estante.IdEstante;
                                    livro.Autor            = bookAuthor;
                                    livro.Nome             = bookName;
                                    livro.IdEstanteVirtual = long.Parse(uri.Segments[4]);

                                    PriceStatistics pricestats =
                                        new PriceStatisticsService().Get(livro.IdEstanteVirtual);
                                    if (pricestats != null)
                                    {
                                        livro.QtdVendida       = pricestats.QtdVendida;
                                        livro.DtUltimaVenda    = pricestats.DtUltimaVenda;
                                        livro.ValorMaxVenda    = pricestats.ValorMaxVenda;
                                        livro.ValorMedioVenda  = pricestats.ValorMedioVenda;
                                        livro.ValorMinVenda    = pricestats.ValorMinVenda;
                                        livro.ValorUltimaVenda = pricestats.ValorUltimaVenda;
                                    }

                                    livros.Add(livro);
                                });
                                livroRepository.AddThreadSafe(livros);

                                int sleep = new Random(DateTime.Now.Millisecond).Next(50, 100);
                                Console.WriteLine($@"Sleeping for {sleep}. Page {iPage}. is finished");
                                _logger.Info($"Sleeping for {sleep}. Page {iPage}. is finished");
                                Console.WriteLine($@"Livros {livros.Count} gravados");
                                Thread.Sleep(sleep);
                            }
                            catch (Exception ex)
                            {
                                Console.WriteLine($@"Erro numero --> {Interlocked.Read(ref errorCount)}) <-- na Estante {estante.Nome}, na página {Interlocked.Read(ref iPage)} ERROR: {ex.Message}");
                                Interlocked.Increment(ref errorCount);
                                _logger.Fatal($"Erro fatal na estante {estante.Nome}. Página {iPage}", ex);
                                Console.WriteLine($@"Erro fatal na estante {estante.Nome}. Página {iPage}");
                                if (Interlocked.Read(ref errorCount) > 20)
                                {
                                    producerCancellationToken.Cancel();
                                }
                            }
                        });
示例#4
0
        public ActionResult Index(ImportContentViewModel model)
        {
            cleanModel(model);
            var configuration = updateDatabase(model);

            var credential     = $"DefaultEndpointsProtocol=https;AccountName={ model.StorageInfo.AccountName };AccountKey={ model.StorageInfo.StorageKey }";
            var storageAccount = CloudStorageAccount.Parse(credential);
            var tableClient    = storageAccount.CreateCloudBlobClient();
            var blobRef        = tableClient.GetContainerReference("scripts");

            blobRef.CreateIfNotExists();

            var styleText        = string.Empty;
            var ReferenceOldName = string.Empty;
            var ReferenceName    = string.Empty;

            foreach (var item in model.ReferenceFileURLs)
            {
                if (item.IndexOf("style.css") > -1)
                {
                    var ReferHttp = new HttpDownloader(item);
                    styleText        = ReferHttp.GetPage();
                    ReferenceOldName = item;
                    ReferenceName    = item.Replace("?", "/").Split(new char[] { '/' }, StringSplitOptions.RemoveEmptyEntries).Last(element => element.IndexOf("style.css") > -1);
                }
            }

            foreach (var item in model.ReferenceFileURLs)
            {
                if (item.IndexOf("style.css") == -1)
                {
                    var fontsHttp = new HttpDownloader(item);
                    var fontsText = fontsHttp.GetPage();
                    var fontsName = item.Split(new char[] { '/' }, StringSplitOptions.RemoveEmptyEntries).LastOrDefault();

                    using (var fileStream = GenerateStreamFromString(fontsText))
                    {
                        var blockBlob = blobRef.GetBlockBlobReference(fontsName);
                        blockBlob.UploadFromStream(fileStream);
                        var url = blockBlob.Uri.AbsoluteUri;
                        styleText = replaceFontReference(styleText, fontsName, url);
                    }
                }
            }

            using (var fileStream = GenerateStreamFromString(styleText))
            {
                var blockBlob = blobRef.GetBlockBlobReference(ReferenceName);
                blockBlob.Properties.ContentType = "text/css";
                blockBlob.UploadFromStream(fileStream);
                blockBlob.SetProperties();
                ReferenceName = blockBlob.Uri.AbsoluteUri;
            }

            blobRef = tableClient.GetContainerReference("htmls");
            blobRef.CreateIfNotExists();

            // Create home page
            var Http         = new HttpDownloader($"{model.BaseURL}{model.HomePageURL}");
            var rawText      = Http.GetPage();
            var replacedText = replaceContent(rawText, model.ReplaceSections);

            replacedText = replaceLink(replacedText, model.BaseURL, model.PagesURLs);
            replacedText = replaceStyleReference(replacedText, ReferenceOldName, ReferenceName);

            using (var fileStream = GenerateStreamFromString(replacedText))
            {
                var blockBlob = blobRef.GetBlockBlobReference(HomePageName);
                blockBlob.Properties.ContentType = "text/html";
                blockBlob.UploadFromStream(fileStream);
                blockBlob.SetProperties();
            }

            // Other pages
            foreach (var item in model.PagesURLs)
            {
                Http         = new HttpDownloader($"{model.BaseURL}{item}");
                rawText      = Http.GetPage();
                replacedText = replaceContent(rawText, model.ReplaceSections);
                replacedText = replaceLink(replacedText, model.BaseURL, model.PagesURLs);
                replacedText = replaceStyleReference(replacedText, ReferenceOldName, ReferenceName);
                var fileName = item.Split(new char[] { '/' }, StringSplitOptions.RemoveEmptyEntries).LastOrDefault();

                using (var fileStream = GenerateStreamFromString(replacedText))
                {
                    var blockBlob = blobRef.GetBlockBlobReference(fileName);
                    blockBlob.Properties.ContentType = "text/html";
                    blockBlob.UploadFromStream(fileStream);
                    blockBlob.SetProperties();
                }
            }

            return(RedirectToAction("Index"));
        }