/// <summary> /// The DownloadArticleContent /// </summary> /// <param name="filters">The <see cref="string"/></param> public void DownloadArticleContent(string select, string[] filters) { var w = new HtmlAgilityPack.HtmlWeb(); HtmlDocument doc = new HtmlDocument(); if (Links.Count > 0) { try { HttpDownloader downloader = new HttpDownloader(Links[0].Uri.ToString(), null, Configuration.UserAgent); doc.LoadHtml(downloader.GetPage()); } catch (Exception ex) { doc = null; this.DisplayText = this.DisplayLine + " ERROR."; logger.Error($"Could not load {Links[0].Uri}"); logger.Error(ex); } } if (doc == null) { return; } HtmlToText conv = new HtmlToText() { Select = select, Filters = filters?.ToList(), LinkStartFrom = this.Links.Count }; Collection <Uri> links = new Collection <Uri>(); Collection <Uri> images = new Collection <Uri>(); var resultString = conv.ConvertHtml(doc.DocumentNode.OuterHtml, Links[0].Uri, out links, out images); //remove multiple lines from article content. It makes text more condensed. var cleanedContent = Regex.Replace(resultString, @"^\s+$[\r\n]*", "\r\n", RegexOptions.Multiline); ExternalLinks = links; ImageLinks = images; ArticleContent = cleanedContent; IsLoaded = true; Save(); }
public ActionResult Index(ImportContentViewModel model) { cleanModel(model); var configuration = updateDatabase(model); var credential = $"DefaultEndpointsProtocol=https;AccountName={ model.StorageInfo.AccountName };AccountKey={ model.StorageInfo.StorageKey }"; var storageAccount = CloudStorageAccount.Parse(credential); var tableClient = storageAccount.CreateCloudBlobClient(); var blobRef = tableClient.GetContainerReference("scripts"); blobRef.CreateIfNotExists(); var styleText = string.Empty; var ReferenceOldName = string.Empty; var ReferenceName = string.Empty; foreach (var item in model.ReferenceFileURLs) { if (item.IndexOf("style.css") > -1) { var ReferHttp = new HttpDownloader(item); styleText = ReferHttp.GetPage(); ReferenceOldName = item; ReferenceName = item.Replace("?", "/").Split(new char[] { '/' }, StringSplitOptions.RemoveEmptyEntries).Last(element => element.IndexOf("style.css") > -1); } } foreach (var item in model.ReferenceFileURLs) { if(item.IndexOf("style.css") == -1) { var fontsHttp = new HttpDownloader(item); var fontsText = fontsHttp.GetPage(); var fontsName = item.Split(new char[] { '/' }, StringSplitOptions.RemoveEmptyEntries).LastOrDefault(); using (var fileStream = GenerateStreamFromString(fontsText)) { var blockBlob = blobRef.GetBlockBlobReference(fontsName); blockBlob.UploadFromStream(fileStream); var url = blockBlob.Uri.AbsoluteUri; styleText = replaceFontReference(styleText, fontsName, url); } } } using (var fileStream = GenerateStreamFromString(styleText)) { var blockBlob = blobRef.GetBlockBlobReference(ReferenceName); blockBlob.Properties.ContentType = "text/css"; blockBlob.UploadFromStream(fileStream); blockBlob.SetProperties(); ReferenceName = blockBlob.Uri.AbsoluteUri; } blobRef = tableClient.GetContainerReference("htmls"); blobRef.CreateIfNotExists(); // Create home page var Http = new HttpDownloader($"{model.BaseURL}{model.HomePageURL}"); var rawText = Http.GetPage(); var replacedText = replaceContent(rawText, model.ReplaceSections); replacedText = replaceLink(replacedText, model.BaseURL, model.PagesURLs); replacedText = replaceStyleReference(replacedText, ReferenceOldName, ReferenceName); using (var fileStream = GenerateStreamFromString(replacedText)) { var blockBlob = blobRef.GetBlockBlobReference(HomePageName); blockBlob.Properties.ContentType = "text/html"; blockBlob.UploadFromStream(fileStream); blockBlob.SetProperties(); } // Other pages foreach (var item in model.PagesURLs) { Http = new HttpDownloader($"{model.BaseURL}{item}"); rawText = Http.GetPage(); replacedText = replaceContent(rawText, model.ReplaceSections); replacedText = replaceLink(replacedText, model.BaseURL, model.PagesURLs); replacedText = replaceStyleReference(replacedText, ReferenceOldName, ReferenceName); var fileName = item.Split(new char[] { '/' }, StringSplitOptions.RemoveEmptyEntries).LastOrDefault(); using (var fileStream = GenerateStreamFromString(replacedText)) { var blockBlob = blobRef.GetBlockBlobReference(fileName); blockBlob.Properties.ContentType = "text/html"; blockBlob.UploadFromStream(fileStream); blockBlob.SetProperties(); } } return RedirectToAction("Index"); }
private static void RetrieveDataEstante() { Logger.Setup(); _logger = LogManager.GetLogger(typeof(Program)); ScrapaContext scrapaContext = new ScrapaContext(); EstanteRepository estanteRepository = new EstanteRepository(scrapaContext); LivroRepository livroRepository = new LivroRepository(scrapaContext); //ScrapTipoEstantes(estanteRepository); string baseUrl = "https://www.estantevirtual.com.br"; IEnumerable <Estante> estantes = estanteRepository.GetAll().Where(e => e.IdEstante > 38).ToList(); foreach (Estante estante in estantes) { Console.WriteLine($@"Starting with estante {estante.Nome}"); _logger.Info($"Starting with Estante {estante.Nome}"); try { CancellationTokenSource producerCancellationToken = new CancellationTokenSource(); CancellationTokenSource consumerCancellationToken = new CancellationTokenSource(); BlockingCollection <HtmlNode> pageCollection = new BlockingCollection <HtmlNode>(250); Task producerThread = Task.Factory.StartNew(() => { int i = -1; Parallel.For(0, 350, new ParallelOptions() { MaxDegreeOfParallelism = 3, CancellationToken = producerCancellationToken.Token }, (iPage) => { if (producerCancellationToken.Token.IsCancellationRequested) { pageCollection.CompleteAdding(); } Interlocked.Increment(ref i); //for (int iPage = 0; iPage < 250; ++iPage) Console.WriteLine($"Retriving {i}."); _logger.Info($"Retriving {i}."); HtmlAgilityPack.HtmlDocument getPage = new HtmlAgilityPack.HtmlDocument(); HttpDownloader htmlDownloader = new HttpDownloader($"{estante.Link}&offset={i}", null, null); getPage.LoadHtml(htmlDownloader.GetPage()); pageCollection.Add(getPage.DocumentNode); int sleepTime = new Random(DateTime.Now.Millisecond).Next(5, 50); Console.WriteLine( $@"Sleeping for {sleepTime}. Page {i}. is ready. OnThread: { Thread.CurrentThread.ManagedThreadId })"); _logger.Info( $"Sleeping for {sleepTime}. Page {i}. is ready (OnThread: {Thread.CurrentThread.ManagedThreadId})"); }); pageCollection.CompleteAdding(); }); Task consumerThread = Task.Factory.StartNew(() => { long iPage = 0; long errorCount = 0; Parallel.For(0, 350, new ParallelOptions() { MaxDegreeOfParallelism = 3, CancellationToken = consumerCancellationToken.Token }, (i) => { consumerCancellationToken.Token.ThrowIfCancellationRequested(); Interlocked.Increment(ref iPage); try { if (pageCollection.TryTake(out HtmlNode item, TimeSpan.FromSeconds(30)) == false) { if (producerCancellationToken.Token.IsCancellationRequested && pageCollection.Count == 0) { consumerCancellationToken.Cancel(); return; } } ; HtmlNodeCollection books = item.SelectNodes("//*[contains(@class, 'js-busca-resultados')]/a"); ConcurrentBag <HtmlNode> booksConcurrentBag = new ConcurrentBag <HtmlNode>(books.ToList()); ConcurrentBag <Livro> livros = new ConcurrentBag <Livro>(); Parallel.ForEach( booksConcurrentBag, new ParallelOptions { MaxDegreeOfParallelism = 20 }, book => { HtmlNode header = book.ChildNodes.FirstOrDefault( d => d.Attributes.Contains("class") && d.Attributes["class"].Value.Contains("breslt-header")); string bookName = header .ChildNodes .FirstOrDefault(d => d.Attributes.Contains("class") && d.Attributes["class"] .Value.Contains("busca-title")) .InnerText; string bookAuthor = header .ChildNodes .FirstOrDefault(d => d.Attributes.Contains("class") && d.Attributes["class"] .Value.Contains("busca-author")) .InnerText; if (livroRepository.Exists(bookName, bookAuthor)) { return; } if (livros.Any(b => (string.Compare(bookName, b.Nome, StringComparison.InvariantCultureIgnoreCase) == 0) && (string.Compare(bookAuthor, b.Autor, StringComparison.InvariantCultureIgnoreCase) == 0))) { return; } var url = book.Attributes["href"]; Uri uri = new Uri(baseUrl + url.Value); Livro livro = new Livro(); livro.IdEstante = estante.IdEstante; livro.Autor = bookAuthor; livro.Nome = bookName; livro.IdEstanteVirtual = long.Parse(uri.Segments[4]); PriceStatistics pricestats = new PriceStatisticsService().Get(livro.IdEstanteVirtual); if (pricestats != null) { livro.QtdVendida = pricestats.QtdVendida; livro.DtUltimaVenda = pricestats.DtUltimaVenda; livro.ValorMaxVenda = pricestats.ValorMaxVenda; livro.ValorMedioVenda = pricestats.ValorMedioVenda; livro.ValorMinVenda = pricestats.ValorMinVenda; livro.ValorUltimaVenda = pricestats.ValorUltimaVenda; } livros.Add(livro); }); livroRepository.AddThreadSafe(livros); int sleep = new Random(DateTime.Now.Millisecond).Next(50, 100); Console.WriteLine($@"Sleeping for {sleep}. Page {iPage}. is finished"); _logger.Info($"Sleeping for {sleep}. Page {iPage}. is finished"); Console.WriteLine($@"Livros {livros.Count} gravados"); Thread.Sleep(sleep); } catch (Exception ex) { Console.WriteLine($@"Erro numero --> {Interlocked.Read(ref errorCount)}) <-- na Estante {estante.Nome}, na página {Interlocked.Read(ref iPage)} ERROR: {ex.Message}"); Interlocked.Increment(ref errorCount); _logger.Fatal($"Erro fatal na estante {estante.Nome}. Página {iPage}", ex); Console.WriteLine($@"Erro fatal na estante {estante.Nome}. Página {iPage}"); if (Interlocked.Read(ref errorCount) > 20) { producerCancellationToken.Cancel(); } } });
public ActionResult Index(ImportContentViewModel model) { cleanModel(model); var configuration = updateDatabase(model); var credential = $"DefaultEndpointsProtocol=https;AccountName={ model.StorageInfo.AccountName };AccountKey={ model.StorageInfo.StorageKey }"; var storageAccount = CloudStorageAccount.Parse(credential); var tableClient = storageAccount.CreateCloudBlobClient(); var blobRef = tableClient.GetContainerReference("scripts"); blobRef.CreateIfNotExists(); var styleText = string.Empty; var ReferenceOldName = string.Empty; var ReferenceName = string.Empty; foreach (var item in model.ReferenceFileURLs) { if (item.IndexOf("style.css") > -1) { var ReferHttp = new HttpDownloader(item); styleText = ReferHttp.GetPage(); ReferenceOldName = item; ReferenceName = item.Replace("?", "/").Split(new char[] { '/' }, StringSplitOptions.RemoveEmptyEntries).Last(element => element.IndexOf("style.css") > -1); } } foreach (var item in model.ReferenceFileURLs) { if (item.IndexOf("style.css") == -1) { var fontsHttp = new HttpDownloader(item); var fontsText = fontsHttp.GetPage(); var fontsName = item.Split(new char[] { '/' }, StringSplitOptions.RemoveEmptyEntries).LastOrDefault(); using (var fileStream = GenerateStreamFromString(fontsText)) { var blockBlob = blobRef.GetBlockBlobReference(fontsName); blockBlob.UploadFromStream(fileStream); var url = blockBlob.Uri.AbsoluteUri; styleText = replaceFontReference(styleText, fontsName, url); } } } using (var fileStream = GenerateStreamFromString(styleText)) { var blockBlob = blobRef.GetBlockBlobReference(ReferenceName); blockBlob.Properties.ContentType = "text/css"; blockBlob.UploadFromStream(fileStream); blockBlob.SetProperties(); ReferenceName = blockBlob.Uri.AbsoluteUri; } blobRef = tableClient.GetContainerReference("htmls"); blobRef.CreateIfNotExists(); // Create home page var Http = new HttpDownloader($"{model.BaseURL}{model.HomePageURL}"); var rawText = Http.GetPage(); var replacedText = replaceContent(rawText, model.ReplaceSections); replacedText = replaceLink(replacedText, model.BaseURL, model.PagesURLs); replacedText = replaceStyleReference(replacedText, ReferenceOldName, ReferenceName); using (var fileStream = GenerateStreamFromString(replacedText)) { var blockBlob = blobRef.GetBlockBlobReference(HomePageName); blockBlob.Properties.ContentType = "text/html"; blockBlob.UploadFromStream(fileStream); blockBlob.SetProperties(); } // Other pages foreach (var item in model.PagesURLs) { Http = new HttpDownloader($"{model.BaseURL}{item}"); rawText = Http.GetPage(); replacedText = replaceContent(rawText, model.ReplaceSections); replacedText = replaceLink(replacedText, model.BaseURL, model.PagesURLs); replacedText = replaceStyleReference(replacedText, ReferenceOldName, ReferenceName); var fileName = item.Split(new char[] { '/' }, StringSplitOptions.RemoveEmptyEntries).LastOrDefault(); using (var fileStream = GenerateStreamFromString(replacedText)) { var blockBlob = blobRef.GetBlockBlobReference(fileName); blockBlob.Properties.ContentType = "text/html"; blockBlob.UploadFromStream(fileStream); blockBlob.SetProperties(); } } return(RedirectToAction("Index")); }