Ejemplo n.º 1
0
        static void Main(string[] args)
        {
            FluentMapper.Initialize(config =>
            {
                config.AddMap(new ArticleMap());
            });

            OnvistaArticlesParsingParameters parserParameters = new OnvistaArticlesParsingParameters(args);

            ArticlesRepository repository = new ArticlesRepository();

            ArticlesParser parser       = new ArticlesParser(repository);
            var            newsArticles = parser.ParseArticles(parserParameters);

            var pendingForSave = newsArticles.Where(x => x.ResultType == ParsingResultType.PendingForSave).ToList();

            Console.WriteLine($"{newsArticles.Count} Articles were parsed.{Environment.NewLine}" +
                              $"PendingForSave: {pendingForSave.Count} | " +
                              $"Saved: {newsArticles.Count(x => x.ResultType == ParsingResultType.Saved)} | " +
                              $"Already exists: {newsArticles.Count(x => x.ResultType == ParsingResultType.AlreadyExists)} ");

            if (pendingForSave.Count > 0)
            {
                Console.WriteLine("Saving parser results...");
                parser.SaveParsingResults(pendingForSave);

                Console.WriteLine("Done");
            }

            SaveParsedArticlesToCsv(newsArticles);
        }
Ejemplo n.º 2
0
        public List <ParsingResult <Article> > ParseArticles(OnvistaArticlesParsingParameters parameters)
        {
            string url          = parameters.NewsUrl;
            int?   pagesToParse = parameters.PagesToParse;
            bool   stopParsingOnExistingRecord = parameters.StopParsingOnExisting;
            int    skipPages    = parameters.SkipPages;
            int    requestDelay = parameters.RequestDelayMs;

            List <ParsingResult <Article> > resultArticles = new List <ParsingResult <Article> >();

            RefreshExistingArticles();

            try
            {
                int pagesCount        = -1;
                int pagesCountToParse = 1;

                for (int page = 1; page <= pagesCountToParse; page++)
                {
                    if (pagesCount != -1 && skipPages > 0)
                    {
                        skipPages--;
                        continue;
                    }

                    string pageUrl  = GetPageUrl(url, page);
                    var    document = GetDocumentFromUrl(pageUrl, requestDelay);

                    if (pagesCount == -1)
                    {
                        pagesCount        = GetPagesCount(document);
                        pagesCountToParse = pagesToParse.HasValue ? Math.Min(pagesCount, pagesToParse.Value) : pagesCount;
                    }

                    if (skipPages > 0)
                    {
                        skipPages--;
                        continue;
                    }

                    _logger.LogInformation($"Parsing of page {page}/{pagesCountToParse} started.");

                    var articles = GetArticlesFromDocument(document, requestDelay);

                    resultArticles.AddRange(articles);

                    _logger.LogInformation($"Page {page} was parsed. Articles count: {resultArticles.Count}");

                    var articlesToSave = articles.Where(x => x.ResultType == ParsingResultType.PendingForSave).ToList();

                    if (articlesToSave.Any() && parameters.SaveWithParsing)
                    {
                        _logger.LogInformation("Saving...");
                        SaveParsingResults(articlesToSave);
                    }

                    if (stopParsingOnExistingRecord &&
                        articles.Any(x => x.ResultType == ParsingResultType.AlreadyExists))
                    {
                        _logger.LogInformation("Stopping parser: already existing records were parsed");
                        break;
                    }
                }
            }
            catch (Exception ex)
            {
                _logger.LogError($"Unexpected error while parsing articles: {resultArticles.Count} articles were parsed", ex);
            }

            return(resultArticles);
        }