Example #1
0
        public async Task ParseAsync()
        {
            try
            {
                RaiseReport("START");
                var pages = await ParsePagesCountAsync().ConfigureAwait(false);   // Site peges count

                RaiseReport($"Pages: { pages }");
                var lastUrl = _storage.GetLastPostUrl();    // Load last parsed post
                RaiseReport($"Last: { lastUrl ?? "New session" }");
                var isEnd = false;
                for (int i = _start; i <= pages; i++)
                {
                    try
                    {
                        if (i == 1)
                        {
                            continue;           // Skip first page
                        }
                        var pageUrl = string.Format(PagePattern, i);

                        RaisePage(new Uri(pageUrl));
                        var postLinkTags = await GetPostUrlsFromPageAsync(pageUrl).ConfigureAwait(false);    // Parse post url's from page

                        var stopWatch = Stopwatch.StartNew();
                        foreach (var postUrl in postLinkTags)
                        {
                            try
                            {
                                RaisePage(new Uri(postUrl));
                                if (postUrl == lastUrl)     // Save only new posts
                                {
                                    isEnd = true;
                                    break;
                                }
                                else if (_start != 0 && _storage.IsExists(postUrl))     // Continue loading from page
                                {
                                    continue;
                                }

                                stopWatch.Restart();
                                var html = await LoadPageAsync(postUrl).ConfigureAwait(false);    // Load post html

                                RaiseReport($"Page loaded: [{ stopWatch.Elapsed.TotalMilliseconds }]");
                                if (string.IsNullOrEmpty(html))
                                {
                                    RaiseError($"Can't load page: { postUrl }");
                                    continue;
                                }
                                stopWatch.Restart();

                                // Parse elements and save to storage

                                using var postParser = new PostParser(html);
                                var postDto = await postParser.GetPostDtoAsync().ConfigureAwait(false);

                                postDto.Comments = await postParser.GetPostCommentsAsync().ConfigureAwait(false);

                                postDto.Files = _saveFiles ? await postParser.GetPostFilesAsync().ConfigureAwait(false) : new List <string>();

                                RaiseReport($"Post parsed: [{ stopWatch.Elapsed.TotalMilliseconds }] ms");
                                stopWatch.Restart();
                                await _storage.SavePostAsync(postUrl, postDto).ConfigureAwait(false);

                                RaiseReport($"Post saved: [{ stopWatch.Elapsed.TotalMilliseconds }] ms");
                            }
                            catch (Exception ex)
                            {
                                RaiseError(ex.Message);
                            }
                        }
                    }
                    catch (Exception ex)
                    {
                        RaiseError(ex.Message);
                    }
                    if (isEnd)
                    {
                        break;
                    }
                }
                RaiseReport("DONE!");
            }
            catch (Exception ex)
            {
                RaiseError(ex.Message);
            }
        }