protected async Task <bool> FixVoting(IHtmlDocument doc) { var votingDiv = doc.QuerySelector("div.voting"); if (votingDiv == null) { return(false); } var linkElement = votingDiv.QuerySelector("a[id*='poll']"); if (linkElement == null) { return(false); } var signatureElement = votingDiv.QuerySelector("input[name='signature']"); var signature = signatureElement.GetAttribute("value"); var url = linkElement.GetAttribute("href"); url += "&js&signature=" + signature; var resource = new DownloadResource() { Url = url }; var res = await _dataDownloader.Download(resource); var resString = res.DownloadedData.AsAnsiString(); var match = Regex.Match(resString, @"get\('(\w+)'\)\.innerHTML\s+=\s+'([^']*)'"); if (!match.Success) { return(false); } var divId = match.Groups[1].Value; var newHtml = match.Groups[2].Value.Replace(@"\""", @""""); var replaceDiv = doc.QuerySelector($"#{divId}"); if (replaceDiv == null) { return(false); } replaceDiv.InnerHtml = newHtml; votingDiv.QuerySelector("span[id*='spanpollaction']")?.Remove(); return(true); }
private async Task <bool> ScanDateUrlsAsync(IEnumerable <DiaryDatePage> datePages, CancellationToken cancellationToken) { var pattern = $@"({_options.DiaryName}\.diary\.ru\/p\w+\.htm).{{0,30}}URL"; foreach (var datePage in datePages) { cancellationToken.ThrowIfCancellationRequested(); var downloadResult = await _downloader.Download(datePage, false, _options.RequestDelay); var html = downloadResult.DownloadedData.AsAnsiString(); var matches = Regex.Matches(html, pattern, RegexOptions.IgnoreCase); foreach (Match m in matches) { cancellationToken.ThrowIfCancellationRequested(); await DownloadPostAsync("http://" + m.Groups[1].Value, datePage.PostDate); } Progress.IncrementInt(ScrapeProgressNames.DatePagesProcessed, 1); await _downloadExistingChecker.AddProcessedDataAsync(datePage); } return(true); }