protected async Task <bool> FixVoting(IHtmlDocument doc) { var votingDiv = doc.QuerySelector("div.voting"); if (votingDiv == null) { return(false); } var linkElement = votingDiv.QuerySelector("a[id*='poll']"); if (linkElement == null) { return(false); } var signatureElement = votingDiv.QuerySelector("input[name='signature']"); var signature = signatureElement.GetAttribute("value"); var url = linkElement.GetAttribute("href"); url += "&js&signature=" + signature; var resource = new DownloadResource() { Url = url }; var res = await _dataDownloader.Download(resource); var resString = res.DownloadedData.AsAnsiString(); var match = Regex.Match(resString, @"get\('(\w+)'\)\.innerHTML\s+=\s+'([^']*)'"); if (!match.Success) { return(false); } var divId = match.Groups[1].Value; var newHtml = match.Groups[2].Value.Replace(@"\""", @""""); var replaceDiv = doc.QuerySelector($"#{divId}"); if (replaceDiv == null) { return(false); } replaceDiv.InnerHtml = newHtml; votingDiv.QuerySelector("span[id*='spanpollaction']")?.Remove(); return(true); }
private async Task DetectMoreType() { if (this._moreType != DiaryMoreLinksType.Undefined) { return; } var resource = new DownloadResource { Url = "http://www.diary.ru/options/site/?msgtags" }; var optionsPageData = await _dataDownloader.Download(resource, false, 1000); var doc = _parser.Parse(optionsPageData.DownloadedData.AsAnsiString()); var element = doc.QuerySelector("input[type='radio'][name='more_type']:checked"); _moreType = (DiaryMoreLinksType)Convert.ToInt32(element.GetAttribute("value")); }
protected async Task <bool> FixMore(IHtmlDocument doc) { await DetectMoreType(); if (this._moreType == DiaryMoreLinksType.Preloaded) { return(false); } var moreLinks = doc.QuerySelectorAll("a.LinkMore"); var actualLinks = (from moreLink in moreLinks let href = moreLink.GetAttribute("href") where !string.IsNullOrEmpty(href) && href.ToLower() != "#more" select moreLink).ToList(); if (actualLinks.Count <= 0) { return(false); } if (_moreType == DiaryMoreLinksType.OnDemand) { var dataToLoad = (from link in actualLinks let matches = Regex.Matches(link.GetAttribute("onclick"), @"\""([^\""]*)\""") where matches.Count > 1 select new { LinkElement = link, Url = $"http://{_diaryName}.diary.ru{matches[1].Groups[1].Value}?post={matches[0].Groups[1].Value}&js", MorePartName = matches[0].Groups[1].Value } ).ToList(); var resources = dataToLoad.Select(d => new DownloadResource { Url = d.Url }); var downloadResults = await _dataDownloader.Download(resources); var results = (from d in dataToLoad from r in downloadResults where d.Url == r.Resource.Url select new { d.LinkElement, d.Url, r.DownloadedData, d.MorePartName }) .ToList(); foreach (var r in results) { var match = Regex.Match(r.DownloadedData.AsAnsiString(), @"innerHTML\s*=\s*'([^']*)'"); if (!match.Success) { continue; } var htmlText = match.Groups[1].Value; var spanId = $"more{r.MorePartName}"; var spanElement = doc.QuerySelector($"#{spanId}"); if (spanElement == null) { continue; } spanElement.InnerHtml = htmlText; } } else if (_moreType == DiaryMoreLinksType.FullPage) { var resource = new DownloadResource { Url = actualLinks[0].GetAttribute("href") }; var downloadResult = await _dataDownloader.Download(resource, false, 1000); var docFull = await _parser.ParseAsync(downloadResult.DownloadedData.AsAnsiString()); foreach (var link in actualLinks) { var match = Regex.Match(link.GetAttribute("href"), @"\/p(\d*).html?\?oam#(.*)$"); if (!match.Success) { continue; } var postNum = match.Groups[1].Value; var moreName = match.Groups[2].Value; var elementStart = docFull.QuerySelector($"a[name='{moreName}']"); var elementEnd = docFull.QuerySelector($"a[name='{moreName}end']"); if (elementStart == null || elementEnd == null) { continue; } var newDiv = docFull.CreateElement("div"); elementStart.Before(newDiv); var nodesToCopy = new List <INode>(); var currentNode = elementStart.NextSibling; while (currentNode != null) { if (currentNode == elementEnd) { break; } nodesToCopy.Add(currentNode); currentNode = currentNode.NextSibling; } foreach (var el in nodesToCopy) { newDiv.AppendChild(el); } var moreHtml = newDiv.InnerHtml; var moreSpanId = "more" + postNum + "m" + moreName.Substring(4); var newMoreSpan = doc.CreateElement("span"); newMoreSpan.Id = moreSpanId; newMoreSpan.Style.Display = "none"; newMoreSpan.Style.Visibility = "hidden"; link.After(newMoreSpan); link.Id = "link" + moreSpanId; newMoreSpan.InnerHtml = moreHtml; } } return(true); }
public async Task <DataDownloaderResult> Download(DownloadResource downloadResource, bool ignore404 = true, int requestDelay = 0) { if (downloadResource == null || string.IsNullOrEmpty(downloadResource.Url)) { throw new ArgumentException("Для скачивания должны быть заполнены пути к данным"); } _logger.LogInformation("Downloading data: " + downloadResource.Url); var uri = downloadResource.Url.StartsWith("htt") ? new Uri(downloadResource.Url) : new Uri(new Uri(BaseUrl), downloadResource.Url); var filePath = string.IsNullOrEmpty(downloadResource.RelativePath) ? string.Empty : Path.Combine(_diaryPath, downloadResource.RelativePath); var client = new CF_WebClient(_cookieContainer); BeforeDownload?.Invoke(this, new DataDownloaderEventArgs { Resource = downloadResource }); Thread.Sleep(requestDelay); byte[] downloadedData; var retries = 0; while (true) { try { downloadedData = await client.DownloadDataTaskAsync(uri); break; //i want to break freeeeee } catch (WebException e) { if (e.Status == WebExceptionStatus.ProtocolError && ignore404) { var response = e.Response as HttpWebResponse; if (response != null) { if (response.StatusCode == HttpStatusCode.NotFound) { _logger.LogWarning("Url not found: " + e.Response.ResponseUri.AbsoluteUri); downloadResource.LocalPath = ""; return(new DataDownloaderResult { Resource = downloadResource, DownloadedData = null }); } } } retries += 1; _logger.LogError(e, $"Error, retry count: {retries}"); if (retries >= Constants.DownloadRetryCount) { throw; } Thread.Sleep(2000); } } AfterDownload?.Invoke(this, new DataDownloaderEventArgs { Resource = downloadResource, DownloadedData = downloadedData }); if (!string.IsNullOrEmpty(filePath)) { using (var f = File.Create(filePath)) { await f.WriteAsync(downloadedData, 0, downloadedData.Length); } } return(new DataDownloaderResult { Resource = downloadResource, DownloadedData = downloadedData }); }