protected async Task <bool> FixVoting(IHtmlDocument doc)
        {
            var votingDiv = doc.QuerySelector("div.voting");

            if (votingDiv == null)
            {
                return(false);
            }

            var linkElement = votingDiv.QuerySelector("a[id*='poll']");

            if (linkElement == null)
            {
                return(false);
            }

            var signatureElement = votingDiv.QuerySelector("input[name='signature']");
            var signature        = signatureElement.GetAttribute("value");

            var url = linkElement.GetAttribute("href");

            url += "&js&signature=" + signature;
            var resource = new DownloadResource()
            {
                Url = url
            };
            var res = await _dataDownloader.Download(resource);

            var resString = res.DownloadedData.AsAnsiString();
            var match     = Regex.Match(resString, @"get\('(\w+)'\)\.innerHTML\s+=\s+'([^']*)'");

            if (!match.Success)
            {
                return(false);
            }

            var divId      = match.Groups[1].Value;
            var newHtml    = match.Groups[2].Value.Replace(@"\""", @"""");
            var replaceDiv = doc.QuerySelector($"#{divId}");

            if (replaceDiv == null)
            {
                return(false);
            }
            replaceDiv.InnerHtml = newHtml;
            votingDiv.QuerySelector("span[id*='spanpollaction']")?.Remove();

            return(true);
        }
        private async Task DetectMoreType()
        {
            if (this._moreType != DiaryMoreLinksType.Undefined)
            {
                return;
            }
            var resource = new DownloadResource {
                Url = "http://www.diary.ru/options/site/?msgtags"
            };
            var optionsPageData = await _dataDownloader.Download(resource, false, 1000);

            var doc     = _parser.Parse(optionsPageData.DownloadedData.AsAnsiString());
            var element = doc.QuerySelector("input[type='radio'][name='more_type']:checked");

            _moreType = (DiaryMoreLinksType)Convert.ToInt32(element.GetAttribute("value"));
        }
        protected async Task <bool> FixMore(IHtmlDocument doc)
        {
            await DetectMoreType();

            if (this._moreType == DiaryMoreLinksType.Preloaded)
            {
                return(false);
            }

            var moreLinks = doc.QuerySelectorAll("a.LinkMore");

            var actualLinks = (from moreLink in moreLinks
                               let href = moreLink.GetAttribute("href")
                                          where !string.IsNullOrEmpty(href) && href.ToLower() != "#more"
                                          select moreLink).ToList();

            if (actualLinks.Count <= 0)
            {
                return(false);
            }

            if (_moreType == DiaryMoreLinksType.OnDemand)
            {
                var dataToLoad = (from link in actualLinks
                                  let matches = Regex.Matches(link.GetAttribute("onclick"), @"\""([^\""]*)\""")
                                                where matches.Count > 1
                                                select new
                {
                    LinkElement = link,
                    Url = $"http://{_diaryName}.diary.ru{matches[1].Groups[1].Value}?post={matches[0].Groups[1].Value}&js",
                    MorePartName = matches[0].Groups[1].Value
                }
                                  ).ToList();

                var resources = dataToLoad.Select(d => new DownloadResource {
                    Url = d.Url
                });
                var downloadResults = await _dataDownloader.Download(resources);

                var results = (from d in dataToLoad
                               from r in downloadResults
                               where d.Url == r.Resource.Url
                               select new { d.LinkElement, d.Url, r.DownloadedData, d.MorePartName })
                              .ToList();

                foreach (var r in results)
                {
                    var match = Regex.Match(r.DownloadedData.AsAnsiString(), @"innerHTML\s*=\s*'([^']*)'");
                    if (!match.Success)
                    {
                        continue;
                    }
                    var htmlText    = match.Groups[1].Value;
                    var spanId      = $"more{r.MorePartName}";
                    var spanElement = doc.QuerySelector($"#{spanId}");
                    if (spanElement == null)
                    {
                        continue;
                    }
                    spanElement.InnerHtml = htmlText;
                }
            }
            else if (_moreType == DiaryMoreLinksType.FullPage)
            {
                var resource = new DownloadResource {
                    Url = actualLinks[0].GetAttribute("href")
                };
                var downloadResult = await _dataDownloader.Download(resource, false, 1000);

                var docFull = await _parser.ParseAsync(downloadResult.DownloadedData.AsAnsiString());

                foreach (var link in actualLinks)
                {
                    var match = Regex.Match(link.GetAttribute("href"), @"\/p(\d*).html?\?oam#(.*)$");
                    if (!match.Success)
                    {
                        continue;
                    }
                    var postNum      = match.Groups[1].Value;
                    var moreName     = match.Groups[2].Value;
                    var elementStart = docFull.QuerySelector($"a[name='{moreName}']");
                    var elementEnd   = docFull.QuerySelector($"a[name='{moreName}end']");
                    if (elementStart == null || elementEnd == null)
                    {
                        continue;
                    }
                    var newDiv = docFull.CreateElement("div");
                    elementStart.Before(newDiv);
                    var nodesToCopy = new List <INode>();
                    var currentNode = elementStart.NextSibling;
                    while (currentNode != null)
                    {
                        if (currentNode == elementEnd)
                        {
                            break;
                        }
                        nodesToCopy.Add(currentNode);
                        currentNode = currentNode.NextSibling;
                    }

                    foreach (var el in nodesToCopy)
                    {
                        newDiv.AppendChild(el);
                    }

                    var moreHtml = newDiv.InnerHtml;

                    var moreSpanId  = "more" + postNum + "m" + moreName.Substring(4);
                    var newMoreSpan = doc.CreateElement("span");
                    newMoreSpan.Id = moreSpanId;

                    newMoreSpan.Style.Display    = "none";
                    newMoreSpan.Style.Visibility = "hidden";
                    link.After(newMoreSpan);
                    link.Id = "link" + moreSpanId;
                    newMoreSpan.InnerHtml = moreHtml;
                }
            }

            return(true);
        }
Example #4
0
        public async Task <DataDownloaderResult> Download(DownloadResource downloadResource, bool ignore404 = true, int requestDelay = 0)
        {
            if (downloadResource == null || string.IsNullOrEmpty(downloadResource.Url))
            {
                throw new ArgumentException("Для скачивания должны быть заполнены пути к данным");
            }
            _logger.LogInformation("Downloading data: " + downloadResource.Url);

            var uri = downloadResource.Url.StartsWith("htt") ? new Uri(downloadResource.Url) : new Uri(new Uri(BaseUrl), downloadResource.Url);

            var filePath = string.IsNullOrEmpty(downloadResource.RelativePath)
                            ? string.Empty
                            : Path.Combine(_diaryPath, downloadResource.RelativePath);

            var client = new CF_WebClient(_cookieContainer);

            BeforeDownload?.Invoke(this, new DataDownloaderEventArgs {
                Resource = downloadResource
            });
            Thread.Sleep(requestDelay);
            byte[] downloadedData;
            var    retries = 0;

            while (true)
            {
                try
                {
                    downloadedData = await client.DownloadDataTaskAsync(uri);

                    break; //i want to break freeeeee
                }
                catch (WebException e)
                {
                    if (e.Status == WebExceptionStatus.ProtocolError && ignore404)
                    {
                        var response = e.Response as HttpWebResponse;
                        if (response != null)
                        {
                            if (response.StatusCode == HttpStatusCode.NotFound)
                            {
                                _logger.LogWarning("Url not found: " + e.Response.ResponseUri.AbsoluteUri);
                                downloadResource.LocalPath = "";
                                return(new DataDownloaderResult {
                                    Resource = downloadResource, DownloadedData = null
                                });
                            }
                        }
                    }
                    retries += 1;
                    _logger.LogError(e, $"Error, retry count: {retries}");

                    if (retries >= Constants.DownloadRetryCount)
                    {
                        throw;
                    }
                    Thread.Sleep(2000);
                }
            }

            AfterDownload?.Invoke(this, new DataDownloaderEventArgs {
                Resource = downloadResource, DownloadedData = downloadedData
            });

            if (!string.IsNullOrEmpty(filePath))
            {
                using (var f = File.Create(filePath))
                {
                    await f.WriteAsync(downloadedData, 0, downloadedData.Length);
                }
            }

            return(new DataDownloaderResult {
                Resource = downloadResource, DownloadedData = downloadedData
            });
        }