예제 #1
0
        public static IEnumerable <CWMIssue> ScrapeVolumeIssues(
            CWMVolume cwmVolume)
        {
            var context = BrowsingContext.New(
                Configuration.Default.WithDefaultLoader());

            var downloadPageUrl = cwmVolume.VolumePageAbsoluteUrl;

            using (var document = context
                                  .OpenAsync(downloadPageUrl)
                                  .GetAwaiter()
                                  .GetResult())
            {
                var canvasWrapper = document
                                    .GetElementById("canvas-wrapper");

                var contentDiv = canvasWrapper
                                 .QuerySelector(
                    "div#canvas > " +
                    "div#page-body-wrapper > " +
                    "div#page-body > " +
                    "div#content-wrapper > " +
                    "div#content");

                var mainContentDiv = contentDiv.Children[2];

                var mainProductList = mainContentDiv
                                      .QuerySelector("div#productList");

                var issueLinkElements = mainProductList
                                        .QuerySelectorAll(
                    "a.product")
                                        .ToArray();

                foreach (var issueLinkElement in issueLinkElements)
                {
                    var hrefRelativeIssueLink = issueLinkElement.GetAttribute("href");
                    var formattedIssueText    = hrefRelativeIssueLink.Trim();

                    if (!_issueLinkTextRegex.IsMatch(formattedIssueText))
                    {
                        //Console.WriteLine($"Cannot parse IssueLinkText {formattedIssueText.Quote()}");
                        continue;
                    }
                    var issueTextMatch = _issueLinkTextRegex.Match(formattedIssueText);

                    var issueMagazineText     = issueTextMatch.Groups["magazine"].Value;
                    var volumeNumberMatchText = issueTextMatch.Groups["volumeNumber"].Value;
                    var issueNumberMatchText  = issueTextMatch.Groups["issueNumber"].Value;

                    if (issueNumberMatchText.IsNullOrEmptyEx())
                    {
                        //Console.WriteLine($"Cannot parse IssueLinkText regex {formattedIssueText.Quote()}");
                        continue;
                    }
                    var issueMagazine = Magazine.GetMagazineFromPrefix(issueMagazineText);

                    if (!int.TryParse(volumeNumberMatchText, out var volumeNumber))
                    {
                        throw new FormatException(
                                  $"Cannot parse the 'volumeNumber' from the text {volumeNumberMatchText.Quote()}.");
                    }

                    if (!int.TryParse(issueNumberMatchText, out var issueNumber))
                    {
                        throw new FormatException(
                                  $"Cannot parse the 'issueNumber' from the text {issueNumberMatchText.Quote()}.");
                    }


                    var issuePageAbsoluteUrl = $"{_domain.TrimEnd('/')}{hrefRelativeIssueLink}";

                    yield return(new CWMIssue(
                                     volumeNumber,
                                     issueNumber,
                                     issueMagazine,
                                     issuePageAbsoluteUrl,
                                     cwmVolume));
                }
            }
        }