Пример #1
0
        public static IEnumerable <CWMArticle> ScrapeIssueArticles(
            CWMIssue cwmIssue)
        {
            var context = BrowsingContext.New(
                Configuration.Default.WithDefaultLoader());

            var downloadPageUrl = cwmIssue.IssuePageAbsoluteUrl;

            using (var document = context
                                  .OpenAsync(downloadPageUrl)
                                  .GetAwaiter()
                                  .GetResult())
            {
                var canvasWrapper = document
                                    .GetElementById("canvas-wrapper");

                var contentDiv = canvasWrapper
                                 .QuerySelector(
                    "div#canvas > " +
                    "div#page-body-wrapper > " +
                    "div#page-body > " +
                    "div#content-wrapper > " +
                    "div#content");

                var mainContentDiv = contentDiv.Children[2];

                var productBlockContentElement = mainContentDiv
                                                 .QuerySelector(
                    "div#productWrapper > " +
                    "div.product-description > " +
                    "div.sqs-layout > " +
                    "div.row.sqs-row > " +
                    "div.col > " +
                    "div.sqs-block.html-block > " +
                    "div.sqs-block-content");

                var currentArticleCategory = "Unknown";

                var magazineSections = ValueEnum
                                       .EnumerateValues <MagazineSection, string>()
                                       .ToArray();

                foreach (var productBlockElement in productBlockContentElement.Children)
                {
                    var articeInfoStr = productBlockElement
                                        .TextContent
                                        .Replace("&nbsp;", "")
                                        .Trim();

                    var isMagazineSection = magazineSections
                                            .Contains(
                        articeInfoStr,
                        new FuzzyStringMatchingComparer(2));

                    if (isMagazineSection)
                    {
                        currentArticleCategory = articeInfoStr;
                        continue;
                    }

                    var splitTerms = articeInfoStr.Split('-');

                    if (splitTerms.Length == 2)
                    {
                        var articleName   = splitTerms[0].Trim();
                        var articleAuthor = splitTerms[1].Trim();

                        yield return(new CWMArticle(
                                         currentArticleCategory,
                                         articleName,
                                         articleAuthor,
                                         cwmIssue));
                    }
                    else
                    {
                        var articleName = articeInfoStr.Trim();

                        yield return(new CWMArticle(
                                         currentArticleCategory,
                                         articleName,
                                         "unknown",
                                         cwmIssue));
                    }
                }
            }
        }
Пример #2
0
 public static IReadOnlyList <CWMArticle> GetIssueArticles(
     this CWMIssue @this)
 {
     return(CWMInterpreter.ScrapeIssueArticles(@this)
            .ToArray());
 }