// Get the interactive's tagline or short description // Previously this has been difficult to pin-point // However I found this on 11/01/2019, they've got it in a META tag at the top of the HTML // E.g. <META NAME="description" content="How will young James fare alone with his mature, womanly neighbors? "> public string GetInteractiveStoryShortDescription(WdcResponse wdcPayload) { Regex interactiveShortDescRegex = new Regex("(?<=<META NAME=\"description\" content=\").+?(?=\">)", RegexOptions.IgnoreCase); Match interactiveShortDescMatch = interactiveShortDescRegex.Match(wdcPayload.WebResponse); if (!interactiveShortDescMatch.Success) { log.Warn($"Couldn't find the short description for interactive story '{wdcPayload.Address}'"); // Just a warning, don't throw an exception over it } return(HttpUtility.HtmlDecode(WdcUtil.CleanHtmlSymbols(interactiveShortDescMatch.Value))); }
// Get the interactive story's description public string GetInteractiveStoryDescription(WdcResponse wdcPayload) { Regex interactiveDescRegex = new Regex("(?<=<td align=left class=\"norm\">).+?(?=<\\/td>)", RegexOptions.IgnoreCase | RegexOptions.Singleline); Match interactiveDescMatch = interactiveDescRegex.Match(wdcPayload.WebResponse); if (!interactiveDescMatch.Success) { throw new WritingClientHtmlParseException($"Couldn't find the description for interactive story '{wdcPayload.Address}'", wdcPayload.Address, wdcPayload.WebResponse); } return(HttpUtility.HtmlDecode(WdcUtil.CleanHtmlSymbols(interactiveDescMatch.Value))); }
// Get the interactive story's title // This method grabs it from within the <title> element, not sure if it gets truncated or not. public string GetInteractiveStoryTitle(WdcResponse wdcPayload) { Regex interactiveTitleRegex = new Regex("(?<=<title>).+?(?= - Writing\\.Com<\\/title>)", RegexOptions.IgnoreCase); Match interactiveTitleMatch = interactiveTitleRegex.Match(wdcPayload.WebResponse); if (!interactiveTitleMatch.Success) { throw new WritingClientHtmlParseException($"Couldn't find the title for interactive story '{wdcPayload.Address}'", wdcPayload.Address, wdcPayload.WebResponse); } return(HttpUtility.HtmlDecode(WdcUtil.CleanHtmlSymbols(interactiveTitleMatch.Value))); }
// Get the available choices // This one is going to be complicated, because none of the divs or whatnot have ID's // First, get a chunk of the HTML that contains the choices, we'll break them down later public IEnumerable <WdcInteractiveChapterChoice> GetInteractiveChapterChoices(WdcResponse payload) { if (IsInteractiveChapterEnd(payload)) { return(null); } var choices = new List <WdcInteractiveChapterChoice>(); Regex chapterChoicesChunkRegex = new Regex("(?<=<b>You have the following choice(s)?:<\\/b>).*?(?=<\\/div><div id=\"end_of_choices\")", RegexOptions.Singleline | RegexOptions.IgnoreCase); Match chapterChoicesChunkMatch = chapterChoicesChunkRegex.Match(payload.WebResponse); if (!chapterChoicesChunkMatch.Success) { throw new WritingClientHtmlParseException($"Couldn't find the HTML chunk containing choices for interactive chapter '{payload.Address}'", payload.Address, payload.WebResponse); } string chapterChoicesChunkHtml = chapterChoicesChunkMatch.Value; // Then try to get the individual choices Regex chapterChoicesRegex = new Regex("<a .*?href=\".+?\">.+?<\\/a>", RegexOptions.IgnoreCase); MatchCollection chapterChoicesMatches = chapterChoicesRegex.Matches(chapterChoicesChunkHtml); foreach (Match match in chapterChoicesMatches) { var newChoice = new WdcInteractiveChapterChoice(); string choiceUrl; // Get the URL Regex choiceUrlRegex = new Regex("(?<=href=\").+?(?=\")"); Match choiceUrlMatch = choiceUrlRegex.Match(match.Value); if (!choiceUrlMatch.Success) { throw new WritingClientHtmlParseException($"Could not find the URL of choice '{match.Value}' on interactive chapter '{payload.Address}'", payload.Address, payload.WebResponse); } choiceUrl = choiceUrlMatch.Value; // Get just the numbers from the URL newChoice.PathLink = WdcUtil.GetFinalParmFromUrl(choiceUrl); // Get the choice name / description // Get what's in between the > and the < int indexOfGt = match.Value.IndexOf('>'); int indexofLt = match.Value.LastIndexOf('<') - 1; newChoice.Name = HttpUtility.HtmlDecode(match.Value.Substring(indexOfGt + 1, indexofLt - indexOfGt)); choices.Add(newChoice); } return(choices.ToArray()); }
public IEnumerable <Uri> GetInteractiveChapterList(string interactiveID, Uri pathToRoot, WdcResponse wdcPayload) { var chapters = new List <Uri>(); // Find the links to the interactive's pages // Create the regex that will find chapter links // E.g. https:\/\/www\.writing\.com\/main\/interact\/item_id\/1824771-short-stories-by-the-people\/map\/(\d)+ string chapterLinkRegexPattern = pathToRoot.ToString() + string.Format("main/interact/item_id/{0}/map/{1}", interactiveID, @"(\d)+"); chapterLinkRegexPattern = WdcUtil.RegexSafeUrl(chapterLinkRegexPattern); Regex chapterLinkRegex = new Regex(chapterLinkRegexPattern, RegexOptions.IgnoreCase); MatchCollection matches = chapterLinkRegex.Matches(wdcPayload.WebResponse); foreach (Match match in matches) { chapters.Add(new Uri(match.Value)); } return(chapters.ToArray()); }
public WdcTitleReaderResult ReadPageTitle(string pageTitle) { var r = new WdcTitleReaderResult(); if (string.IsNullOrEmpty(pageTitle)) { return(r); } // Start by trimming off the " - Writing.com" Regex titleTailPattern = new Regex(" - writing\\.com", RegexOptions.IgnoreCase); pageTitle = titleTailPattern.Replace(pageTitle, ""); // Look for the ": ", and split it var indexOfSeparator = pageTitle.IndexOf(TITLE_SEPARATOR); if (indexOfSeparator < 0) { // Didn't find separator, is just a simple page name r.StoryName = WdcUtil.CleanHtmlSymbols(pageTitle.Trim()); } else { // Found separator, there are 2 parts var pageTitleSplit = pageTitle.Split(TITLE_SEPARATOR); // THe recent chapters page uses "Recent chapters: (story title)" bool backwards = pageTitleSplit[0] == "Recent Chapters"; r.StoryName = WdcUtil.CleanHtmlSymbols( pageTitleSplit[backwards ? 1 : 0].Trim() ); r.PageName = WdcUtil.CleanHtmlSymbols( pageTitleSplit[backwards ? 0 : 1].Trim() ); } return(r); }
public WdcInteractiveChapter GetInteractiveChaper(string interactiveID, string chapterPath, WdcResponse payload) { if (!WdcUtil.IsValidChapterPath(chapterPath)) { throw new ArgumentException($"Chapter '{chapterPath}' is not a valid chapter path", nameof(chapterPath)); } var chapter = new WdcInteractiveChapter(); chapter.Path = chapterPath; chapter.Title = GetInteractiveChapterTitle(payload); chapter.Content = GetInteractiveChapterContent(payload); if (chapterPath != "1") { chapter.SourceChoiceTitle = GetInteractiveChapterSourceChoice(payload); // Only get the source choice if it's not the first chapter } else { chapter.SourceChoiceTitle = ""; } chapter.LastUpdated = DateTime.Now; // TODO chapter author chapter.Author = GetInteractiveChapterAuthor(payload); var choices = GetInteractiveChapterChoices(payload); if (choices == null) { chapter.IsEnd = true; } else { chapter.Choices.AddRange(choices); } return(chapter); }