private async Task UpdateChapter(WInteractiveChapter localChapter) { _log.DebugFormat("Updating chapter {0}", localChapter.Path); var cancelToken = _cancelTokenSource.Token; Uri chapterUrl = _wc.GetInteractiveChapterUrl(_story.UrlID, localChapter.Path); WInteractiveChapter retrievedChapter; while (true) { // Cancellation support if (cancelToken.IsCancellationRequested) { return; } try { DoStateUpdate($"Exporting chapter: {localChapter.Path}"); retrievedChapter = await _wc.GetInteractiveChapter(chapterUrl); break; // Got the chapter successfully, excape this crude retry loop } catch (InteractivesTemporarilyUnavailableException) { _log.InfoFormat("Encountered 'interactives temporarily unavailable' page, pausing for {0}ms", ITU_PAUSE_MS); ITUPause("Interactives temporarily unavailable, waiting {0}s"); // Cancellation support if (cancelToken.IsCancellationRequested) { return; } } } // Cancellation support if (cancelToken.IsCancellationRequested) { return; } // Got the chapter, lets sync and merge the changes into the local copy //log.Debug("Updating chapter"); localChapter.LastSynced = DateTime.Now; _story.HasChanged = true; localChapter.Author = retrievedChapter.Author; localChapter.Choices = retrievedChapter.Choices; localChapter.Content = retrievedChapter.Content; localChapter.IsEnd = retrievedChapter.IsEnd; if (localChapter.Path == String.Empty) { localChapter.Path = retrievedChapter.Path; // Should I risk changing this? } localChapter.SourceChoiceTitle = retrievedChapter.SourceChoiceTitle; localChapter.Title = retrievedChapter.Title; localChapter.VersionFoundAt = retrievedChapter.VersionFoundAt; }
public async Task <WInteractiveChapter> GetInteractiveChapter(Uri chapterUrl) { log.DebugFormat("Getting interactive chapter {0}", chapterUrl.AbsolutePath); var newChapter = new WInteractiveChapter(); string chapterUrlParm = GetMapUrlParameter(chapterUrl.ToString()); string chapterHtml = await HttpGetAsyncAsString(chapterUrl); // DEBUG //string chapterHtml = System.IO.File.ReadAllText("test-interactive-page.html"); // TODO: detect either requesting a login, or that "interactives temporarily unavailable" mesage // Detect "Login required" if (IsLoginPage(chapterHtml)) { // We need to log in, and get the chapter again log.Debug("Login required while trying to get an interactive chapter"); await LoginAsync(); // Get it again chapterHtml = await HttpGetAsyncAsString(chapterUrl); // Check if it's a login again. If it is, login failed if (IsLoginPage(chapterHtml)) { throw new Exception("Failed to login to retrieve interactive chapter"); } } // Detect "Interactives temporarily unavailable" // TODO: Should this be in here? If we want to better handle cancelling, I'm wondering if this should throw a specialised exception // and let the thread above handle retrying if (IsInteractivesUnavailablePage(chapterHtml)) { throw new InteractivesTemporarilyUnavailableException(); } // Get chapter title // Method 1. Get it from the "Your path to this chapter" // CAUTION: can sometimes get truncated, but this appears to be the the legit title from the database, it was truncated when the chapter was made // NOTE: Fails on the first chapter, because there's no choices made yet // string chapterTitleRegexPattern = string.Format("(?<=\\/map\\/{0}\">).*?(?=<\\/a>)", chapterUrlParm); // Method 2. Get it from between <big><big><b>...</b></big></big> // <span style="font-size:1.45em;font-weight:bold;">...</span> // There are other isntances of the <big><b> tags in use, but only the chapter title gets wrapped in 2x of them // Isn't perfect, but until the website layout changes, it'll work string chapterTitleRegexPattern = @"(?<=<span style=""font-size:1\.45em;font-weight:bold;"">).+(?=</span>\s+ \s+</span>)"; Regex chapterTitleRegex = new Regex(chapterTitleRegexPattern, RegexOptions.IgnoreCase | RegexOptions.Singleline); Match chapterTitleMatch = chapterTitleRegex.Match(chapterHtml); if (!chapterTitleMatch.Success) { throw new WritingClientHtmlParseException($"Couldn't find the chapter title for chapter '{chapterUrl.ToString()}'", chapterHtml); } string chapterTitle = HttpUtility.HtmlDecode(chapterTitleMatch.Value); // Search for the choice that lead to this chapter // This usually has the more fleshed out title, as the legit title can sometimes be truncated Regex chapterSourceChoiceRegex = new Regex(@"(?<=This\s*choice:\s*<b>).+(?=</b><span class=""noPrint"">)", RegexOptions.IgnoreCase | RegexOptions.Singleline); Match chapterSourceChoiceMatch = chapterSourceChoiceRegex.Match(chapterHtml); if (!chapterSourceChoiceMatch.Success && chapterUrlParm != "1") // If we can't find it, and it's not the first chapter { throw new WritingClientHtmlParseException($"Couldn't find the interactive chapter's source choice and this isn't the first chapter, for chapter '{chapterUrl.ToString()}'", chapterHtml); } string chapterSourceChoice = HttpUtility.HtmlDecode(chapterSourceChoiceMatch.Value); // Search for the chapter content, the actual writing // <div class="KonaBody">stuff goes here</div> //Regex chapterContentRegex = new Regex("(?<=<div class=\"KonaBody\">).+?(?=<\\/div>)", RegexOptions.IgnoreCase | RegexOptions.Singleline); Regex chapterContentRegex = new Regex(@"(?<=<div class="""">).+(?=</span></div></div>)", RegexOptions.IgnoreCase | RegexOptions.Singleline); Match chapterContentMatch = chapterContentRegex.Match(chapterHtml); if (!chapterContentMatch.Success) { throw new WritingClientHtmlParseException($"Couldn't find the content for the interactive chapter '{chapterUrl.ToString()}'", chapterHtml); } string chapterContent = HttpUtility.HtmlDecode(chapterContentMatch.Value) .Replace(@"<span style=""font-size:1.5em;""><div><span>", "") .Replace(@"<div><span>", "") .Replace(@"</div></span>", ""); // Get the author // <a title="Username: rpcity Member Since: July 4th, 2002 Click for links!" style="font - size:1em; font - weight:bold; cursor: pointer; ">SmittySmith</a> Regex chapterAuthorChunkRegex = new Regex(@"<a title=""\s*Username:\s*.*?</a>", RegexOptions.IgnoreCase | RegexOptions.Singleline); Match chapterAuthorChunkMatch = chapterAuthorChunkRegex.Match(chapterHtml); if (!chapterAuthorChunkMatch.Success) { throw new WritingClientHtmlParseException($"Couldn't find the HTML chunk containing the author for the interactive chapter '{chapterUrl.ToString()}'", chapterHtml); } string chapterAuthorChunk = chapterAuthorChunkMatch.Value; // Get the author username Regex chapterAuthorUsernameRegex = new Regex(@"(?<=Username:\s*)[a-zA-Z]+"); Match chapterAuthorUsernameMatch = chapterAuthorUsernameRegex.Match(chapterAuthorChunk); string chapterAuthorUsername = chapterAuthorUsernameMatch.Value; // Get the author display name Regex chapterAuthorNameRegex = new Regex("(?<=>).+?(?=<)"); Match chapterAuthorNameMatch = chapterAuthorNameRegex.Match(chapterAuthorChunk); string chapterAuthorName = chapterAuthorNameMatch.Value; // End end chapters if (IsInteractiveChapterEnd(chapterHtml)) { newChapter.IsEnd = true; } else { // Search for the available choices // This one is going to be complicated, because none of the divs or whatnot have ID's // First, get a chunk of the HTML that contains the choices, we'll break them down later Regex chapterChoicesChunkRegex = new Regex(@"(?<=<b>You've got the following choice).+(?=</div><div id=""end_choices"")", RegexOptions.Singleline | RegexOptions.IgnoreCase); Match chapterChoicesChunkMatch = chapterChoicesChunkRegex.Match(chapterHtml); if (!chapterChoicesChunkMatch.Success) { throw new WritingClientHtmlParseException($"Couldn't find the HTML chunk containing choices for interactive chapter '{chapterUrl.ToString()}'", chapterHtml); } string chapterChoicesChunkHtml = chapterChoicesChunkMatch.Value; // Then try to get the individual choices Regex chapterChoicesRegex = new Regex(@"<a .*?href="".+?"">.+?</a>", RegexOptions.IgnoreCase); MatchCollection chapterChoicesMatches = chapterChoicesRegex.Matches(chapterChoicesChunkHtml); foreach (Match match in chapterChoicesMatches) { string choiceUrl; string choiceMapUrlParm; string choiceName; // Get the URL Regex choiceUrlRegex = new Regex(@"(?<=href="").+?(?="")"); Match choiceUrlMatch = choiceUrlRegex.Match(match.Value); if (!choiceUrlMatch.Success) { throw new WritingClientHtmlParseException($"Could not find the URL of choice '{match.Value}' on interactive chapter '{chapterUrl.ToString()}'", chapterHtml); } choiceUrl = choiceUrlMatch.Value; // Get just the numbers from the URL choiceMapUrlParm = GetMapUrlParameter(choiceUrl); // Get the choice name / description int indexOfGt = match.Value.IndexOf('>'); int indexofLt = match.Value.LastIndexOf('<') - 1; choiceName = HttpUtility.HtmlDecode(match.Value.Substring(indexOfGt + 1, indexofLt - indexOfGt)); newChapter.Choices.Add(new WInteractiveChapterChoice() { MapLink = choiceMapUrlParm, Name = choiceName }); } } // Put the rest together newChapter.Author = new WAuthor() { Name = chapterAuthorName, Username = chapterAuthorUsername }; newChapter.Path = chapterUrlParm; newChapter.Content = chapterContent; newChapter.SourceChoiceTitle = chapterSourceChoice; newChapter.Title = chapterTitle; return(newChapter); }
private async Task UpdateStory() { _log.DebugFormat("Updating story: {0}", _story.UrlID); var cancelToken = _cancelTokenSource.Token; WInteractiveStory retrievedStory; while (true) { try { retrievedStory = await _wc.GetInteractive(_story.UrlID); break; // Got the chapter successfully, excape this crude retry loop } catch (InteractivesTemporarilyUnavailableException) { _log.InfoFormat("Encountered 'interactives temporarily unavailable' page, pausing for {0}ms", ITU_PAUSE_MS); ITUPause("Interactives temporarily unavailable, waiting {0}s"); // Cancellation support if (cancelToken.IsCancellationRequested) { return; } // This method of pausing won't work that well when it comes to cancelling. // It'll do for now, but we'll need something better // E.g. it sents a "Pause until" DateTime, and keeps checking every second, or something. /* * _log.DebugFormat("Encountered 'interactives temporarily unavailable' page, pausing for {0}ms", ITUPauseMS); * DoStateUpdate("Interactives temporarily unavailable, waiting before trying again"); * Thread.Sleep(ITUPauseMS); */ } } // Cancellation support if (cancelToken.IsCancellationRequested) { return; } // Update the story _story.LastSynced = DateTime.Now; if (_story.Name != retrievedStory.Name || _story.Description != retrievedStory.Description || _story.ShortDescription != retrievedStory.ShortDescription ) { _log.DebugFormat("Updating story details"); _story.Name = retrievedStory.Name; _story.ShortDescription = retrievedStory.ShortDescription; _story.Description = retrievedStory.Description; _story.LastUpdated = DateTime.Now; _story.HasChanged = true; } // Cancellation support if (cancelToken.IsCancellationRequested) { return; } // Check if there's any new chapters that we haven't seen before //DoStatusUpdate("Checking for new chapters"); Uri[] chapters = new Uri[0]; while (true) { try { DoStateUpdate("Retrieving chapter list"); chapters = await _wc.GetAllInteractiveChapterUrls(_story.UrlID); break; // Got what we need, lets escape this loop } catch (InteractivesTemporarilyUnavailableException) { _log.InfoFormat("Encountered 'interactives temporarily unavailable' page, pausing for {0}ms", ITU_PAUSE_MS); ITUPause("Interactives temporarily unavailable, waiting {0}s"); // Cancellation support if (cancelToken.IsCancellationRequested) { return; } } } foreach (Uri chapterUrl in chapters) { // Cancellation support if (cancelToken.IsCancellationRequested) { return; } // Do we already have this chapter? string chapterMapId = _wc.GetMapUrlParameter(chapterUrl.ToString()); WInteractiveChapter localChapter = _story.Chapters.Find(c => c.Path == chapterMapId); if (localChapter == null) { // Add a placeholder chapter to this story's list of chapters // The sync worker should see this and get the rest of the details _story.Chapters.Add(new WInteractiveChapter() { Path = _wc.GetMapUrlParameter(chapterUrl) }); } } }