private BlogEditingTemplate ParseBlogPostIntoTemplate(BlogPostRegionLocatorStrategy regionLocator, IProgressHost progress, string postUrl) { progress.UpdateProgress(Res.Get(StringId.ProgressCreatingEditingTemplate)); BlogPostRegions regions = regionLocator.LocateRegionsOnUIThread(progress, postUrl); IHTMLElement primaryTitleRegion = GetPrimaryEditableTitleElement(regions.BodyRegion, regions.Document, regions.TitleRegions); // IF // - primaryTitleRegion is not null (title found) // - BodyRegion is null (no post body found) // - AND primaryTitleRegion is a link if (primaryTitleRegion != null && regions.BodyRegion == null && primaryTitleRegion.tagName.ToLower() == "a") { // Title region was detected, but body region was not. // It is possible that only titles are shown on the homepage // Try requesting the post itself, and loading regions from the post itself // HACK Somewhere the 'about:' protocol replaces http/https, replace it again with the correct protocol var pathMatch = new Regex("^about:(.*)$").Match((primaryTitleRegion as IHTMLAnchorElement).href); Debug.Assert(pathMatch.Success); // Assert that this URL is to the format we expect var newPostPath = pathMatch.Groups[1].Value; // Grab the path from the URL var homepageUri = new Uri(_blogHomepageUrl); var newPostUrl = $"{homepageUri.Scheme}://{homepageUri.Host}{newPostPath}"; // Recreate the full post URL // Set the NextTryPostUrl flag in the region locater // This will indicate to the other thread that another page should be parsed _nextTryPostUrl = newPostUrl; return(null); } BlogEditingTemplate template = GenerateBlogTemplate((IHTMLDocument3)regions.Document, primaryTitleRegion, regions.TitleRegions, regions.BodyRegion); progress.UpdateProgress(100, 100); return(template); }
private BlogEditingTemplate ParseBlogPostIntoTemplate(BlogPostRegionLocatorStrategy regionLocator, IProgressHost progress) { progress.UpdateProgress(Res.Get(StringId.ProgressCreatingEditingTemplate)); BlogPostRegions regions = regionLocator.LocateRegionsOnUIThread(progress); IHTMLElement primaryTitleRegion = GetPrimaryEditableTitleElement(regions.BodyRegion, regions.Document, regions.TitleRegions); BlogEditingTemplate template = GenerateBlogTemplate((IHTMLDocument3)regions.Document, primaryTitleRegion, regions.TitleRegions, regions.BodyRegion); progress.UpdateProgress(100, 100); return(template); }
public override BlogPostRegions LocateRegionsOnUIThread(IProgressHost progress) { blogHomepageContents.Seek(0, SeekOrigin.Begin); IHTMLDocument2 doc2 = HTMLDocumentHelper.GetHTMLDocumentFromStream(blogHomepageContents, _blogHomepageUrl); // Ensure that the document is fully loaded. // If it is not fully loaded, then viewing its current style is non-deterministic. DateTime startedDoingEvents = DateTime.Now; while (!progress.CancelRequested && !HTMLDocumentHelper.IsReady(doc2)) { if (DateTime.Now.Subtract(startedDoingEvents).TotalMilliseconds > 10000) { // Timing out here is not fatal. Trace.WriteLine("Timed out while loading blog homepage for theme detection."); break; } Application.DoEvents(); } //The Google/Blogger dynamic templates load the pages dynmaically usig Ajax, so we dont have any template to use. if (IsUsingDynamicTemplate(doc2)) { throw new BlogClientAbortGettingTemplateException(); } IHTMLElement[] titles = FindText(_titleText, doc2.body); IHTMLElement[] bodies = FindText(_bodyText, doc2.body); if (titles.Length == 0 || bodies.Length == 0) { throw new Exception("Unable to locate blog post elements using most recent post"); } if (IsSmartContent(bodies[0])) { throw new Exception("Most recent post is smart content"); } BlogPostRegions regions = new BlogPostRegions(); regions.TitleRegions = titles; //scrub the post body element to avoid improperly including extraneous parent elements regions.BodyRegion = ScrubPostBodyRegionParentElements(bodies[0]); regions.Document = doc2; progress.UpdateProgress(100, 100); return(regions); }
private BlogPostRegions ParseBlogPostIntoTemplate(Stream stream, string postSourceUrl, IProgressHost progress) { progress.UpdateProgress(Res.Get(StringId.ProgressCreatingEditingTemplate)); //parse the document to create the blog template IHTMLDocument2 doc2 = HTMLDocumentHelper.GetHTMLDocumentFromStream(stream, postSourceUrl); IHTMLDocument3 doc = (IHTMLDocument3)doc2; IHTMLElement[] titleElements = HTMLDocumentHelper.FindElementsContainingText(doc2, TEMPORARY_POST_TITLE_GUID); IHTMLElement bodyElement = HTMLDocumentHelper.FindElementContainingText(doc2, TEMPORARY_POST_BODY_GUID); if (bodyElement != null && bodyElement.tagName == "P") { //the body element is the <p> we planted, so replace it with a DIV since that will be the safest //element to have a as parent to all post content. IHTMLElement div = doc2.createElement("div"); (bodyElement.parentElement as IHTMLDOMNode).replaceChild(div as IHTMLDOMNode, bodyElement as IHTMLDOMNode); bodyElement = div; } //locate the title element. Note that is there are more than 1 copies of the title text detected, we use the one //that is anchored closest to the left or the body element. if (titleElements.Length > 0) { BlogPostRegions regions = new BlogPostRegions(); regions.Document = (IHTMLDocument)doc; regions.TitleRegions = titleElements; regions.BodyRegion = bodyElement; progress.UpdateProgress(100, 100); return(regions); } else { throw new Exception("unable to access test post."); } }
public override BlogPostRegions LocateRegionsOnUIThread(IProgressHost progress) { blogHomepageContents.Seek(0, SeekOrigin.Begin); IHTMLDocument2 doc2 = HTMLDocumentHelper.GetHTMLDocumentFromStream(blogHomepageContents, _blogHomepageUrl); // Ensure that the document is fully loaded. // If it is not fully loaded, then viewing its current style is non-deterministic. DateTime startedDoingEvents = DateTime.Now; while (!progress.CancelRequested && !HTMLDocumentHelper.IsReady(doc2)) { if (DateTime.Now.Subtract(startedDoingEvents).TotalMilliseconds > 10000) { // Timing out here is not fatal. Trace.WriteLine("Timed out while loading blog homepage for theme detection."); break; } Application.DoEvents(); } IHTMLElement[] titles = FindText(_titleText, doc2.body); IHTMLElement[] bodies = FindText(_bodyText, doc2.body); if (titles.Length == 0 || bodies.Length == 0) throw new Exception("Unable to locate blog post elements using most recent post"); if (IsSmartContent(bodies[0])) throw new Exception("Most recent post is smart content"); BlogPostRegions regions = new BlogPostRegions(); regions.TitleRegions = titles; //scrub the post body element to avoid improperly including extraneous parent elements regions.BodyRegion = ScrubPostBodyRegionParentElements(bodies[0]); regions.Document = doc2; progress.UpdateProgress(100, 100); return regions; }
private BlogPostRegions ParseBlogPostIntoTemplate(Stream stream, string postSourceUrl, IProgressHost progress) { progress.UpdateProgress(Res.Get(StringId.ProgressCreatingEditingTemplate)); //parse the document to create the blog template IHTMLDocument2 doc2 = HTMLDocumentHelper.GetHTMLDocumentFromStream(stream, postSourceUrl); IHTMLDocument3 doc = (IHTMLDocument3)doc2; IHTMLElement[] titleElements = HTMLDocumentHelper.FindElementsContainingText(doc2, TEMPORARY_POST_TITLE_GUID); IHTMLElement bodyElement = HTMLDocumentHelper.FindElementContainingText(doc2, TEMPORARY_POST_BODY_GUID); if (bodyElement != null && bodyElement.tagName == "P") { //the body element is the <p> we planted, so replace it with a DIV since that will be the safest //element to have a as parent to all post content. IHTMLElement div = doc2.createElement("div"); (bodyElement.parentElement as IHTMLDOMNode).replaceChild(div as IHTMLDOMNode, bodyElement as IHTMLDOMNode); bodyElement = div; } //locate the title element. Note that is there are more than 1 copies of the title text detected, we use the one //that is anchored closest to the left or the body element. if (titleElements.Length > 0) { BlogPostRegions regions = new BlogPostRegions(); regions.Document = (IHTMLDocument)doc; regions.TitleRegions = titleElements; regions.BodyRegion = bodyElement; progress.UpdateProgress(100, 100); return regions; } else { throw new Exception("unable to access test post."); } }