//get img urls private string GetImageUrls(HtmlDocument document, PreviewArticleBindingModel config) { //find img in content var contentElement = document.DocumentNode.QuerySelector(config.ContentSelector); if (contentElement == null) { return(null); } var imgEls = contentElement.QuerySelectorAll("img"); if (imgEls == null || imgEls.Count() == 0) { return(null); } var imgUrlStringBuilder = new StringBuilder(); foreach (var imgEl in imgEls) { var url = GetImgUrl(imgEl); imgUrlStringBuilder.Append(url); imgUrlStringBuilder.Append(","); } //remove last coma if (imgUrlStringBuilder.Length > 0) { imgUrlStringBuilder.Length--; } return(imgUrlStringBuilder.ToString()); }
public IHttpActionResult PreviewArticle(PreviewArticleBindingModel model) { Debug.WriteLine("Hello preview article"); if (!ModelState.IsValid) { return(BadRequest(ModelState)); } HtmlDocument document = null; var url = model.Link; try { document = _htmlWeb.Load(url); } catch (Exception err) { Console.WriteLine("LOAD HTML DOC FAILED: " + err.Message); return(Content(HttpStatusCode.InternalServerError, "can not get html document")); } //get title var title = GetTitle(document, model); var description = GetDescription(document, model); var content = GetContent(document, model); var viewModel = new PreviewArticleViewModel() { Content = content, Description = description, Title = title }; return(Json(viewModel)); }
private string GetContent(HtmlDocument document, PreviewArticleBindingModel config) { if (config.ContentSelector == null) { return(null); } if (config.RemovalSelector != null && config.RemovalSelector.Length != 0) { var removalElements = document.DocumentNode.QuerySelectorAll(config.RemovalSelector).ToList(); foreach (var removeItem in removalElements) { //remove from html document removeItem.Remove(); } } //get content var contentContainer = document.DocumentNode.QuerySelector(config.ContentSelector); if (contentContainer == null) { return(null); } var listContentChildNode = contentContainer.ChildNodes.ToList(); foreach (var node in listContentChildNode) { //skip blank text node if (node.Name.Equals("#text")) { continue; } //scan for paragraph if (node.Name.Equals("p")) { node.RemoveClass(); node.AddClass("article-paragraph"); } //scan for img container node var imgElement = node.QuerySelector("img"); if (imgElement != null) { //get current img url and caption var imgUrl = GetImgUrl(imgElement); var imgCaption = GetImgCaption(node); //generate html code for img GenerateHtmlForImg(imgUrl, imgCaption, node); } } return(contentContainer.InnerHtml); }
//get article description private string GetDescription(HtmlDocument document, PreviewArticleBindingModel config) { if (config.DescriptionSelector == null) { return(null); } var descriptionEl = document.DocumentNode.QuerySelector(config.DescriptionSelector); if (descriptionEl == null) { return(null); } var description = descriptionEl.InnerText.Trim(); return(description); }
//for content crawler private string GetTitle(HtmlDocument document, PreviewArticleBindingModel config) { if (config.TitleSelector == null) { return(null); } var titleElement = document.DocumentNode.QuerySelector(config.TitleSelector); if (titleElement == null) { return(null); } var title = titleElement.InnerText.Trim(); return(title); }