private async Task <HtmlInfo> GetPageInfo(string url, int docNumber, bool needTakeChild) { var page = await _client.GetAsync(url, HttpCompletionOption.ResponseContentRead); var doc = new HtmlDocument(); var contentInner = await page.Content.ReadAsStringAsync(); doc.LoadHtml(contentInner); doc.DocumentNode.Descendants() .Where(n => n.Name == "script" || n.Name == "style") .ToList() .ForEach(n => n.Remove()); var wordsInfo = doc.DocumentNode.SelectNodes("//text()") .Select(node => node.InnerText.Trim()) .SelectMany(word => word.Split(' ')) .Select(word => new string(word.Where(char.IsLetter).ToArray())) .Where(x => !string.IsNullOrEmpty(x)) .Select(x => new WordInfo { Word = x, DocNumber = docNumber }) .ToList(); var childLinks = needTakeChild ? doc.DocumentNode.SelectNodes("//a[@href]") .Where(node => node != null).Select(node => node?.Attributes["href"]?.Value) .Where(x => x != null && Uri.IsWellFormedUriString(x, UriKind.Absolute) && x.StartsWith("http")) .Distinct() .ToList() : new List <string>(); var result = new HtmlInfo { Content = new ContentInfo { Content = string.Join(" ", doc.DocumentNode.SelectNodes("//text()") .Select(node => node.InnerText.Trim())), WordsInfo = wordsInfo }, ChildLinks = childLinks, IsVisited = true, Link = url, Level = docNumber, ParentLink = null }; _htmlInfos.Add(result); return(result); }
/// <summary> /// document in html representation and reorder a page /// </summary> /// <param name="DocumentName">file/document name</param> /// <param name="CurrentPageNumber">Page existing order number</param> /// <param name="NewPageNumber">Page new order number</param> /// <param name="DocumentPassword">Password Parameter is optional</param> public static List <HtmlInfo> RenderDocumentAsHtml(String DocumentName, int CurrentPageNumber, int NewPageNumber, String DocumentPassword = null) { //ExStart:RenderAsHtmlAndReorderPage //Get Configurations ViewerConfig config = Utilities.GetConfigurations(); // Cast ViewerHtmlHandler class object to its base class(ViewerHandler). ViewerHandler <PageHtml> handler = new ViewerHtmlHandler(config); // Guid implies that unique document name string guid = DocumentName; //Instantiate the HtmlOptions object with setting of Reorder Transformation HtmlOptions options = new HtmlOptions { Transformations = Transformation.Reorder }; //to get html representations of pages with embedded resources options.IsResourcesEmbedded = true; // Set password if document is password protected. if (!String.IsNullOrEmpty(DocumentPassword)) { options.Password = DocumentPassword; } //Call ReorderPage and pass the reference of ViewerHandler's class parameter by reference. Utilities.PageTransformations.ReorderPage(ref handler, guid, CurrentPageNumber, NewPageNumber); //down cast the handler(ViewerHandler) to viewerHtmlHandler ViewerHtmlHandler htmlHandler = (ViewerHtmlHandler)handler; //Get document pages in html form List <PageHtml> pages = htmlHandler.GetPages(guid, options); List <HtmlInfo> contents = new List <HtmlInfo>(); foreach (PageHtml page in pages) { HtmlInfo htmlInfo = new HtmlInfo(); htmlInfo.HtmlContent = page.HtmlContent; htmlInfo.PageNmber = page.PageNumber; contents.Add(htmlInfo); } return(contents); //ExEnd:RenderAsHtmlAndReorderPage }
public static List <HtmlInfo> RotateDocumentAsHtml(String DocumentName, int pageNumber, int RotationAngle, String DocumentPassword = null) { //ExStart:RenderAsImageWithRotationTransformation //Get Configurations ViewerConfig config = Utilities.GetConfigurations(); // Create image handler ViewerHandler <PageHtml> handler = new ViewerHtmlHandler(config); // Guid implies that unique document name string guid = DocumentName; //Initialize ImageOptions Object and setting Rotate Transformation HtmlOptions options = new HtmlOptions { Transformations = Transformation.Rotate }; // Set password if document is password protected. if (!String.IsNullOrEmpty(DocumentPassword)) { options.Password = DocumentPassword; } //Call RotatePages to apply rotate transformation to a page Utilities.PageTransformations.RotatePages(ref handler, guid, pageNumber, RotationAngle); //down cast the handler(ViewerHandler) to viewerHtmlHandler ViewerHtmlHandler htmlHandler = (ViewerHtmlHandler)handler; //Get document pages in image form List <PageHtml> pages = htmlHandler.GetPages(guid, options); List <HtmlInfo> contents = new List <HtmlInfo>(); foreach (PageHtml page in pages) { HtmlInfo htmlInfo = new HtmlInfo(); htmlInfo.HtmlContent = page.HtmlContent; htmlInfo.PageNmber = page.PageNumber; contents.Add(htmlInfo); } return(contents); //ExEnd:RenderAsImageWithRotationTransformation }
/// <summary> /// Render document in html representation with watermark /// </summary> /// <param name="DocumentName">file/document name</param> /// <param name="WatermarkText">watermark text</param> /// <param name="WatermarkColor"> System.Drawing.Color</param> /// <param name="position">Watermark Position is optional parameter. Default value is WatermarkPosition.Diagonal</param> /// <param name="WatermarkWidth"> width of watermark as integer. it is optional Parameter default value is 100</param> /// <param name="DocumentPassword">Password Parameter is optional</param> public static List <HtmlInfo> RenderDocumentAsHtml(String DocumentName, String WatermarkText, Color WatermarkColor, int WatermarkWidth = 100, String DocumentPassword = null) { //ExStart:RenderAsHtmlWithWaterMark //Get Configurations ViewerConfig config = Utilities.GetConfigurations(); // Create html handler ViewerHtmlHandler htmlHandler = new ViewerHtmlHandler(config); // Guid implies that unique document name string guid = DocumentName; //Instantiate the HtmlOptions object HtmlOptions options = new HtmlOptions(); options.IsResourcesEmbedded = false; // Set password if document is password protected. if (!String.IsNullOrEmpty(DocumentPassword)) { options.Password = DocumentPassword; } // Call AddWatermark and pass the reference of HtmlOptions object as 1st parameter Utilities.PageTransformations.AddWatermark(ref options, WatermarkText, WatermarkColor, WatermarkPosition.Diagonal, WatermarkWidth); //Get document pages in html form List <PageHtml> pages = htmlHandler.GetPages(guid, options); List <HtmlInfo> contents = new List <HtmlInfo>(); foreach (PageHtml page in pages) { HtmlInfo htmlInfo = new HtmlInfo(); htmlInfo.HtmlContent = page.HtmlContent; htmlInfo.PageNmber = page.PageNumber; contents.Add(htmlInfo); } return(contents); //ExEnd:RenderAsHtmlWithWaterMark }
/// <summary> /// Render simple document in html representation /// </summary> /// <param name="DocumentName">File name</param> /// <param name="DocumentPassword">Optional</param> public static List <HtmlInfo> RenderDocumentAsHtml(String DocumentName, String DocumentPassword = null) { //ExStart:RenderAsHtml //Get Configurations ViewerConfig config = Utilities.GetConfigurations(); // Create html handler ViewerHtmlHandler htmlHandler = new ViewerHtmlHandler(config); // Guid implies that unique document name string guid = DocumentName; //Instantiate the HtmlOptions object HtmlOptions options = new HtmlOptions(); //to get html representations of pages with embedded resources options.IsResourcesEmbedded = true; // Set password if document is password protected. if (!String.IsNullOrEmpty(DocumentPassword)) { options.Password = DocumentPassword; } //Get document pages in html form List <PageHtml> pages = htmlHandler.GetPages(guid, options); List <HtmlInfo> contents = new List <HtmlInfo>(); foreach (PageHtml page in pages) { HtmlInfo htmlInfo = new HtmlInfo(); htmlInfo.HtmlContent = page.HtmlContent; htmlInfo.PageNmber = page.PageNumber; contents.Add(htmlInfo); } return(contents); //ExEnd:RenderAsHtml }
private static HtmlInfo SeparateHtml(string contentHtml) { var content = new HtmlInfo(); var document = new HtmlDocument(); document.LoadHtml(contentHtml); // TODO: how to get TITLE // InnerText in HtmlAgilityPack is not decoded, should be a bug var headerNode = document.DocumentNode.SelectSingleNode("//h1|//h2|//h3"); content.Title = StringHelper.HtmlDecode(headerNode?.InnerText); if (headerNode != null && document.DocumentNode.FirstChild == headerNode) { content.RawTitle = headerNode.OuterHtml; headerNode.Remove(); } else { content.RawTitle = string.Empty; } content.Content = document.DocumentNode.OuterHtml; return content; }