예제 #1
0
        public static string Parse(int siteId, string filePath, bool isClearFormat, bool isFirstLineIndent, bool isClearFontSize, bool isClearFontFamily, bool isClearImages)
        {
            if (string.IsNullOrEmpty(filePath))
            {
                return(string.Empty);
            }

            var filename = PathUtils.GetFileNameWithoutExtension(filePath);

            //被转换的html文档保存的位置
            try
            {
                var saveFilePath = PathUtils.GetTemporaryFilesPath(filename + ".html");
                FileUtils.DeleteFileIfExists(saveFilePath);
                WordDntb.buildWord(filePath, saveFilePath);

                var parsedContent = FileUtils.ReadText(saveFilePath, System.Text.Encoding.Default);
                parsedContent = RegexUtils.GetInnerContent("body", parsedContent);

                //try
                //{
                //    parsedContent = HtmlClearUtils.ClearElementAttributes(parsedContent, "p");
                //}
                //catch { }

                if (isClearFormat)
                {
                    parsedContent = HtmlClearUtils.ClearFormat(parsedContent);
                }

                if (isFirstLineIndent)
                {
                    parsedContent = HtmlClearUtils.FirstLineIndent(parsedContent);
                }

                if (isClearFontSize)
                {
                    parsedContent = HtmlClearUtils.ClearFontSize(parsedContent);
                }

                if (isClearFontFamily)
                {
                    parsedContent = HtmlClearUtils.ClearFontFamily(parsedContent);
                }

                if (isClearImages)
                {
                    parsedContent = StringUtils.StripTags(parsedContent, "img");
                }
                else
                {
                    var siteInfo = SiteManager.GetSiteInfo(siteId);
                    var imageFileNameArrayList = RegexUtils.GetOriginalImageSrcs(parsedContent);
                    if (imageFileNameArrayList != null && imageFileNameArrayList.Count > 0)
                    {
                        foreach (var imageFileName in imageFileNameArrayList)
                        {
                            var imageFilePath       = PathUtils.GetTemporaryFilesPath(imageFileName);
                            var fileExtension       = PathUtils.GetExtension(imageFilePath);
                            var uploadDirectoryPath = PathUtility.GetUploadDirectoryPath(siteInfo, fileExtension);
                            var uploadDirectoryUrl  = PageUtility.GetSiteUrlByPhysicalPath(siteInfo, uploadDirectoryPath, true);
                            if (!FileUtils.IsFileExists(imageFilePath))
                            {
                                continue;
                            }

                            var uploadFileName = PathUtility.GetUploadFileName(siteInfo, imageFilePath);
                            var destFilePath   = PathUtils.Combine(uploadDirectoryPath, uploadFileName);
                            FileUtils.MoveFile(imageFilePath, destFilePath, false);
                            parsedContent = parsedContent.Replace(imageFileName, PageUtils.Combine(uploadDirectoryUrl, uploadFileName));

                            FileUtils.DeleteFileIfExists(imageFilePath);
                        }
                    }
                }

                FileUtils.DeleteFileIfExists(filePath);
                FileUtils.DeleteFileIfExists(saveFilePath);
                return(parsedContent.Trim());
            }
            catch (Exception ex)
            {
                LogUtils.AddErrorLog(ex);
                return(string.Empty);
            }
        }
예제 #2
0
        public static (string title, string content) ConvertToHtml(string docxFilePath, ConverterSettings settings)
        {
            string title;
            string content;
            var    fi = new FileInfo(docxFilePath);

            var byteArray = File.ReadAllBytes(fi.FullName);

            using (var memoryStream = new MemoryStream())
            {
                memoryStream.Write(byteArray, 0, byteArray.Length);
                using (var wDoc = WordprocessingDocument.Open(memoryStream, true))
                {
                    title = fi.FullName;
                    var part = wDoc.CoreFilePropertiesPart;
                    if (part != null)
                    {
                        title = (string)part.GetXDocument().Descendants(DC.title).FirstOrDefault() ?? fi.FullName;
                    }

                    title = PathUtils.GetFileNameWithoutExtension(title);

                    // TODO: Determine max-width from size of content area.
                    var htmlSettings = new HtmlConverterSettings
                    {
                        // AdditionalCss = "body { margin: 1cm auto; max-width: 20cm; padding: 0; }",
                        PageTitle                           = title,
                        FabricateCssClasses                 = true,
                        CssClassPrefix                      = "pt-",
                        RestrictToSupportedLanguages        = false,
                        RestrictToSupportedNumberingFormats = false,
                        ImageHandler                        = imageInfo =>
                        {
                            if (settings.IsClearImages || string.IsNullOrEmpty(settings.ImageDirectoryPath))
                            {
                                return(null);
                            }
                            DirectoryUtils.CreateDirectoryIfNotExists(settings.ImageDirectoryPath);

                            var         extension   = imageInfo.ContentType.Split('/')[1].ToLower();
                            ImageFormat imageFormat = null;
                            if (extension == "png")
                            {
                                imageFormat = ImageFormat.Png;
                            }
                            else if (extension == "gif")
                            {
                                imageFormat = ImageFormat.Gif;
                            }
                            else if (extension == "bmp")
                            {
                                imageFormat = ImageFormat.Bmp;
                            }
                            else if (extension == "jpeg")
                            {
                                imageFormat = ImageFormat.Jpeg;
                            }
                            else if (extension == "tiff")
                            {
                                // Convert tiff to gif.
                                extension   = "gif";
                                imageFormat = ImageFormat.Gif;
                            }
                            else if (extension == "x-wmf")
                            {
                                extension   = "wmf";
                                imageFormat = ImageFormat.Wmf;
                            }

                            // If the image format isn't one that we expect, ignore it,
                            // and don't return markup for the link.
                            if (imageFormat == null)
                            {
                                return(null);
                            }

                            var imageFileName = StringUtils.GetShortGuid(false) + "." + extension;

                            var imageFilePath = PathUtils.Combine(settings.ImageDirectoryPath, imageFileName);
                            try
                            {
                                imageInfo.Bitmap.Save(imageFilePath, imageFormat);
                            }
                            catch (System.Runtime.InteropServices.ExternalException)
                            {
                                return(null);
                            }
                            var imageSource = PageUtils.Combine(settings.ImageDirectoryUrl, imageFileName);

                            var img = new XElement(Xhtml.img,
                                                   new XAttribute(NoNamespace.src, imageSource),
                                                   imageInfo.ImgStyleAttribute,
                                                   imageInfo.AltText != null ?
                                                   new XAttribute(NoNamespace.alt, imageInfo.AltText) : null);
                            return(img);
                        }
                    };
                    var htmlElement = HtmlConverter.ConvertToHtml(wDoc, htmlSettings);

                    // Produce HTML document with <!DOCTYPE html > declaration to tell the browser
                    // we are using HTML5.
                    var html = new XDocument(
                        new XDocumentType("html", null, null, null),
                        htmlElement);

                    // Note: the xhtml returned by ConvertToHtmlTransform contains objects of type
                    // XEntity.  PtOpenXmlUtil.cs define the XEntity class.  See
                    // http://blogs.msdn.com/ericwhite/archive/2010/01/21/writing-entity-references-using-linq-to-xml.aspx
                    // for detailed explanation.
                    //
                    // If you further transform the XML tree returned by ConvertToHtmlTransform, you
                    // must do it correctly, or entities will not be serialized properly.

                    var htmlString = html.ToString(SaveOptions.DisableFormatting);
                    var htmlDoc    = new HtmlDocument();
                    htmlDoc.LoadHtml(htmlString);
                    var style = htmlDoc.DocumentNode.SelectSingleNode("//style").OuterHtml;
                    var body  = htmlDoc.DocumentNode.SelectSingleNode("//body").InnerHtml;

                    // var style = HtmlToWmlConverter.CleanUpCss((string)htmlElement.Descendants().FirstOrDefault(d => d.Name.LocalName.ToLower() == "style"));

                    content = $"{style}{Environment.NewLine}{body}";

                    if (settings.IsSaveHtml && !string.IsNullOrEmpty(settings.HtmlDirectoryPath) && DirectoryUtils.IsDirectoryExists(settings.HtmlDirectoryPath))
                    {
                        var htmlFilePath = PathUtils.Combine(settings.HtmlDirectoryPath, PathUtils.GetFileNameWithoutExtension(docxFilePath) + ".html");
                        File.WriteAllText(htmlFilePath, htmlString, Encoding.UTF8);
                    }
                }
            }

            if (settings.IsFirstLineTitle)
            {
                var contentTitle = RegexUtils.GetInnerContent("p", content);
                contentTitle = StringUtils.StripTags(contentTitle);
                if (!string.IsNullOrEmpty(contentTitle) && settings.IsFirstLineRemove)
                {
                    content = StringUtils.ReplaceFirst(contentTitle, content, string.Empty);
                }
                if (!string.IsNullOrEmpty(contentTitle))
                {
                    contentTitle = contentTitle.Trim();
                    contentTitle = contentTitle.Trim(' ', ' ');
                    contentTitle = StringUtils.StripEntities(contentTitle);
                }

                if (!string.IsNullOrEmpty(contentTitle))
                {
                    title = contentTitle;
                }
            }

            if (settings.IsClearFormat)
            {
                content = HtmlClearUtils.ClearFormat(content);
            }

            if (settings.IsFirstLineIndent)
            {
                content = HtmlClearUtils.FirstLineIndent(content);
            }

            if (settings.IsClearFontSize)
            {
                content = HtmlClearUtils.ClearFontSize(content);
            }

            if (settings.IsClearFontFamily)
            {
                content = HtmlClearUtils.ClearFontFamily(content);
            }

            if (settings.IsFirstLineRemove)
            {
                content = StringUtils.ReplaceFirst(title, content, string.Empty);
            }

            if (string.IsNullOrEmpty(title))
            {
                title = PathUtils.GetFileNameWithoutExtension(docxFilePath);
            }

            return(title, content);
        }