private void btnApply_Click(object sender, EventArgs e) { OpenFileDialog ofd = new OpenFileDialog(); ofd.Multiselect = true; DialogResult dr = ofd.ShowDialog(); foreach (var item in ofd.FileNames) { using (WordprocessingDocument doc = WordprocessingDocument.Open(item, true)) { SimplifyMarkupSettings settings = new SimplifyMarkupSettings { RemoveContentControls = cbRemoveContentControls.Checked, RemoveSmartTags = cbRemoveSmartTags.Checked, RemoveRsidInfo = cbRemoveRsidInfo.Checked, RemoveComments = cbRemoveComments.Checked, RemoveEndAndFootNotes = cbRemoveEndAndFootNotes.Checked, ReplaceTabsWithSpaces = cbReplaceTabsWithSpaces.Checked, RemoveFieldCodes = cbRemoveFieldCodes.Checked, RemovePermissions = cbRemovePermissions.Checked, RemoveProof = cbRemoveProof.Checked, RemoveSoftHyphens = cbRemoveSoftHyphens.Checked, RemoveLastRenderedPageBreak = cbRemoveLastRenderedPageBreak.Checked, RemoveBookmarks = cbRemoveBookmarks.Checked, RemoveWebHidden = cbRemoveWebHidden.Checked, NormalizeXml = cbNormalize.Checked, }; OpenXmlPowerTools.MarkupSimplifier.SimplifyMarkup(doc, settings); } } }
private static object RemoveCustomXmlAndContentControlsTransform( XNode node, SimplifyMarkupSettings simplifyMarkupSettings) { if (node is XElement element) { if (simplifyMarkupSettings.RemoveSmartTags && element.Name == W.smartTag) { return(element .Elements() .Select(e => RemoveCustomXmlAndContentControlsTransform(e, simplifyMarkupSettings))); } if (simplifyMarkupSettings.RemoveContentControls && element.Name == W.sdt) { return(element .Elements(W.sdtContent) .Elements() .Select(e => RemoveCustomXmlAndContentControlsTransform(e, simplifyMarkupSettings))); } return(new XElement(element.Name, element.Attributes(), element.Nodes().Select(n => RemoveCustomXmlAndContentControlsTransform(n, simplifyMarkupSettings)))); } return(node); }
/// <summary> /// Converts a specific node instead of the whole word document into HTML. /// Note: this method is added for the above purpose. See the other method: /// public static XElement ConvertToHtml(WordprocessingDocument wordDoc, HtmlConverterSettings htmlConverterSettings, Func<ImageInfo, XElement> imageHandler) /// </summary> /// <param name="wordDoc"></param> /// <param name="node">The node to convert to HTML.</param> /// <param name="htmlConverterSettings"></param> /// <returns></returns> public static XElement ConvertToHtml(WordprocessingDocument wordDoc, XNode node, HtmlConverterSettings htmlConverterSettings) { InitEntityMap(); if (htmlConverterSettings.ConvertFormatting) { throw new InvalidSettingsException("Conversion with formatting is not supported"); } RevisionAccepter.AcceptRevisions(wordDoc); SimplifyMarkupSettings settings = new SimplifyMarkupSettings { RemoveComments = true, RemoveContentControls = true, RemoveEndAndFootNotes = true, RemoveFieldCodes = false, RemoveLastRenderedPageBreak = true, RemovePermissions = true, RemoveProof = true, RemoveRsidInfo = true, RemoveSmartTags = true, RemoveSoftHyphens = true, ReplaceTabsWithSpaces = true, }; MarkupSimplifier.SimplifyMarkup(wordDoc, settings); AnnotateHyperlinkContent((XElement)node); XElement xhtml = (XElement)ConvertToHtmlTransform(wordDoc, htmlConverterSettings, node, null); return(xhtml); }
private static object RemoveCustomXmlAndContentControlsTransform( XNode node, SimplifyMarkupSettings simplifyMarkupSettings) { XElement element = node as XElement; if (element != null) { if (simplifyMarkupSettings.RemoveSmartTags && element.Name == W.smartTag) { return(element .Elements() .Select(e => RemoveCustomXmlAndContentControlsTransform(e, simplifyMarkupSettings))); } if (simplifyMarkupSettings.RemoveContentControls && element.Name == W.sdt) { return(element .Element(W.sdtContent) .Elements() .Select(e => RemoveCustomXmlAndContentControlsTransform(e, simplifyMarkupSettings))); } } return(node); }
public static void SimplifyMarkup(WordprocessingDocument doc, SimplifyMarkupSettings settings) { SimplifyMarkupForPart(doc.MainDocumentPart, settings); SimplifyMarkupForPart(doc.MainDocumentPart.StyleDefinitionsPart, settings); SimplifyMarkupForPart(doc.MainDocumentPart.StylesWithEffectsPart, settings); }
public static void SimplifyMarkup(WordprocessingDocument doc, SimplifyMarkupSettings settings) { if (settings.RemoveMarkupForDocumentComparison) { settings.RemoveRsidInfo = true; RemoveElementsForDocumentComparison(doc); } if (settings.RemoveRsidInfo) { RemoveRsidInfoInSettings(doc); } if (settings.AcceptRevisions) { RevisionAccepter.AcceptRevisions(doc); } foreach (var part in doc.ContentParts()) { SimplifyMarkupForPart(part, settings); } if (doc.MainDocumentPart.StyleDefinitionsPart != null) { SimplifyMarkupForPart(doc.MainDocumentPart.StyleDefinitionsPart, settings); } if (doc.MainDocumentPart.StylesWithEffectsPart != null) { SimplifyMarkupForPart(doc.MainDocumentPart.StylesWithEffectsPart, settings); } }
public static WmlDocument SimplifyMarkup(WmlDocument doc, SimplifyMarkupSettings settings) { using (var streamDoc = new OpenXmlMemoryStreamDocument(doc)) { using (WordprocessingDocument document = streamDoc.GetWordprocessingDocument()) SimplifyMarkup(document, settings); return(streamDoc.GetModifiedWmlDocument()); } }
public static WmlDocument SimplifyMarkup(WmlDocument doc, SimplifyMarkupSettings settings) { using (OpenXmlMemoryStreamDocument streamDoc = new OpenXmlMemoryStreamDocument(doc)) { using (WordprocessingDocument document = streamDoc.GetWordprocessingDocument()) { SimplifyMarkup(document, settings); } return streamDoc.GetModifiedWmlDocument(); } }
public static void SimplifyMarkup(WordprocessingDocument doc, SimplifyMarkupSettings settings) { if (settings.AcceptRevisions) RevisionAccepter.AcceptRevisions(doc); foreach (var part in doc.ContentParts()) SimplifyMarkupForPart(part, settings); if (doc.MainDocumentPart.StyleDefinitionsPart != null) SimplifyMarkupForPart(doc.MainDocumentPart.StyleDefinitionsPart, settings); if (doc.MainDocumentPart.StylesWithEffectsPart != null) SimplifyMarkupForPart(doc.MainDocumentPart.StylesWithEffectsPart, settings); }
public async Task <string> Translate(IFormFile file, bool translateHeader, bool translateFooter) { var filePath = Path.GetTempFileName(); Console.WriteLine($"[FILE] {filePath}"); using (var ms = new MemoryStream()) { await file.CopyToAsync(ms); using (WordprocessingDocument worddoc = WordprocessingDocument.Open(ms, true)) { OpenXmlPowerTools.SimplifyMarkupSettings settings = new OpenXmlPowerTools.SimplifyMarkupSettings { AcceptRevisions = false, NormalizeXml = false, //setting this to false reduces translation quality, but if true some documents have XML format errors when opening RemoveBookmarks = true, RemoveComments = true, RemoveContentControls = true, RemoveEndAndFootNotes = true, RemoveFieldCodes = true, RemoveGoBackBookmark = true, RemoveHyperlinks = false, RemoveLastRenderedPageBreak = true, RemoveMarkupForDocumentComparison = true, RemovePermissions = false, RemoveProof = true, RemoveRsidInfo = true, RemoveSmartTags = true, RemoveSoftHyphens = true, RemoveWebHidden = true, ReplaceTabsWithSpaces = false }; OpenXmlPowerTools.MarkupSimplifier.SimplifyMarkup(worddoc, settings); Body body = worddoc.MainDocumentPart.Document.Body; var texts = body.Descendants <Text>(); foreach (var text in texts) { if (!string.IsNullOrWhiteSpace(text.Text)) { text.Text = await this.CallTranslator(text.Text); } } var clone = worddoc.Clone(filePath); clone.Close(); worddoc.Close(); } } return(filePath); }
public static void SimplifyMarkup(WordprocessingDocument doc, SimplifyMarkupSettings settings) { if (settings.RemoveMarkupForDocumentComparison) { settings.RemoveRsidInfo = true; RemoveElementsForDocumentComparison(doc); } if (settings.RemoveRsidInfo) RemoveRsidInfoInSettings(doc); if (settings.AcceptRevisions) RevisionAccepter.AcceptRevisions(doc); foreach (var part in doc.ContentParts()) SimplifyMarkupForPart(part, settings); if (doc.MainDocumentPart.StyleDefinitionsPart != null) SimplifyMarkupForPart(doc.MainDocumentPart.StyleDefinitionsPart, settings); if (doc.MainDocumentPart.StylesWithEffectsPart != null) SimplifyMarkupForPart(doc.MainDocumentPart.StylesWithEffectsPart, settings); }
public static void SimplifyMarkup(WordprocessingDocument doc, SimplifyMarkupSettings settings) { if (settings.AcceptRevisions) { RevisionAccepter.AcceptRevisions(doc); } foreach (var part in doc.ContentParts()) { SimplifyMarkupForPart(part, settings); } if (doc.MainDocumentPart.StyleDefinitionsPart != null) { SimplifyMarkupForPart(doc.MainDocumentPart.StyleDefinitionsPart, settings); } if (doc.MainDocumentPart.StylesWithEffectsPart != null) { SimplifyMarkupForPart(doc.MainDocumentPart.StylesWithEffectsPart, settings); } }
public static XElement ConvertToHtml(WordprocessingDocument wordDoc, HtmlConverterSettings htmlConverterSettings, Func <ImageInfo, XElement> imageHandler) { InitEntityMap(); if (htmlConverterSettings.ConvertFormatting) { throw new InvalidSettingsException("Conversion with formatting is not supported"); } RevisionAccepter.AcceptRevisions(wordDoc); SimplifyMarkupSettings settings = new SimplifyMarkupSettings { RemoveComments = true, RemoveContentControls = true, RemoveEndAndFootNotes = true, RemoveFieldCodes = false, RemoveLastRenderedPageBreak = true, RemovePermissions = true, RemoveProof = true, RemoveRsidInfo = true, RemoveSmartTags = true, RemoveSoftHyphens = true, ReplaceTabsWithSpaces = true, }; MarkupSimplifier.SimplifyMarkup(wordDoc, settings); XElement rootElement = wordDoc.MainDocumentPart.GetXDocument().Root; AnnotateHyperlinkContent(rootElement); XElement xhtml = (XElement)ConvertToHtmlTransform(wordDoc, htmlConverterSettings, rootElement, imageHandler); // Note: the xhtml returned by ConvertToHtmlTransform contains objects of type // XEntity. PtOpenXmlUtil.cs define the XEntity class. See // http://blogs.msdn.com/ericwhite/archive/2010/01/21/writing-entity-references-using-linq-to-xml.aspx // for detailed explanation. // // If you further transform the XML tree returned by ConvertToHtmlTransform, you // must do it correctly, or entities will not be serialized properly. return(xhtml); }
static void Main(string[] args) { Console.WriteLine("Word Markup Simplifier 1.0.0"); if (args.Length < 1) { Console.WriteLine("Usage: WordMarkupSimplifier <some word document>"); return; } Console.WriteLine("Simplifying " + args[0]); try { using (WordprocessingDocument doc = WordprocessingDocument.Open(args[0], true)) { SimplifyMarkupSettings settings = new SimplifyMarkupSettings { RemoveComments = true, RemoveContentControls = true, RemoveEndAndFootNotes = true, RemoveFieldCodes = false, RemoveLastRenderedPageBreak = true, RemovePermissions = true, RemoveProof = true, RemoveRsidInfo = true, RemoveSmartTags = true, RemoveSoftHyphens = true, ReplaceTabsWithSpaces = true, }; MarkupSimplifier.SimplifyMarkup(doc, settings); } } catch (OpenXmlPackageException e) { Console.WriteLine("Error: " + args[0] + " is not a valid Word document."); Console.WriteLine("Exception: " + e.Message); } }
private static void SimplifyMarkupForPart(OpenXmlPart part, SimplifyMarkupSettings settings) { var parameters = new SimplifyMarkupParameters(); if (part.ContentType == "application/vnd.openxmlformats-officedocument.wordprocessingml.document.main+xml") { var doc = (WordprocessingDocument)part.OpenXmlPackage; if (settings.RemoveGoBackBookmark) { XElement goBackBookmark = doc .MainDocumentPart .GetXDocument() .Descendants(W.bookmarkStart) .FirstOrDefault(bm => (string)bm.Attribute(W.name) == "_GoBack"); if (goBackBookmark != null) { parameters.GoBackId = (int)goBackBookmark.Attribute(W.id); } } } XDocument xdoc = part.GetXDocument(); XElement newRoot = xdoc.Root; // Need to do this first to enable simplifying hyperlinks. if (settings.RemoveContentControls || settings.RemoveSmartTags) { newRoot = (XElement)RemoveCustomXmlAndContentControlsTransform(newRoot, settings); } // This may touch many elements, so needs to be its own transform. if (settings.RemoveRsidInfo) { newRoot = (XElement)RemoveRsidTransform(newRoot); } var prevNewRoot = new XDocument(newRoot); while (true) { if (settings.RemoveComments || settings.RemoveEndAndFootNotes || settings.ReplaceTabsWithSpaces || settings.RemoveFieldCodes || settings.RemovePermissions || settings.RemoveProof || settings.RemoveBookmarks || settings.RemoveWebHidden || settings.RemoveGoBackBookmark || settings.RemoveHyperlinks) { newRoot = (XElement)SimplifyMarkupTransform(newRoot, settings, parameters); } // Remove runs and run properties that have become empty due to previous transforms. newRoot = (XElement)RemoveEmptyRunsAndRunPropertiesTransform(newRoot); // Merge adjacent runs that have identical run properties. newRoot = (XElement)MergeAdjacentRunsTransform(newRoot); // Merge adjacent instrText elements. newRoot = (XElement)MergeAdjacentInstrText(newRoot); // Separate run children into separate runs newRoot = (XElement)SeparateRunChildrenIntoSeparateRuns(newRoot); if (XNode.DeepEquals(prevNewRoot.Root, newRoot)) { break; } prevNewRoot = new XDocument(newRoot); } if (settings.NormalizeXml) { XAttribute[] nsAttrs = NamespaceAttributeUtil.NamespaceAttributes; XDocument newXDoc = Normalize(new XDocument(newRoot), null); newRoot = newXDoc.Root; if (newRoot != null) { foreach (XAttribute nsAttr in nsAttrs) { if (newRoot.Attribute(nsAttr.Name) == null) { newRoot.Add(nsAttr); } } } part.PutXDocument(newXDoc); } else { part.PutXDocument(new XDocument(newRoot)); } }
private static void SimplifyMarkupForPart( OpenXmlPart part, SimplifyMarkupSettings settings) { SimplifyMarkupParameters parameters = new SimplifyMarkupParameters(); if (part.ContentType == "application/vnd.openxmlformats-officedocument.wordprocessingml.document.main+xml") { WordprocessingDocument doc = (WordprocessingDocument)part.OpenXmlPackage; if (settings.RemoveGoBackBookmark == true) { var goBackBookmark = doc .MainDocumentPart .GetXDocument() .Root .Descendants(W.bookmarkStart) .FirstOrDefault(bm => (string)bm.Attribute(W.name) == "_GoBack"); if (goBackBookmark != null) parameters.GoBackId = (int)goBackBookmark.Attribute(W.id); } } XDocument xdoc = part.GetXDocument(); XElement newRoot = xdoc.Root; // Need to do this first to enable simplifying hyperlinks. if (settings.RemoveContentControls || settings.RemoveSmartTags) newRoot = (XElement) RemoveCustomXmlAndContentControlsTransform( newRoot, settings); // This may touch many elements, so needs to be its own // transform. if (settings.RemoveRsidInfo) newRoot = (XElement)RemoveRsidTransform(newRoot); XDocument prevNewRoot = new XDocument(newRoot); while (true) { if (settings.RemoveComments || settings.RemoveEndAndFootNotes || settings.ReplaceTabsWithSpaces || settings.RemoveFieldCodes || settings.RemovePermissions || settings.RemoveProof || settings.RemoveBookmarks || settings.RemoveWebHidden || settings.RemoveGoBackBookmark) newRoot = (XElement)SimplifyMarkupTransform(newRoot, settings, parameters); // Remove runs and run properties that have become empty due to previous // transforms. newRoot = (XElement) RemoveEmptyRunsAndRunPropertiesTransform(newRoot); // Merge adjacent runs that have identical run properties. newRoot = (XElement)MergeAdjacentRunsTransform(newRoot); // Merge adjacent instrText elements. newRoot = (XElement)MergeAdjacentInstrText(newRoot); // Separate run children into separate runs newRoot = (XElement)SeparateRunChildrenIntoSeparateRuns(newRoot); if (XNode.DeepEquals(prevNewRoot.Root, newRoot)) break; prevNewRoot = new XDocument(newRoot); } if (settings.NormalizeXml) { XAttribute[] ns_attrs = { new XAttribute(XNamespace.Xmlns + "wpc", WPC.wpc), new XAttribute(XNamespace.Xmlns + "mc", MC.mc), new XAttribute(XNamespace.Xmlns + "o", O.o), new XAttribute(XNamespace.Xmlns + "r", R.r), new XAttribute(XNamespace.Xmlns + "m", M.m), new XAttribute(XNamespace.Xmlns + "v", VML.vml), new XAttribute(XNamespace.Xmlns + "wp14", WP14.wp14), new XAttribute(XNamespace.Xmlns + "wp", WP.wp), new XAttribute(XNamespace.Xmlns + "w10", W10.w10), new XAttribute(XNamespace.Xmlns + "w", W.w), new XAttribute(XNamespace.Xmlns + "w14", W14.w14), new XAttribute(XNamespace.Xmlns + "wpg", WPG.wpg), new XAttribute(XNamespace.Xmlns + "wpi", WPI.wpi), new XAttribute(XNamespace.Xmlns + "wne", WNE.wne), new XAttribute(XNamespace.Xmlns + "wps", WPS.wps), new XAttribute(MC.Ignorable, "w14 wp14"), }; XDocument newXDoc = Normalize(new XDocument(newRoot), null); foreach (var nsatt in ns_attrs) { if (newXDoc.Root.Attribute(nsatt.Name) == null) newXDoc.Root.Add(nsatt); } part.PutXDocument(newXDoc); } else { part.PutXDocument(new XDocument(newRoot)); } }
// lastRenderedPageBreak, permEnd, permStart, proofErr, noProof // softHyphen: // Remove when simplifying. // fldSimple, fldData, fldChar, instrText: // For hyperlinks, generate same in XHtml. Other than hyperlinks, do the following: // - collapse fldSimple // - remove fldSimple, fldData, fldChar, instrText. private static object SimplifyMarkupTransform( XNode node, SimplifyMarkupSettings settings, SimplifyMarkupParameters parameters) { XElement element = node as XElement; if (element != null) { if (settings.RemovePermissions && (element.Name == W.permEnd || element.Name == W.permStart)) return null; if (settings.RemoveProof && (element.Name == W.proofErr || element.Name == W.noProof)) return null; if (settings.RemoveSoftHyphens && element.Name == W.softHyphen) return null; if (settings.RemoveLastRenderedPageBreak && element.Name == W.lastRenderedPageBreak) return null; if (settings.RemoveBookmarks && (element.Name == W.bookmarkStart || element.Name == W.bookmarkEnd)) return null; if (settings.RemoveGoBackBookmark && ((element.Name == W.bookmarkStart && (int)element.Attribute(W.id) == parameters.GoBackId) || (element.Name == W.bookmarkEnd && (int)element.Attribute(W.id) == parameters.GoBackId))) return null; if (settings.RemoveWebHidden && element.Name == W.webHidden) return null; if (settings.ReplaceTabsWithSpaces && element.Name == W.tab && element.Parent.Name == W.r) return new XElement(W.t, new XAttribute(XNamespace.Xml + "space", "preserve"), " "); if (settings.RemoveComments && (element.Name == W.commentRangeStart || element.Name == W.commentRangeEnd || element.Name == W.commentReference || element.Name == W.annotationRef)) return null; if (settings.RemoveComments && element.Name == W.rStyle && element.Attribute(W.val).Value == "CommentReference") return null; if (settings.RemoveEndAndFootNotes && (element.Name == W.endnoteReference || element.Name == W.footnoteReference)) return null; if (settings.RemoveFieldCodes) { if (element.Name == W.fldSimple) return element.Elements().Select(e => SimplifyMarkupTransform(e, settings, parameters)); if (element.Name == W.fldData || element.Name == W.fldChar || element.Name == W.instrText) return null; } return new XElement(element.Name, element.Attributes(), element.Nodes().Select(n => SimplifyMarkupTransform(n, settings, parameters))); } return node; }
public static XElement ConvertToHtml(WordprocessingDocument wordDoc, HtmlConverterSettings htmlConverterSettings, Func<ImageInfo, XElement> imageHandler) { InitEntityMap(); if (htmlConverterSettings.ConvertFormatting) { throw new InvalidSettingsException("Conversion with formatting is not supported"); } RevisionAccepter.AcceptRevisions(wordDoc); SimplifyMarkupSettings settings = new SimplifyMarkupSettings { RemoveComments = true, RemoveContentControls = true, RemoveEndAndFootNotes = true, RemoveFieldCodes = false, RemoveLastRenderedPageBreak = true, RemovePermissions = true, RemoveProof = true, RemoveRsidInfo = true, RemoveSmartTags = true, RemoveSoftHyphens = true, ReplaceTabsWithSpaces = true, }; MarkupSimplifier.SimplifyMarkup(wordDoc, settings); XElement rootElement = wordDoc.MainDocumentPart.GetXDocument().Root; AnnotateHyperlinkContent(rootElement); XElement xhtml = (XElement)ConvertToHtmlTransform(wordDoc, htmlConverterSettings, rootElement, imageHandler); // Note: the xhtml returned by ConvertToHtmlTransform contains objects of type // XEntity. PtOpenXmlUtil.cs define the XEntity class. See // http://blogs.msdn.com/ericwhite/archive/2010/01/21/writing-entity-references-using-linq-to-xml.aspx // for detailed explanation. // // If you further transform the XML tree returned by ConvertToHtmlTransform, you // must do it correctly, or entities will not be serialized properly. return xhtml; }
private static void SimplifyMarkupForPart( OpenXmlPart part, SimplifyMarkupSettings settings) { SimplifyMarkupParameters parameters = new SimplifyMarkupParameters(); if (part.ContentType == "application/vnd.openxmlformats-officedocument.wordprocessingml.document.main+xml") { WordprocessingDocument doc = (WordprocessingDocument)part.OpenXmlPackage; if (settings.RemoveGoBackBookmark == true) { var goBackBookmark = doc .MainDocumentPart .GetXDocument() .Root .Descendants(W.bookmarkStart) .FirstOrDefault(bm => (string)bm.Attribute(W.name) == "_GoBack"); if (goBackBookmark != null) { parameters.GoBackId = (int)goBackBookmark.Attribute(W.id); } } } XDocument xdoc = part.GetXDocument(); XElement newRoot = xdoc.Root; // Need to do this first to enable simplifying hyperlinks. if (settings.RemoveContentControls || settings.RemoveSmartTags) { newRoot = (XElement) RemoveCustomXmlAndContentControlsTransform( newRoot, settings); } // This may touch many elements, so needs to be its own // transform. if (settings.RemoveRsidInfo) { newRoot = (XElement)RemoveRsidTransform(newRoot); } XDocument prevNewRoot = new XDocument(newRoot); while (true) { if (settings.RemoveComments || settings.RemoveEndAndFootNotes || settings.ReplaceTabsWithSpaces || settings.RemoveFieldCodes || settings.RemovePermissions || settings.RemoveProof || settings.RemoveBookmarks || settings.RemoveWebHidden || settings.RemoveGoBackBookmark) { newRoot = (XElement)SimplifyMarkupTransform(newRoot, settings, parameters); } // Remove runs and run properties that have become empty due to previous // transforms. newRoot = (XElement) RemoveEmptyRunsAndRunPropertiesTransform(newRoot); // Merge adjacent runs that have identical run properties. newRoot = (XElement)MergeAdjacentRunsTransform(newRoot); // Merge adjacent instrText elements. newRoot = (XElement)MergeAdjacentInstrText(newRoot); // Separate run children into separate runs newRoot = (XElement)SeparateRunChildrenIntoSeparateRuns(newRoot); if (XNode.DeepEquals(prevNewRoot.Root, newRoot)) { break; } prevNewRoot = new XDocument(newRoot); } if (settings.NormalizeXml) { XAttribute[] ns_attrs = { new XAttribute(XNamespace.Xmlns + "wpc", WPC.wpc), new XAttribute(XNamespace.Xmlns + "mc", MC.mc), new XAttribute(XNamespace.Xmlns + "o", O.o), new XAttribute(XNamespace.Xmlns + "r", R.r), new XAttribute(XNamespace.Xmlns + "m", M.m), new XAttribute(XNamespace.Xmlns + "v", VML.vml), new XAttribute(XNamespace.Xmlns + "wp14", WP14.wp14), new XAttribute(XNamespace.Xmlns + "wp", WP.wp), new XAttribute(XNamespace.Xmlns + "w10", W10.w10), new XAttribute(XNamespace.Xmlns + "w", W.w), new XAttribute(XNamespace.Xmlns + "w14", W14.w14), new XAttribute(XNamespace.Xmlns + "wpg", WPG.wpg), new XAttribute(XNamespace.Xmlns + "wpi", WPI.wpi), new XAttribute(XNamespace.Xmlns + "wne", WNE.wne), new XAttribute(XNamespace.Xmlns + "wps", WPS.wps), new XAttribute(MC.Ignorable, "w14 wp14"), }; XDocument newXDoc = Normalize(new XDocument(newRoot), null); foreach (var nsatt in ns_attrs) { if (newXDoc.Root.Attribute(nsatt.Name) == null) { newXDoc.Root.Add(nsatt); } } part.PutXDocument(newXDoc); } else { part.PutXDocument(new XDocument(newRoot)); } }
public static XElement ConvertToHtml(WordprocessingDocument wordDoc, HtmlConverterSettings htmlConverterSettings) { InitEntityMap(); RevisionAccepter.AcceptRevisions(wordDoc); SimplifyMarkupSettings simplifyMarkupSettings = new SimplifyMarkupSettings { RemoveComments = true, RemoveContentControls = true, RemoveEndAndFootNotes = true, RemoveFieldCodes = false, RemoveLastRenderedPageBreak = true, RemovePermissions = true, RemoveProof = true, RemoveRsidInfo = true, RemoveSmartTags = true, RemoveSoftHyphens = true, RemoveGoBackBookmark = true, ReplaceTabsWithSpaces = false, }; MarkupSimplifier.SimplifyMarkup(wordDoc, simplifyMarkupSettings); FormattingAssemblerSettings formattingAssemblerSettings = new FormattingAssemblerSettings { RemoveStyleNamesFromParagraphAndRunProperties = false, ClearStyles = false, RestrictToSupportedLanguages = htmlConverterSettings.RestrictToSupportedLanguages, RestrictToSupportedNumberingFormats = htmlConverterSettings.RestrictToSupportedNumberingFormats, CreateHtmlConverterAnnotationAttributes = true, OrderElementsPerStandard = false, ListItemRetrieverSettings = new ListItemRetrieverSettings() { ListItemTextImplementations = htmlConverterSettings.ListItemImplementations, }, }; FormattingAssembler.AssembleFormatting(wordDoc, formattingAssemblerSettings); InsertAppropriateNonbreakingSpaces(wordDoc); CalculateSpanWidthForTabs(wordDoc); ReverseTableBordersForRtlTables(wordDoc); AdjustTableBorders(wordDoc); XElement rootElement = wordDoc.MainDocumentPart.GetXDocument().Root; FieldRetriever.AnnotateWithFieldInfo(wordDoc.MainDocumentPart); AnnotateForSections(wordDoc); XElement xhtml = (XElement)ConvertToHtmlTransform(wordDoc, htmlConverterSettings, rootElement, false, 0m); ReifyStylesAndClasses(htmlConverterSettings, xhtml); // Note: the xhtml returned by ConvertToHtmlTransform contains objects of type // XEntity. PtOpenXmlUtil.cs define the XEntity class. See // http://blogs.msdn.com/ericwhite/archive/2010/01/21/writing-entity-references-using-linq-to-xml.aspx // for detailed explanation. // // If you further transform the XML tree returned by ConvertToHtmlTransform, you // must do it correctly, or entities will not be serialized properly. return xhtml; }
public static void CopyFormattingAssembledDocx(FileInfo source, FileInfo dest) { var ba = File.ReadAllBytes(source.FullName); using (MemoryStream ms = new MemoryStream()) { ms.Write(ba, 0, ba.Length); using (WordprocessingDocument wordDoc = WordprocessingDocument.Open(ms, true)) { RevisionAccepter.AcceptRevisions(wordDoc); SimplifyMarkupSettings simplifyMarkupSettings = new SimplifyMarkupSettings { RemoveComments = true, RemoveContentControls = true, RemoveEndAndFootNotes = true, RemoveFieldCodes = false, RemoveLastRenderedPageBreak = true, RemovePermissions = true, RemoveProof = true, RemoveRsidInfo = true, RemoveSmartTags = true, RemoveSoftHyphens = true, RemoveGoBackBookmark = true, ReplaceTabsWithSpaces = false, }; MarkupSimplifier.SimplifyMarkup(wordDoc, simplifyMarkupSettings); FormattingAssemblerSettings formattingAssemblerSettings = new FormattingAssemblerSettings { RemoveStyleNamesFromParagraphAndRunProperties = false, ClearStyles = false, RestrictToSupportedLanguages = false, RestrictToSupportedNumberingFormats = false, CreateHtmlConverterAnnotationAttributes = true, OrderElementsPerStandard = false, ListItemRetrieverSettings = new ListItemRetrieverSettings() { ListItemTextImplementations = ListItemRetrieverSettings.DefaultListItemTextImplementations, }, }; FormattingAssembler.AssembleFormatting(wordDoc, formattingAssemblerSettings); } var newBa = ms.ToArray(); File.WriteAllBytes(dest.FullName, newBa); } }
private static void SimplifyMarkupForPart( OpenXmlPart part, SimplifyMarkupSettings settings) { XDocument xdoc = part.GetXDocument(); XElement newRoot = xdoc.Root; // Need to do this first to enable simplifying hyperlinks. if (settings.RemoveContentControls || settings.RemoveSmartTags) newRoot = (XElement) RemoveCustomXmlAndContentControlsTransform( newRoot, settings); // This may touch many elements, so needs to be its own // transform. if (settings.RemoveRsidInfo) newRoot = (XElement)RemoveRsidTransform(newRoot); if (settings.RemoveComments || settings.RemoveEndAndFootNotes || settings.ReplaceTabsWithSpaces || settings.RemoveFieldCodes || settings.RemovePermissions || settings.RemoveProof || settings.RemoveBookmarks || settings.RemoveWebHidden) newRoot = (XElement)SimplifyMarkupTransform(newRoot, settings); // Remove runs and run properties that have become empty due to previous // transforms. newRoot = (XElement) RemoveEmptyRunsAndRunPropertiesTransform(newRoot); // Merge adjacent runs that have identical run properties. newRoot = (XElement)RemoveSuperfluousRunsTransform(newRoot); // Merge adjacent instrText elements. newRoot = (XElement)MergeAdjacentInstrText(newRoot); // The last thing to do is to again remove runs and run properties // that have become empty due to previous transforms. newRoot = (XElement) RemoveEmptyRunsAndRunPropertiesTransform(newRoot); if (settings.NormalizeXml) { XAttribute[] ns_attrs = { new XAttribute(XNamespace.Xmlns + "wpc", WPC.wpc), new XAttribute(XNamespace.Xmlns + "mc", MC.mc), new XAttribute(XNamespace.Xmlns + "o", O.o), new XAttribute(XNamespace.Xmlns + "r", R.r), new XAttribute(XNamespace.Xmlns + "m", M.m), new XAttribute(XNamespace.Xmlns + "v", VML.vml), new XAttribute(XNamespace.Xmlns + "wp14", WP14.wp14), new XAttribute(XNamespace.Xmlns + "wp", WP.wp), new XAttribute(XNamespace.Xmlns + "w10", W10.w10), new XAttribute(XNamespace.Xmlns + "w", W.w), new XAttribute(XNamespace.Xmlns + "w14", W14.w14), new XAttribute(XNamespace.Xmlns + "wpg", WPG.wpg), new XAttribute(XNamespace.Xmlns + "wpi", WPI.wpi), new XAttribute(XNamespace.Xmlns + "wne", WNE.wne), new XAttribute(XNamespace.Xmlns + "wps", WPS.wps), new XAttribute(MC.Ignorable, "w14 wp14"), }; XDocument newXDoc = Normalize(new XDocument(newRoot), null); newXDoc.Root.Add(ns_attrs); part.PutXDocument(newXDoc); } else { part.PutXDocument(new XDocument(newRoot)); } }
public WmlDocument SimplifyMarkup(SimplifyMarkupSettings settings) { return(MarkupSimplifier.SimplifyMarkup(this, settings)); }
private static object RemoveCustomXmlAndContentControlsTransform( XNode node, SimplifyMarkupSettings simplifyMarkupSettings) { XElement element = node as XElement; if (element != null) { if (simplifyMarkupSettings.RemoveSmartTags && element.Name == W.smartTag) return element .Elements() .Select(e => RemoveCustomXmlAndContentControlsTransform(e, simplifyMarkupSettings)); if (simplifyMarkupSettings.RemoveContentControls && element.Name == W.sdt) return element .Element(W.sdtContent) .Elements() .Select(e => RemoveCustomXmlAndContentControlsTransform(e, simplifyMarkupSettings)); } return node; }
// lastRenderedPageBreak, permEnd, permStart, proofErr, noProof // softHyphen: // Remove when simplifying. // fldSimple, fldData, fldChar, instrText: // For hyperlinks, generate same in XHtml. Other than hyperlinks, do the following: // - collapse fldSimple // - remove fldSimple, fldData, fldChar, instrText. private static object SimplifyMarkupTransform( XNode node, SimplifyMarkupSettings settings, SimplifyMarkupParameters parameters) { XElement element = node as XElement; if (element != null) { if (settings.RemovePermissions && (element.Name == W.permEnd || element.Name == W.permStart)) { return(null); } if (settings.RemoveProof && (element.Name == W.proofErr || element.Name == W.noProof)) { return(null); } if (settings.RemoveSoftHyphens && element.Name == W.softHyphen) { return(null); } if (settings.RemoveLastRenderedPageBreak && element.Name == W.lastRenderedPageBreak) { return(null); } if (settings.RemoveBookmarks && (element.Name == W.bookmarkStart || element.Name == W.bookmarkEnd)) { return(null); } if (settings.RemoveGoBackBookmark && ((element.Name == W.bookmarkStart && (int)element.Attribute(W.id) == parameters.GoBackId) || (element.Name == W.bookmarkEnd && (int)element.Attribute(W.id) == parameters.GoBackId))) { return(null); } if (settings.RemoveWebHidden && element.Name == W.webHidden) { return(null); } if (settings.ReplaceTabsWithSpaces && element.Name == W.tab && element.Parent.Name == W.r) { return(new XElement(W.t, new XAttribute(XNamespace.Xml + "space", "preserve"), " ")); } if (settings.RemoveComments && (element.Name == W.commentRangeStart || element.Name == W.commentRangeEnd || element.Name == W.commentReference || element.Name == W.annotationRef)) { return(null); } if (settings.RemoveComments && element.Name == W.rStyle && element.Attribute(W.val).Value == "CommentReference") { return(null); } if (settings.RemoveEndAndFootNotes && (element.Name == W.endnoteReference || element.Name == W.footnoteReference)) { return(null); } if (settings.RemoveFieldCodes) { if (element.Name == W.fldSimple) { return(element.Elements().Select(e => SimplifyMarkupTransform(e, settings, parameters))); } if (element.Name == W.fldData || element.Name == W.fldChar || element.Name == W.instrText) { return(null); } } return(new XElement(element.Name, element.Attributes(), element.Nodes().Select(n => SimplifyMarkupTransform(n, settings, parameters)))); } return(node); }
public WmlDocument SimplifyMarkup(SimplifyMarkupSettings settings) { return MarkupSimplifier.SimplifyMarkup(this, settings); }
private static WmlDocument PreProcessMarkup(WmlDocument source, int startingIdForFootnotesEndnotes) { // open and close to get rid of MC content using (var ms = new MemoryStream()) { ms.Write(source.DocumentByteArray, 0, source.DocumentByteArray.Length); var os = new OpenSettings { MarkupCompatibilityProcessSettings = new MarkupCompatibilityProcessSettings( MarkupCompatibilityProcessMode.ProcessAllParts, FileFormatVersions.Office2007) }; using (WordprocessingDocument wDoc = WordprocessingDocument.Open(ms, true, os)) { OpenXmlPartRootElement unused = wDoc.MainDocumentPart.RootElement; if (wDoc.MainDocumentPart.FootnotesPart != null) { // contrary to what you might think, looking at the API, it is necessary to access the root element of each part to cause // the SDK to process MC markup. OpenXmlPartRootElement unused1 = wDoc.MainDocumentPart.FootnotesPart.RootElement; } if (wDoc.MainDocumentPart.EndnotesPart != null) { OpenXmlPartRootElement unused1 = wDoc.MainDocumentPart.EndnotesPart.RootElement; } } source = new WmlDocument(source.FileName, ms.ToArray()); } // open and close to get rid of MC content using (var ms = new MemoryStream()) { ms.Write(source.DocumentByteArray, 0, source.DocumentByteArray.Length); var os = new OpenSettings { MarkupCompatibilityProcessSettings = new MarkupCompatibilityProcessSettings( MarkupCompatibilityProcessMode.ProcessAllParts, FileFormatVersions.Office2007) }; using (WordprocessingDocument wDoc = WordprocessingDocument.Open(ms, true, os)) { TestForInvalidContent(wDoc); RemoveExistingPowerToolsMarkup(wDoc); // Removing content controls, field codes, and bookmarks is a no-no for many use cases. // We need content controls, e.g., on the title page. Field codes are required for // automatic cross-references, which require bookmarks. // TODO: Revisit var msSettings = new SimplifyMarkupSettings { RemoveBookmarks = true, AcceptRevisions = false, RemoveComments = true, RemoveContentControls = true, RemoveFieldCodes = true, RemoveGoBackBookmark = true, RemoveLastRenderedPageBreak = true, RemovePermissions = true, RemoveProof = true, RemoveSmartTags = true, RemoveSoftHyphens = true, RemoveHyperlinks = true }; MarkupSimplifier.SimplifyMarkup(wDoc, msSettings); ChangeFootnoteEndnoteReferencesToUniqueRange(wDoc, startingIdForFootnotesEndnotes); AddUnidsToMarkupInContentParts(wDoc); AddFootnotesEndnotesParts(wDoc); FillInEmptyFootnotesEndnotes(wDoc); } return(new WmlDocument(source.FileName, ms.ToArray())); } }
public static WmlDocument MergeComments(WmlDocument document1, WmlDocument document2, bool ensureLocked) { WmlDocument cDoc1 = new WmlDocument(document1); WmlDocument cDoc2 = new WmlDocument(document2); using (OpenXmlMemoryStreamDocument streamDoc1 = new OpenXmlMemoryStreamDocument(cDoc1)) using (WordprocessingDocument doc1 = streamDoc1.GetWordprocessingDocument()) using (OpenXmlMemoryStreamDocument streamDoc2 = new OpenXmlMemoryStreamDocument(cDoc2)) using (WordprocessingDocument doc2 = streamDoc2.GetWordprocessingDocument()) { SimplifyMarkupSettings mss = new SimplifyMarkupSettings() { RemoveProof = true, RemoveRsidInfo = true, RemoveGoBackBookmark = true, }; MarkupSimplifier.SimplifyMarkup(doc1, mss); MarkupSimplifier.SimplifyMarkup(doc2, mss); // If documents don't contain the same content, then don't attempt to merge comments. bool same = DocumentComparer.CompareDocuments(doc1, doc2); if (!same) { throw new CommentMergerDifferingContentsException( "Documents do not contain the same content"); } if (doc1.MainDocumentPart.WordprocessingCommentsPart == null && doc2.MainDocumentPart.WordprocessingCommentsPart == null) { return(new WmlDocument(document1)); } if (doc1.MainDocumentPart.WordprocessingCommentsPart != null && doc2.MainDocumentPart.WordprocessingCommentsPart == null) { return(new WmlDocument(document1)); } if (doc1.MainDocumentPart.WordprocessingCommentsPart == null && doc2.MainDocumentPart.WordprocessingCommentsPart != null) { return(new WmlDocument(document2)); } // If either of the documents have no comments, then return the other one. if (!doc1.MainDocumentPart.WordprocessingCommentsPart.GetXDocument().Root .Elements(W.comment).Any()) { return(new WmlDocument(document2)); } if (!doc2.MainDocumentPart.WordprocessingCommentsPart.GetXDocument().Root .Elements(W.comment).Any()) { return(new WmlDocument(document1)); } if (ensureLocked) { // If either document is not locked (allowing only commenting), don't attempt to // merge comments. if (doc1.ExtendedFilePropertiesPart.GetXDocument().Root .Element(EP.DocSecurity).Value != "8") { throw new CommentMergerUnlockedDocumentException( "Document1 is not locked"); } if (doc2.ExtendedFilePropertiesPart.GetXDocument().Root .Element(EP.DocSecurity).Value != "8") { throw new CommentMergerUnlockedDocumentException( "Document2 is not locked"); } } RenumberCommentsInDoc2(doc1, doc2); WmlDocument destDoc = new WmlDocument(document1); using (OpenXmlMemoryStreamDocument streamDoc = new OpenXmlMemoryStreamDocument(destDoc)) { using (WordprocessingDocument destWDoc = streamDoc.GetWordprocessingDocument()) { // Merge the comments part. XDocument commentsPartXDoc = new XDocument( new XElement(W.comments, new XAttribute(XNamespace.Xmlns + "w", W.w), doc1.MainDocumentPart.WordprocessingCommentsPart.GetXDocument().Root.Elements(), doc2.MainDocumentPart.WordprocessingCommentsPart.GetXDocument().Root.Elements())); destWDoc.MainDocumentPart.WordprocessingCommentsPart.PutXDocument(commentsPartXDoc); MergeCommentsInPart(doc1.MainDocumentPart, doc2.MainDocumentPart, destWDoc.MainDocumentPart, commentsPartXDoc); } return(streamDoc.GetModifiedWmlDocument()); } } }
public static WmlDocument MergeComments(WmlDocument document1, WmlDocument document2, bool ensureLocked) { WmlDocument cDoc1 = new WmlDocument(document1); WmlDocument cDoc2 = new WmlDocument(document2); using (OpenXmlMemoryStreamDocument streamDoc1 = new OpenXmlMemoryStreamDocument(cDoc1)) using (WordprocessingDocument doc1 = streamDoc1.GetWordprocessingDocument()) using (OpenXmlMemoryStreamDocument streamDoc2 = new OpenXmlMemoryStreamDocument(cDoc2)) using (WordprocessingDocument doc2 = streamDoc2.GetWordprocessingDocument()) { SimplifyMarkupSettings mss = new SimplifyMarkupSettings() { RemoveProof = true, RemoveRsidInfo = true, RemoveGoBackBookmark = true, }; MarkupSimplifier.SimplifyMarkup(doc1, mss); MarkupSimplifier.SimplifyMarkup(doc2, mss); // If documents don't contain the same content, then don't attempt to merge comments. bool same = DocumentComparer.CompareDocuments(doc1, doc2); if (!same) throw new CommentMergerDifferingContentsException( "Documents do not contain the same content"); if (doc1.MainDocumentPart.WordprocessingCommentsPart == null && doc2.MainDocumentPart.WordprocessingCommentsPart == null) return new WmlDocument(document1); if (doc1.MainDocumentPart.WordprocessingCommentsPart != null && doc2.MainDocumentPart.WordprocessingCommentsPart == null) return new WmlDocument(document1); if (doc1.MainDocumentPart.WordprocessingCommentsPart == null && doc2.MainDocumentPart.WordprocessingCommentsPart != null) return new WmlDocument(document2); // If either of the documents have no comments, then return the other one. if (! doc1.MainDocumentPart.WordprocessingCommentsPart.GetXDocument().Root .Elements(W.comment).Any()) return new WmlDocument(document2); if (! doc2.MainDocumentPart.WordprocessingCommentsPart.GetXDocument().Root .Elements(W.comment).Any()) return new WmlDocument(document1); if (ensureLocked) { // If either document is not locked (allowing only commenting), don't attempt to // merge comments. if (doc1.ExtendedFilePropertiesPart.GetXDocument().Root .Element(EP.DocSecurity).Value != "8") throw new CommentMergerUnlockedDocumentException( "Document1 is not locked"); if (doc2.ExtendedFilePropertiesPart.GetXDocument().Root .Element(EP.DocSecurity).Value != "8") throw new CommentMergerUnlockedDocumentException( "Document2 is not locked"); } RenumberCommentsInDoc2(doc1, doc2); WmlDocument destDoc = new WmlDocument(document1); using (OpenXmlMemoryStreamDocument streamDoc = new OpenXmlMemoryStreamDocument(destDoc)) { using (WordprocessingDocument destWDoc = streamDoc.GetWordprocessingDocument()) { // Merge the comments part. XDocument commentsPartXDoc = new XDocument( new XElement(W.comments, new XAttribute(XNamespace.Xmlns + "w", W.w), doc1.MainDocumentPart.WordprocessingCommentsPart.GetXDocument().Root.Elements(), doc2.MainDocumentPart.WordprocessingCommentsPart.GetXDocument().Root.Elements())); destWDoc.MainDocumentPart.WordprocessingCommentsPart.PutXDocument(commentsPartXDoc); MergeCommentsInPart(doc1.MainDocumentPart, doc2.MainDocumentPart, destWDoc.MainDocumentPart, commentsPartXDoc); } return streamDoc.GetModifiedWmlDocument(); } } }
public static void SimplifyDocument(String filename) { using (WordprocessingDocument doc = WordprocessingDocument.Open(filename, true)) { SimplifyMarkupSettings settings = new SimplifyMarkupSettings { RemoveComments = true, RemoveContentControls = true, RemoveEndAndFootNotes = true, RemoveFieldCodes = false, RemoveLastRenderedPageBreak = true, RemovePermissions = true, RemoveProof = true, RemoveRsidInfo = true, RemoveSmartTags = true, RemoveSoftHyphens = true, ReplaceTabsWithSpaces = true, }; MarkupSimplifier.SimplifyMarkup(doc, settings); } }