Beispiel #1
0
        #pragma warning restore CS1998

        public static FieldExtractResult ExtractFields(string templateFileName,
                                                       bool removeCustomProperties = true, IEnumerable <string> keepPropertyNames = null)
        {
            string      newTemplateFileName  = templateFileName + "obj.docx";
            string      outputFile           = templateFileName + "obj.json";
            WmlDocument templateDoc          = new WmlDocument(templateFileName); // just reads the template's bytes into memory (that's all), read-only
            WmlDocument preprocessedTemplate = null;

            byte[] byteArray        = templateDoc.DocumentByteArray;
            var    fieldAccumulator = new FieldAccumulator();

            using (MemoryStream mem = new MemoryStream())
            {
                mem.Write(byteArray, 0, byteArray.Length);                                      // copy template file (binary) into memory -- I guess so the template/file handle isn't held/locked?
                using (WordprocessingDocument wordDoc = WordprocessingDocument.Open(mem, true)) // read & parse that byte array into OXML document (also in memory)
                {
                    // first, remove all the task panes / web extension parts from the template (if there are any)
                    wordDoc.DeleteParts <WebExTaskpanesPart>(wordDoc.GetPartsOfType <WebExTaskpanesPart>());
                    // next, extract all fields (and thus logic) from the template's content parts
                    ExtractAllTemplateFields(wordDoc, fieldAccumulator, removeCustomProperties, keepPropertyNames);
                }
                preprocessedTemplate = new WmlDocument(newTemplateFileName, mem.ToArray());
            }
            // save the output (even in the case of error, since error messages are in the file)
            preprocessedTemplate.Save();

            using (StreamWriter sw = File.CreateText(outputFile))
            {
                fieldAccumulator.JsonSerialize(sw);
                sw.Close();
            }

            return(new FieldExtractResult(newTemplateFileName, outputFile));
        }
Beispiel #2
0
        private static void ExtractFieldsFromPart(OpenXmlPart part, FieldAccumulator fieldAccumulator)
        {
            XDocument xDoc     = part.GetXDocument();
            var       xDocRoot = (XElement)IdentifyAndTransformFields(xDoc.Root, fieldAccumulator);

            xDoc.Elements().First().ReplaceWith(xDocRoot);
            part.PutXDocument();
        }
Beispiel #3
0
        private static void ExtractAllTemplateFields(WordprocessingDocument wordDoc, FieldAccumulator fieldAccumulator,
                                                     bool removeCustomProperties = true, IEnumerable <string> keepPropertyNames = null)
        {
            if (RevisionAccepter.HasTrackedRevisions(wordDoc))
            {
                throw new FieldParseException("Invalid template - contains tracked revisions");
            }

            // extract fields from each part of the document
            foreach (var part in wordDoc.ContentParts())
            {
                ExtractFieldsFromPart(part, fieldAccumulator);

                if (removeCustomProperties)
                {
                    // remove document variables and custom properties
                    // (in case they have any sensitive information that should not carry over to assembled documents!)
                    MainDocumentPart main = part as MainDocumentPart;
                    if (main != null)
                    {
                        var docVariables = main.DocumentSettingsPart.Settings.Descendants <DocumentVariables>();
                        foreach (DocumentVariables docVars in docVariables.ToList())
                        {
                            foreach (DocumentVariable docVar in docVars.ToList())
                            {
                                if (keepPropertyNames == null || !Enumerable.Contains <string>(keepPropertyNames, docVar.Name))
                                {
                                    docVar.Remove();
                                    //docVar.Name = "Id";
                                    //docVar.Val.Value = "123";
                                }
                            }
                        }
                    }
                }
            }
            if (removeCustomProperties)
            {
                // remove custom properties if there are any (custom properties are the new/non-legacy version of document variables)
                var custom = wordDoc.CustomFilePropertiesPart;
                if (custom != null)
                {
                    foreach (CustomDocumentProperty prop in custom.Properties.ToList())
                    {
                        if (keepPropertyNames == null || !Enumerable.Contains <string>(keepPropertyNames, prop.Name))
                        {
                            prop.Remove();
                            // string propName = prop.Name;
                            // string value = prop.VTLPWSTR.InnerText;
                        }
                    }
                }
            }
        }
Beispiel #4
0
        private static object IdentifyAndTransformFields(XNode node, FieldAccumulator fieldAccumulator)
        {
            XElement element = node as XElement;

            if (element != null)
            {
                if (element.Name == W.sdt)
                {
                    var alias = (string)element.Elements(W.sdtPr).Elements(W.alias).Attributes(W.val).FirstOrDefault();
                    if (alias == null || alias == "")
                    {
                        var ccContents = element
                                         .DescendantsTrimmed(W.txbxContent)
                                         .Where(e => e.Name == W.t)
                                         .Select(t => (string)t)
                                         .StringConcatenate()
                                         .CleanUpInvalidCharacters();
                        if (FieldRecognizer.IsField(ccContents, out ccContents))
                        {
                            //var isBlockLevel = element.Element(W.sdtContent).Elements(W.p).FirstOrDefault() != null;
                            var newCC = new XElement(element.Name, element.Attributes());
                            var props = element.Elements(W.sdtPr).FirstOrDefault();
                            if (props == null)
                            {
                                props = new XElement(W.sdtPr);
                            }
                            else
                            {
                                props.Remove();
                            }
                            newCC.Add(props);
                            var tagElem = props.Elements(W.tag).FirstOrDefault();
                            if (tagElem == null)
                            {
                                tagElem = new XElement(W.tag);
                                props.Add(tagElem);
                            }
                            var fieldId = fieldAccumulator.AddField(ccContents);
                            tagElem.SetAttributeValue(W.val, fieldId);
                            newCC.Add(element.Nodes());
                            return(newCC);
                        }
                        return(new XElement(element.Name,
                                            element.Attributes(),
                                            element.Nodes().Select(n => IdentifyAndTransformFields(n, fieldAccumulator))));
                    }
                    return(new XElement(element.Name,
                                        element.Attributes(),
                                        element.Nodes().Select(n => IdentifyAndTransformFields(n, fieldAccumulator))));
                }
                if (element.Name == W.p)
                {
                    fieldAccumulator.BeginBlock();
                    var paraContents = element
                                       .DescendantsTrimmed(W.txbxContent)
                                       .Where(e => e.Name == W.t)
                                       .Select(t => (string)t)
                                       .StringConcatenate()
                                       .Trim();
                    int occurances = CountSubstring(FieldRecognizer.EmbedBegin, paraContents);
                    if (occurances == 1 &&
                        paraContents.StartsWith(FieldRecognizer.EmbedBegin + FieldRecognizer.FieldBegin) &&
                        paraContents.EndsWith(FieldRecognizer.FieldEnd + FieldRecognizer.EmbedEnd))
                    {
                        var content = paraContents
                                      .Substring(FieldRecognizer.EmbedBegin.Length,
                                                 paraContents.Length - FieldRecognizer.EmbedBegin.Length - FieldRecognizer.EmbedEnd.Length)
                                      .Trim();
                        if (FieldRecognizer.IsField(content, out content))
                        {
                            var fieldId = fieldAccumulator.AddField(content);
                            fieldAccumulator.EndBlock();
                            var      ppr = element.Elements(W.pPr).FirstOrDefault();
                            var      rpr = (ppr != null) ? ppr.Elements(W.rPr).FirstOrDefault() : null;
                            XElement r   = new XElement(W.r, rpr,
                                                        new XElement(W.t, FieldRecognizer.FieldBegin + content + FieldRecognizer.FieldEnd));
                            return(new XElement(element.Name,
                                                element.Attributes(),
                                                element.Elements(W.pPr),
                                                CCTWrap(fieldId, r)
                                                ));
                        }
                        // else fall through to (slower) case
                    }
                    if (paraContents.Contains(FieldRecognizer.EmbedBegin + FieldRecognizer.FieldBegin))
                    {
                        fieldAccumulator.RegisterNonFieldContentInBlock();
                        var runReplacementInfo = new List <XElement>();
                        var placeholderText    = Guid.NewGuid().ToString();
                        var r = new Regex(
                            Regex.Escape(FieldRecognizer.EmbedBegin)
                            + "\\s*"
                            + Regex.Escape(FieldRecognizer.FieldBegin)
                            + ".*?"
                            + Regex.Escape(FieldRecognizer.FieldEnd)
                            + "\\s*"
                            + Regex.Escape(FieldRecognizer.EmbedEnd));
                        var replacedCount = OpenXmlRegex.Replace(new[] { element }, r, placeholderText, (para, match) =>
                        {
                            var matchString = match.Value.Trim().Replace("\u0001", ""); // unrecognized codes/elements returned as \u0001; strip these
                            var content     = matchString.Substring(
                                FieldRecognizer.EmbedBegin.Length,
                                matchString.Length - FieldRecognizer.EmbedBegin.Length - FieldRecognizer.EmbedEnd.Length
                                ).CleanUpInvalidCharacters();
                            if (FieldRecognizer.IsField(content, out content))
                            {
                                runReplacementInfo.Add(CCWrap(new XElement(W.r, new XElement(W.t,
                                                                                             FieldRecognizer.FieldBegin + content + FieldRecognizer.FieldEnd))));
                                return(true);
                            }
                            return(false);
                        }, false);
                        if (replacedCount > 0)
                        {
                            var newPara = new XElement(element);
                            foreach (var elem in runReplacementInfo)
                            {
                                var runToReplace = newPara.Descendants(W.r).FirstOrDefault(rn => rn.Value == placeholderText &&
                                                                                           rn.Parent.Name != Templater.OD.Content);
                                if (runToReplace == null)
                                {
                                    throw new InvalidOperationException("Internal error");
                                }
                                else
                                {
                                    var rpr = runToReplace.Elements(W.rPr).FirstOrDefault();
                                    if (rpr != null)
                                    {
                                        rpr.Remove();
                                        elem.Elements(W.sdtContent).First().Elements(W.r).First().AddFirst(rpr);
                                    }
                                    runToReplace.ReplaceWith(elem);
                                }
                            }
                            var coalescedParagraph = WordprocessingMLUtil.CoalesceAdjacentRunsWithIdenticalFormatting(newPara);
                            var transformedContent = IdentifyAndTransformFields(coalescedParagraph, fieldAccumulator);
                            fieldAccumulator.EndBlock();
                            return(transformedContent);
                        }
                    }
                    var transformedParaContent = element.Nodes().Select(n => IdentifyAndTransformFields(n, fieldAccumulator)).ToArray();
                    fieldAccumulator.EndBlock();
                    return(new XElement(element.Name, element.Attributes(), transformedParaContent));
                }
                if (element.Name == W.lastRenderedPageBreak)
                {
                    // documents assembled from templates will almost always change pagination, so remove Word's pagination hints
                    // (also because they're not handled cleanly by OXPT)
                    return(null);
                }

                return(new XElement(element.Name,
                                    element.Attributes(),
                                    element.Nodes().Select(n => IdentifyAndTransformFields(n, fieldAccumulator))));
            }
            return(node);
        }