Пример #1
0
 private static XDocument ProcessZipFile(byte[] fileData)
 {
     using (var ms = new MemoryStream(fileData)) {
         try {
             using (var gz = new ZipArchive(ms, ZipArchiveMode.Read)) {
                 var entries = gz.Entries;
                 var docFile = entries.First(z => Path.GetExtension(z.Name).StartsWith(".htm"));
                 // copy resources so we can leave the zip safely
                 var resourceFiles = entries.Except(new[] { docFile }).Select(r => {
                     var name        = r.FullName;
                     var imageStream = new MemoryStream();
                     r.Open().CopyTo(imageStream);
                     var content = imageStream.ToArray();
                     return(new {
                         Name = name,
                         Content = content
                     });
                 }).ToList();
                 Html2XmlUtil.imageAsBase64      = true;
                 Html2XmlUtil.TreatExternalData += (sender, args) => {
                     var images = resourceFiles.Where(f => f.Name.EndsWith(args.FileName)).ToList();
                     if (!images.Any())
                     {
                         return;
                     }
                     if (images.Count() > 1)
                     {
                         images = resourceFiles.Where(f => f.Name == Path.Combine(args.FilePath, args.FileName)).ToList();
                     }
                     var image = images.First();
                     try {
                         using (var imageStream = new MemoryStream(image.Content)) {
                             var img = System.Drawing.Image.FromStream(imageStream);
                             using (var pngImgMs = new MemoryStream()) {
                                 img.Save(pngImgMs, ImageFormat.Png);
                                 pngImgMs.Position = 0;
                                 args.Data         = pngImgMs.ToArray();
                             }
                         }
                     } catch (Exception) {
                         args.Data = null;
                     }
                 };
                 var result = new MemoryStream();
                 docFile.Open().CopyTo(result);
                 if (result.Length == 0)
                 {
                     throw new ArgumentOutOfRangeException();
                 }
                 string html;
                 var    bytes = result.ToArray();
                 html = Encoding.UTF8.GetString(bytes);                  // unclear, even ISO docs do well in UTF8 IsUtf8(bytes) ? Encoding.UTF8.GetString(bytes) : Encoding.GetEncoding(1252).GetString(bytes);
                 var xDoc = Html2XmlUtil.CleanUpHtmlWithResources(html); // make clean XHTML with embedded images
                 return(xDoc);
             }
         } catch (Exception ex) {
             throw;
         }
     }
 }
Пример #2
0
        public static Opus ImportSingleHtml(byte[] content, Import mapping, string name)
        {
            Opus opus = new Opus();
            var  html = Encoding.UTF8.GetString(content);

            try {
                // convert prepared HTML into internal <Content> XML (backup and restore format)
                var parameters = new System.Collections.Specialized.NameValueCollection();
                if (mapping != null)
                {
                    mapping.CharacterStyles.ForEach(c => parameters.Add(c.Key, c.Value));
                    mapping.ParagraphStyles.ForEach(c => parameters.Add(c.Key, c.Value));
                    mapping.NumberingStyles.ForEach(c => parameters.Add(c.Key, c.Value));
                }
                var xml = Html2XmlUtil.HtmlToOpusXsltParser(html, parameters);
                using (var xmlStream = new MemoryStream(Encoding.UTF8.GetBytes(xml))) {
                    var xDoc = XDocument.Load(xmlStream);
                    // use restore to create import, all content is in Opus, then
                    RestoreOpusFromFile(opus, xDoc, null);
                    // register the import information with the uploaded and converted HTML
                }
            }
            catch (Exception ex) {
            }
            return(opus);
        }
Пример #3
0
        private static void Main(string[] args)
        {
            string    path, html;
            var       defKey = "y";
            XDocument xml;

            do
            {
                Console.Write("Import Word HTML (H), Word (W), XML (X), Cleanup (C) HTML? ");
                XDocument fixTableXslt;
                XDocument fixListingXslt;
                XDocument config = XDocument.Load("config.xml");
                switch (Console.ReadKey(false).KeyChar.ToString().ToLower())
                {
                case "t":
                    fixTableXslt = Html2XmlUtil.GenerateFixTableXslt(config);
                    break;

                case "h":
                    path = GetPath("Word HTML");
                    // generate the transform files from config on-the-fly
                    fixTableXslt   = Html2XmlUtil.GenerateFixTableXslt(config);
                    fixListingXslt = Html2XmlUtil.GenerateFixListingXslt(config);
                    ImportSingleHtml(path, fixTableXslt, fixListingXslt);
                    break;

                case "w":
                    Console.WriteLine("");
                    path = GetPath("Word DOCX");
                    break;

                case "x":
                    Console.WriteLine("");
                    path = GetPath("texxtoor XML");
                    break;

                case "c":
                    Console.WriteLine("");
                    path = GetPath("Word HTML (clean only)");
                    break;

                default:
                    Console.WriteLine("");
                    Console.Write("Unknown key. ");
                    break;
                }
                Console.Write("Repeat (y) or End (n)?");
                defKey = Console.ReadKey(false).KeyChar.ToString().ToLower();
                Console.WriteLine("");
            } while (defKey == "y");
            Console.WriteLine("done");
            // TODO: Check results by creating a preview in plain HTML
            Console.ReadLine();
        }
Пример #4
0
        private static void ImportSingleHtml(string path, XDocument tableFix, XDocument listingFix)
        {
            var html = File.ReadAllText(path, Encoding.UTF8);
            var name = Path.GetFileNameWithoutExtension(path);

            Console.WriteLine("Convert start");
            try {
                // convert prepared HTML into internal <Content> XML (backup and restore format)
                var xml = Html2XmlUtil.HtmlToOpusXsltParser(name, html, tableFix, listingFix);
                using (var xmlStream = new MemoryStream(Encoding.UTF8.GetBytes(xml))) {
                    Console.WriteLine("Parser");
                    var xDoc = XDocument.Load(xmlStream);
                }
            }
            catch (Exception ex) {
                var cc = Console.ForegroundColor;
                Console.ForegroundColor = ConsoleColor.Red;
                Console.WriteLine(ex.Message);
                Console.ForegroundColor = cc;
            }
        }
Пример #5
0
        private void fehlerBehandelnToolStripMenuItem_Click(object sender, EventArgs e)
        {
            // convert into opus XML and manage conversion errors
            var mapping = GetDefaultMapping();
            var html    = File.ReadAllText(htmlFileName, Encoding.Default);

            try {
                // Make XHTML
                var xhtmlDocument = new Texxtoor.BaseLibrary.Core.HtmlAgility.Pack.HtmlDocument();
                xhtmlDocument.OptionOutputAsXhtml = true;
                xhtmlDocument.OptionFixNestedTags = true;
                xhtmlDocument.OptionOutputAsXml   = true;
                xhtmlDocument.LoadHtml(html);
                var ms = new MemoryStream();
                var tw = new XmlTextWriter(ms, Encoding.Default);
                xhtmlDocument.Save(tw);
                // change encoding, as our xslt module needs utf-8
                var enc   = Encoding.GetEncoding("iso-8859-1");
                var text  = enc.GetString(ms.ToArray());
                var xhtml = Encoding.UTF8.GetString(Encoding.UTF8.GetBytes(text));
                // transform
                var xml = Html2XmlUtil.HtmlToOpusXsltParser(xhtml, mapping);
                // TODO: check XML here

                // TODO: if XML is okay store on disc for further processing
                var xmlFileName = Path.Combine(Path.GetDirectoryName(htmlFileName) ?? String.Empty, String.Format("{0}.xml", Path.GetFileNameWithoutExtension(htmlFileName)));
                if (File.Exists(xmlFileName))
                {
                    AddTextToProtocol("XML exists, deleting ");
                    File.Delete(xmlFileName);
                    AddTextToProtocol("XML deleted ");
                }
                AddTextToProtocol("Attempt to write XML");
                File.WriteAllText(xmlFileName, xml, Encoding.UTF8);
                AddTextToProtocol("XML written to disc at {0}", xmlFileName);
            } catch (Exception ex) {
                AddTextToProtocol("** Fehler beim Konvertieren in Texxtoor-XML: {0}", ex.Message);
            }
        }
Пример #6
0
        private void fehlerBehandelnToolStripMenuItem_Click(object sender, EventArgs e)
        {
            // convert into opus XML and manage conversion errors
            var mapping = GetDefaultMapping();
            var html    = File.ReadAllText(htmlFileName, Encoding.Default);

            try {
                AddTextToProtocol("Make XHTML");
                // Make XHTML
                var xhtmlDocument = new Texxtoor.BaseLibrary.Core.HtmlAgility.Pack.HtmlDocument();
                xhtmlDocument.OptionOutputAsXhtml = true;
                xhtmlDocument.OptionFixNestedTags = true;
                xhtmlDocument.OptionOutputAsXml   = true;
                xhtmlDocument.LoadHtml(html);
                var ms = new MemoryStream();
                var tw = new XmlTextWriter(ms, Encoding.Default);
                xhtmlDocument.Save(tw);
                // change encoding, as our xslt module needs utf-8
                var enc  = Encoding.GetEncoding("iso-8859-1");
                var text = enc.GetString(ms.ToArray());

                var xhtml = Encoding.UTF8.GetString(Encoding.UTF8.GetBytes(text));
                // convert to UTF-8 for XSLT parsing
                var doc = System.Xml.Linq.XDocument.Parse(xhtml);
                doc.Declaration.Encoding = "UTF-8";
                var sb = new StringBuilder();
                var targetDocWriter = new StringWriterUtf8(sb);
                // replace entites with regular spaces, double call for weird &amp;nbsp;'s
                var leaves = from elm in doc.Descendants()
                             where !elm.Elements().Any()
                             select elm;
                foreach (var leaf in leaves)
                {
                    var v = HttpUtility.HtmlDecode(leaf.Value);
                    v = HttpUtility.HtmlDecode(v);
                    if (v.Contains(((char)133).ToString()))
                    {
                        v = v.Replace(((char)133).ToString(), "...");
                    }
                    leaf.Value = v;
                }
                doc.Save(targetDocWriter);
                xhtml = sb.ToString();
                // transform
                var xml = Html2XmlUtil.HtmlToOpusXsltParser(xhtml, mapping);
                // TODO: check XML here
                using (var xmlStream = new MemoryStream(Encoding.UTF8.GetBytes(xml))) {
                    using (var xmlReader = new XmlTextReader(xmlStream)) {
                        doc = XDocument.Load(xmlReader);
                    }
                }
                AddTextToProtocol("Fix XML");
                // 1. Fix headers: <Element type="section" Name="1   Name">1   Name</Element> ===> Remove "1    "
                var headers = doc.Descendants("Element").Where(elm => elm.Attribute("Type") != null && elm.Attribute("Type").Value == "Section");
                foreach (var header in headers)
                {
                    if (header.FirstNode.NodeType == XmlNodeType.Text)
                    {
                        var value = ((XText)header.FirstNode).Value.Trim();
                        var match = Regex.Match(value, @"^\d{1,3}(?:\.\d{1,3}){0,5}\s+(.*)");
                        if (match.Success)
                        {
                            ((XText)header.FirstNode).Value = match.Groups[1].Value;
                        }
                    }
                }
                // 2. Fix bullet points <li>.     Text</li> ===> Remove ".      "
                var bullets = doc.Descendants("li");
                foreach (var bullet in bullets)
                {
                    var match = Regex.Match(bullet.Value, @"[·]\s+(.*)");
                    if (match.Success)
                    {
                        bullet.Value = match.Groups[1].Value;
                    }
                }
                AddTextToProtocol("Embed Images");
                // 3. Embed images (Element Type="Image")
                var images = doc.Descendants("Element").Where(elm => elm.Attribute("Type") != null && elm.Attribute("Type").Value == "Image");
                foreach (var image in images)
                {
                    // try reading file
                    if (image.Attribute("Path") == null)
                    {
                        continue;
                    }
                    // we take PNG only, hence a conversion is appropriate
                    var path = Path.Combine(imgFolder, Path.GetFileName(image.Attribute("Path").Value));
                    if (!File.Exists(path))
                    {
                        continue;
                    }
                    var img = Image.FromFile(path);
                    using (var imgMs = new MemoryStream()) {
                        img.Save(imgMs, ImageFormat.Png);
                        var base64 = Convert.ToBase64String(imgMs.ToArray());
                        // apply as embedded base64
                        image.Add(new XAttribute("Method", "Base64"));
                        image.Value = base64;
                    }
                }
                sb.Clear();
                targetDocWriter = new StringWriterUtf8(sb);
                doc.Save(targetDocWriter);
                xml = sb.ToString();
                // TODO: if XML is okay store on disc for further processing
                var xmlFileName = Path.Combine(Path.GetDirectoryName(htmlFileName) ?? String.Empty, String.Format("{0}.xml", Path.GetFileNameWithoutExtension(htmlFileName)));
                if (File.Exists(xmlFileName))
                {
                    AddTextToProtocol("XML exists, deleting ");
                    File.Delete(xmlFileName);
                    AddTextToProtocol("XML deleted ");
                }
                AddTextToProtocol("Attempt to write XML");
                File.WriteAllText(xmlFileName, xml, Encoding.UTF8);
                AddTextToProtocol("XML written to disc at {0}", xmlFileName);
            } catch (Exception ex) {
                AddTextToProtocol("** Fehler beim Konvertieren in Texxtoor-XML: {0}", ex.Message);
            }
        }