private static XDocument ProcessZipFile(byte[] fileData) { using (var ms = new MemoryStream(fileData)) { try { using (var gz = new ZipArchive(ms, ZipArchiveMode.Read)) { var entries = gz.Entries; var docFile = entries.First(z => Path.GetExtension(z.Name).StartsWith(".htm")); // copy resources so we can leave the zip safely var resourceFiles = entries.Except(new[] { docFile }).Select(r => { var name = r.FullName; var imageStream = new MemoryStream(); r.Open().CopyTo(imageStream); var content = imageStream.ToArray(); return(new { Name = name, Content = content }); }).ToList(); Html2XmlUtil.imageAsBase64 = true; Html2XmlUtil.TreatExternalData += (sender, args) => { var images = resourceFiles.Where(f => f.Name.EndsWith(args.FileName)).ToList(); if (!images.Any()) { return; } if (images.Count() > 1) { images = resourceFiles.Where(f => f.Name == Path.Combine(args.FilePath, args.FileName)).ToList(); } var image = images.First(); try { using (var imageStream = new MemoryStream(image.Content)) { var img = System.Drawing.Image.FromStream(imageStream); using (var pngImgMs = new MemoryStream()) { img.Save(pngImgMs, ImageFormat.Png); pngImgMs.Position = 0; args.Data = pngImgMs.ToArray(); } } } catch (Exception) { args.Data = null; } }; var result = new MemoryStream(); docFile.Open().CopyTo(result); if (result.Length == 0) { throw new ArgumentOutOfRangeException(); } string html; var bytes = result.ToArray(); html = Encoding.UTF8.GetString(bytes); // unclear, even ISO docs do well in UTF8 IsUtf8(bytes) ? Encoding.UTF8.GetString(bytes) : Encoding.GetEncoding(1252).GetString(bytes); var xDoc = Html2XmlUtil.CleanUpHtmlWithResources(html); // make clean XHTML with embedded images return(xDoc); } } catch (Exception ex) { throw; } } }
public static Opus ImportSingleHtml(byte[] content, Import mapping, string name) { Opus opus = new Opus(); var html = Encoding.UTF8.GetString(content); try { // convert prepared HTML into internal <Content> XML (backup and restore format) var parameters = new System.Collections.Specialized.NameValueCollection(); if (mapping != null) { mapping.CharacterStyles.ForEach(c => parameters.Add(c.Key, c.Value)); mapping.ParagraphStyles.ForEach(c => parameters.Add(c.Key, c.Value)); mapping.NumberingStyles.ForEach(c => parameters.Add(c.Key, c.Value)); } var xml = Html2XmlUtil.HtmlToOpusXsltParser(html, parameters); using (var xmlStream = new MemoryStream(Encoding.UTF8.GetBytes(xml))) { var xDoc = XDocument.Load(xmlStream); // use restore to create import, all content is in Opus, then RestoreOpusFromFile(opus, xDoc, null); // register the import information with the uploaded and converted HTML } } catch (Exception ex) { } return(opus); }
private static void Main(string[] args) { string path, html; var defKey = "y"; XDocument xml; do { Console.Write("Import Word HTML (H), Word (W), XML (X), Cleanup (C) HTML? "); XDocument fixTableXslt; XDocument fixListingXslt; XDocument config = XDocument.Load("config.xml"); switch (Console.ReadKey(false).KeyChar.ToString().ToLower()) { case "t": fixTableXslt = Html2XmlUtil.GenerateFixTableXslt(config); break; case "h": path = GetPath("Word HTML"); // generate the transform files from config on-the-fly fixTableXslt = Html2XmlUtil.GenerateFixTableXslt(config); fixListingXslt = Html2XmlUtil.GenerateFixListingXslt(config); ImportSingleHtml(path, fixTableXslt, fixListingXslt); break; case "w": Console.WriteLine(""); path = GetPath("Word DOCX"); break; case "x": Console.WriteLine(""); path = GetPath("texxtoor XML"); break; case "c": Console.WriteLine(""); path = GetPath("Word HTML (clean only)"); break; default: Console.WriteLine(""); Console.Write("Unknown key. "); break; } Console.Write("Repeat (y) or End (n)?"); defKey = Console.ReadKey(false).KeyChar.ToString().ToLower(); Console.WriteLine(""); } while (defKey == "y"); Console.WriteLine("done"); // TODO: Check results by creating a preview in plain HTML Console.ReadLine(); }
private static void ImportSingleHtml(string path, XDocument tableFix, XDocument listingFix) { var html = File.ReadAllText(path, Encoding.UTF8); var name = Path.GetFileNameWithoutExtension(path); Console.WriteLine("Convert start"); try { // convert prepared HTML into internal <Content> XML (backup and restore format) var xml = Html2XmlUtil.HtmlToOpusXsltParser(name, html, tableFix, listingFix); using (var xmlStream = new MemoryStream(Encoding.UTF8.GetBytes(xml))) { Console.WriteLine("Parser"); var xDoc = XDocument.Load(xmlStream); } } catch (Exception ex) { var cc = Console.ForegroundColor; Console.ForegroundColor = ConsoleColor.Red; Console.WriteLine(ex.Message); Console.ForegroundColor = cc; } }
private void fehlerBehandelnToolStripMenuItem_Click(object sender, EventArgs e) { // convert into opus XML and manage conversion errors var mapping = GetDefaultMapping(); var html = File.ReadAllText(htmlFileName, Encoding.Default); try { // Make XHTML var xhtmlDocument = new Texxtoor.BaseLibrary.Core.HtmlAgility.Pack.HtmlDocument(); xhtmlDocument.OptionOutputAsXhtml = true; xhtmlDocument.OptionFixNestedTags = true; xhtmlDocument.OptionOutputAsXml = true; xhtmlDocument.LoadHtml(html); var ms = new MemoryStream(); var tw = new XmlTextWriter(ms, Encoding.Default); xhtmlDocument.Save(tw); // change encoding, as our xslt module needs utf-8 var enc = Encoding.GetEncoding("iso-8859-1"); var text = enc.GetString(ms.ToArray()); var xhtml = Encoding.UTF8.GetString(Encoding.UTF8.GetBytes(text)); // transform var xml = Html2XmlUtil.HtmlToOpusXsltParser(xhtml, mapping); // TODO: check XML here // TODO: if XML is okay store on disc for further processing var xmlFileName = Path.Combine(Path.GetDirectoryName(htmlFileName) ?? String.Empty, String.Format("{0}.xml", Path.GetFileNameWithoutExtension(htmlFileName))); if (File.Exists(xmlFileName)) { AddTextToProtocol("XML exists, deleting "); File.Delete(xmlFileName); AddTextToProtocol("XML deleted "); } AddTextToProtocol("Attempt to write XML"); File.WriteAllText(xmlFileName, xml, Encoding.UTF8); AddTextToProtocol("XML written to disc at {0}", xmlFileName); } catch (Exception ex) { AddTextToProtocol("** Fehler beim Konvertieren in Texxtoor-XML: {0}", ex.Message); } }
private void fehlerBehandelnToolStripMenuItem_Click(object sender, EventArgs e) { // convert into opus XML and manage conversion errors var mapping = GetDefaultMapping(); var html = File.ReadAllText(htmlFileName, Encoding.Default); try { AddTextToProtocol("Make XHTML"); // Make XHTML var xhtmlDocument = new Texxtoor.BaseLibrary.Core.HtmlAgility.Pack.HtmlDocument(); xhtmlDocument.OptionOutputAsXhtml = true; xhtmlDocument.OptionFixNestedTags = true; xhtmlDocument.OptionOutputAsXml = true; xhtmlDocument.LoadHtml(html); var ms = new MemoryStream(); var tw = new XmlTextWriter(ms, Encoding.Default); xhtmlDocument.Save(tw); // change encoding, as our xslt module needs utf-8 var enc = Encoding.GetEncoding("iso-8859-1"); var text = enc.GetString(ms.ToArray()); var xhtml = Encoding.UTF8.GetString(Encoding.UTF8.GetBytes(text)); // convert to UTF-8 for XSLT parsing var doc = System.Xml.Linq.XDocument.Parse(xhtml); doc.Declaration.Encoding = "UTF-8"; var sb = new StringBuilder(); var targetDocWriter = new StringWriterUtf8(sb); // replace entites with regular spaces, double call for weird &nbsp;'s var leaves = from elm in doc.Descendants() where !elm.Elements().Any() select elm; foreach (var leaf in leaves) { var v = HttpUtility.HtmlDecode(leaf.Value); v = HttpUtility.HtmlDecode(v); if (v.Contains(((char)133).ToString())) { v = v.Replace(((char)133).ToString(), "..."); } leaf.Value = v; } doc.Save(targetDocWriter); xhtml = sb.ToString(); // transform var xml = Html2XmlUtil.HtmlToOpusXsltParser(xhtml, mapping); // TODO: check XML here using (var xmlStream = new MemoryStream(Encoding.UTF8.GetBytes(xml))) { using (var xmlReader = new XmlTextReader(xmlStream)) { doc = XDocument.Load(xmlReader); } } AddTextToProtocol("Fix XML"); // 1. Fix headers: <Element type="section" Name="1 Name">1 Name</Element> ===> Remove "1 " var headers = doc.Descendants("Element").Where(elm => elm.Attribute("Type") != null && elm.Attribute("Type").Value == "Section"); foreach (var header in headers) { if (header.FirstNode.NodeType == XmlNodeType.Text) { var value = ((XText)header.FirstNode).Value.Trim(); var match = Regex.Match(value, @"^\d{1,3}(?:\.\d{1,3}){0,5}\s+(.*)"); if (match.Success) { ((XText)header.FirstNode).Value = match.Groups[1].Value; } } } // 2. Fix bullet points <li>. Text</li> ===> Remove ". " var bullets = doc.Descendants("li"); foreach (var bullet in bullets) { var match = Regex.Match(bullet.Value, @"[ยท]\s+(.*)"); if (match.Success) { bullet.Value = match.Groups[1].Value; } } AddTextToProtocol("Embed Images"); // 3. Embed images (Element Type="Image") var images = doc.Descendants("Element").Where(elm => elm.Attribute("Type") != null && elm.Attribute("Type").Value == "Image"); foreach (var image in images) { // try reading file if (image.Attribute("Path") == null) { continue; } // we take PNG only, hence a conversion is appropriate var path = Path.Combine(imgFolder, Path.GetFileName(image.Attribute("Path").Value)); if (!File.Exists(path)) { continue; } var img = Image.FromFile(path); using (var imgMs = new MemoryStream()) { img.Save(imgMs, ImageFormat.Png); var base64 = Convert.ToBase64String(imgMs.ToArray()); // apply as embedded base64 image.Add(new XAttribute("Method", "Base64")); image.Value = base64; } } sb.Clear(); targetDocWriter = new StringWriterUtf8(sb); doc.Save(targetDocWriter); xml = sb.ToString(); // TODO: if XML is okay store on disc for further processing var xmlFileName = Path.Combine(Path.GetDirectoryName(htmlFileName) ?? String.Empty, String.Format("{0}.xml", Path.GetFileNameWithoutExtension(htmlFileName))); if (File.Exists(xmlFileName)) { AddTextToProtocol("XML exists, deleting "); File.Delete(xmlFileName); AddTextToProtocol("XML deleted "); } AddTextToProtocol("Attempt to write XML"); File.WriteAllText(xmlFileName, xml, Encoding.UTF8); AddTextToProtocol("XML written to disc at {0}", xmlFileName); } catch (Exception ex) { AddTextToProtocol("** Fehler beim Konvertieren in Texxtoor-XML: {0}", ex.Message); } }