private void fehlerBehandelnToolStripMenuItem_Click(object sender, EventArgs e) { // convert into opus XML and manage conversion errors var mapping = GetDefaultMapping(); var html = File.ReadAllText(htmlFileName, Encoding.Default); try { // Make XHTML var xhtmlDocument = new Texxtoor.BaseLibrary.Core.HtmlAgility.Pack.HtmlDocument(); xhtmlDocument.OptionOutputAsXhtml = true; xhtmlDocument.OptionFixNestedTags = true; xhtmlDocument.OptionOutputAsXml = true; xhtmlDocument.LoadHtml(html); var ms = new MemoryStream(); var tw = new XmlTextWriter(ms, Encoding.Default); xhtmlDocument.Save(tw); // change encoding, as our xslt module needs utf-8 var enc = Encoding.GetEncoding("iso-8859-1"); var text = enc.GetString(ms.ToArray()); var xhtml = Encoding.UTF8.GetString(Encoding.UTF8.GetBytes(text)); // transform var xml = Html2XmlUtil.HtmlToOpusXsltParser(xhtml, mapping); // TODO: check XML here // TODO: if XML is okay store on disc for further processing var xmlFileName = Path.Combine(Path.GetDirectoryName(htmlFileName) ?? String.Empty, String.Format("{0}.xml", Path.GetFileNameWithoutExtension(htmlFileName))); if (File.Exists(xmlFileName)) { AddTextToProtocol("XML exists, deleting "); File.Delete(xmlFileName); AddTextToProtocol("XML deleted "); } AddTextToProtocol("Attempt to write XML"); File.WriteAllText(xmlFileName, xml, Encoding.UTF8); AddTextToProtocol("XML written to disc at {0}", xmlFileName); } catch (Exception ex) { AddTextToProtocol("** Fehler beim Konvertieren in Texxtoor-XML: {0}", ex.Message); } }
private void fehlerBehandelnToolStripMenuItem_Click(object sender, EventArgs e) { // convert into opus XML and manage conversion errors var mapping = GetDefaultMapping(); var html = File.ReadAllText(htmlFileName, Encoding.Default); try { AddTextToProtocol("Make XHTML"); // Make XHTML var xhtmlDocument = new Texxtoor.BaseLibrary.Core.HtmlAgility.Pack.HtmlDocument(); xhtmlDocument.OptionOutputAsXhtml = true; xhtmlDocument.OptionFixNestedTags = true; xhtmlDocument.OptionOutputAsXml = true; xhtmlDocument.LoadHtml(html); var ms = new MemoryStream(); var tw = new XmlTextWriter(ms, Encoding.Default); xhtmlDocument.Save(tw); // change encoding, as our xslt module needs utf-8 var enc = Encoding.GetEncoding("iso-8859-1"); var text = enc.GetString(ms.ToArray()); var xhtml = Encoding.UTF8.GetString(Encoding.UTF8.GetBytes(text)); // convert to UTF-8 for XSLT parsing var doc = System.Xml.Linq.XDocument.Parse(xhtml); doc.Declaration.Encoding = "UTF-8"; var sb = new StringBuilder(); var targetDocWriter = new StringWriterUtf8(sb); // replace entites with regular spaces, double call for weird &nbsp;'s var leaves = from elm in doc.Descendants() where !elm.Elements().Any() select elm; foreach (var leaf in leaves) { var v = HttpUtility.HtmlDecode(leaf.Value); v = HttpUtility.HtmlDecode(v); if (v.Contains(((char)133).ToString())) { v = v.Replace(((char)133).ToString(), "..."); } leaf.Value = v; } doc.Save(targetDocWriter); xhtml = sb.ToString(); // transform var xml = Html2XmlUtil.HtmlToOpusXsltParser(xhtml, mapping); // TODO: check XML here using (var xmlStream = new MemoryStream(Encoding.UTF8.GetBytes(xml))) { using (var xmlReader = new XmlTextReader(xmlStream)) { doc = XDocument.Load(xmlReader); } } AddTextToProtocol("Fix XML"); // 1. Fix headers: <Element type="section" Name="1 Name">1 Name</Element> ===> Remove "1 " var headers = doc.Descendants("Element").Where(elm => elm.Attribute("Type") != null && elm.Attribute("Type").Value == "Section"); foreach (var header in headers) { if (header.FirstNode.NodeType == XmlNodeType.Text) { var value = ((XText)header.FirstNode).Value.Trim(); var match = Regex.Match(value, @"^\d{1,3}(?:\.\d{1,3}){0,5}\s+(.*)"); if (match.Success) { ((XText)header.FirstNode).Value = match.Groups[1].Value; } } } // 2. Fix bullet points <li>. Text</li> ===> Remove ". " var bullets = doc.Descendants("li"); foreach (var bullet in bullets) { var match = Regex.Match(bullet.Value, @"[·]\s+(.*)"); if (match.Success) { bullet.Value = match.Groups[1].Value; } } AddTextToProtocol("Embed Images"); // 3. Embed images (Element Type="Image") var images = doc.Descendants("Element").Where(elm => elm.Attribute("Type") != null && elm.Attribute("Type").Value == "Image"); foreach (var image in images) { // try reading file if (image.Attribute("Path") == null) { continue; } // we take PNG only, hence a conversion is appropriate var path = Path.Combine(imgFolder, Path.GetFileName(image.Attribute("Path").Value)); if (!File.Exists(path)) { continue; } var img = Image.FromFile(path); using (var imgMs = new MemoryStream()) { img.Save(imgMs, ImageFormat.Png); var base64 = Convert.ToBase64String(imgMs.ToArray()); // apply as embedded base64 image.Add(new XAttribute("Method", "Base64")); image.Value = base64; } } sb.Clear(); targetDocWriter = new StringWriterUtf8(sb); doc.Save(targetDocWriter); xml = sb.ToString(); // TODO: if XML is okay store on disc for further processing var xmlFileName = Path.Combine(Path.GetDirectoryName(htmlFileName) ?? String.Empty, String.Format("{0}.xml", Path.GetFileNameWithoutExtension(htmlFileName))); if (File.Exists(xmlFileName)) { AddTextToProtocol("XML exists, deleting "); File.Delete(xmlFileName); AddTextToProtocol("XML deleted "); } AddTextToProtocol("Attempt to write XML"); File.WriteAllText(xmlFileName, xml, Encoding.UTF8); AddTextToProtocol("XML written to disc at {0}", xmlFileName); } catch (Exception ex) { AddTextToProtocol("** Fehler beim Konvertieren in Texxtoor-XML: {0}", ex.Message); } }