Exemple #1
0
        private void fehlerBehandelnToolStripMenuItem_Click(object sender, EventArgs e)
        {
            // convert into opus XML and manage conversion errors
            var mapping = GetDefaultMapping();
            var html    = File.ReadAllText(htmlFileName, Encoding.Default);

            try {
                // Make XHTML
                var xhtmlDocument = new Texxtoor.BaseLibrary.Core.HtmlAgility.Pack.HtmlDocument();
                xhtmlDocument.OptionOutputAsXhtml = true;
                xhtmlDocument.OptionFixNestedTags = true;
                xhtmlDocument.OptionOutputAsXml   = true;
                xhtmlDocument.LoadHtml(html);
                var ms = new MemoryStream();
                var tw = new XmlTextWriter(ms, Encoding.Default);
                xhtmlDocument.Save(tw);
                // change encoding, as our xslt module needs utf-8
                var enc   = Encoding.GetEncoding("iso-8859-1");
                var text  = enc.GetString(ms.ToArray());
                var xhtml = Encoding.UTF8.GetString(Encoding.UTF8.GetBytes(text));
                // transform
                var xml = Html2XmlUtil.HtmlToOpusXsltParser(xhtml, mapping);
                // TODO: check XML here

                // TODO: if XML is okay store on disc for further processing
                var xmlFileName = Path.Combine(Path.GetDirectoryName(htmlFileName) ?? String.Empty, String.Format("{0}.xml", Path.GetFileNameWithoutExtension(htmlFileName)));
                if (File.Exists(xmlFileName))
                {
                    AddTextToProtocol("XML exists, deleting ");
                    File.Delete(xmlFileName);
                    AddTextToProtocol("XML deleted ");
                }
                AddTextToProtocol("Attempt to write XML");
                File.WriteAllText(xmlFileName, xml, Encoding.UTF8);
                AddTextToProtocol("XML written to disc at {0}", xmlFileName);
            } catch (Exception ex) {
                AddTextToProtocol("** Fehler beim Konvertieren in Texxtoor-XML: {0}", ex.Message);
            }
        }
Exemple #2
0
        private void fehlerBehandelnToolStripMenuItem_Click(object sender, EventArgs e)
        {
            // convert into opus XML and manage conversion errors
            var mapping = GetDefaultMapping();
            var html    = File.ReadAllText(htmlFileName, Encoding.Default);

            try {
                AddTextToProtocol("Make XHTML");
                // Make XHTML
                var xhtmlDocument = new Texxtoor.BaseLibrary.Core.HtmlAgility.Pack.HtmlDocument();
                xhtmlDocument.OptionOutputAsXhtml = true;
                xhtmlDocument.OptionFixNestedTags = true;
                xhtmlDocument.OptionOutputAsXml   = true;
                xhtmlDocument.LoadHtml(html);
                var ms = new MemoryStream();
                var tw = new XmlTextWriter(ms, Encoding.Default);
                xhtmlDocument.Save(tw);
                // change encoding, as our xslt module needs utf-8
                var enc  = Encoding.GetEncoding("iso-8859-1");
                var text = enc.GetString(ms.ToArray());

                var xhtml = Encoding.UTF8.GetString(Encoding.UTF8.GetBytes(text));
                // convert to UTF-8 for XSLT parsing
                var doc = System.Xml.Linq.XDocument.Parse(xhtml);
                doc.Declaration.Encoding = "UTF-8";
                var sb = new StringBuilder();
                var targetDocWriter = new StringWriterUtf8(sb);
                // replace entites with regular spaces, double call for weird  's
                var leaves = from elm in doc.Descendants()
                             where !elm.Elements().Any()
                             select elm;
                foreach (var leaf in leaves)
                {
                    var v = HttpUtility.HtmlDecode(leaf.Value);
                    v = HttpUtility.HtmlDecode(v);
                    if (v.Contains(((char)133).ToString()))
                    {
                        v = v.Replace(((char)133).ToString(), "...");
                    }
                    leaf.Value = v;
                }
                doc.Save(targetDocWriter);
                xhtml = sb.ToString();
                // transform
                var xml = Html2XmlUtil.HtmlToOpusXsltParser(xhtml, mapping);
                // TODO: check XML here
                using (var xmlStream = new MemoryStream(Encoding.UTF8.GetBytes(xml))) {
                    using (var xmlReader = new XmlTextReader(xmlStream)) {
                        doc = XDocument.Load(xmlReader);
                    }
                }
                AddTextToProtocol("Fix XML");
                // 1. Fix headers: <Element type="section" Name="1   Name">1   Name</Element> ===> Remove "1    "
                var headers = doc.Descendants("Element").Where(elm => elm.Attribute("Type") != null && elm.Attribute("Type").Value == "Section");
                foreach (var header in headers)
                {
                    if (header.FirstNode.NodeType == XmlNodeType.Text)
                    {
                        var value = ((XText)header.FirstNode).Value.Trim();
                        var match = Regex.Match(value, @"^\d{1,3}(?:\.\d{1,3}){0,5}\s+(.*)");
                        if (match.Success)
                        {
                            ((XText)header.FirstNode).Value = match.Groups[1].Value;
                        }
                    }
                }
                // 2. Fix bullet points <li>.     Text</li> ===> Remove ".      "
                var bullets = doc.Descendants("li");
                foreach (var bullet in bullets)
                {
                    var match = Regex.Match(bullet.Value, @"[·]\s+(.*)");
                    if (match.Success)
                    {
                        bullet.Value = match.Groups[1].Value;
                    }
                }
                AddTextToProtocol("Embed Images");
                // 3. Embed images (Element Type="Image")
                var images = doc.Descendants("Element").Where(elm => elm.Attribute("Type") != null && elm.Attribute("Type").Value == "Image");
                foreach (var image in images)
                {
                    // try reading file
                    if (image.Attribute("Path") == null)
                    {
                        continue;
                    }
                    // we take PNG only, hence a conversion is appropriate
                    var path = Path.Combine(imgFolder, Path.GetFileName(image.Attribute("Path").Value));
                    if (!File.Exists(path))
                    {
                        continue;
                    }
                    var img = Image.FromFile(path);
                    using (var imgMs = new MemoryStream()) {
                        img.Save(imgMs, ImageFormat.Png);
                        var base64 = Convert.ToBase64String(imgMs.ToArray());
                        // apply as embedded base64
                        image.Add(new XAttribute("Method", "Base64"));
                        image.Value = base64;
                    }
                }
                sb.Clear();
                targetDocWriter = new StringWriterUtf8(sb);
                doc.Save(targetDocWriter);
                xml = sb.ToString();
                // TODO: if XML is okay store on disc for further processing
                var xmlFileName = Path.Combine(Path.GetDirectoryName(htmlFileName) ?? String.Empty, String.Format("{0}.xml", Path.GetFileNameWithoutExtension(htmlFileName)));
                if (File.Exists(xmlFileName))
                {
                    AddTextToProtocol("XML exists, deleting ");
                    File.Delete(xmlFileName);
                    AddTextToProtocol("XML deleted ");
                }
                AddTextToProtocol("Attempt to write XML");
                File.WriteAllText(xmlFileName, xml, Encoding.UTF8);
                AddTextToProtocol("XML written to disc at {0}", xmlFileName);
            } catch (Exception ex) {
                AddTextToProtocol("** Fehler beim Konvertieren in Texxtoor-XML: {0}", ex.Message);
            }
        }