public void TestParseNormalTags() { string testData = File.ReadAllText("Data/Zoo.xml"); XTag[] tags = XTag.Generate(testData.ToCharArray()).ToArray(); tags.ForEach(tag => Console.WriteLine(tag)); }
public void TestXmlSpec() { string testData = File.ReadAllText("Data/XML.xml"); XTag[] tags = XTag.Generate(testData.ToCharArray()).ToArray(); tags.ForEach(tag => Console.WriteLine(tag)); }
public ManifestItem(XELement e, EpubFile belongTo) { this.belongTo = belongTo; XTag tag = e.tag; href = tag.GetAttribute("href");//Will be Add opf path in ReadSpine() id = tag.GetAttribute("id"); mediaType = tag.GetAttribute("media-type"); properties = tag.GetAttribute("properties"); }
public void TestPrint() { string sampleTag = "<sample& key<='Tom' book' noValue authors>=\"Eric & Frank\" />"; XTag tag = XTag.Generate(sampleTag.ToCharArray()).First(); Console.WriteLine(tag.ToString()); Assert.AreEqual("<sample& sample&=\"Eric & Frank\" sample&=\"Tom' book\" sample& />", tag.ToString()); Console.WriteLine(tag.Print(XDocument.ShowAllPrintConfig)); Assert.AreEqual("<sample& key<=\"Tom' book\" noValue authors>=\"Eric & Frank\" />", tag.Print(XDocument.ShowAllPrintConfig)); }
static void ProcXHTML(TextItem i) { Log.log("[Info]" + i.fullName); string name = Path.GetFileNameWithoutExtension(i.fullName); string r = i.data.Replace("\r", "").Replace("\n", ""); Match m = Regex.Match(r, "<body(.*)</body>"); if (!m.Success) { Log.log("[Error]body?"); return; } r = m.Groups[0].Value; XFragment f = new XFragment(r, 0); string txt = ""; string counter = ""; f.parts.ForEach((p) => { if (p.GetType() == typeof(XText)) { txt += Util.Trim(p.originalText); counter += Util.Trim(p.originalText); } if (p.GetType() == typeof(XTag)) { XTag p0 = (XTag)p; if (p.type == PartType.tag_start && p0.tagname == "rt") { txt += "("; } if (p.type == PartType.tag_end && p0.tagname == "rt") { txt += ")"; } if (p.type == PartType.tag_start && p0.tagname == "p") { txt += "//"; } if (p.type == PartType.tag_end && p0.tagname == "p") { txt += "\r\n"; } if (p.type == PartType.tag_end && p0.tagname == "div") { txt += "\r\n"; } } }); if (Util.Trim(counter).Length > 0) { File.WriteAllText("epub2note_output/" + name + ".txt", txt); } }
public void TestDescriptiveTags() { string sampleTag = "<?xml-stylesheet type=\"text/xsl\" href=\"REC-xml.xsl\"?>"; XTag tag = XTag.Generate(sampleTag.ToCharArray()).First(); Console.WriteLine(tag.ToString()); sampleTag = @"<!-- Notes on preparation of the Fifth Edition: - Worked from http://www.w3.org/XML/xml-V10-4e-errata -->"; tag = XTag.Generate(sampleTag.ToCharArray()).First(); Console.WriteLine(tag.Print(XDocument.ShowAllPrintConfig)); }
/// <param name="pos">找到Tag之后,Tag的起始位置</param> public static XTag FindTag(string tagName, string text, ref int pos) { Regex reg1 = new Regex("<" + tagName + "[^a-zA-Z]"); Regex reg2 = new Regex("<" + tagName + ".*?>"); Match m = reg1.Match(text, pos); if (m.Success) { m = reg2.Match(text, m.Index); XTag tag = new XTag(m.Value); pos = m.Index; return(tag); } return(null); }
void CheckHead() { Regex reg = new Regex("<head>[\\w\\W]*?</head>"); Match m = reg.Match(text); if (!m.Success) { Log.log("[Warn]No head tag in xhtml"); return; } Regex reg_link = new Regex("<link .*?>"); Regex reg_script = new Regex("<script .*?>"); var ms = reg_link.Matches(m.Value); foreach (Match link in ms) { XTag tag = new XTag(link.Value); if (tag.GetAttribute("type").ToLower() == "text/css") { string url = tag.GetAttribute("href"); url = Util.ReferPath(xhtml.fullName, url); css.Add(url); } } int pos = m.Index; Match scpt = reg_script.Match(text, pos); while (scpt.Success) { XTag tag = new XTag(scpt.Value); if (tag.GetAttribute("src").Contains("notereplace.js")) { string scpt_end = "</script>"; int sei = text.IndexOf(scpt_end, scpt.Index); if (sei < 0) { Log.log("[Error]Unclosed script tag."); break; } text = text.Remove(scpt.Index, sei - scpt.Index + scpt_end.Length); Log.log("[Info ]Removed reference to notereplace.js"); break; } else { pos = scpt.Index + scpt.Length; } scpt = reg_script.Match(text, pos); } }
void ProcNote(Match m, XTag tag) { string note_id = "", ref_id; //Link tag solve { var a = tag.GetClassNames(); if (!Util.Contains(a, "duokan-footnote")) { string added = "duokan-footnote"; if (a.Length != 0) { added = " " + added; } tag.SetAttribute("class", tag.GetAttribute("class") + added); } } if (tag.GetAttribute("epub:type") != "noteref") { tag.SetAttribute("epub:type", "noteref"); } { string href = tag.GetAttribute("href"); int pt = href.IndexOf('#'); if (pt < 0) { Log.log("[Error]Not a valid link :" + href + ""); return; } if (pt != 0) { Log.log("[Warn ]href=\":" + href + "\""); } note_id = href.Substring(pt + 1); } ref_id = note_id + "_ref"; tag.SetAttribute("id", ref_id); text = text.Remove(m.Index, m.Length); text = text.Insert(m.Index, tag.ToString()); //Note content ProcNoteContent(note_id, ref_id); }
public ProcOPF(Epub epub) { TextItem opf = epub.OPF; int pos = 0; XTag t = XTag.FindTag("item", opf.data, ref pos); while (t != null) { if (Path.GetFileName(t.GetAttribute("href")) == "notereplace.js") { opf.data = opf.data.Remove(pos, t.originalText.Length); Log.log("[Info ]Removed item reference to notereplace.js"); break; } pos++; t = XTag.FindTag("item", opf.data, ref pos); } }
void CheckFootnotes() { Regex reg_link = new Regex("<a .*?>"); int pos = 0; Match link = reg_link.Match(text); while (link.Success) { XTag tag = new XTag(link.Value); if (Util.Contains(tag.GetClassNames(), "duokan-footnote") || tag.GetAttribute("epub:type") == "noteref") { ProcNote(link, tag); } pos = link.Index + 1;//假定注释本体都在链接后面 link = reg_link.Match(text, pos); } }
public void ReadMeta() { var packge_tag = XTag.FindTag("package", OPF.text); idref = packge_tag.GetAttribute("unique-identifier"); _version = packge_tag.GetAttribute("version"); xml_lang = packge_tag.GetAttribute("xml:lang");//bookwalker dc_titles = new List <MetaRecord>(); dc_creators = new List <MetaRecord>(); dc_language = new List <MetaRecord>(); dc_identifier = new List <MetaRecord>(); others = new List <MetaRecord>(); meta = new List <MetaRecord>(); switch (_version) { case "3.0": ReadMeta3(); break; default: ReadMeta2(); break; } }
public XFragment(string text, int start) { if (text[start] != '<') { throw new XMLException("XFragment Error:Unexpect Start."); } indexInSource = start; Regex reg_tag = new Regex("<[^\\!]*?>"); int count = 0, pos = start; Match m; do { m = reg_tag.Match(text, pos); if (!m.Success) { new XMLException("XFragment Error:Unexpect end."); } XTag tag = new XTag(m.Value); if (tag.type == PartType.tag_start) { count++; } if (tag.type == PartType.tag_end) { count--; } if (m.Index > pos) { parts.Add(new XText(text.Substring(pos, m.Index - pos))); } parts.Add(tag); pos = m.Index + m.Value.Length; }while (count > 0); originalLength = m.Index - start + m.Value.Length; root = new XELement(this, 0); }
static void ProcXHTML(TextItem i) { Log.log("[Info]" + i.fullName); string name = Path.GetFileNameWithoutExtension(i.fullName); string r = i.data.Replace("\r", "").Replace("\n", ""); Match m = Regex.Match(r, "<body(.*)</body>"); if (!m.Success) { Log.log("[Error]body?"); return; } r = m.Groups[0].Value; XFragment f = new XFragment(r, 0); string txt = ""; string counter = ""; string temp = ""; foreach (var p in f.parts) { if (p.GetType() == typeof(XText)) { string trimed = Util.Trim(p.originalText); txt += trimed; counter += trimed; if (trimed.Length > 0) { switch (trimed[0]) { case '「': temp = "「」"; break; case '『': temp = "『』"; break; default: temp = ""; break; } } else { temp = ""; } } if (p.GetType() == typeof(XTag)) { XTag p0 = (XTag)p; if (p0.tagname == "img") { txt += p0.originalText; } if (p.type == PartType.tag_start && p0.tagname == "rt") { txt += "("; } if (p.type == PartType.tag_end && p0.tagname == "rt") { txt += ")"; } if (p.type == PartType.tag_start && p0.tagname == "p") { txt += "##"; } if (p.type == PartType.tag_end && p0.tagname == "p") { txt += "\r\n" + temp + "\r\n##——————\r\n"; } if (p.type == PartType.tag_end && p0.tagname == "div") { txt += "\r\n\r\n##——————\r\n"; } } } if (Util.Trim(counter).Length > 0) { File.WriteAllText("epub2comment_output/" + name + ".txt", txt); } }
public static void CreateEntry(WikiEntry entry) { foreach (var item in entry.Categories) { string categoryId = ""; // Category exists if (XCategories.Any(c => c.Attribute("Text").Value == item.Text)) { continue; } else { // Generate new Category categoryId = (Convert.ToInt32(XCategories.Last().Attribute("Id").Value) + 1).ToString(); XCategory.Add( new XElement("Category", new XAttribute("Id", categoryId), new XAttribute("Text", item.Text)) ); } } foreach (var item in entry.Tags) { string tagId = ""; // Tag exists if (XTags.Any(c => c.Attribute("Text").Value == item.Text)) { continue; } else { // Generate new Tag tagId = (Convert.ToInt32(XTags.Last().Attribute("Id").Value) + 1).ToString(); XTag.Add ( new XElement("Tag", new XAttribute("Id", tagId), new XAttribute("Text", item.Text)) ); } } XElement wikiEntry = new XElement("WikiEntry", new XAttribute("Id", Guid.NewGuid()), new XAttribute("CreatedBy", "*****@*****.**"), new XAttribute("CreatedAt", DateTimeOffset.UtcNow.ToString()), new XAttribute("UpdatedBy", "*****@*****.**"), new XAttribute("UpdatedAt", DateTimeOffset.UtcNow.ToString()), new XAttribute("CategoryIds", string.Join(",", from x in entry.Categories select x.Id)), new XAttribute("TagIds", string.Join(",", from x in entry.Tags select x.Id)), new XElement("Title", entry.Title), new XElement("Content", new XCData(entry.Content)) ); XWikiEntries.Add(wikiEntry); xDoc.Save(file); }
static void ProcXHTML(TextItem i) { Log.log("[Info]" + i.fullName); string name = Path.GetFileNameWithoutExtension(i.fullName); string r = i.data.Replace("\r", "").Replace("\n", ""); Match m = Regex.Match(r, "<body(.*)</body>"); if (!m.Success) { Log.log("[Error]body?"); return; } r = m.Groups[0].Value; XFragment f = new XFragment(r, 0); string txt = ""; string counter = ""; foreach (var p in f.parts) { if (p.GetType() == typeof(XText)) { string trimed = Util.Trim(p.originalText); txt += trimed; counter += trimed; continue; } if (p.GetType() == typeof(XTag)) { XTag p0 = (XTag)p; if (p.type == PartType.tag_start) { switch (p0.tagname) { case "h1": case "h2": case "h3": case "h4": case "h5": case "h6": txt += "[" + p0.tagname + "]"; continue; case "body": continue; } } if (p.type == PartType.tag_end) { switch (p0.tagname) { case "h1": case "h2": case "h3": case "h4": case "h5": case "h6": txt += "[/" + p0.tagname + "]\r\n"; continue; case "body": continue; } } if (p.type == PartType.tag_end && p0.tagname == "div") { txt += "</div>\r\n"; continue; } if (p.type == PartType.tag_start && p0.tagname == "p") { continue; } if (p.type == PartType.tag_end && p0.tagname == "p") { txt += "\r\n"; continue; } txt += p0.originalText; } } if (Util.Trim(counter).Length > 0) { File.WriteAllText(output_dir + name + ".txt", txt); } }
void ProcXHTML(TextItem item) { XTag tag = XTag.FindTag("link", item.data); if (tag != null) { if (tag.GetAttribute("type").ToLower() == "text/css") { string url = tag.GetAttribute("href"); url = Util.ReferPath(item.fullName, url); bool already = false; foreach (var c in css) { if (c.fullName == url) { already = true; break; } } if (!already) { TextItem i = epub.GetItem <TextItem>(url); if (i != null) { css.Add(i); } else { Log.log("[Warn ]Cannot find CSS:" + url); } } } else { Log.log("[Warn ]Cannot find CSS reference."); } } else { Log.log("[Warn ]Cannot find CSS reference."); } int pos = 0; tag = XTag.FindTag("p", item.data, ref pos); int count = 0; while (tag != null) { switch (item.data[pos + tag.originalText.Length]) { case '「': case '(': case '『': case '<': case '《': tag.AddClassName("ae_draw_out"); item.data = item.data.Remove(pos, tag.originalText.Length); item.data = item.data.Insert(pos, tag.ToString()); count++; break; } pos++; tag = XTag.FindTag("p", item.data, ref pos); } Log.log("[Info ]Added class for " + count + " elements in " + item.fullName); }
void ProcNoteContent(string note_id, string ref_id) { string log = ""; Regex reg_tag = new Regex("<.*?>"); Regex reg_duokan = new Regex("<ol .*?>"); Regex reg_aside = new Regex("<aside .*?>"); int index = -1, length = 0; string note_content = null; string list_value = "1"; Match m = reg_aside.Match(text); while (m.Success) { XTag tag = new XTag(m.Value); if (tag.GetAttribute("id") == note_id) { index = m.Index; log += "aside; "; XFragment frag = new XFragment(text, index); if (frag.root != null) { var dk = frag.root.GetElementById(note_id); if (dk != null) { //做过兼容,aside里套多看li note_content = dk.innerXHTML; list_value = dk.tag.GetAttribute("value"); log += "duplicate id at <" + dk.tag.tagname + ">" + dk.tag.GetAttribute("class") + "; "; } else { note_content = frag.root.innerXHTML; } length = frag.originalLength; } else { Log.log("[Error]Found note but failure on parsing. id=" + note_id); return; } break; } m = m.NextMatch(); } if (index < 0)//如果只对多看适配,没有aside { m = reg_duokan.Match(text); while (m.Success) { XFragment frg = new XFragment(text, m.Index); if (frg.root != null) { if (Util.Contains(frg.root.tag.GetClassNames(), "duokan-footnote-content")) { var a = frg.root.GetElementById(note_id); if (a != null) { index = m.Index; note_content = a.innerXHTML; length = frg.originalLength; log += "duokan-footnote; "; break; } } } m = m.NextMatch(); } } if (note_content == null) { Log.log("[Error]cannot find note"); return; } { Match ma = Regex.Match(note_content, "<a .*?></a>"); if (ma.Success) { note_content = Regex.Replace(note_content, "<a .*?></a>", ""); log += "empty <a> tag;"; } } note_content = Util.Trim(note_content); if (note_content.StartsWith("<div")) { log += "<div>;"; XFragment f = new XFragment(note_content, 0); if (f != null) { note_content = f.root.innerXHTML; } } string note_full = string.Format(template, note_id, ref_id, note_content); text = text.Remove(index, length); text = text.Insert(index, note_full); Log.log("[Info ]Detected id=" + note_id + ":" + log); Log.log("[Info ]Formated id=" + note_id + ":" + note_content); contain_footnote = true; }
void ProcXHTML(TextItem item) { XTag tag = XTag.FindTag("link", item.data); if (tag != null) { if (tag.GetAttribute("type").ToLower() == "text/css") { string url = tag.GetAttribute("href"); url = Util.ReferPath(item.fullName, url); bool already = false; foreach (var c in css) { if (c.fullName == url) { already = true; break; } } if (!already) { TextItem i = epub.GetItem <TextItem>(url); if (i != null) { css.Add(i); } else { Log.log("[Warn ]Cannot find CSS:" + url); } } } else { Log.log("[Warn ]Cannot find CSS reference."); } } else { Log.log("[Warn ]Cannot find CSS reference."); } int pos = 0; tag = XTag.FindTag("p", item.data, ref pos); int count = 0; while (tag != null) { XFragment p = new XFragment(item.data, pos); string onlytext = Regex.Replace(p.root.innerXHTML, "<.*?>", ""); string nospace = onlytext.Replace(" ", "").Replace(" ", ""); switch (nospace) { case "*": case "*": case "***": case "※": case "※※※": case "◆": case "◇": case "●": case "☆": case "⭐": case "×××": tag.AddClassName("ae_center"); item.data = item.data.Remove(pos, tag.originalText.Length); item.data = item.data.Insert(pos, tag.ToString()); Log.log("[Info ]Detect Separator:" + p.root.innerXHTML); count++; break; default: if (p.root.innerXHTML.Length < 4) { Log.log("[ Info]Short <p> element:" + p.root.innerXHTML); } break; } pos++; tag = XTag.FindTag("p", item.data, ref pos); } Log.log("[Info ]Added class for " + count + " elements in " + item.fullName); }