void ReadSpine() { XFragment f = XFragment.FindFragment("manifest", OPF.text); _manifest = new Dictionary <string, ManifestItem>(); foreach (var e in f.root.childs) { if (e.tag.tagname != "item") { continue; } var i = new ManifestItem(e, this); _manifest.Add(i.id, i); } foreach (var a in _manifest) { if (a.Value.href[0] != '/') { string dir = Path.GetDirectoryName(OPF.fullName); if (dir != "") { a.Value.href = Path.GetDirectoryName(OPF.fullName) + "/" + a.Value.href; } } } f = XFragment.FindFragment("spine", OPF.text); _spine = new Spine(f, _manifest); }
public void ToString_XFragment() { var xml = @"<test /><test2><child attr1=""attr1value"" /></test2>"; XFragment fragment = xml; var result = fragment.ToString(); Assert.AreEqual(xml, result); }
public void Convert_XFragmentToString() { var xml = @"<test /><test2><child attr1=""attr1value"" /></test2>"; XFragment fragment = xml; string result = fragment; Assert.AreEqual(xml, result); }
public override void Process(Epub epub) { TextItem opf = epub.OPF; XFragment f = XFragment.FindFragment("metadata", opf.data); int a = f.root.tagEndRef; int b = f.IndexInSource(a); opf.data = opf.data.Insert(b, string.Format("\n <meta name=\"AeroEpubProc\" content=\"{0}\" />\n", metaValue)); }
public void Convert_XNodeToXFragment() { var nodes = new XElement("test"); XFragment fragment = nodes; var xml = @"<test />"; Assert.AreEqual((string)fragment, xml); }
public static XFragment FindFragment(string tagName, string text) { Regex reg = new Regex("<" + tagName + "[^a-zA-Z]"); Match m = reg.Match(text); if (m.Success) { XFragment f = new XFragment(text, m.Index); return(f); } return(null); }
static void ProcXHTML(TextItem i) { Log.log("[Info]" + i.fullName); string name = Path.GetFileNameWithoutExtension(i.fullName); string r = i.data.Replace("\r", "").Replace("\n", ""); Match m = Regex.Match(r, "<body(.*)</body>"); if (!m.Success) { Log.log("[Error]body?"); return; } r = m.Groups[0].Value; XFragment f = new XFragment(r, 0); string txt = ""; string counter = ""; f.parts.ForEach((p) => { if (p.GetType() == typeof(XText)) { txt += Util.Trim(p.originalText); counter += Util.Trim(p.originalText); } if (p.GetType() == typeof(XTag)) { XTag p0 = (XTag)p; if (p.type == PartType.tag_start && p0.tagname == "rt") { txt += "("; } if (p.type == PartType.tag_end && p0.tagname == "rt") { txt += ")"; } if (p.type == PartType.tag_start && p0.tagname == "p") { txt += "//"; } if (p.type == PartType.tag_end && p0.tagname == "p") { txt += "\r\n"; } if (p.type == PartType.tag_end && p0.tagname == "div") { txt += "\r\n"; } } }); if (Util.Trim(counter).Length > 0) { File.WriteAllText("epub2note_output/" + name + ".txt", txt); } }
public void Convert_StringToXFragment() { XFragment fragment = @"<test /><test2><child attr1=""attr1value"" /></test2>"; var firstElement = fragment.First() as XElement; var lastElement = fragment.Last() as XElement; Assert.IsNotNull(firstElement); Assert.IsNotNull(lastElement); Assert.IsNotNull(lastElement.Element("child")); Assert.AreEqual(firstElement.Name, "test"); Assert.AreEqual(lastElement.Name, "test2"); Assert.AreEqual((string)lastElement.Element("child").Attribute("attr1"), "attr1value"); }
public void Convert_XNodesToXFragment() { var nodes = new[] { new XElement("test"), new XElement("test2", new XElement("child", new XAttribute("attr1", "attr1value") ) ), }; XFragment fragment = nodes; var xml = @"<test /><test2><child attr1=""attr1value"" /></test2>"; Assert.AreEqual((string)fragment, xml); }
public static TextEpubItemFile[] KakuyomuWork(string url) { Log.Info("Kakuyomu Work"); string raw = GetSource(url); string title = (new XFragment(raw, raw.IndexOf("<h1 id=\"workTitle\">"))).root.childs[0].innerXHTML; Log.Info("" + title); XFragment toc = new XFragment(raw, raw.IndexOf("<div id=\"table-of-contents\">")); var list = toc.root.childs[0].childs[1].childs[0]; TextEpubItemFile[] xhtmls = new TextEpubItemFile[list.childs.Count]; for (int i = 0; i < xhtmls.Length; i++) { xhtmls[i] = KakuyomuEpisode("https://kakuyomu.jp" + list.childs[i].childs[0].tag.GetAttribute("href")); xhtmls[i].fullName = $"[EP{Util.Number(i + 1)}]" + xhtmls[i].fullName; } return(xhtmls); }
public static TextEpubItemFile KakuyomuEpisode(string url) { Log.Note("Kakuyomu Episode"); string raw = GetSource(url); string sidebar = GetSource(url + "/episode_sidebar"); Log.Info("Got text."); Regex regex = new Regex("<div class=\"widget-episodeBody js-episode-body[\\s\\S]*?</div>"); string part = regex.Match(raw).Value; XFragment info = XFragment.FindFragment("dl", sidebar); string title = info.root.childs[1].innerXHTML; string uploadDate = info.root.childs[5].childs[0].tag.GetAttribute("datetime"); string updateDate = info.root.childs[7].childs[0].tag.GetAttribute("datetime"); Log.Info($"{title}, Upload:{uploadDate}, Update:{updateDate}"); string meta = $" <title>{title}</title>\n <meta name=\"Source\" content=\"{url}\" />\n <meta name=\"Upload Date\" content=\"{uploadDate}\"/>\n <meta name=\"Update Date\" content=\"{updateDate}\"/>"; return(new TextEpubItemFile(Util.FilenameCheck(title) + ".xhtml", string.Format(xhtml, meta, part))); }
public Spine(XFragment spine, Dictionary <string, ManifestItem> items) { string toc = spine.root.tag.GetAttribute("toc"); string id = spine.root.tag.GetAttribute("id"); if (toc != "") { this.toc = items[toc]; } pageProgressionDirection = spine.root.tag.GetAttribute("page-progression-direction"); foreach (var e in spine.root.childs) { if (e.tag.tagname != "itemref") { continue; } this.items.Add(new SpineItemref(e, items)); } }
public void ReadMeta() { XFragment f = XFragment.FindFragment("metadata", OPF.data); _creator = ""; foreach (var e in f.root.childs) { switch (e.tag.tagname) { case "dc:title": _title = e.innerXHTML; break; case "dc:creator": _creator += e.innerXHTML + ","; break; } } if (_creator.EndsWith(',')) { _creator = _creator.Substring(0, _creator.Length - 1); } }
public override void Process(Epub epub) { Log.log("[Start]" + ToString()); Log.level = " "; TextItem opf = epub.OPF; XFragment f = XFragment.FindFragment("metadata", opf.data); foreach (var e in f.root.childs) { if (e.tag.tagname == "dc:creator") { string a = e.tag.GetAttribute("opf:file-as"); bool r = e.tag.RemoveAttribute("opf:file-as"); Log.log("[Info ]Removed meta info opf:file-as=" + a); } } f.Apply(ref opf.data); Log.level = ""; Log.log("[End]" + ToString()); Log.log(""); }
public XELement(XFragment frag, int start) { doc = frag; this.tagStartRef = start; if (doc.parts[tagStartRef].type == PartType.tag_single) { this.tagEndRef = start; return; } for (int i = start + 1; i < doc.parts.Count; i++) { if (doc.parts[i].type == PartType.tag_start) { XELement ele = new XELement(doc, i); ele.parent = this; childs.Add(ele); i = ele.tagEndRef; continue; } if (doc.parts[i].type == PartType.tag_end) { if (((XTag)doc.parts[i]).tagname == ((XTag)doc.parts[start]).tagname) { tagEndRef = i; break; } else { throw new XMLException("dismatched end tag:" + doc.parts[start] + "..." + doc.parts[i]); } } } if (tagEndRef == -1) { throw new XMLException("Failure when close tag."); } }
static void ProcXHTML(TextItem i) { Log.log("[Info]" + i.fullName); string name = Path.GetFileNameWithoutExtension(i.fullName); string r = i.data.Replace("\r", "").Replace("\n", ""); Match m = Regex.Match(r, "<body(.*)</body>"); if (!m.Success) { Log.log("[Error]body?"); return; } r = m.Groups[0].Value; XFragment f = new XFragment(r, 0); string txt = ""; string counter = ""; foreach (var p in f.parts) { if (p.GetType() == typeof(XText)) { string trimed = Util.Trim(p.originalText); txt += trimed; counter += trimed; continue; } if (p.GetType() == typeof(XTag)) { XTag p0 = (XTag)p; if (p.type == PartType.tag_start) { switch (p0.tagname) { case "h1": case "h2": case "h3": case "h4": case "h5": case "h6": txt += "[" + p0.tagname + "]"; continue; case "body": continue; } } if (p.type == PartType.tag_end) { switch (p0.tagname) { case "h1": case "h2": case "h3": case "h4": case "h5": case "h6": txt += "[/" + p0.tagname + "]\r\n"; continue; case "body": continue; } } if (p.type == PartType.tag_end && p0.tagname == "div") { txt += "</div>\r\n"; continue; } if (p.type == PartType.tag_start && p0.tagname == "p") { continue; } if (p.type == PartType.tag_end && p0.tagname == "p") { txt += "\r\n"; continue; } txt += p0.originalText; } } if (Util.Trim(counter).Length > 0) { File.WriteAllText(output_dir + name + ".txt", txt); } }
void ReadMeta3() { XFragment f = XFragment.FindFragment("metadata", OPF.text); List <MetaRecord> primary = new List <MetaRecord>(); foreach (var e in f.root.childs) { switch (e.tag.tagname) { case "dc:language": case "dc:identifier": { var t = new MetaRecord(e); primary.Add(t); } break; case "meta": { string name = e.tag.GetAttribute("name"); if (name != "") { var t = new MetaRecord(); t.name = name; t.value = e.tag.GetAttribute("content"); meta.Add(t); continue; } string refines = e.tag.GetAttribute("refines"); if (refines != "") { if (refines.StartsWith("#") && refines.Length > 1) { string id = refines.Substring(1); var t = new MetaRecord(e); t.name = e.tag.GetAttribute("property"); t.AddIfExist(e, "scheme"); foreach (var r in primary) { //要是refine在primary前面我可不管…… if (r.id == id) { r.refines.Add(t); break; } } continue; } } string property = e.tag.GetAttribute("property"); if (property != "") { var t = new MetaRecord(e); t.name = property; meta.Add(t); continue; } } break; default: { var t = new MetaRecord(e); t.AddIfExist(e, "xml:lang"); t.AddIfExist(e, "dir"); primary.Add(t); } break; } } foreach (var a in primary) { switch (a.name) { case "dc:title": dc_titles.Add(a); break; case "dc:creator": dc_creators.Add(a); break; case "dc:identifier": dc_identifier.Add(a); break; case "dc:language": dc_language.Add(a); break; default: others.Add(a); break; } } foreach (var a in dc_identifier) { if (idref == a.id) { uniqueIdentifier = a; break; } } foreach (var a in manifest) { switch (a.Value.properties) { case "nav": _toc = a.Value; break; case "cover-image": cover_img = a.Value.href; break; } } if (_toc == null) { _toc = spine.toc; } //check //if (dc_titles.Count == 0 || dc_identifier.Count == 0 || dc_language.Count == 0) { throw new EpubErrorException("Lack of some metadata."); } }
void ReadMeta2() { XFragment f = XFragment.FindFragment("metadata", OPF.text); foreach (var e in f.root.childs) { switch (e.tag.tagname) { case "dc:title": { var t = new MetaRecord(e); t.AddIfExist(e, "opf:file-as"); dc_titles.Add(t); } break; case "dc:creator": { var t = new MetaRecord(e); t.AddIfExist(e, "opf:file-as"); t.AddIfExist(e, "opf:role"); dc_creators.Add(t); } break; case "dc:language": { var t = new MetaRecord(e); dc_language.Add(t); } break; case "dc:identifier": { var t = new MetaRecord(e); t.AddIfExist(e, "opf:scheme"); dc_identifier.Add(t); } break; case "dc:contributor": { var t = new MetaRecord(e); t.AddIfExist(e, "opf:file-as"); t.AddIfExist(e, "opf:role"); others.Add(t); } break; case "dc:date": { var t = new MetaRecord(e); t.AddIfExist(e, "opf:event"); others.Add(t); } break; case "meta": { var t = new MetaRecord(); t.name = e.tag.GetAttribute("name"); t.value = e.tag.GetAttribute("content"); meta.Add(t); } break; default: { var t = new MetaRecord(e); others.Add(t); } break; } } foreach (var a in meta) { if (a.name == "cover") { string id = a.value; if (manifest.ContainsKey(id)) { cover_img = manifest[id].href; } break; } } _toc = spine.toc; }
public static SqlXml ToSqlXml(this XFragment xFragment) => new SqlXml(xFragment.CreateReader());
public static XFragment ToXFragment(this SqlXml sqlxml) { using var xmlReader = sqlxml.CreateReader(); return(XFragment.Parse(xmlReader)); }
void ProcNoteContent(string note_id, string ref_id) { string log = ""; Regex reg_tag = new Regex("<.*?>"); Regex reg_duokan = new Regex("<ol .*?>"); Regex reg_aside = new Regex("<aside .*?>"); int index = -1, length = 0; string note_content = null; string list_value = "1"; Match m = reg_aside.Match(text); while (m.Success) { XTag tag = new XTag(m.Value); if (tag.GetAttribute("id") == note_id) { index = m.Index; log += "aside; "; XFragment frag = new XFragment(text, index); if (frag.root != null) { var dk = frag.root.GetElementById(note_id); if (dk != null) { //做过兼容,aside里套多看li note_content = dk.innerXHTML; list_value = dk.tag.GetAttribute("value"); log += "duplicate id at <" + dk.tag.tagname + ">" + dk.tag.GetAttribute("class") + "; "; } else { note_content = frag.root.innerXHTML; } length = frag.originalLength; } else { Log.log("[Error]Found note but failure on parsing. id=" + note_id); return; } break; } m = m.NextMatch(); } if (index < 0)//如果只对多看适配,没有aside { m = reg_duokan.Match(text); while (m.Success) { XFragment frg = new XFragment(text, m.Index); if (frg.root != null) { if (Util.Contains(frg.root.tag.GetClassNames(), "duokan-footnote-content")) { var a = frg.root.GetElementById(note_id); if (a != null) { index = m.Index; note_content = a.innerXHTML; length = frg.originalLength; log += "duokan-footnote; "; break; } } } m = m.NextMatch(); } } if (note_content == null) { Log.log("[Error]cannot find note"); return; } { Match ma = Regex.Match(note_content, "<a .*?></a>"); if (ma.Success) { note_content = Regex.Replace(note_content, "<a .*?></a>", ""); log += "empty <a> tag;"; } } note_content = Util.Trim(note_content); if (note_content.StartsWith("<div")) { log += "<div>;"; XFragment f = new XFragment(note_content, 0); if (f != null) { note_content = f.root.innerXHTML; } } string note_full = string.Format(template, note_id, ref_id, note_content); text = text.Remove(index, length); text = text.Insert(index, note_full); Log.log("[Info ]Detected id=" + note_id + ":" + log); Log.log("[Info ]Formated id=" + note_id + ":" + note_content); contain_footnote = true; }
static void ProcXHTML(TextItem i) { Log.log("[Info]" + i.fullName); string name = Path.GetFileNameWithoutExtension(i.fullName); string r = i.data.Replace("\r", "").Replace("\n", ""); Match m = Regex.Match(r, "<body(.*)</body>"); if (!m.Success) { Log.log("[Error]body?"); return; } r = m.Groups[0].Value; XFragment f = new XFragment(r, 0); string txt = ""; string counter = ""; string temp = ""; foreach (var p in f.parts) { if (p.GetType() == typeof(XText)) { string trimed = Util.Trim(p.originalText); txt += trimed; counter += trimed; if (trimed.Length > 0) { switch (trimed[0]) { case '「': temp = "「」"; break; case '『': temp = "『』"; break; default: temp = ""; break; } } else { temp = ""; } } if (p.GetType() == typeof(XTag)) { XTag p0 = (XTag)p; if (p0.tagname == "img") { txt += p0.originalText; } if (p.type == PartType.tag_start && p0.tagname == "rt") { txt += "("; } if (p.type == PartType.tag_end && p0.tagname == "rt") { txt += ")"; } if (p.type == PartType.tag_start && p0.tagname == "p") { txt += "##"; } if (p.type == PartType.tag_end && p0.tagname == "p") { txt += "\r\n" + temp + "\r\n##——————\r\n"; } if (p.type == PartType.tag_end && p0.tagname == "div") { txt += "\r\n\r\n##——————\r\n"; } } } if (Util.Trim(counter).Length > 0) { File.WriteAllText("epub2comment_output/" + name + ".txt", txt); } }
void ProcXHTML(TextItem item) { XTag tag = XTag.FindTag("link", item.data); if (tag != null) { if (tag.GetAttribute("type").ToLower() == "text/css") { string url = tag.GetAttribute("href"); url = Util.ReferPath(item.fullName, url); bool already = false; foreach (var c in css) { if (c.fullName == url) { already = true; break; } } if (!already) { TextItem i = epub.GetItem <TextItem>(url); if (i != null) { css.Add(i); } else { Log.log("[Warn ]Cannot find CSS:" + url); } } } else { Log.log("[Warn ]Cannot find CSS reference."); } } else { Log.log("[Warn ]Cannot find CSS reference."); } int pos = 0; tag = XTag.FindTag("p", item.data, ref pos); int count = 0; while (tag != null) { XFragment p = new XFragment(item.data, pos); string onlytext = Regex.Replace(p.root.innerXHTML, "<.*?>", ""); string nospace = onlytext.Replace(" ", "").Replace(" ", ""); switch (nospace) { case "*": case "*": case "***": case "※": case "※※※": case "◆": case "◇": case "●": case "☆": case "⭐": case "×××": tag.AddClassName("ae_center"); item.data = item.data.Remove(pos, tag.originalText.Length); item.data = item.data.Insert(pos, tag.ToString()); Log.log("[Info ]Detect Separator:" + p.root.innerXHTML); count++; break; default: if (p.root.innerXHTML.Length < 4) { Log.log("[ Info]Short <p> element:" + p.root.innerXHTML); } break; } pos++; tag = XTag.FindTag("p", item.data, ref pos); } Log.log("[Info ]Added class for " + count + " elements in " + item.fullName); }