Пример #1
0
        void ReadSpine()
        {
            XFragment f = XFragment.FindFragment("manifest", OPF.text);

            _manifest = new Dictionary <string, ManifestItem>();
            foreach (var e in f.root.childs)
            {
                if (e.tag.tagname != "item")
                {
                    continue;
                }
                var i = new ManifestItem(e, this);
                _manifest.Add(i.id, i);
            }
            foreach (var a in _manifest)
            {
                if (a.Value.href[0] != '/')
                {
                    string dir = Path.GetDirectoryName(OPF.fullName);
                    if (dir != "")
                    {
                        a.Value.href = Path.GetDirectoryName(OPF.fullName) + "/" + a.Value.href;
                    }
                }
            }

            f      = XFragment.FindFragment("spine", OPF.text);
            _spine = new Spine(f, _manifest);
        }
Пример #2
0
        public void ToString_XFragment()
        {
            var       xml      = @"<test /><test2><child attr1=""attr1value"" /></test2>";
            XFragment fragment = xml;
            var       result   = fragment.ToString();

            Assert.AreEqual(xml, result);
        }
Пример #3
0
        public void Convert_XFragmentToString()
        {
            var       xml      = @"<test /><test2><child attr1=""attr1value"" /></test2>";
            XFragment fragment = xml;
            string    result   = fragment;

            Assert.AreEqual(xml, result);
        }
Пример #4
0
        public override void Process(Epub epub)
        {
            TextItem  opf = epub.OPF;
            XFragment f   = XFragment.FindFragment("metadata", opf.data);
            int       a   = f.root.tagEndRef;
            int       b   = f.IndexInSource(a);

            opf.data = opf.data.Insert(b, string.Format("\n    <meta name=\"AeroEpubProc\" content=\"{0}\" />\n", metaValue));
        }
Пример #5
0
        public void Convert_XNodeToXFragment()
        {
            var       nodes    = new XElement("test");
            XFragment fragment = nodes;

            var xml = @"<test />";

            Assert.AreEqual((string)fragment, xml);
        }
Пример #6
0
    public static XFragment FindFragment(string tagName, string text)
    {
        Regex reg = new Regex("<" + tagName + "[^a-zA-Z]");
        Match m   = reg.Match(text);

        if (m.Success)
        {
            XFragment f = new XFragment(text, m.Index);
            return(f);
        }
        return(null);
    }
Пример #7
0
    static void ProcXHTML(TextItem i)
    {
        Log.log("[Info]" + i.fullName);
        string name = Path.GetFileNameWithoutExtension(i.fullName);
        string r    = i.data.Replace("\r", "").Replace("\n", "");
        Match  m    = Regex.Match(r, "<body(.*)</body>");

        if (!m.Success)
        {
            Log.log("[Error]body?"); return;
        }
        r = m.Groups[0].Value;
        XFragment f       = new XFragment(r, 0);
        string    txt     = "";
        string    counter = "";

        f.parts.ForEach((p) =>
        {
            if (p.GetType() == typeof(XText))
            {
                txt += Util.Trim(p.originalText); counter += Util.Trim(p.originalText);
            }
            if (p.GetType() == typeof(XTag))
            {
                XTag p0 = (XTag)p;
                if (p.type == PartType.tag_start && p0.tagname == "rt")
                {
                    txt += "(";
                }
                if (p.type == PartType.tag_end && p0.tagname == "rt")
                {
                    txt += ")";
                }
                if (p.type == PartType.tag_start && p0.tagname == "p")
                {
                    txt += "//";
                }
                if (p.type == PartType.tag_end && p0.tagname == "p")
                {
                    txt += "\r\n";
                }
                if (p.type == PartType.tag_end && p0.tagname == "div")
                {
                    txt += "\r\n";
                }
            }
        });
        if (Util.Trim(counter).Length > 0)
        {
            File.WriteAllText("epub2note_output/" + name + ".txt", txt);
        }
    }
Пример #8
0
        public void Convert_StringToXFragment()
        {
            XFragment fragment = @"<test /><test2><child attr1=""attr1value"" /></test2>";

            var firstElement = fragment.First() as XElement;
            var lastElement  = fragment.Last() as XElement;

            Assert.IsNotNull(firstElement);
            Assert.IsNotNull(lastElement);
            Assert.IsNotNull(lastElement.Element("child"));
            Assert.AreEqual(firstElement.Name, "test");
            Assert.AreEqual(lastElement.Name, "test2");
            Assert.AreEqual((string)lastElement.Element("child").Attribute("attr1"), "attr1value");
        }
Пример #9
0
        public void Convert_XNodesToXFragment()
        {
            var nodes = new[] {
                new XElement("test"),
                new XElement("test2",
                             new XElement("child",
                                          new XAttribute("attr1", "attr1value")
                                          )
                             ),
            };
            XFragment fragment = nodes;

            var xml = @"<test /><test2><child attr1=""attr1value"" /></test2>";

            Assert.AreEqual((string)fragment, xml);
        }
Пример #10
0
    public static TextEpubItemFile[] KakuyomuWork(string url)
    {
        Log.Info("Kakuyomu Work");
        string raw   = GetSource(url);
        string title = (new XFragment(raw, raw.IndexOf("<h1 id=\"workTitle\">"))).root.childs[0].innerXHTML;

        Log.Info("" + title);
        XFragment toc  = new XFragment(raw, raw.IndexOf("<div id=\"table-of-contents\">"));
        var       list = toc.root.childs[0].childs[1].childs[0];

        TextEpubItemFile[] xhtmls = new TextEpubItemFile[list.childs.Count];
        for (int i = 0; i < xhtmls.Length; i++)
        {
            xhtmls[i]          = KakuyomuEpisode("https://kakuyomu.jp" + list.childs[i].childs[0].tag.GetAttribute("href"));
            xhtmls[i].fullName = $"[EP{Util.Number(i + 1)}]" + xhtmls[i].fullName;
        }
        return(xhtmls);
    }
Пример #11
0
    public static TextEpubItemFile KakuyomuEpisode(string url)
    {
        Log.Note("Kakuyomu Episode");
        string raw     = GetSource(url);
        string sidebar = GetSource(url + "/episode_sidebar");

        Log.Info("Got text.");
        Regex     regex      = new Regex("<div class=\"widget-episodeBody js-episode-body[\\s\\S]*?</div>");
        string    part       = regex.Match(raw).Value;
        XFragment info       = XFragment.FindFragment("dl", sidebar);
        string    title      = info.root.childs[1].innerXHTML;
        string    uploadDate = info.root.childs[5].childs[0].tag.GetAttribute("datetime");
        string    updateDate = info.root.childs[7].childs[0].tag.GetAttribute("datetime");

        Log.Info($"{title}, Upload:{uploadDate}, Update:{updateDate}");
        string meta = $"    <title>{title}</title>\n    <meta name=\"Source\" content=\"{url}\" />\n    <meta name=\"Upload Date\" content=\"{uploadDate}\"/>\n    <meta name=\"Update Date\" content=\"{updateDate}\"/>";

        return(new TextEpubItemFile(Util.FilenameCheck(title) + ".xhtml", string.Format(xhtml, meta, part)));
    }
Пример #12
0
        public Spine(XFragment spine, Dictionary <string, ManifestItem> items)
        {
            string toc = spine.root.tag.GetAttribute("toc");
            string id  = spine.root.tag.GetAttribute("id");

            if (toc != "")
            {
                this.toc = items[toc];
            }
            pageProgressionDirection = spine.root.tag.GetAttribute("page-progression-direction");
            foreach (var e in spine.root.childs)
            {
                if (e.tag.tagname != "itemref")
                {
                    continue;
                }
                this.items.Add(new SpineItemref(e, items));
            }
        }
Пример #13
0
    public void ReadMeta()
    {
        XFragment f = XFragment.FindFragment("metadata", OPF.data);

        _creator = "";

        foreach (var e in f.root.childs)
        {
            switch (e.tag.tagname)
            {
            case "dc:title": _title = e.innerXHTML; break;

            case "dc:creator": _creator += e.innerXHTML + ","; break;
            }
        }
        if (_creator.EndsWith(','))
        {
            _creator = _creator.Substring(0, _creator.Length - 1);
        }
    }
Пример #14
0
        public override void Process(Epub epub)
        {
            Log.log("[Start]" + ToString());
            Log.level = " ";
            TextItem  opf = epub.OPF;
            XFragment f   = XFragment.FindFragment("metadata", opf.data);

            foreach (var e in f.root.childs)
            {
                if (e.tag.tagname == "dc:creator")
                {
                    string a = e.tag.GetAttribute("opf:file-as");
                    bool   r = e.tag.RemoveAttribute("opf:file-as");
                    Log.log("[Info ]Removed meta info opf:file-as=" + a);
                }
            }
            f.Apply(ref opf.data);
            Log.level = "";
            Log.log("[End]" + ToString());
            Log.log("");
        }
Пример #15
0
 public XELement(XFragment frag, int start)
 {
     doc = frag;
     this.tagStartRef = start;
     if (doc.parts[tagStartRef].type == PartType.tag_single)
     {
         this.tagEndRef = start;
         return;
     }
     for (int i = start + 1; i < doc.parts.Count; i++)
     {
         if (doc.parts[i].type == PartType.tag_start)
         {
             XELement ele = new XELement(doc, i);
             ele.parent = this;
             childs.Add(ele);
             i = ele.tagEndRef;
             continue;
         }
         if (doc.parts[i].type == PartType.tag_end)
         {
             if (((XTag)doc.parts[i]).tagname == ((XTag)doc.parts[start]).tagname)
             {
                 tagEndRef = i; break;
             }
             else
             {
                 throw new XMLException("dismatched end tag:" + doc.parts[start] + "..." + doc.parts[i]);
             }
         }
     }
     if (tagEndRef == -1)
     {
         throw new XMLException("Failure when close tag.");
     }
 }
Пример #16
0
    static void ProcXHTML(TextItem i)
    {
        Log.log("[Info]" + i.fullName);
        string name = Path.GetFileNameWithoutExtension(i.fullName);
        string r    = i.data.Replace("\r", "").Replace("\n", "");
        Match  m    = Regex.Match(r, "<body(.*)</body>");

        if (!m.Success)
        {
            Log.log("[Error]body?"); return;
        }
        r = m.Groups[0].Value;
        XFragment f       = new XFragment(r, 0);
        string    txt     = "";
        string    counter = "";

        foreach (var p in f.parts)
        {
            if (p.GetType() == typeof(XText))
            {
                string trimed = Util.Trim(p.originalText);
                txt     += trimed;
                counter += trimed;
                continue;
            }
            if (p.GetType() == typeof(XTag))
            {
                XTag p0 = (XTag)p;
                if (p.type == PartType.tag_start)
                {
                    switch (p0.tagname)
                    {
                    case "h1":
                    case "h2":
                    case "h3":
                    case "h4":
                    case "h5":
                    case "h6":
                        txt += "[" + p0.tagname + "]";
                        continue;

                    case "body":
                        continue;
                    }
                }
                if (p.type == PartType.tag_end)
                {
                    switch (p0.tagname)
                    {
                    case "h1":
                    case "h2":
                    case "h3":
                    case "h4":
                    case "h5":
                    case "h6":
                        txt += "[/" + p0.tagname + "]\r\n";
                        continue;

                    case "body":
                        continue;
                    }
                }
                if (p.type == PartType.tag_end && p0.tagname == "div")
                {
                    txt += "</div>\r\n"; continue;
                }
                if (p.type == PartType.tag_start && p0.tagname == "p")
                {
                    continue;
                }
                if (p.type == PartType.tag_end && p0.tagname == "p")
                {
                    txt += "\r\n"; continue;
                }
                txt += p0.originalText;
            }
        }
        if (Util.Trim(counter).Length > 0)
        {
            File.WriteAllText(output_dir + name + ".txt", txt);
        }
    }
Пример #17
0
        void ReadMeta3()
        {
            XFragment         f       = XFragment.FindFragment("metadata", OPF.text);
            List <MetaRecord> primary = new List <MetaRecord>();

            foreach (var e in f.root.childs)
            {
                switch (e.tag.tagname)
                {
                case "dc:language":
                case "dc:identifier":
                {
                    var t = new MetaRecord(e);
                    primary.Add(t);
                }
                break;

                case "meta":
                {
                    string name = e.tag.GetAttribute("name");
                    if (name != "")
                    {
                        var t = new MetaRecord();
                        t.name  = name;
                        t.value = e.tag.GetAttribute("content");
                        meta.Add(t);
                        continue;
                    }
                    string refines = e.tag.GetAttribute("refines");
                    if (refines != "")
                    {
                        if (refines.StartsWith("#") && refines.Length > 1)
                        {
                            string id = refines.Substring(1);
                            var    t  = new MetaRecord(e);
                            t.name = e.tag.GetAttribute("property");
                            t.AddIfExist(e, "scheme");
                            foreach (var r in primary)
                            {         //要是refine在primary前面我可不管……
                                if (r.id == id)
                                {
                                    r.refines.Add(t);
                                    break;
                                }
                            }
                            continue;
                        }
                    }
                    string property = e.tag.GetAttribute("property");
                    if (property != "")
                    {
                        var t = new MetaRecord(e);
                        t.name = property;
                        meta.Add(t);
                        continue;
                    }
                }
                break;

                default:
                {
                    var t = new MetaRecord(e);
                    t.AddIfExist(e, "xml:lang");
                    t.AddIfExist(e, "dir");
                    primary.Add(t);
                }
                break;
                }
            }
            foreach (var a in primary)
            {
                switch (a.name)
                {
                case "dc:title": dc_titles.Add(a); break;

                case "dc:creator": dc_creators.Add(a); break;

                case "dc:identifier": dc_identifier.Add(a); break;

                case "dc:language": dc_language.Add(a); break;

                default: others.Add(a); break;
                }
            }
            foreach (var a in dc_identifier)
            {
                if (idref == a.id)
                {
                    uniqueIdentifier = a; break;
                }
            }
            foreach (var a in manifest)
            {
                switch (a.Value.properties)
                {
                case "nav": _toc = a.Value; break;

                case "cover-image": cover_img = a.Value.href; break;
                }
            }
            if (_toc == null)
            {
                _toc = spine.toc;
            }
            //check
            //if (dc_titles.Count == 0 || dc_identifier.Count == 0 || dc_language.Count == 0) { throw new EpubErrorException("Lack of some metadata."); }
        }
Пример #18
0
        void ReadMeta2()
        {
            XFragment f = XFragment.FindFragment("metadata", OPF.text);

            foreach (var e in f.root.childs)
            {
                switch (e.tag.tagname)
                {
                case "dc:title":
                {
                    var t = new MetaRecord(e);
                    t.AddIfExist(e, "opf:file-as");
                    dc_titles.Add(t);
                }
                break;

                case "dc:creator":
                {
                    var t = new MetaRecord(e);
                    t.AddIfExist(e, "opf:file-as");
                    t.AddIfExist(e, "opf:role");
                    dc_creators.Add(t);
                }
                break;

                case "dc:language":
                {
                    var t = new MetaRecord(e);
                    dc_language.Add(t);
                }
                break;

                case "dc:identifier":
                {
                    var t = new MetaRecord(e);
                    t.AddIfExist(e, "opf:scheme");
                    dc_identifier.Add(t);
                }
                break;

                case "dc:contributor":
                {
                    var t = new MetaRecord(e);
                    t.AddIfExist(e, "opf:file-as");
                    t.AddIfExist(e, "opf:role");
                    others.Add(t);
                }
                break;

                case "dc:date":
                {
                    var t = new MetaRecord(e);
                    t.AddIfExist(e, "opf:event");
                    others.Add(t);
                }
                break;

                case "meta":
                {
                    var t = new MetaRecord();
                    t.name  = e.tag.GetAttribute("name");
                    t.value = e.tag.GetAttribute("content");
                    meta.Add(t);
                }
                break;

                default:
                {
                    var t = new MetaRecord(e);
                    others.Add(t);
                }
                break;
                }
            }
            foreach (var a in meta)
            {
                if (a.name == "cover")
                {
                    string id = a.value;
                    if (manifest.ContainsKey(id))
                    {
                        cover_img = manifest[id].href;
                    }
                    break;
                }
            }
            _toc = spine.toc;
        }
Пример #19
0
 public static SqlXml ToSqlXml(this XFragment xFragment) => new SqlXml(xFragment.CreateReader());
Пример #20
0
 public static XFragment ToXFragment(this SqlXml sqlxml)
 {
     using var xmlReader = sqlxml.CreateReader();
     return(XFragment.Parse(xmlReader));
 }
Пример #21
0
        void ProcNoteContent(string note_id, string ref_id)
        {
            string log = "";
            Regex  reg_tag = new Regex("<.*?>");
            Regex  reg_duokan = new Regex("<ol .*?>");
            Regex  reg_aside = new Regex("<aside .*?>");
            int    index = -1, length = 0;
            string note_content = null; string list_value = "1";

            Match m = reg_aside.Match(text);

            while (m.Success)
            {
                XTag tag = new XTag(m.Value);
                if (tag.GetAttribute("id") == note_id)
                {
                    index = m.Index;
                    log  += "aside; ";
                    XFragment frag = new XFragment(text, index);
                    if (frag.root != null)
                    {
                        var dk = frag.root.GetElementById(note_id);
                        if (dk != null)
                        {
                            //做过兼容,aside里套多看li
                            note_content = dk.innerXHTML;
                            list_value   = dk.tag.GetAttribute("value");
                            log         += "duplicate id at <" + dk.tag.tagname + ">" + dk.tag.GetAttribute("class") + "; ";
                        }
                        else
                        {
                            note_content = frag.root.innerXHTML;
                        }
                        length = frag.originalLength;
                    }
                    else
                    {
                        Log.log("[Error]Found note but failure on parsing. id=" + note_id); return;
                    }
                    break;
                }
                m = m.NextMatch();
            }

            if (index < 0)//如果只对多看适配,没有aside
            {
                m = reg_duokan.Match(text);
                while (m.Success)
                {
                    XFragment frg = new XFragment(text, m.Index);
                    if (frg.root != null)
                    {
                        if (Util.Contains(frg.root.tag.GetClassNames(), "duokan-footnote-content"))
                        {
                            var a = frg.root.GetElementById(note_id);
                            if (a != null)
                            {
                                index        = m.Index;
                                note_content = a.innerXHTML;
                                length       = frg.originalLength;
                                log         += "duokan-footnote; ";
                                break;
                            }
                        }
                    }
                    m = m.NextMatch();
                }
            }

            if (note_content == null)
            {
                Log.log("[Error]cannot find note"); return;
            }

            {
                Match ma = Regex.Match(note_content, "<a .*?></a>");
                if (ma.Success)
                {
                    note_content = Regex.Replace(note_content, "<a .*?></a>", "");
                    log         += "empty <a> tag;";
                }
            }
            note_content = Util.Trim(note_content);
            if (note_content.StartsWith("<div"))
            {
                log += "<div>;";
                XFragment f = new XFragment(note_content, 0);
                if (f != null)
                {
                    note_content = f.root.innerXHTML;
                }
            }
            string note_full = string.Format(template, note_id, ref_id, note_content);

            text = text.Remove(index, length);
            text = text.Insert(index, note_full);
            Log.log("[Info ]Detected id=" + note_id + ":" + log);
            Log.log("[Info ]Formated id=" + note_id + ":" + note_content);
            contain_footnote = true;
        }
Пример #22
0
    static void ProcXHTML(TextItem i)
    {
        Log.log("[Info]" + i.fullName);
        string name = Path.GetFileNameWithoutExtension(i.fullName);
        string r    = i.data.Replace("\r", "").Replace("\n", "");
        Match  m    = Regex.Match(r, "<body(.*)</body>");

        if (!m.Success)
        {
            Log.log("[Error]body?"); return;
        }
        r = m.Groups[0].Value;
        XFragment f       = new XFragment(r, 0);
        string    txt     = "";
        string    counter = "";
        string    temp    = "";

        foreach (var p in f.parts)
        {
            if (p.GetType() == typeof(XText))
            {
                string trimed = Util.Trim(p.originalText);
                txt     += trimed;
                counter += trimed;
                if (trimed.Length > 0)
                {
                    switch (trimed[0])
                    {
                    case '「':
                        temp = "「」";
                        break;

                    case '『':
                        temp = "『』";
                        break;

                    default:
                        temp = "";
                        break;
                    }
                }
                else
                {
                    temp = "";
                }
            }
            if (p.GetType() == typeof(XTag))
            {
                XTag p0 = (XTag)p;
                if (p0.tagname == "img")
                {
                    txt += p0.originalText;
                }
                if (p.type == PartType.tag_start && p0.tagname == "rt")
                {
                    txt += "(";
                }
                if (p.type == PartType.tag_end && p0.tagname == "rt")
                {
                    txt += ")";
                }
                if (p.type == PartType.tag_start && p0.tagname == "p")
                {
                    txt += "##";
                }
                if (p.type == PartType.tag_end && p0.tagname == "p")
                {
                    txt += "\r\n" + temp + "\r\n##——————\r\n";
                }
                if (p.type == PartType.tag_end && p0.tagname == "div")
                {
                    txt += "\r\n\r\n##——————\r\n";
                }
            }
        }
        if (Util.Trim(counter).Length > 0)
        {
            File.WriteAllText("epub2comment_output/" + name + ".txt", txt);
        }
    }
        void ProcXHTML(TextItem item)
        {
            XTag tag = XTag.FindTag("link", item.data);

            if (tag != null)
            {
                if (tag.GetAttribute("type").ToLower() == "text/css")
                {
                    string url = tag.GetAttribute("href");
                    url = Util.ReferPath(item.fullName, url);
                    bool already = false;
                    foreach (var c in css)
                    {
                        if (c.fullName == url)
                        {
                            already = true; break;
                        }
                    }
                    if (!already)
                    {
                        TextItem i = epub.GetItem <TextItem>(url);
                        if (i != null)
                        {
                            css.Add(i);
                        }
                        else
                        {
                            Log.log("[Warn ]Cannot find CSS:" + url);
                        }
                    }
                }
                else
                {
                    Log.log("[Warn ]Cannot find CSS reference.");
                }
            }
            else
            {
                Log.log("[Warn ]Cannot find CSS reference.");
            }

            int pos = 0;

            tag = XTag.FindTag("p", item.data, ref pos);
            int count = 0;

            while (tag != null)
            {
                XFragment p        = new XFragment(item.data, pos);
                string    onlytext = Regex.Replace(p.root.innerXHTML, "<.*?>", "");
                string    nospace  = onlytext.Replace(" ", "").Replace(" ", "");
                switch (nospace)
                {
                case "*":
                case "*":
                case "***":
                case "※":
                case "※※※":
                case "◆":
                case "◇":
                case "●":
                case "☆":
                case "⭐":
                case "×××":

                    tag.AddClassName("ae_center");
                    item.data = item.data.Remove(pos, tag.originalText.Length);
                    item.data = item.data.Insert(pos, tag.ToString());
                    Log.log("[Info ]Detect Separator:" + p.root.innerXHTML);
                    count++;
                    break;

                default:
                    if (p.root.innerXHTML.Length < 4)
                    {
                        Log.log("[ Info]Short <p> element:" + p.root.innerXHTML);
                    }
                    break;
                }

                pos++;
                tag = XTag.FindTag("p", item.data, ref pos);
            }
            Log.log("[Info ]Added class for " + count + " elements in " + item.fullName);
        }