Ejemplo n.º 1
0
        List <string> ParseChapter(List <string> contents)
        {
            if (contents == null || contents.Count < 3)
            {
                return(null);
            }

            var title = UtilityService.RemoveWhitespaces(contents[1].Trim()).Replace("\r", "").Replace("\n", "").Replace("\t", "");
            var start = title.PositionOf("<h4");

            start = title.PositionOf(">", start + 1);
            var end = title.PositionOf("</h4>", start + 1);

            if (start > 0 && end > 0)
            {
                title = title.Substring(start + 1, end - start - 1).Trim().Replace("♦", " ");
                "Đồng tác giả|Dịch giả|Người dịch|Dịch viện|Chuyển ngữ|Dịch ra|Anh dịch|Dịch thuật|Bản dịch|Hiệu đính|Biên Tập|Biên soạn|đánh máy bổ sung|Nguyên tác|Nguyên bản|Dịch theo|Dịch từ|Theo bản|Biên dịch|Tổng Hợp|Tủ Sách|Tuyển tập|Sách Xuất Bản Tại|Chủ biên|Chủ nhiệm".Split('|')
                .ForEach(excluded =>
                {
                    start = title.IndexOf(excluded, StringComparison.OrdinalIgnoreCase);
                    if (start > -1)
                    {
                        end = title.IndexOf("<br>", start, StringComparison.OrdinalIgnoreCase);
                        if (end < 0)
                        {
                            end = title.Length - 4;
                        }
                        title = title.Remove(start, end - start + 4).Trim();
                    }
                });

                while (title.IsStartsWith("<br>"))
                {
                    title = title.Substring(4).Trim();
                }
                while (title.IsEndsWith("<br>"))
                {
                    title = title.Substring(0, title.Length - 4).Trim();
                }

                title = title.Replace("<br>", ": ").Replace("<BR>", " - ").Trim();
            }

            start = title.PositionOf("<div class=\"hr");
            if (start > 0)
            {
                var tit = "";
                do
                {
                    start = title.PositionOf("<span", start + 1);
                    start = start < 0 ? -1 : title.PositionOf(">", start + 1) + 1;
                    end   = start < 0 ? -1 : title.PositionOf("</span>", start);
                    var t = start > 0 && end > 0 ? title.Substring(start, end - start).Trim() : "";
                    if (!t.Equals("") && !t.IsEquals(this.Title) && !t.IsEquals(this.Author))
                    {
                        tit += (tit != "" ? "<br>" : "") + t;
                    }
                } while (start > 0 && end > 0);
                title = tit.Replace("<br>", ": ").Replace("<BR>", " - ").Trim();
            }

            title = UtilityService.ClearTag(title, "img").Trim();
            title = UtilityService.RemoveTag(title, "br").Trim();
            title = UtilityService.RemoveTag(title, "p").Trim();
            title = UtilityService.RemoveTag(title, "i").Trim();
            title = UtilityService.RemoveTag(title, "b").Trim();
            title = UtilityService.RemoveTag(title, "em").Trim();
            title = UtilityService.RemoveTag(title, "strong").Trim();

            while (title.IndexOf("  ") > 0)
            {
                title = title.Replace("  ", " ");
            }
            while (title.IndexOf("- -") > 0)
            {
                title = title.Replace("- -", "-");
            }
            while (title.IndexOf(": -") > 0)
            {
                title = title.Replace(": -", ":");
            }

            title = title.Trim().Replace("( ", "(").Replace(" )", ")").Replace("- (", "(").Replace(": :", ":").GetNormalized();

            while (title.StartsWith(")") || title.StartsWith("]"))
            {
                title = title.Right(title.Length - 1).Trim();
            }
            while (title.EndsWith("(") || title.EndsWith("["))
            {
                title = title.Left(title.Length - 1).Trim();
            }

            while (title.StartsWith(":"))
            {
                title = title.Right(title.Length - 1).Trim();
            }
            while (title.EndsWith(":"))
            {
                title = title.Left(title.Length - 1).Trim();
            }

            if (title.Equals(title.ToUpper()))
            {
                title = title.ToLower().GetNormalized();
            }

            var body = UtilityService.RemoveWhitespaces(contents[2].Trim()).Replace(StringComparison.OrdinalIgnoreCase, "\r", "").Replace(StringComparison.OrdinalIgnoreCase, "\n", "").Replace(StringComparison.OrdinalIgnoreCase, "\t", "");

            body = UtilityService.RemoveTagAttributes(body, "p");
            body = UtilityService.RemoveTagAttributes(body, "div");

            body = UtilityService.ClearTag(body, "script");
            body = UtilityService.ClearComments(body);
            body = UtilityService.RemoveMsOfficeTags(body);

            body = body.Replace(StringComparison.OrdinalIgnoreCase, "<div></div>", "</p><p>").Trim();
            if (body.IsStartsWith("<div") && !body.IsEndsWith("</div>"))
            {
                body = body.Remove(0, body.IndexOf(">") + 1);
                body = "<p>" + body + "</p>";
            }

            while (body.IsStartsWith("<div>"))
            {
                body = body.Substring(5).Trim();
            }
            while (body.IsEndsWith("</div>"))
            {
                body = body.Substring(0, body.Length - 6).Trim();
            }

            start = body.PositionOf("<?xml");
            while (start > -1)
            {
                end   = body.PositionOf(">", start);
                body  = body.Remove(start, end - start + 1);
                start = body.PositionOf("<?xml");
            }

            "strong|em|p|img".Split('|')
            .ForEach(tag => body = body.Replace(StringComparison.OrdinalIgnoreCase, "<" + tag, "<" + tag).Replace(StringComparison.OrdinalIgnoreCase, "</" + tag + ">", "</" + tag + ">"));

            body = body.Replace(StringComparison.OrdinalIgnoreCase, "<DIV class=\"truyen_text\"></DIV></STRONG>", "</STRONG>\n<p>");
            body = body.Replace(StringComparison.OrdinalIgnoreCase, "<DIV class=\"truyen_text\"></DIV></EM>", "</EM>\n<p>");

            var headingTags = "h1|h2|h3|h4|h5|h6".Split('|');

            headingTags.ForEach(tag =>
            {
                body = body.Replace(StringComparison.OrdinalIgnoreCase, "<" + tag + "><div class=\"truyen_text\"></div>", "<" + tag + "> ").Replace(StringComparison.OrdinalIgnoreCase, "<" + tag + "><div class=\"truyen_text\"> </div>", "<" + tag + ">");
                body = UtilityService.RemoveTagAttributes(body, tag);
            });

            body = body.Replace(StringComparison.OrdinalIgnoreCase, "<div class=\"truyen_text\"></div>", "</p><p>").Replace(StringComparison.OrdinalIgnoreCase, "<div class=\"truyen_text\"> </div>", "</p><p>");
            body = body.Replace(StringComparison.OrdinalIgnoreCase, "<div class=\"truyen_text\">", "<p>").Replace(StringComparison.OrdinalIgnoreCase, "<div", "<p").Replace(StringComparison.OrdinalIgnoreCase, "</div>", "</p>");
            body = body.Replace(StringComparison.OrdinalIgnoreCase, "</li></p>", "</li>").Replace(StringComparison.OrdinalIgnoreCase, "<p><li>", "<li>");
            body = body.Replace(StringComparison.OrdinalIgnoreCase, "<p></ul></p>", "</ul>").Replace(StringComparison.OrdinalIgnoreCase, "<p></ol></p>", "</ol>");

            body = body.Replace(StringComparison.OrdinalIgnoreCase, "<i class=\"calibre7\"", "<i").Replace(StringComparison.OrdinalIgnoreCase, "<img class=\"calibre1\"", "<img").Replace(StringComparison.OrdinalIgnoreCase, "<b class=\"calibre4\"", "<b");
            body = body.Replace(StringComparison.OrdinalIgnoreCase, "<p> <b>", "<p><b>").Replace(StringComparison.OrdinalIgnoreCase, ". </b>", ".</b> ").Replace(StringComparison.OrdinalIgnoreCase, ". </i>", ".</i> ");
            body = body.Replace(StringComparison.OrdinalIgnoreCase, "<p align=\"center\"> <", "<p align=\"center\"><").Replace(StringComparison.OrdinalIgnoreCase, "<p> <", "<p><").Replace(StringComparison.OrdinalIgnoreCase, "<p> ", "<p>");
            body = body.Replace(StringComparison.OrdinalIgnoreCase, "<p><p>", "<p>").Replace(StringComparison.OrdinalIgnoreCase, "</p></p>", "</p>").Replace(StringComparison.OrdinalIgnoreCase, ". </p> ", ".</p>");

            headingTags.ForEach(tag =>
            {
                body = body.Replace(StringComparison.OrdinalIgnoreCase, "<" + tag + "> <", "<" + tag + "><").Replace(StringComparison.OrdinalIgnoreCase, "> </" + tag + ">", "></" + tag + ">");
                body = body.Replace(StringComparison.OrdinalIgnoreCase, "<" + tag + "></" + tag + ">", "").Replace(StringComparison.OrdinalIgnoreCase, "<" + tag + "> </" + tag + ">", "");
                body = body.Replace(StringComparison.OrdinalIgnoreCase, "<" + tag + "></p>", "<" + tag + ">").Replace(StringComparison.OrdinalIgnoreCase, "<p></" + tag + ">", "</" + tag + ">");
                body = body.Replace(StringComparison.OrdinalIgnoreCase, "<" + tag + "><strong>", "<" + tag + ">").Replace(StringComparison.OrdinalIgnoreCase, "</strong></" + tag + ">", "</" + tag + ">");
                body = body.Replace(StringComparison.OrdinalIgnoreCase, "<" + tag + "><em>", "<" + tag + ">").Replace(StringComparison.OrdinalIgnoreCase, "</em></" + tag + ">", "</" + tag + ">");
                body = body.Replace(StringComparison.OrdinalIgnoreCase, "<p><" + tag + ">", "<" + tag + ">").Replace(StringComparison.OrdinalIgnoreCase, "</" + tag + "></p>", "</" + tag + ">").Replace(StringComparison.OrdinalIgnoreCase, "<" + tag + "></p>", "");
            });

            headingTags.ForEach(tag =>
            {
                body  = body.Replace(StringComparison.OrdinalIgnoreCase, "<p><" + tag + ">", "<" + tag + ">").Replace(StringComparison.OrdinalIgnoreCase, "</" + tag + "></p>", "</" + tag + ">");
                start = body.PositionOf("<" + tag + ">");
                while (start > -1)
                {
                    end         = body.PositionOf("</" + tag + ">", start + 1);
                    var heading = body.Substring(start + 4, end - start - 4);
                    body        = body.Remove(start, end - start + 5);

                    var pos = heading.PositionOf("<");
                    while (pos > -1)
                    {
                        end = heading.PositionOf(">", pos);
                        if (end > 0)
                        {
                            heading = heading.Remove(pos, end - pos + 1);
                        }
                        pos = heading.PositionOf("<");
                    }
                    body  = body.Insert(start, "<" + tag + ">" + heading + "</" + tag + ">");
                    start = body.PositionOf("<" + tag + ">", start + 1);
                }
            });

            start = body.PositionOf("<p id=\"chuhoain\"");
            while (start > -1)
            {
                end = body.PositionOf("</span><p>", start);
                var img = body.PositionOf("<img", start);
                if (start > -1 && end > start && img > start)
                {
                    var imgStart = body.PositionOf("src=\"", img) + 5;
                    var imgEnd   = -1;
                    if (imgStart < 0)
                    {
                        imgStart = body.PositionOf("src='", img) + 5;
                        imgEnd   = body.PositionOf("'", imgStart);
                    }
                    else
                    {
                        imgEnd = body.PositionOf("\"", imgStart);
                    }
                    var imgChar = body.Substring(imgStart, imgEnd - imgStart);
                    body = body.Remove(start, end - start + 10);
                    body = body.Insert(start, "<p>" + this.GetImageCharacter(imgChar));
                }
                start = body.PositionOf("<p id=\"chuhoain\"", start + 1);
            }

            start = body.PositionOf("<img");
            while (start > -1)
            {
                end = body.PositionOf(">", start + 1);
                var img = body.PositionOf("src=\"https://vnthuquan.net/userfiles/images/chu%20cai/cotich", start);
                if (img < 0)
                {
                    img = body.PositionOf("src='https://vnthuquan.net/userfiles/images/chu%20cai/cotich", start);
                }

                if (img > -1 && end > img)
                {
                    end = body.PositionOf("\"", img + 5);
                    if (end < 0)
                    {
                        end = body.PositionOf("'", img + 5);
                    }
                    var imgChar = body.Substring(img + 5, end - img + 5);
                    end = body.PositionOf("<p>", start);
                    if (end < 0)
                    {
                        end = body.PositionOf(">", start) + 1;
                    }
                    else
                    {
                        end += 3;
                    }
                    string str = body.Substring(start, end - start);
                    body = body.Remove(start, end - start);
                    body = body.Insert(start, this.GetImageCharacter(imgChar));
                }

                start = body.PositionOf("<img", start + 1);
            }

            if (body.Equals("</p><p>"))
            {
                body = "";
            }
            else
            {
                body = this.NormalizeChapterBody(body);
                body = body.Replace(StringComparison.OrdinalIgnoreCase, "<h1>", "<h2>").Replace(StringComparison.OrdinalIgnoreCase, "</h1>", "</h2>");
            }

            return(new List <string> {
                title, body
            });
        }