示例#1
0
        public static HtmlNodeCollection SelectNodesFromClass(this HtmlNode node, string className)
        {
            HtmlNodeCollection selectedNodes = new HtmlNodeCollection(node);

            // Iterate through all immediate child nodes
            foreach (HtmlNode subNode in node.ChildNodes)
            {
                // Skip nodes containing an embedded card
                if (!subNode.ContainsClassName("vcard"))
                {
                    // If sub-node contains class name then add it to the list
                    if (subNode.ContainsClassName(className))
                    {
                        selectedNodes.Add(subNode);
                    }
                    // Otherwise select nodes recursively from descendants
                    else
                    {
                        HtmlNodeCollection selectedSubNodes = SelectNodesFromClass(subNode, className);
                        foreach (HtmlNode sn in selectedSubNodes)
                        {
                            selectedNodes.Add(sn);
                        }
                    }
                }
            }

            return(selectedNodes);
        }
示例#2
0
        private string Furigana(HtmlNode node)
        {
            if (_footnotes.Count == 0)
            {
                var firstNode = HtmlNode.CreateNode("<p id=\"Lf0\">================Footnotes================</p>");

                _footnotes.Add(firstNode);
            }

            var sub      = Regex.Match(node.InnerHtml, "<ruby>(.*)</ruby>").Value;
            var kMatches = Regex.Matches(sub, "<rb>(.*?)</rb>");
            var fMatches = Regex.Matches(sub, "<rt>(.*?)</rt>");

            var kanji    = "";
            var furigana = "";

            foreach (var kMatch in kMatches.Where(kMatch => kMatch.Groups.Count > 1))
            {
                kanji += kMatch.Groups[1].Value;
            }

            foreach (var fMatch in fMatches.Where(fMatch => fMatch.Groups.Count > 1))
            {
                furigana += fMatch.Groups[1].Value;
            }

            var line = node.InnerHtml.Replace(sub, kanji);

            _footnotes.Add(HtmlNode.CreateNode($"<p id=\"{node.Id}L\">{line}</p>"));
            _footnotes.Add(HtmlNode.CreateNode($"<p id=\"{node.Id}K\">{kanji}</p>"));
            _footnotes.Add(HtmlNode.CreateNode($"<p id=\"{node.Id}F\">{furigana}</p>"));
            _footnotes.Add(HtmlNode.CreateNode($"<p id=\"{node.Id}E\"></p>"));

            return(line);
        }
示例#3
0
        static void Main(string[] args)
        {
            var html =
                @"<body>
                        <h1>This is <b>bold</b> heading</h1>
                        <p>This is <u>underlined</u> paragraph</p>
                    </body>";

            var htmlDoc = new HtmlDocument();

            htmlDoc.LoadHtml(html);

            var htmlBody = htmlDoc.DocumentNode.SelectSingleNode("//body");

            HtmlNode refChild = htmlBody.ChildNodes[1];

            HtmlNode newChild = HtmlNode.CreateNode("<p> This is inserted after node paragraph");

            DisplayNode(htmlBody);

            htmlBody.InsertAfter(newChild, refChild);

            Console.WriteLine("\n ******** Node inserted after first child ***************\n");

            DisplayNode(htmlBody);

            HtmlNode newChild2 = HtmlNode.CreateNode("<h1> This is inserted before node heading</h1>");

            htmlBody.InsertBefore(newChild2, refChild);

            Console.WriteLine("\n ************ Node inserted after second child ********************\n");

            DisplayNode(htmlBody);

            Console.WriteLine("\n *************** Node inserted in the beginning ******************\n");

            HtmlNode newChild3 = HtmlNode.CreateNode("<h1> This is added at the beginning</h1>");

            htmlBody.PrependChild(newChild3);

            DisplayNode(htmlBody);

            Console.WriteLine("\n ****************** Prepend Children method ********************\n");

            HtmlNode H1Node = HtmlNode.CreateNode("<h1>This is new heading</h1>");

            HtmlNode pNode = HtmlNode.CreateNode("<p>This is new paragraph 1</p>");

            HtmlNodeCollection newChildren = new HtmlNodeCollection(htmlBody);

            newChildren.Add(H1Node);
            newChildren.Add(pNode);

            htmlBody.PrependChildren(newChildren);

            DisplayNode(htmlBody);

            Console.ReadLine();
        }
        public void AddTest()
        {
            HtmlElement root = new HtmlElement("root");
            HtmlNodeCollection target = new HtmlNodeCollection(root);
            HtmlElement child = new HtmlElement("child");
            int index = target.Add(child);
            Assert.AreEqual(root, child.Parent);
            Assert.AreEqual(index, 0);

            target.Add(null);
        }
示例#5
0
        private HtmlNodeCollection GenerateParticipantsNodes(HtmlDocument doc, HtmlNode parent, Rp rp)
        {
            HtmlNodeCollection res = new HtmlNodeCollection(parent);

            res.Add(doc.CreateTextNode("Feat. "));
            foreach (var participant in rp.Partenaires)
            {
                var partenaireNode = doc.CreateElement("span");
                partenaireNode.AddClass(participant.Groupe.GetDescription());
                partenaireNode.AppendChild(doc.CreateTextNode(participant.Nom));
                res.Add(partenaireNode);
                res.Add(doc.CreateTextNode(" - "));
            }
            return(res);
        }
示例#6
0
        public static HtmlNodeCollection GetNodesWithTagAndAttribute(string content, string htmlTag, string attr, string attrValue)
        {
            var doc = new HtmlDocument();

            doc.LoadHtml(content);
            if (doc.DocumentNode == null)
            {
                return(null);
            }
            if (content == null)
            {
                return(null);
            }

            var htmlNodes = doc.DocumentNode.SelectNodes("//" + htmlTag);

            var result = new HtmlNodeCollection(doc.DocumentNode);

            if (htmlNodes == null)
            {
                return(result);
            }

            foreach (var node in htmlNodes)
            {
                var attribute = node.Attributes[attr]?.Value;
                if (attribute?.IndexOf(attrValue, StringComparison.OrdinalIgnoreCase) >= 0)
                {
                    result.Add(node);
                }
            }

            return(result);
        }
示例#7
0
        public HtmlTokenizer(string text, IStemmer stemmer, bool decodeTextBlocks, bool tokenizeTextBlocks, bool applySkipRules)
        {
            Utils.ThrowException(text == null ? new ArgumentNullException("text") : null);
            mText               = text;
            mStemmer            = stemmer;
            mDecodeTextBlocks   = decodeTextBlocks;
            mTokenizeTextBlocks = tokenizeTextBlocks;
            mApplySkipRules     = applySkipRules;
            HtmlDocument htmlDoc = new HtmlDocument();

            Configure(htmlDoc);
            htmlDoc.LoadHtml(text);
            HtmlNodeCollection nodes = new HtmlNodeCollection(/*parentNode=*/ null);

            nodes.Add(htmlDoc.DocumentNode);
            RegexTokenizer textBlockTokenizer = null;

            if (mTokenizeTextBlocks)
            {
                textBlockTokenizer                     = new RegexTokenizer();
                textBlockTokenizer.TokenRegex          = string.Format("({0})|({1})", mWordRegexStr, mNumberRegexStr);
                textBlockTokenizer.IgnoreUnknownTokens = true;
            }
            CreateTokens(nodes, textBlockTokenizer);
        }
        public void FindByAttributeNameTest()
        {
            HtmlElement root = new HtmlElement("root");
            HtmlNodeCollection target = new HtmlNodeCollection(root);
            target.Add(new HtmlElement("first"));
            target.Add(new HtmlElement("second"));
            target.Add(new HtmlElement("third"));
            ((HtmlElement)target[0]).Nodes.Add(new HtmlElement("secondchild"));

            ((HtmlElement)target[1]).Attributes.Add(new HtmlAttribute("firstattribute"));
            ((HtmlElement)target[1]).Attributes.Add(new HtmlAttribute("secondattribute"));
            ((HtmlElement)target[2]).Attributes.Add(new HtmlAttribute("firstattribute"));

            Assert.AreEqual(target.FindByAttributeName("firstattribute").Count, 2);

            ((HtmlElement)((HtmlElement)target[0]).Nodes[0]).Attributes.Add(new HtmlAttribute("firstattribute"));
            Assert.AreEqual(target.FindByAttributeName("firstattribute", false).Count, 2);
            Assert.AreEqual(target.FindByAttributeName("firstattribute", true).Count, 3);
        }
示例#9
0
        public void ProcessCourse()
        {
            var htmlDoc = new HtmlDocument();

            htmlDoc.LoadHtml(course_result);
            HtmlNode           table      = htmlDoc.DocumentNode.SelectSingleNode("//table[@class='table listTable']");
            HtmlNodeCollection tableNodes = table.ChildNodes;
            HtmlNodeCollection trNodes    = new HtmlNodeCollection(table);

            foreach (var n in tableNodes)
            {
                if (n.Name == "tr")
                {
                    trNodes.Add(n);
                }
            }
            trNodes.RemoveAt(0);    //第一个tr节点非课程项,故去除

            foreach (var n in trNodes)
            {
                HtmlNodeCollection tdNodes = n.ChildNodes;
                var itemlist = new List <String>();
                foreach (var td in tdNodes)     //每个tdNodes中含27个td
                {
                    var a = td.InnerText.Replace("\r", "").Replace("\n", "").Replace("\t", "").Replace(" ", "");
                    itemlist.Add(a);
                }

                using (var context = new jwContext())
                {
                    var thisLessonNum = itemlist[1]; //直接将itemlist[1]放入Linq表达式将报错
                    var course        = context.Courses.SingleOrDefault(c => c.StuID == stuid && c.LessonNum == thisLessonNum);
                    if (course == null)              //确保表中不存在此项记录
                    {
                        var newcourse = new Course
                        {
                            StuID           = stuid,
                            LessonNum       = itemlist[1],
                            LessonName      = itemlist[3],
                            LessonType      = itemlist[5],
                            LearninType     = itemlist[7],
                            TeachingCollege = itemlist[9],
                            Teacher         = itemlist[11],
                            Specialty       = itemlist[13],
                            Credit          = itemlist[15],
                            LessonHours     = itemlist[17],
                            Time            = itemlist[19],
                            Note            = itemlist[21]
                        };
                        context.Courses.Add(newcourse);
                        context.SaveChanges();
                    }
                }
            }
        }
 private void GetMethods(IEnumerable <Test> methodList, HtmlNodeCollection htmlNode)
 {
     foreach (Test method in methodList)
     {
         HtmlNode testNodeMethod = HtmlNode.CreateNode($"<ul>" +
                                                       $"<li>{CreateColoredResult(method.Result)}" +
                                                       $"<b>Method Name</b> <br> {method.MethodName}</li>" +
                                                       $"</ul>");
         htmlNode.Add(testNodeMethod);
     }
 }
示例#11
0
        private void FinalizeHtmlDocument(IList <ArelleColumnSection> outlist)
        {
            var mtn = this.mainTableNode;

            for (int i = 0; i < mtn.ChildNodes.Count; i++)
            {
                mtn.ChildNodes[i].RemoveAllChildren();
                var row = new HtmlNodeCollection(mtn.ChildNodes[i]);
                foreach (var item in outlist[i].DynamicColumns)
                {
                    row.Add(item);
                }

                foreach (var item in outlist[i].NewStaticColumns)
                {
                    row.Add(item);
                }

                mtn.ChildNodes[i].AppendChildren(row);
            }
        }
        public static HtmlNodeCollection SelectNodesEx(this HtmlNode node, string xpath)
        {
            HtmlNodeCollection nodes    = new HtmlNodeCollection(null);
            XPathNodeIterator  iterator = new HtmlNodeNavigator(node.OwnerDocument, node).Select(xpath);

            while (iterator.MoveNext())
            {
                HtmlNodeNavigator current = (HtmlNodeNavigator)iterator.Current;
                nodes.Add(current.CurrentNode);
            }
            return(nodes);
        }
        public static HtmlNodeCollection ChildElements(this HtmlNode node)
        {
            var childNodes           = node.ChildNodes;
            HtmlNodeCollection elems = new HtmlNodeCollection(node);

            foreach (var child in childNodes)
            {
                if (child.NodeType == HtmlNodeType.Element)
                {
                    elems.Add(child);
                }
            }
            return(elems);
        }
示例#14
0
        public void Scan_sends_article_for_analysis_if_keyword_found()
        {
            var loggerMock = new Mock <ILogger <Tracker> >();
            var spiderMock = new Mock <ISpider>();

            spiderMock.Setup(s => s.LoadPage(It.IsAny <string>()));
            var headerNode = new HtmlNode(HtmlNodeType.Element, new HtmlDocument(), 0);

            headerNode.InnerHtml = "Some valid article header";

            var headers = new HtmlNodeCollection(null);

            headers.Add(headerNode);

            spiderMock.Setup(s => s.GetHeadersOfSize(It.IsAny <HtmlDocument>(), It.IsAny <int>())).Returns(headers);
            spiderMock.Setup(s => s.DownloadArticleByHeader(It.IsAny <string>(), It.IsAny <HtmlNode>())).Returns(("", new HtmlDocument()));
            var pipelineMock = new Mock <IPipeline>();

            pipelineMock.Setup(p => p.SendForAnalysis(It.IsAny <Article>()));

            var validatorMock = new Mock <IValidator>();

            validatorMock.Setup(v => v.ConsideredArticleHeader(It.IsAny <string>())).Returns(true);
            var extractorMock = new Mock <IExtractor>();

            var extractedBodyText = @"this body text was extracted from an html document 
            and it contains the default keyword Sverige which we know is a keyword since no env var KEYWORDS was set";

            extractorMock.Setup(e => e.ExtractBodyTextFromArticleDocument(It.IsAny <HtmlDocument>())).Returns(extractedBodyText);

            var tracker = new Tracker(pipelineMock.Object, spiderMock.Object, loggerMock.Object,
                                      validatorMock.Object, extractorMock.Object);

            var baseUrlInTest = "http://madeupnews.com";

            tracker.Scan(baseUrlInTest, new List <Article>());


            var numberOfHeaderLevels = 4; // recursive for four header levels h1, h2, h3 and h4

            // all levels will return the one header in this test...

            spiderMock.Verify(s => s.LoadPage(baseUrlInTest), Times.Exactly(numberOfHeaderLevels));
            spiderMock.Verify(s => s.GetHeadersOfSize(It.IsAny <HtmlDocument>(), It.IsAny <int>()), Times.Exactly(numberOfHeaderLevels));
            spiderMock.Verify(s => s.DownloadArticleByHeader(It.IsAny <string>(), It.IsAny <HtmlNode>()), Times.Exactly(numberOfHeaderLevels));
            extractorMock.Verify(e => e.ExtractBodyTextFromArticleDocument(It.IsAny <HtmlDocument>()), Times.Exactly(numberOfHeaderLevels));
            pipelineMock.Verify(p => p.SendForAnalysis(It.IsAny <Article>()), Times.Exactly(numberOfHeaderLevels));
        }
示例#15
0
            public static string SuccessSignup()
            {
                HtmlDocument login = new HtmlDocument();

                login.LoadHtml(Resources.header + Resources.login);
                HtmlNode alert = login.DocumentNode.SelectSingleNode("//*[contains(concat(\" \", normalize-space(@class), \" \"), \" alert \")]");

                alert.AddClass("alert-success in");
                alert.RemoveClass("hidden");
                HtmlNode           message  = HtmlNode.CreateNode("<p><strong>Success!</strong> Please login below</p>");
                HtmlNodeCollection children = new HtmlNodeCollection(alert);

                children.Add(message);
                alert.AppendChildren(children);
                return(login.DocumentNode.OuterHtml);
            }
        public static async Task <string> GetVietnamCoronaData()
        {
            try
            {
                HtmlNode            coronaTbl   = null;
                HtmlNode            coronathead = null;
                HtmlNode            coronatbody = null;
                HttpResponseMessage response    = null;
                //string content = "_congbothongke_WAR_coronadvcportlet_ma={0}&_congbothongke_WAR_coronadvcportlet_jsonData=%5B%7B%22name%22%3A%22Ha+Noi%22%2C%22ma%22%3A%2201%22%2C%22soCaNhiem%22%3A%223%22%2C%22tuVong%22%3A%220%22%2C%22nghiNhiem%22%3A%220%22%2C%22binhPhuc%22%3A%220%22%2C%22cachLy%22%3A%220%22%7D%2C%7B%22name%22%3A%22aaaaa%22%2C%22ma%22%3A%22%22%2C%22soCaNhiem%22%3A%2220%22%2C%22tuVong%22%3A%220%22%2C%22nghiNhiem%22%3A%22120%22%7D%2C%7B%22name%22%3A%22bbb%22%2C%22ma%22%3A%22%22%2C%22soCaNhiem%22%3A%2220%22%2C%22tuVong%22%3A%220%22%2C%22nghiNhiem%22%3A%22120%22%7D%2C%7B%22ma%22%3A%2202%22%2C%22soCaNhiem%22%3A%220%22%2C%22tuVong%22%3A%220%22%2C%22nghiNhiem%22%3A%220%22%2C%22binhPhuc%22%3A%220%22%2C%22cachLy%22%3A%220%22%7D%2C%7B%22ma%22%3A%22--Ch%E1%BB%8Dn+%C4%91%E1%BB%8Ba+ph%C6%B0%C6%A1ng--%22%2C%22soCaNhiem%22%3A%22%22%2C%22tuVong%22%3A%22%22%2C%22nghiNhiem%22%3A%22%22%7D%2C%7B%22ma%22%3A%22VNALL%22%2C%22soCaNhiem%22%3A%2238%22%2C%22tuVong%22%3A%220%22%2C%22nghiNhiem%22%3A%22210%22%2C%22binhPhuc%22%3A%2216%22%2C%22cachLy%22%3A%222.336%22%7D%2C%7B%22ma%22%3A%2279%22%2C%22soCaNhiem%22%3A%224+%22%2C%22tuVong%22%3A%220%22%2C%22nghiNhiem%22%3A%220%22%2C%22binhPhuc%22%3A%223+%22%2C%22cachLy%22%3A%22%22%7D%2C%7B%22ma%22%3A%2226%22%2C%22soCaNhiem%22%3A%2211%22%2C%22tuVong%22%3A%220%22%2C%22nghiNhiem%22%3A%220%22%2C%22binhPhuc%22%3A%2210%22%2C%22cachLy%22%3A%22%22%7D%2C%7B%22ma%22%3A%2238%22%2C%22soCaNhiem%22%3A%221%22%2C%22tuVong%22%3A%220%22%2C%22nghiNhiem%22%3A%220%22%2C%22binhPhuc%22%3A%221%22%2C%22cachLy%22%3A%22%22%7D%2C%7B%22ma%22%3A%2256%22%2C%22soCaNhiem%22%3A%221%22%2C%22tuVong%22%3A%220%22%2C%22nghiNhiem%22%3A%220%22%2C%22binhPhuc%22%3A%221%22%2C%22cachLy%22%3A%22%22%7D%2C%7B%22ma%22%3A%2208%22%2C%22soCaNhiem%22%3A%220%22%2C%22tuVong%22%3A%220%22%2C%22nghiNhiem%22%3A%220%22%2C%22binhPhuc%22%3A%220%22%2C%22cachLy%22%3A%220%22%7D%2C%7B%22ma%22%3A%2222%22%2C%22soCaNhiem%22%3A%224+%22%2C%22tuVong%22%3A%220%22%2C%22nghiNhiem%22%3A%220%22%2C%22binhPhuc%22%3A%220%22%2C%22cachLy%22%3A%220%22%7D%2C%7B%22ma%22%3A%2246%22%2C%22soCaNhiem%22%3A%221+%22%2C%22tuVong%22%3A%220%22%2C%22nghiNhiem%22%3A%220%22%2C%22binhPhuc%22%3A%220%22%2C%22cachLy%22%3A%220%22%7D%2C%7B%22ma%22%3A%2210%22%2C%22soCaNhiem%22%3A%222%22%2C%22tuVong%22%3A%220%22%2C%22nghiNhiem%22%3A%220%22%2C%22binhPhuc%22%3A%220%22%2C%22cachLy%22%3A%220%22%7D%2C%7B%22ma%22%3A%2237%22%2C%22soCaNhiem%22%3A%221+%22%2C%22tuVong%22%3A%220%22%2C%22nghiNhiem%22%3A%220%22%2C%22binhPhuc%22%3A%220%22%2C%22cachLy%22%3A%220%22%7D%2C%7B%22ma%22%3A%2248%22%2C%22soCaNhiem%22%3A%222+%22%2C%22tuVong%22%3A%220%22%2C%22nghiNhiem%22%3A%220%22%2C%22binhPhuc%22%3A%220%22%2C%22cachLy%22%3A%220%22%7D%2C%7B%22ma%22%3A%2249%22%2C%22soCaNhiem%22%3A%221+%22%2C%22tuVong%22%3A%220%22%2C%22nghiNhiem%22%3A%220%22%2C%22binhPhuc%22%3A%220%22%2C%22cachLy%22%3A%220%22%7D%2C%7B%22ma%22%3A%2260%22%2C%22soCaNhiem%22%3A%221+%22%2C%22tuVong%22%3A%220%22%2C%22nghiNhiem%22%3A%220%22%2C%22binhPhuc%22%3A%220%22%2C%22cachLy%22%3A%220%22%7D%5D";
                coronaTbl = HtmlNode.CreateNode("<table></table>");
                coronaTbl.Attributes.Add("class", "table table-bordered");
                coronathead = HtmlNode.CreateNode("<thead><tr><th>No</th><th>Tỉnh/Thành phố</th><th>Số ca mắc</th><th>Phục hồi</th><th>Tử vong</th><th>Tỉ lệ tử vong</th></tr></thead>");
                coronaTbl.AppendChild(coronathead);
                coronatbody = HtmlNode.CreateNode("<tbody></tbody>");
                response    = await client.GetAsync("https://ncov.moh.gov.vn/");

                if (response.IsSuccessStatusCode)
                {
                    byte[] bytecontentArr = await response.Content.ReadAsByteArrayAsync();

                    string htmlpage = Unzip(bytecontentArr);
                    var    htmlDoc  = new HtmlDocument();
                    htmlDoc.LoadHtml(htmlpage);
                    HtmlNode           dataTbl = htmlDoc.DocumentNode.SelectSingleNode("/html/body/div[1]/div/div/div/div/div[2]/div/div/section[2]/div/div[1]/table");
                    HtmlNodeCollection trColl  = dataTbl.SelectNodes("tbody/tr");
                    int count = 1;
                    foreach (HtmlNode tr in trColl)
                    {
                        HtmlNodeCollection tdData = tr.SelectNodes("td");
                        HtmlNode           newTr  = HtmlNode.CreateNode("<tr></tr>");
                        HtmlNodeCollection tdColl = new HtmlNodeCollection(newTr);
                        string             tinh   = tdData[0].InnerText;
                        int    soCaMac            = int.Parse(tdData[1].InnerText);
                        int    phuchoi            = int.Parse(tdData[3].InnerText);
                        int    tuvong             = int.Parse(tdData[4].InnerText);
                        double rate = Math.Round(1.0 * tuvong / soCaMac * 100, 2);
                        tdColl.Add(HtmlNode.CreateNode($"<td>{count++}</td>"));
                        tdColl.Add(HtmlNode.CreateNode($"<td>{tinh}</td>"));
                        tdColl.Add(HtmlNode.CreateNode($"<td>{soCaMac.ToString()}</td>"));
                        tdColl.Add(HtmlNode.CreateNode($"<td>{phuchoi.ToString()}</td>"));
                        tdColl.Add(HtmlNode.CreateNode($"<td>{tuvong.ToString()}</td>"));
                        tdColl.Add(HtmlNode.CreateNode($"<td>{rate}%</td>"));
                        newTr.AppendChildren(tdColl);
                        coronatbody.AppendChild(newTr);
                    }
                    coronaTbl.AppendChild(coronatbody);
                }
                return(coronaTbl.OuterHtml);
            }
            catch (Exception)
            {
                return("");
            }
        }
示例#17
0
            public static string FailLogin()
            {
                HtmlDocument login = new HtmlDocument();

                login.LoadHtml(Resources.header + Resources.login);
                HtmlNode alert = login.DocumentNode.SelectSingleNode("//*[contains(concat(\" \", normalize-space(@class), \" \"), \" alert \")]");

                alert.AddClass("alert-danger in");
                alert.RemoveClass("hidden");
                HtmlNode message = HtmlNode.CreateNode("<p><strong>Uh-Oh...</strong> Looks like we didn't recognize that Username/Password pair."
                                                       + " Try again or <a data-toggle=\"modal\" href=\"#resetPassword\">Reset your Password</a></p>");
                HtmlNodeCollection children = new HtmlNodeCollection(alert);

                children.Add(message);
                alert.AppendChildren(children);
                return(login.DocumentNode.OuterHtml);
            }
        /// <summary>
        /// Selects a list of nodes matching the <see cref="XPath"/> expression.
        /// </summary>
        /// <param name="xpath">The XPath expression.</param>
        /// <returns>An <see cref="HtmlNodeCollection"/> containing a collection of nodes matching the <see cref="XPath"/> query, or <c>null</c> if no node matched the XPath expression.</returns>
        public HtmlNodeCollection SelectNodes(string xpath)
        {
            HtmlNodeCollection list = new HtmlNodeCollection(null);

            HtmlNodeNavigator nav = new HtmlNodeNavigator(OwnerDocument, this);
            XPathNodeIterator it = nav.Select(xpath);
            while (it.MoveNext())
            {
                HtmlNodeNavigator n = (HtmlNodeNavigator)it.Current;
                list.Add(n.CurrentNode);
            }
            if (list.Count == 0)
            {
                return null;
            }
            return list;
        }
示例#19
0
            public static string SuccessResetPassword()
            {
                HtmlDocument login = new HtmlDocument();

                login.LoadHtml(LoginManager.Login());
                HtmlNode alert = login.DocumentNode.SelectSingleNode("//*[contains(concat(\" \", normalize-space(@class), \" \"), \" alert \")]");

                alert.AddClass("alert-success");
                alert.AddClass("in");
                alert.RemoveClass("hidden");
                HtmlNode           message  = HtmlNode.CreateNode("<p><strong>Password Reset</strong> Please login below with your new password</p>");
                HtmlNodeCollection children = new HtmlNodeCollection(alert);

                children.Add(message);
                alert.AppendChildren(children);
                return(login.DocumentNode.OuterHtml);
            }
示例#20
0
        public HtmlNodeCollection TraversalPtt(string target, int count, int?targetCount, HtmlNodeCollection htmlNodes)
        {
            string res = "";

            if (count == 0)//第一筆,index
            {
                if (target.Contains("search"))
                {
                    res = RequestPtt($"{target}");
                }
                else
                {
                    res = RequestPtt($"bbs/{target}/index.html");
                }
            }
            else
            {
                res = RequestPtt($"{target}");
            }
            var htmlDoc = new HtmlDocument();

            htmlDoc.LoadHtml(res);
            if (htmlDoc.DocumentNode.SelectNodes("//div[@class='btn-group btn-group-paging']")[0].ChildNodes[3].Attributes.Count == 1)//沒有下一頁
            {
                return(htmlNodes);
            }
            var next = htmlDoc.DocumentNode.SelectNodes("//div[@class='btn-group btn-group-paging']")[0].ChildNodes[3].Attributes[1].Value;

            var infos = htmlDoc.DocumentNode.SelectNodes("//div[@class='r-ent']");

            count += infos.Count;

            var infoList = infos.Reverse();//抓下來的資料會是倒排的

            foreach (var info in infoList)
            {
                htmlNodes.Add(info);
            }

            if (count >= targetCount)
            {
                return(htmlNodes);
            }
            return(TraversalPtt(next, count, targetCount, htmlNodes));
        }
示例#21
0
        public void Scan_doesnt_send_article_for_analysis_if_text_extraction_fails()
        {
            var loggerMock = new Mock <ILogger <Tracker> >();
            var spiderMock = new Mock <ISpider>();

            spiderMock.Setup(s => s.LoadPage(It.IsAny <string>()));
            var headerNode = new HtmlNode(HtmlNodeType.Element, new HtmlDocument(), 0);

            headerNode.InnerHtml = "Some valid article header";

            var headers = new HtmlNodeCollection(null);

            headers.Add(headerNode);

            spiderMock.Setup(s => s.GetHeadersOfSize(It.IsAny <HtmlDocument>(), It.IsAny <int>())).Returns(headers);
            spiderMock.Setup(s => s.DownloadArticleByHeader(It.IsAny <string>(), It.IsAny <HtmlNode>())).Returns(("", new HtmlDocument()));
            var pipelineMock = new Mock <IPipeline>();

            pipelineMock.Setup(p => p.SendForAnalysis(It.IsAny <Article>()));

            var validatorMock = new Mock <IValidator>();

            validatorMock.Setup(v => v.ConsideredArticleHeader(It.IsAny <string>())).Returns(true);
            var extractorMock = new Mock <IExtractor>();

            extractorMock.Setup(e => e.ExtractBodyTextFromArticleDocument(It.IsAny <HtmlDocument>())).Throws(new Exception("something bad happend"));

            var tracker = new Tracker(pipelineMock.Object, spiderMock.Object, loggerMock.Object,
                                      validatorMock.Object, extractorMock.Object);

            var baseUrlInTest = "http://madeupnews.com";

            tracker.Scan(baseUrlInTest, new List <Article>());


            var numberOfHeaderLevels = 4; // recursive for four header levels h1, h2, h3 and h4

            // all levels will return the one header in this test...

            spiderMock.Verify(s => s.LoadPage(baseUrlInTest), Times.Exactly(numberOfHeaderLevels));
            spiderMock.Verify(s => s.GetHeadersOfSize(It.IsAny <HtmlDocument>(), It.IsAny <int>()), Times.Exactly(numberOfHeaderLevels));
            spiderMock.Verify(s => s.DownloadArticleByHeader(It.IsAny <string>(), It.IsAny <HtmlNode>()), Times.Exactly(numberOfHeaderLevels));
            extractorMock.Verify(e => e.ExtractBodyTextFromArticleDocument(It.IsAny <HtmlDocument>()), Times.Exactly(numberOfHeaderLevels));
            pipelineMock.Verify(p => p.SendForAnalysis(It.IsAny <Article>()), Times.Exactly(0));
        }
示例#22
0
        public static HtmlNodeCollection AddRange(this HtmlNodeCollection collection, HtmlNodeCollection other)
        {
            if (other == null)
            {
                return(collection);
            }

            if (collection == null)
            {
                return(other);
            }

            foreach (var node in other)
            {
                collection.Add(node);
            }

            return(collection);
        }
示例#23
0
        private static HtmlNodeCollection FindClassNameNodes(string classname, HtmlNodeCollection nodes)
        {
            var doc    = new HtmlDocument();
            var result = new HtmlNodeCollection(doc.DocumentNode);

            if (nodes == null)
            {
                return(result);
            }

            foreach (var node in nodes)
            {
                var className = node.Attributes["class"]?.Value;
                if (className?.IndexOf(classname, StringComparison.OrdinalIgnoreCase) >= 0)
                {
                    result.Add(node);
                }
            }
            return(result);
        }
示例#24
0
        public static HtmlNodeCollection SelectNodes(this HtmlNode node, String xpath)
        {
            HtmlNodeCollection results = new HtmlNodeCollection(node);

            if (xpath.Equals("comment()"))
            {
                foreach (HtmlNode n in node.ChildNodes)
                {
                    if (n.NodeType == HtmlNodeType.Comment)
                    {
                        results.Add(n);
                    }
                }
            }
            else
            {
                throw new NotSupportedException("Only the XPath expressions required by dotNetRDF code are supported by this method");
            }

            return(results);
        }
示例#25
0
        public HDocument ParseHocr(HDocument hOrcDoc, string hOcrFile, bool append)
        {
            _hDoc = hOrcDoc;

            if (_doc == null)
            {
                _doc = new HtmlDocument();
            }

            _hOcrFilePath = hOcrFile;
            if (File.Exists(hOcrFile) == false)
            {
                throw new Exception("hocr file not found");
            }

            _currentPage = null;
            _currentPara = null;
            _currentLine = null;

            _doc.Load(hOcrFile, Encoding.UTF8);


            HtmlNode           body   = _doc.DocumentNode.SelectNodes("//body")[0];
            HtmlNodeCollection nodes1 = body.SelectNodes("//div");
            //#Issue #1 reported by Ryan-George
            IEnumerable <HtmlNode> divs  = body.ChildNodes.Where(node => node.Name.ToLower() == "div");
            HtmlNodeCollection     nodes = new HtmlNodeCollection(null);

            foreach (HtmlNode div in divs)
            {
                nodes.Add(div);
            }

            _hDoc.ClassName = "body";

            ParseNodes(nodes);
            return(_hDoc);
        }
示例#26
0
        /// <summary>
        /// Grabs the required trs from the market table after calculating the range from the base number.
        /// </summary>
        /// <param name="marketURL">The market URL</param>
        /// <param name="openMarketBaseNumber">The open market base number</param>
        /// <returns>HtmlNodeCollection</returns>
        private HtmlNodeCollection DownloadMarketData(string marketURL, int openMarketBaseNumber)
        {
            // Define the range
            baseNumber         = Math.Round(Convert.ToDecimal(openMarketBaseNumber), 2);
            baseNumberPlus50   = baseNumber + 100;
            baseNumberPlus100  = baseNumber + 200;
            baseNumberPlus150  = baseNumber + 300;
            baseNumberPlus200  = baseNumber - 100;
            baseNumberMinus50  = baseNumber - 200;
            baseNumberMinus100 = baseNumber - 300;

            // Grab all rows
            var htmlWeb = new HtmlWeb();

            HtmlAgilityPack.HtmlDocument htmlDocument = htmlWeb.Load(marketURL);

            HtmlNodeCollection tableRows = htmlDocument.DocumentNode.SelectNodes("//table[@id=\"octable\"]//tr");

            tableRows.RemoveAt(tableRows.Count - 1);
            tableRows.RemoveAt(0);
            tableRows.RemoveAt(0);

            // Get only those rows which contain values for the defined tange
            HtmlNodeCollection workSetRows = new HtmlNodeCollection(null);

            foreach (var currentTableRow in tableRows)
            {
                if (currentTableRow.InnerHtml.Contains(baseNumber.ToString()) || currentTableRow.InnerHtml.Contains(baseNumberPlus50.ToString()) ||
                    currentTableRow.InnerHtml.Contains(baseNumberPlus100.ToString()) || currentTableRow.InnerHtml.Contains(baseNumberMinus50.ToString()) ||
                    currentTableRow.InnerHtml.Contains(baseNumberMinus100.ToString()) || currentTableRow.InnerHtml.Contains(baseNumberPlus150.ToString()) ||
                    currentTableRow.InnerHtml.Contains(baseNumberPlus200.ToString()))
                {
                    workSetRows.Add(currentTableRow);
                }
            }

            return(workSetRows);
        }
示例#27
0
        public void Scan_doesnt_download_article_with_invalid_header()
        {
            var pipelineMock  = new Mock <IPipeline>();
            var loggerMock    = new Mock <ILogger <Tracker> >();
            var extractorMock = new Mock <IExtractor>();

            var validatorMock = new Mock <IValidator>();

            validatorMock.Setup(v => v.ConsideredArticleHeader(It.IsAny <string>())).Returns(false);

            var spiderMock = new Mock <ISpider>();

            spiderMock.Setup(s => s.LoadPage(It.IsAny <string>()));

            var headerNode = new HtmlNode(HtmlNodeType.Element, new HtmlDocument(), 0);

            headerNode.InnerHtml = "<h1>doesnt matter - mocked</h1>"; //known invalid article header

            var headers = new HtmlNodeCollection(null);

            headers.Add(headerNode);

            spiderMock.Setup(s => s.GetHeadersOfSize(It.IsAny <HtmlDocument>(), It.IsAny <int>())).Returns(headers);
            spiderMock.Setup(s => s.DownloadArticleByHeader(It.IsAny <string>(), It.IsAny <HtmlNode>())).Returns((null, null));

            var tracker = new Tracker(pipelineMock.Object, spiderMock.Object, loggerMock.Object,
                                      validatorMock.Object, extractorMock.Object);

            var baseUrlInTest = "http://madeupnews.com";

            tracker.Scan(baseUrlInTest, new List <Article>());

            var numberOfHeaderLevels = 4; // all levels will return the one header in this test...

            spiderMock.Verify(s => s.LoadPage(baseUrlInTest), Times.Exactly(numberOfHeaderLevels));
            spiderMock.Verify(s => s.GetHeadersOfSize(It.IsAny <HtmlDocument>(), It.IsAny <int>()), Times.Exactly(numberOfHeaderLevels));
            spiderMock.Verify(s => s.DownloadArticleByHeader(It.IsAny <string>(), It.IsAny <HtmlNode>()), Times.Exactly(0));
        }
        private static HtmlNode CreateNewTr(HtmlNodeCollection dtd, int no, bool highlightFlg)
        {
            CultureInfo viCulture = new CultureInfo("vi-VN");
            HtmlNode    newTr     = HtmlNode.CreateNode("<tr></tr>");

            if (highlightFlg)
            {
                newTr.Attributes.Add("style", "background-color:yellow");
            }
            HtmlNodeCollection tdColl = new HtmlNodeCollection(newTr);
            int    soCaMac            = int.Parse(dtd[1].InnerText.Trim(), NumberStyles.AllowThousands);
            int    phuchoi            = int.Parse(dtd[5].InnerText.Trim(), NumberStyles.AllowThousands);
            int    tuvong             = dtd[3].InnerText.Trim() == string.Empty ? 0 : int.Parse(dtd[3].InnerText.Trim(), NumberStyles.AllowThousands);
            double rate = Math.Round(1.0 * tuvong / soCaMac * 100, 2);

            tdColl.Add(HtmlNode.CreateNode($"<td>{no}</td>"));
            tdColl.Add(HtmlNode.CreateNode($"<td>{dtd[0].InnerText.Trim()}</td>"));
            tdColl.Add(HtmlNode.CreateNode($"<td>{soCaMac.ToString("N0", viCulture)}</td>"));
            tdColl.Add(HtmlNode.CreateNode($"<td>{phuchoi.ToString("N0", viCulture)}</td>"));
            tdColl.Add(HtmlNode.CreateNode($"<td>{tuvong.ToString("N0", viCulture)}</td>"));
            tdColl.Add(HtmlNode.CreateNode($"<td>{rate}%</td>"));
            newTr.AppendChildren(tdColl);
            return(newTr);
        }
示例#29
0
        private static HtmlNodeCollection BuildNodeCollection(Queue<string> tokens)
        {
            HtmlNodeCollection nodes = new HtmlNodeCollection(null);
            HtmlElement element = null;
            string current;

            while (tokens.Count > 0)
            {
                current = tokens.Dequeue();
                switch (current)
                {
                    case ("<"):
                        // Read open tag

                        if (tokens.Count == 0)
                            break;

                        current = tokens.Dequeue();
                        element = new HtmlElement(current);

                        // read the attributes and values
                        while (tokens.Count > 0 && (current = tokens.Dequeue()) != ">" && current != "/>")
                        {
                            string attribute_name = current;
                            if (tokens.Count > 0 && tokens.Peek() == "=")
                            {
                                tokens.Dequeue();
                                current = (tokens.Count > 0) ? tokens.Dequeue() : null;
                                HtmlAttribute attribute = new HtmlAttribute(attribute_name, HttpUtility.HtmlDecode(current));
                                element.Attributes.Add(attribute);
                            }
                            else //if (tokens.Count == 0)
                            {
                                // Null-attributeValue attribute
                                HtmlAttribute attribute = new HtmlAttribute(attribute_name);
                                element.Attributes.Add(attribute);
                            }
                        }
                        nodes.Add(element);

                        if (current == "/>")
                        {
                            element.IsTerminated = true;
                            element = null; //could not have any sub elements
                        }
                        else if (current == ">")
                        {
                            continue;
                        }
                        break;
                    case (">"):
                        continue;
                    case ("</"):
                        // Read close tag

                        if (tokens.Count == 0)
                            break;

                        current = tokens.Dequeue();

                        int open_index = FindTagOpenNodeIndex(nodes, current);
                        if (open_index != -1)
                        {
                            MoveNodesDown(ref nodes, open_index + 1, (HtmlElement)nodes[open_index]);
                        }

                        // Skip to the end of this tag
                        while (tokens.Count > 0 && (current = tokens.Dequeue()) != ">")
                        {
                            //shouldn't happen
                        }
                        element = null;
                        break;
                    default:
                        HtmlText node = new HtmlText(current);
                        nodes.Add(node);
                        break;
                }
            }
            return nodes;
        }
 public void HtmlNodeCollectionConstructorTest()
 {
     HtmlElement root = new HtmlElement("root");
     HtmlNodeCollection target = new HtmlNodeCollection(root);
     HtmlElement child = new HtmlElement("child");
     target.Add(child);
     Assert.AreEqual(root, child.Parent);
 }
        public void InsertTest()
        {
            HtmlElement root = new HtmlElement("root");
            HtmlNodeCollection target = new HtmlNodeCollection(root);
            HtmlElement child = new HtmlElement("child");
            target.Add(child);

            child = new HtmlElement("second");
            target.Insert(0, child);
            Assert.AreEqual(root, child.Parent);
            Assert.AreEqual(target.IndexOf(child), 0);

            target.Insert(0, null);
        }
示例#32
0
        static void Main(string[] args)
        {
            var html =
                @"<body>
                    <h1>This is <b>bold</b> heading</h1>
                    <p>This is <u>underlined</u> paragraph</p>
			
			        <h1>This is <i>italic</i> heading</h1>
			        <p>This is <u>underlined</u> paragraph</p>
                </body>";

            var htmlDoc = new HtmlDocument();

            htmlDoc.LoadHtml(html);

            var htmlNodes = htmlDoc.DocumentNode.SelectNodes("//body/h1");

            Console.WriteLine("-------------------------------------------------");
            Console.WriteLine("InnerHtml");
            foreach (var node in htmlNodes)
            {
                Console.WriteLine(node.InnerHtml);
            }

            Console.WriteLine("--------------------------------------------------");
            Console.WriteLine("InnerText");
            foreach (var node in htmlNodes)
            {
                Console.WriteLine(node.InnerText);
            }

            Console.WriteLine("----------------------------------------------------");
            Console.WriteLine("OuterHtml");
            foreach (var node in htmlNodes)
            {
                Console.WriteLine(node.OuterHtml);
            }

            Console.WriteLine("-----------------------------------------------------");
            Console.WriteLine("ParentNode of h1 is :");
            var selectSingleNode = htmlDoc.DocumentNode.SelectSingleNode("//body/h1");

            HtmlNode parentNode = selectSingleNode.ParentNode;

            Console.WriteLine(parentNode.Name);

            Console.WriteLine("--------------------------------------------------------");
            Console.WriteLine("Child nodes present initially");
            var htmlBody = htmlDoc.DocumentNode.SelectSingleNode("//body");

            DisplayChildNodes(htmlBody);

            HtmlNode h2Node = HtmlNode.CreateNode("<h2> This is h2 heading</h2>");

            htmlBody.AppendChild(h2Node);

            Console.WriteLine("---------------------------------------------------------");
            Console.WriteLine("After child node appended");

            DisplayChildNodes(htmlBody);

            HtmlNode h3Node = HtmlNode.CreateNode("<h3> THis is H2 heading</h3>");
            HtmlNode pNode1 = HtmlNode.CreateNode("<p>This is appended paragraph 1</p>");
            HtmlNode pNode2 = HtmlNode.CreateNode("<p>This is appended paragraph 2</p>");

            HtmlNodeCollection children = new HtmlNodeCollection(htmlBody);

            children.Add(h3Node);
            children.Add(pNode1);
            children.Add(pNode2);

            htmlBody.AppendChildren(children);

            Console.WriteLine("\n*********************After children appended**********************");
            DisplayChildNodes(htmlBody);

            HtmlNode newHtmlBody = htmlBody.Clone();

            Console.WriteLine("\n Duplicate Node Name :" + newHtmlBody.Name);

            Console.WriteLine("\n*********************** Display children of the duplicate node **************************\n");

            DisplayChildNodes(newHtmlBody);

            var htmlBodyTwo = htmlBody.CloneNode(false);

            Console.WriteLine("\n Clone Node Name: " + newHtmlBody.Name);

            Console.WriteLine("\n************* Display children of the clone node *******************\n");

            DisplayChildNodes(htmlBodyTwo);

            HtmlNode h1Node = htmlBody.ChildNodes[1];

            DisplayNode(h1Node);

            HtmlNode h4Node = h1Node.CloneNode("h2", true);

            Console.WriteLine("\n************* CLone node *********************\n");

            DisplayNode(h4Node);

            HtmlNode h5Node = htmlBody.ChildNodes[1];

            DisplayNode(h5Node);

            HtmlNode h6Node = h5Node.CloneNode("h6", true);

            Console.WriteLine("\n*************** Clone node ***********************\n");

            DisplayNode(h6Node);

            HtmlNode newBody = HtmlNode.CreateNode("<body></body>");

            newBody.CopyFrom(htmlBody);

            DisplayNode(htmlBody);

            Console.WriteLine("\n****************** Display node **********************\n");

            DisplayNode(newBody);

            HtmlNode newBody2 = HtmlNode.CreateNode("<body></body>");

            newBody2.CopyFrom(htmlBodyTwo, false);

            DisplayNode(htmlBodyTwo);

            Console.WriteLine("\n **************** Duplicate node ********************\n");

            DisplayNode(newBody2);

            HtmlNode newPara = HtmlNode.CreateNode("<p>This is new paragraph</p>");

            htmlBodyTwo.ChildNodes.Add(newPara);

            Console.WriteLine("\n ***************** After adding new child node ************\n");

            DisplayNode(htmlBodyTwo);



            Console.ReadKey();
        }
示例#33
0
        /// <summary>
        /// Gets the formatted html for the specified message.
        /// </summary>
        /// <param name="message">The message.</param>
        /// <returns>The formatted html.</returns>
        public static async Task <string> FormattedHtml(MailMessage message)
        {
            try
            {
                // Load the html
                HtmlDocument htmlDocument = new HtmlDocument();
                htmlDocument.OptionFixNestedTags = true;
                string html = (message.TextContentType == ETextContentType.Html ? message.Text : string.Format("<p>{0}</p>", (message.Text + string.Empty).Replace(Environment.NewLine, "<br/>")));
                htmlDocument.LoadHtml(html);

                // Get the link nodes
                IEnumerable <HtmlNode> linkNodes = htmlDocument.DocumentNode.Descendants("a")
                                                   .Where(o => !string.IsNullOrEmpty(o.GetAttributeValue("href", null)) &&
                                                          (o.GetAttributeValue("href", null).StartsWith("http", StringComparison.OrdinalIgnoreCase) ||
                                                           o.GetAttributeValue("href", null).StartsWith("www", StringComparison.OrdinalIgnoreCase)));

                // Loop through each external link - ensure it opens in new window
                foreach (HtmlNode linkNode in linkNodes)
                {
                    if (linkNode.Attributes.Contains("target"))
                    {
                        linkNode.Attributes["target"].Value = "_blank";
                    }
                    else
                    {
                        linkNode.Attributes.Add("target", "_blank");
                    }
                }

                // Get the image nodes
                IEnumerable <HtmlNode> imageNodes = htmlDocument.DocumentNode.Descendants("img")
                                                    .Where(o => !string.IsNullOrEmpty(o.GetAttributeValue("src", null)) &&
                                                           (!o.GetAttributeValue("src", null).StartsWith("http", StringComparison.OrdinalIgnoreCase) ||
                                                            !o.GetAttributeValue("src", null).StartsWith("www", StringComparison.OrdinalIgnoreCase)));

                // Loop through each local image
                foreach (HtmlNode imageNode in imageNodes)
                {
                    try
                    {
                        // Find the image attachment
                        string     srcWithoutCid = imageNode.GetAttributeValue("src", null).Replace("cid:", "");
                        Attachment attachment    = message.Attachments[srcWithoutCid];

                        // If found
                        if (attachment != null)
                        {
                            // Convert image to base64
                            StorageFile attachmentFile = await IOUtil.GetCreateFile(attachment.FullFilename, CreationCollisionOption.ReplaceExisting);

                            imageNode.Attributes["src"].Value = await TransformFileToBase64ImageString(attachmentFile.Path);
                        }
                    }
                    catch (Exception ex)
                    {
                        LogFile.Instance.LogError("", "", ex.ToString());
                    }
                }

                // Ensure that the html node exists
                HtmlNode htmlNode = htmlDocument.DocumentNode.Descendants("html").FirstOrDefault();
                if (htmlNode == null)
                {
                    htmlNode = htmlDocument.CreateElement("html");
                    htmlDocument.DocumentNode.AppendChild(htmlNode);
                }

                // Ensure that the head node exists
                HtmlNode headNode = htmlDocument.DocumentNode.Descendants("head").FirstOrDefault();
                if (headNode == null)
                {
                    headNode = htmlDocument.CreateElement("head");
                    htmlNode.AppendChild(headNode);
                }

                // Create page css transition
                HtmlNode cssTransitionNode = htmlDocument.CreateElement("style");
                cssTransitionNode.InnerHtml = "body{opacity:0;transition: all 2s ease;}.loaded{opacity:1;}";
                headNode.PrependChild(cssTransitionNode);

                // Create page javascript transition
                HtmlNode javascriptTransitionNode = htmlDocument.CreateElement("script");
                javascriptTransitionNode.Attributes.Add("type", "text/javascript");
                javascriptTransitionNode.InnerHtml = "document.addEventListener('DOMContentLoaded', function () { document.body.classList.add('loaded'); }, false);";
                headNode.AppendChild(javascriptTransitionNode);

                // Ensure that the body node exists
                HtmlNode bodyNode = htmlDocument.DocumentNode.Descendants("body").FirstOrDefault();
                if (bodyNode == null)
                {
                    bodyNode = htmlDocument.CreateElement("body");
                    htmlNode.AppendChild(bodyNode);
                }

                // Add the body tags
                HtmlNodeCollection htmlNodes = new HtmlNodeCollection(bodyNode);
                foreach (HtmlNode node in htmlDocument.DocumentNode.ChildNodes.ToList())
                {
                    if (!node.Name.Equals("html", StringComparison.OrdinalIgnoreCase) &&
                        !node.Name.Equals("head", StringComparison.OrdinalIgnoreCase) &&
                        !node.Name.Equals("body", StringComparison.OrdinalIgnoreCase))
                    {
                        htmlNodes.Add(node);
                        htmlDocument.DocumentNode.RemoveChild(node);
                    }
                }
                bodyNode.AppendChildren(htmlNodes);

                // Return the html
                return(htmlDocument.DocumentNode.InnerHtml);
            }
            catch (Exception ex)
            {
                LogFile.Instance.LogError("", "", ex.ToString());
                return(message.Text);
            }
        }
示例#34
0
        /// <summary>
        /// Selects a list of nodes matching the <see cref="XPath"/> expression.
        /// </summary>
        /// <param name="xpath">The XPath expression.</param>
        /// <returns>An <see cref="HtmlNodeCollection"/> containing a collection of nodes matching the <see cref="XPath"/> query</returns>
        public HtmlNodeCollection SelectNodes(string xpath, XmlNamespaceManager xmgr)
        {
            HtmlNodeCollection list = new HtmlNodeCollection(null);

            HtmlNodeNavigator nav = new HtmlNodeNavigator(_ownerdocument, this);
            XPathNodeIterator it = nav.Select(xpath, xmgr);
            while (it.MoveNext())
            {
                HtmlNodeNavigator n = (HtmlNodeNavigator)it.Current;
                list.Add(n.CurrentNode);
            }
            if (list.Count == 0)
            {
                return list;
            }
            return list;
        }
        public void ItemByIndexTest()
        {
            HtmlElement root = new HtmlElement("root");
            HtmlNodeCollection target = new HtmlNodeCollection(root);
            target.Add(new HtmlElement("first"));
            target.Add(new HtmlElement("second"));
            target.Add(new HtmlElement("third"));

            Assert.AreEqual(target[1], target["second"]);

            target[2] = new HtmlElement("another");
            target[0] = null;

            StringAssert.Contains(target[2].ToString(), "another");
        }
        public void ItemByNameTest()
        {
            HtmlElement root = new HtmlElement("root");
            HtmlNodeCollection target = new HtmlNodeCollection(root);
            target.Add(new HtmlElement("first"));
            target.Add(new HtmlElement("second"));
            target.Add(new HtmlElement("second"));

            Assert.IsNotNull(target["second"]);

            Assert.IsNull(target["anyname"]);
        }
        public void GetByNameTest()
        {
            HtmlElement root = new HtmlElement("root");
            HtmlNodeCollection target = new HtmlNodeCollection(root);
            target.Add(new HtmlElement("first"));
            target.Add(new HtmlElement("second"));
            target.Add(new HtmlElement("second"));

            Assert.AreEqual(target.GetByName("second").Count, 2);

            ((HtmlElement)target[0]).Nodes.Add(new HtmlElement("second"));

            Assert.AreEqual(target.GetByName("second", false).Count, 2);

            Assert.AreEqual(target.GetByName("second").Count, 3);
        }
示例#38
0
        private RaceParsingResult ParseRace(string html)
        {
            var result = new RaceParsingResult
            {
                PlayerResults = new List <ResultParsed>(),
                RaceInfo      = new RaceInfo()
            };

            HtmlDocument doc = new HtmlDocument();

            doc.LoadHtml(html);

            HtmlNode gameTypeSpan = doc.DocumentNode.SelectSingleNode("//td[@id='gamedesc']/span");
            string   modeId       = gameTypeSpan.GetAttributeValue("class", "0").Replace("gametype-", "");
            var      SpanA        = gameTypeSpan.SelectSingleNode("a");

            string modeName;

            if (SpanA == null) //станд режим
            {
                modeName = gameTypeSpan.InnerText;
            }
            else // словарь
            {
                modeId  += "-" + Regex.Match(SpanA.GetAttributeValue("href", "0"), "\\d+");
                modeName = SpanA.InnerText;
            }
            result.RaceInfo.Mode = new Mode(modeId, modeName);

            result.RaceInfo.BookAuthor = doc.DocumentNode.SelectSingleNode("//div[@id='bookinfo']//div[@class='author']")?.InnerText;
            result.RaceInfo.BookName   = doc.DocumentNode.SelectSingleNode("//div[@id='bookinfo']//div[@class='name']")?.InnerText;

            HtmlNode totalPlayersNode = doc.DocumentNode.SelectSingleNode("//div[@id='players-count-lbl']/span");

            if (!string.IsNullOrEmpty(totalPlayersNode?.InnerText))
            {
                var match = Regex.Match(totalPlayersNode.InnerText, @"\d+");
                if (match.Success)
                {
                    result.RaceInfo.TotalPlayers = int.Parse(match.Value);
                }
            }

            HtmlNode pointsIncreaseNode = doc.DocumentNode.SelectSingleNode("//div[@id='players-count-lbl']/b");
            int      pointsIncrease     = 0;

            if (!string.IsNullOrEmpty(pointsIncreaseNode?.InnerText))
            {
                var match = Regex.Match(pointsIncreaseNode.InnerText, @"\d+");
                if (match.Success)
                {
                    pointsIncrease = int.Parse(match.Value);
                }
            }

            HtmlNodeCollection players = doc.DocumentNode.SelectNodes("//div[@class='player other ng-scope']");

            if (players != null)
            {
                players.Add(doc.DocumentNode.SelectSingleNode("//div[@class='player you ng-scope']"));// ng-scope
            }
            else
            {
                players = doc.DocumentNode.SelectNodes("//div[@class='player you ng-scope']");
            }

            foreach (HtmlNode player in players)
            {
                ResultParsed resultParsed = new ResultParsed();
                resultParsed.Mode           = new Mode(modeId, modeName);
                resultParsed.PointsIncrease = pointsIncrease;

                HtmlNode rating = player.SelectSingleNode("div[@class='rating']");
                HtmlNode car    = player.SelectSingleNode("table[@class='car']");
                HtmlNode place  = rating.SelectSingleNode("div/ins");
                HtmlNode nick   = player.SelectSingleNode("table//a");
                if (nick != null)
                {
                    resultParsed.Nick = nick != null ? nick.InnerText : "Гость";
                    string id_str = nick.GetAttributeValue("href", "0");
                    resultParsed.Id   = int.Parse(Regex.Match(id_str, "[0-9]+").ToString());
                    resultParsed.Rank = Rank.GetByIndex(int.Parse(nick.GetAttributeValue("class", "000000").Substring(4, 1)));
                }
                else
                {
                    continue;
                    //result.Nick = "Гость";
                    //result.Id = 0;
                    //result.Rank = Rank.GetByIndex(0);
                }

                if (place != null)
                {
                    string place_str = place.InnerText;
                    resultParsed.RealPlace = int.Parse(place_str.Substring(0, place_str.Length - 6));
                    resultParsed.Time      = TimeSpan.Parse("00:" + rating.SelectSingleNode("div[@class='stats']/div").InnerText.Replace(" ", "").Replace("\r", "").Replace("\n", ""));
                    resultParsed.Speed     = (int)Math.Round(double.Parse(rating.SelectSingleNode("div[@class='stats']/div[2]/span").InnerText, new NumberFormatInfo()
                    {
                        NumberDecimalSeparator = ","
                    }));
                    resultParsed.ErCnt  = int.Parse(rating.SelectSingleNode("div[@class='stats']/div[3]/span").InnerText);
                    resultParsed.ErRate = double.Parse(rating.SelectSingleNode("div[@class='stats']/div[3]/span[2]").InnerText, new NumberFormatInfo()
                    {
                        NumberDecimalSeparator = ","
                    }) / 100;
                    result.RaceInfo.ArrivedPlayers++;
                }

                if (player.SelectSingleNode("div[@class='newrecord']//span[@class='']") != null)
                {
                    resultParsed.IsRecord = true; //рекорд с записью или без
                }
                int.TryParse(Regex.Match(car.GetAttributeValue("style", ""), "(?<=left: )\\d+(?=px)").ToString(), out int progress);
                resultParsed.Progress = (int)(progress / 4.8);
                //result.finished = progress >= 100; //style="top: 0px; left: 480px; "

                HtmlNode _imgcont = car.SelectSingleNode(".//div[@class='imgcont']");
                HtmlNode _left    = car.SelectSingleNode(".//div[@class='imgcont leave']");
                resultParsed.HasLeftRace = _left != null;

                HtmlNode _noerror_fail = car.SelectSingleNode(".//img[@class='noerror-fail']");
                resultParsed.NoErrorFail = _noerror_fail != null;

                HtmlNode _i_style = car.SelectSingleNode(".//i");
                if (_i_style != null)
                {
                    int.TryParse(Regex.Match(_i_style.GetAttributeValue("title", ""), "\\d+").ToString(), out int _mileage);
                    resultParsed.Mileage = _mileage;
                }
                result.PlayerResults.Add(resultParsed);
            }
            return(result);
        }
示例#39
0
        /// <summary>
        ///  附录需要在word里按目录要求,手动改为一级或者二级标题的格式
        /// </summary>
        /// <param name="rootConvention"></param>
        public ReturnInfo ReadHtml(ConventionRow rootConvention)
        {
            HtmlAgilityPack.HtmlDocument doc = new HtmlAgilityPack.HtmlDocument();
            doc.Load(htmlPath);
            HtmlNode                 htmlRootNode = doc.DocumentNode;
            HtmlNodeCollection       title1Nodes_init;
            HtmlNodeCollection       title2Nodes_init;
            List <string>            str_contentList   = new List <string>();
            List <string>            str_titleList     = new List <string>();
            List <string>            str_title1List    = new List <string>();
            List <string>            str_title2List    = new List <string>();
            HtmlNodeCollection       contentNodes      = new HtmlNodeCollection(htmlRootNode.Clone());
            Dictionary <int, string> dic_title1Content = new Dictionary <int, string>();
            HtmlNodeCollection       titleNodes        = new HtmlNodeCollection(htmlRootNode.Clone());
            HtmlNodeCollection       title1Nodes       = new HtmlNodeCollection(htmlRootNode.Clone());
            HtmlNodeCollection       title2Nodes       = new HtmlNodeCollection(htmlRootNode.Clone());
            HtmlNodeCollection       ftNoteRefnodes    = new HtmlNodeCollection(htmlRootNode.Clone());
            string htmlTxt = htmlRootNode.InnerHtml;

            //正文识别标题

            #region (废弃选项:一级标题粗体识别)
            //if (method == ReadMethod.TITLE1_BOLD)
            //{
            //    //一级标题
            //    title1Nodes_init = htmlRootNode.SelectNodes(title1_select);
            //    //二级标题可能所在span
            //    title2Nodes_init = htmlRootNode.SelectNodes(title2_select);
            //    #region 找出一级标题,HtmlNode保存在title1Nodes,文本存储在 str_title1List

            //    if (title1Nodes_init != null)
            //    {
            //        for (int i = 0; i < title1Nodes_init.Count; i++)
            //        {
            //            if ((title1Nodes_init[i].ParentNode.Name == "p" && title1Nodes_init[i].ParentNode.ParentNode.Name == "div" && title1Nodes_init[i].HasChildNodes)
            //                || (title1Nodes_init[i].Name == "h1" && title1Nodes_init[i].ParentNode.Name == "div")
            //                || (title1Nodes_init[i].Name == "h2" && title1Nodes_init[i].ParentNode.Name == "div")
            //                || (title1Nodes_init[i].ParentNode.Name == "a" && title1Nodes_init[i].ParentNode.ParentNode.Name == "p")
            //                )
            //            {
            //                foreach (var child in title1Nodes_init[i].DescendantsAndSelf())
            //                {
            //                    if (child.Name == "span" && child.HasAttributes)
            //                    {
            //                        foreach (var atbt in child.Attributes)
            //                        {
            //                            if (atbt.Name == "style")//&& atbt.Value== "font-size:15.0pt;font-family:黑体")
            //                            {
            //                                if ((title1Nodes_init[i].ParentNode.InnerText.Contains("第") && title1Nodes_init[i].ParentNode.InnerText.Contains("章"))
            //                                    )
            //                                {
            //                                    if (title1Nodes_init[i].ParentNode.ParentNode.Name == "p")
            //                                    {
            //                                        title1Nodes.Add(title1Nodes_init[i].ParentNode.ParentNode);
            //                                        str_title1List.Add(title1Nodes_init[i].ParentNode.ParentNode.InnerText.Replace("&nbsp;", " ").Replace("\r\n", ""));
            //                                    }
            //                                    else if (title1Nodes_init[i].ParentNode.Name == "p")
            //                                    {
            //                                        title1Nodes.Add(title1Nodes_init[i].ParentNode);
            //                                        str_title1List.Add(title1Nodes_init[i].ParentNode.InnerText.Trim().Replace("&nbsp;", " ").Replace("\r\n", ""));
            //                                    }
            //                                    else if (title1Nodes_init[i].Name == "h" || title1Nodes_init[i].Name == "h1" || title1Nodes_init[i].Name == "h2")
            //                                    {
            //                                        title1Nodes.Add(title1Nodes_init[i]);
            //                                        str_title1List.Add(title1Nodes_init[i].InnerText.Trim().Replace("&nbsp;", " ").Replace("\r\n", ""));
            //                                    }
            //                                }
            //                                break;
            //                            }
            //                        }
            //                        break;
            //                    }
            //                }
            //            }
            //        }
            //    }


            //#region 找出二级小节标题,HtmlNode保存在title2Nodes ,文本存储在str_title2List
            ////span所在的几种情形:div->p->a->span   div->p->span  div->h1->span

            //if (title2Nodes_init != null)
            //{
            //    for (int i = 0; i < title2Nodes_init.Count; i++)
            //    {
            //        //标题span存在的情形1
            //        if (title2Nodes_init[i].ParentNode.Name == "a" && title2Nodes_init[i].ParentNode.ParentNode.Name == "p")
            //        {
            //            //避免添加重复的部分
            //            if ((i == 0) || (i > 0 && title2Nodes_init[i].ParentNode.ParentNode.Line != title2Nodes_init[i - 1].ParentNode.ParentNode.Line))
            //            {
            //                title2Nodes.Add(title2Nodes_init[i].ParentNode.ParentNode);
            //                str_title2List.Add(title2Nodes_init[i].ParentNode.ParentNode.InnerText.Trim().Replace("&nbsp;", " ").Replace("\r\n", ""));
            //            }
            //        }
            //        //标题span存在的情形2、3
            //        else if ((title2Nodes_init[i].ParentNode.Name == "p" && title2Nodes_init[i].ParentNode.ParentNode.Name == "div")
            //            || (title2Nodes_init[i].ParentNode.Name == "h1" && title2Nodes_init[i].ParentNode.ParentNode.Name == "div"))
            //        {
            //            //避免添加重复的部分
            //            if ((i == 0) || (i > 0 && title2Nodes_init[i].ParentNode.Line != title2Nodes_init[i - 1].ParentNode.Line))
            //            {
            //                title2Nodes.Add(title2Nodes_init[i].ParentNode);
            //                str_title2List.Add(title2Nodes_init[i].ParentNode.InnerText.Trim().Replace("&nbsp;", " ").Replace("\r\n", ""));
            //            }
            //        }
            //    }
            //    for (int i = 0; i < title2Nodes.Count; i++)
            //    {
            //        if ((i > 0 && title2Nodes[i].Line == title2Nodes[i - 1].Line))
            //        {
            //            str_title2List.RemoveAt(i);
            //            title2Nodes.RemoveAt(i);
            //        }
            //    }
            //}
            //#endregion
            //
            //}
            #endregion

            #region  项1:pdf转为图片的word文件后,通过p节点class属性提取标题
            if (method == ReadMethod.TITLE_CLASS)
            {
                //HtmlNodeCollection title1Nodes_tmp = new HtmlNodeCollection(htmlRootNode.Clone());
                title1Nodes_init = htmlRootNode.SelectNodes(@"//p[@class=1]");
                title2Nodes_init = htmlRootNode.SelectNodes(@"//p[@class=2]");
                for (int i = 0; i < title1Nodes_init.Count; i++)
                {
                    if (title1Nodes_init[i].InnerText.Replace("&nbsp;", "").Trim() != string.Empty)
                    {
                        str_title1List.Add(title1Nodes_init[i].InnerText.Trim().Replace("&nbsp;", " ").Replace("\r\n", ""));
                        title1Nodes.Add(title1Nodes_init[i]);
                    }
                }
                for (int i = 0; i < title2Nodes_init.Count; i++)
                {
                    if (title2Nodes_init[i].InnerText.Replace("&nbsp;", "").Trim() != string.Empty)
                    {
                        str_title2List.Add(title2Nodes_init[i].InnerText.Trim().Replace("&nbsp;", " ").Replace("\r\n", ""));
                        title2Nodes.Add(title2Nodes_init[i]);
                    }
                }
            }
            #endregion

            #region  项2:标题中Span 标签 Style属性识别
            else if (method == ReadMethod.TITLE_SPANSTYLE)
            {
                HtmlNodeCollection title1Nodes_tmp = new HtmlNodeCollection(htmlRootNode.Clone());
                #region 提取一级标题节点,生成一级目录的节点集合title1Nodes,和字符串集合str_title1List
                title1Nodes_init = htmlRootNode.SelectNodes(@"//p");
                if (title1Nodes_init != null)
                {
                    for (int i = 0; i < title1Nodes_init.Count; i++)
                    {
                        string str_style = title1Nodes_init[i].InnerHtml.Replace("\r\n", "");
                        bool   condition = str_style.Contains(title1_select);
                        //bool condition = str_style.Contains(title1_select)
                        //    && (title1Nodes_init[i].InnerText.Substring(0, 1) == "第")
                        //|| title1Nodes_init[i].InnerText.Substring(0, 1) == "附";
                        if (RecogOptions.title1_has_zitizihao)
                        {
                            string str_style_zihao = title1_select.Substring(0, title1_select.IndexOf(';'));
                            string str_style_ziti  = title1_select.Substring(title1_select.IndexOf(';') + 1);
                            condition = str_style.Contains(str_style_zihao) && str_style.Contains(str_style_ziti);
                        }
                        if (condition)
                        {
                            foreach (var match in title1Nodes_init[i].DescendantsAndSelf())
                            {
                                if (RecogOptions.title1_child == 0 && match.Name == "p")
                                {
                                    title1Nodes_tmp.Add(title1Nodes_init[i]);
                                    break;
                                }
                                if (RecogOptions.title1_child == 1 && match.Name == "b")
                                {
                                    title1Nodes_tmp.Add(title1Nodes_init[i]);
                                    break;
                                }
                                if (RecogOptions.title1_child == 2 && match.Name == "a")
                                {
                                    title1Nodes_tmp.Add(title1Nodes_init[i]);
                                    break;
                                }
                            }
                        }
                    }
                    for (int i = 0; i < title1Nodes_tmp.Count; i++)
                    {
                        if (title1Nodes_tmp[i].InnerText.Replace("&nbsp;", "").Trim() != string.Empty &&
                            (i == 0 || (i > 0 && title1Nodes_tmp[i].Line != title1Nodes_tmp[i - 1].Line)))
                        {
                            str_title1List.Add(title1Nodes_tmp[i].InnerText.Trim().Replace("&nbsp;", " ").Replace("\r\n", ""));
                            title1Nodes.Add(title1Nodes_tmp[i]);
                        }
                    }
                }
                #endregion

                #region 提取二级标题节点,生成二级目录的节点集合title2Nodes,和字符串集合str_title2List

                HtmlNodeCollection tempNodes = new HtmlNodeCollection(htmlRootNode.Clone());
                if (RecogOptions.title2RecogMethod == 1)
                {
                    title2Nodes_init = htmlRootNode.SelectNodes(@"//p");
                    if (title2Nodes_init != null)
                    {
                        for (int i = 0; i < title2Nodes_init.Count; i++)
                        {
                            string          str_tmp = title2Nodes_init[i].InnerText.Replace("&nbsp;", " ");
                            string          regExp  = Patterns.title2_x_dot_x_XXX;
                            Regex           reg     = new Regex(regExp, RegexOptions.Multiline);
                            MatchCollection matches = reg.Matches(str_tmp);
                            if (matches.Count > 0)
                            {
                                string tmp = matches[0].Value;
                                //有些文档中形如“1 XXX”的不是二级标题,需要手动在程序中修改
                                //if(tmp.Substring(0, 1) == "第" || tmp.Substring(0, 1) == "附" || tmp.Substring(0, 1) == "修")
                                if (!tmp.Contains("。") //&& tmp.Substring(tmp.Length - 1, 1) != ":" && !tmp.Contains(";")
                                    //&&!tmp.Contains("p"))
                                    )
                                //tmp.Length>0&&(tmp.Substring(0,1)=="第"|| tmp.Substring(0, 1) == "附"|| tmp.Substring(0, 1) == "修"))
                                {
                                    foreach (var match in title2Nodes_init[i].DescendantsAndSelf())
                                    {
                                        if (RecogOptions.title2_child == 0 && match.Name == "p")
                                        {
                                            tempNodes.Add(title2Nodes_init[i]);
                                            break;
                                        }
                                        if (RecogOptions.title2_child == 1 && match.Name == "b")
                                        {
                                            tempNodes.Add(title2Nodes_init[i]);
                                            break;
                                        }
                                        if (RecogOptions.title2_child == 2 && match.Name == "a")
                                        {
                                            tempNodes.Add(title2Nodes_init[i]);
                                            break;
                                        }
                                    }
                                }
                            }
                        }
                    }
                }
                if (RecogOptions.title2RecogMethod == 0)
                {
                    title2Nodes_init = htmlRootNode.SelectNodes(@"//span[@style]");
                    if (title2Nodes_init != null)
                    {
                        for (int i = 0; i < title2Nodes_init.Count; i++)
                        {
                            string str_style = title2Nodes_init[i].Attributes["style"].Value.Replace("\r\n", "");
                            bool   condition = str_style.Contains(title2_select);
                            if (RecogOptions.title2_has_zitizihao)
                            {
                                string str_style_zihao = title2_select.Substring(0, title2_select.IndexOf(';'));
                                string str_style_ziti  = title2_select.Substring(title2_select.IndexOf(';') + 1);
                                condition = str_style.Contains(str_style_zihao) && str_style.Contains(str_style_ziti);
                            }
                            if (condition)
                            {
                                if ((RecogOptions.title2_child == 0) ||
                                    (RecogOptions.title2_child == 1 && title2Nodes_init[i].ParentNode.Name == "b") ||
                                    (RecogOptions.title2_child == 2 && title2Nodes_init[i].ParentNode.Name == "a"))
                                {
                                    foreach (var match in title2Nodes_init[i].AncestorsAndSelf())
                                    {
                                        if (match.Name == "p")
                                        {
                                            //foreach(var match1 in match.Descendants())
                                            //{
                                            //    if (match1.Name == "a")
                                            //    {
                                            //        tempNodes.Add(match);
                                            //        break;
                                            //   }
                                            //}
                                            string tmp = match.InnerText.Replace("&nbsp;", "").Replace("\r\n", "").Trim();
                                            int    a   = 0;
                                            if (tmp.Length > 1)
                                            {
                                                //有些文档中形如“1 XXX”的不是二级标题,需要手动在程序中修改
                                                //if((tmp.Contains("条") && tmp.Substring(0, 1) == "第") || tmp.Substring(0, 1) == "附" || tmp.Substring(0, 1) == "标")
                                                //if(tmp.Contains("条")&&tmp.Substring(0,1)=="第")
                                                //if(!(tmp.Substring(0,1)=="第")&& !(tmp.Substring(0, 1) == "附"))

                                                //if(int.TryParse(tmp.Substring(0, 1),out a)==true)
                                                if (!tmp.Contains("。"))//&& tmp.Substring(tmp.Length - 1, 1) != ":" && !tmp.Contains(";"))
                                                //tmp.Length>0&&(tmp.Substring(0,1)=="第"|| tmp.Substring(0, 1) == "附"|| tmp.Substring(0, 1) == "修"))
                                                {
                                                    tempNodes.Add(match);
                                                }
                                            }
                                            break;
                                        }
                                    }
                                }
                            }
                        }
                    }
                }
                for (int i = 0; i < tempNodes.Count; i++)
                {
                    if (tempNodes[i].InnerText.Replace("&nbsp;", "").Trim() != String.Empty &&
                        (i == 0 || (i > 0 && tempNodes[i].Line != tempNodes[i - 1].Line)))
                    {
                        title2Nodes.Add(tempNodes[i]);
                        string tmp = tempNodes[i].InnerText.Replace("\r\n", "").Replace("&nbsp;", " ");
                        str_title2List.Add(tmp.Trim());
                    }
                }
                #endregion
            }
            #endregion

            #region  项3:h1/h2/h3标签识别标题
            //else if (method == ReadMethod.TITLE_TAG)
            //{
            //    titleNodes_init = htmlRootNode.SelectNodes(@"//" + title1_select + @"|" + @"//" + title2_select);
            //    title1Nodes_init = htmlRootNode.SelectNodes(@"//" + title1_select);
            //    title2Nodes_init = htmlRootNode.SelectNodes(@"//" + title2_select);
            //    for (int i = 0; i < titleNodes_init.Count; i++)
            //    {
            //        string tmpstr = titleNodes_init[i].InnerText;
            //        if (titleNodes_init[i].Name == title1_select && tmpstr.Contains("第") && tmpstr.Contains("章"))
            //        {
            //            titleNodes.Add(titleNodes_init[i]);
            //            title1Nodes.Add(titleNodes_init[i]);
            //            str_titleList.Add(titleNodes_init[i].InnerText.Trim().Replace("&nbsp;", " ").Replace("\r\n", ""));
            //            str_title1List.Add(titleNodes_init[i].InnerText.Trim().Replace("&nbsp;", " ").Replace("\r\n", ""));
            //        }
            //        else if (titleNodes_init[i].Name == title2_select)
            //        {
            //            titleNodes.Add(titleNodes_init[i]);
            //            title2Nodes.Add(titleNodes_init[i]);
            //            str_titleList.Add(titleNodes_init[i].InnerText.Trim().Replace("&nbsp;", " ").Replace("\r\n", ""));
            //            str_title2List.Add(titleNodes_init[i].InnerText.Trim().Replace("&nbsp;", " ").Replace("\r\n", ""));
            //        }
            //    }
            //}
            #endregion

            #region 生成包含按序排列的一二级目录的节点集合titleNodes,和字符串集合str_titleList
            foreach (var match in title1Nodes)
            {
                titleNodes.Add(match);
            }
            foreach (var match in title2Nodes)
            {
                titleNodes.Add(match);
            }
            for (int i = 0; i < titleNodes.Count; i++)
            {
                for (int j = i; j < titleNodes.Count; j++)
                {
                    if (titleNodes[i].Line > titleNodes[j].Line)
                    {
                        var temp = titleNodes[i];
                        titleNodes[i] = titleNodes[j];
                        titleNodes[j] = temp;
                    }
                }
            }

            for (int i = 0; i < titleNodes.Count; i++)
            {
                string tmp = titleNodes[i].InnerText.Replace("&nbsp;", " ").Replace("\r\n", "");
                str_titleList.Add(tmp.Trim());
            }
            #endregion
            try
            {
                #region 找出html文本末尾可能存在的各脚注div,HtmlNode存储在ftNoteRefnodes

                foreach (var match in htmlRootNode.Descendants())
                {
                    if (match.Name == "div" && match.HasAttributes)
                    {
                        string tmp = match.GetAttributeValue("id", "notfound");
                        if (tmp != "notfound")
                        {
                            ftNoteRefnodes.Add(match);
                        }
                    }
                }
                #endregion

                #region html文档中去除文档末尾的脚注,保存在 htmlTxt 字符串
                if (ftNoteRefnodes != null)
                {
                    for (int i = 0; i < ftNoteRefnodes.Count; i++)
                    {
                        htmlTxt = htmlTxt.Replace(ftNoteRefnodes[i].OuterHtml, "");
                    }
                }
                htmlTxt = htmlTxt.Replace("</body>", "").Replace("</html>", "").Replace("<body>", "").Replace("<html>", "");
                #endregion

                #region 替换图片路径
                Regex           reg     = new Regex(Patterns.imageSrc);
                MatchCollection matches = reg.Matches(htmlTxt);
                if (matches.Count == 0)
                {
                    retInfo.picResult = "无匹配图片";
                }
                else
                {
                    htmlTxt           = reg.Replace(htmlTxt, "${1}" + imageFilePath + "${2}");
                    retInfo.picResult = "识别到图片数目:" + matches.Count.ToString();
                }
                //System.IO.File.WriteAllText(@"../../../htmlRcgTest/全文.html", htmlTxt);
                #endregion

                #region 提取一级标题下可能有的正文,此标题序号和正文键值对 存储在字典dic_title1Content(包含脚注)dic_title1Content_tmp(不含脚注)
                Dictionary <int, string> dic_title1Content_tmp = new Dictionary <int, string>();
                for (int i = 0; i < titleNodes.Count; i++)
                {
                    for (int j = 0; j < title1Nodes.Count - 1; j++)
                    {
                        if (titleNodes[i].Line == title1Nodes[j].Line)
                        {
                            if ((i < titleNodes.Count - 1 && titleNodes[i + 1].Line == title1Nodes[j + 1].Line))
                            {
                                int start = htmlTxt.IndexOf(title1Nodes[j].OuterHtml);
                                int end   = htmlTxt.IndexOf(title1Nodes[j + 1].OuterHtml, start + 1);
                                if (start != -1 && end > start)
                                {
                                    dic_title1Content_tmp.Add(j, htmlTxt.Substring(start, end - start));
                                    break;
                                }
                                else
                                {
                                    throw new Exception("title1 content提取出错");
                                }
                            }
                        }
                    }
                }
                for (int i = 0; i < title1Nodes.Count; i++)
                {
                    if (titleNodes.Last().Line == title1Nodes[i].Line)
                    {
                        int start = htmlTxt.IndexOf(title1Nodes.Last().OuterHtml);
                        if (start != -1)
                        {
                            dic_title1Content_tmp.Add(title1Nodes.Count - 1, htmlTxt.Substring(start));
                            break;
                        }
                        else
                        {
                            throw new Exception("title1 last content提取出错");
                        }
                    }
                }
                foreach (var pair in dic_title1Content_tmp)
                {
                    string v = pair.Value;
                    foreach (var ftnref in ftNoteRefnodes)
                    {
                        if (pair.Value.Contains("href=\"#_" + ftnref.Attributes["id"].Value + "\""))
                        {
                            v = v + ftnref.OuterHtml;
                        }
                    }
                    dic_title1Content.Add(pair.Key, v);
                }
                #endregion

                #region 更新 htmlTxt 字符串,将html文本中一级标题和一级标题下直接的正文  删除
                if (title1Nodes != null)
                {
                    for (int i = 0; i < title1Nodes.Count; i++)
                    {
                        if (dic_title1Content_tmp.Count != 0)//若存在一级标题下直接的正文
                        {
                            foreach (var pair in dic_title1Content_tmp)
                            {
                                int index = htmlTxt.IndexOf(pair.Value);
                                htmlTxt = htmlTxt.Replace(pair.Value, "");
                                if (i != pair.Key)
                                {
                                    htmlTxt = htmlTxt.Replace(title1Nodes[i].OuterHtml, "");
                                }
                            }
                        }
                        else //若不存在一级标题下直接的正文
                        {
                            htmlTxt = htmlTxt.Replace(title1Nodes[i].OuterHtml, "");
                        }
                    }
                }

                #endregion

                #region 提取二级标题下Html正文,分小节存储,HtmlNode节点存储在contentNodes,文本存储在str_contentList
                int index_PartStart = 0, index_PartEnd = 0;
                for (int i = 0; i < title2Nodes.Count; i++)
                {
                    HtmlAgilityPack.HtmlDocument contentNodeDoc = new HtmlAgilityPack.HtmlDocument();
                    string str_content;

                    if (i < title2Nodes.Count - 1)
                    {
                        index_PartStart = htmlTxt.IndexOf(title2Nodes[i].OuterHtml, index_PartStart + 1);
                        index_PartEnd   = htmlTxt.IndexOf(title2Nodes[i + 1].OuterHtml, index_PartStart + 1);
                        if (index_PartStart != -1 && index_PartEnd > index_PartStart)
                        {
                            str_content = htmlTxt.Substring(index_PartStart, index_PartEnd - index_PartStart);
                        }
                        else
                        {
                            throw new Exception("提取出错");
                        }
                    }
                    else
                    {
                        index_PartStart = htmlTxt.IndexOf(title2Nodes[title2Nodes.Count - 1].OuterHtml, index_PartStart + 1);
                        if (index_PartStart != -1)
                        {
                            str_content = htmlTxt.Substring(index_PartStart);
                        }
                        else
                        {
                            throw new Exception("提取出错");
                        }
                    }
                    foreach (var ftnref in ftNoteRefnodes)
                    {
                        if (str_content.Contains("href=\"#_" + ftnref.Attributes["id"].Value + "\""))
                        {
                            str_content = str_content + ftnref.OuterHtml;
                        }
                    }
                    contentNodeDoc.LoadHtml(str_content);
                    contentNodes.Add(contentNodeDoc.DocumentNode);
                    str_contentList.Add(contentNodes[i].OuterHtml);
                    System.IO.File.WriteAllText(@"../../../htmlRcgTest/" + i + @".html", str_contentList[i]);
                }
                #endregion
                //  断点位置:在局部变量窗口中检查str_contentList/str_titleList/
                //  str_title1List /str_title2List/dic_title1Content
                //  1、数目是否正确
                //  2、的内容是否正确,是否有缺失(二级标题下的正文可以在输出的文件
                //     "../../../htmlRcgTest/" + i + @".html"中查看)
            }
            catch (Exception err)
            {
                Console.WriteLine(err.Message);
            }

            #region 将一、二级标题及内容录入数据库
            try
            {
                SQLUtils sqlUtils = SQLUtils.getInstance();
                sqlUtils.makeConnect();
                ConventionRow tmp_rootConvention = rootConvention;
                for (int i = 0; i < title1Nodes.Count; i++)
                {
                    ConventionRow tempRow1 = null;
                    foreach (var pair in dic_title1Content)
                    {
                        if (pair.Key == i)//若一级标题下有内容,而无二级目录
                        {
                            tempRow1 = new ConventionRow(rootConvention, str_title1List[i],
                                                         i + 1, ConventionOptions.CATEGORY.IS_CONTENT, pair.Value);
                            sqlUtils.writeRow_local(tempRow1);
                            retInfo.title1Guids.Add(tempRow1.Guid);
                            //retInfo.retTable.Rows.Add(tempRow1);
                            break;
                        }
                    }
                    if (tempRow1 == null)////若一级标题下无内容,有二级目录
                    {
                        tempRow1 = new ConventionRow(rootConvention, str_title1List[i],
                                                     i + 1, ConventionOptions.CATEGORY.IS_CATEGORY);
                        sqlUtils.writeRow_local(tempRow1);
                        retInfo.title1Guids.Add(tempRow1.Guid);
                        //retInfo.retTable.Rows.Add(tempRow1);
                    }
                    for (int j = 0, k = 0; j < title2Nodes.Count; j++)
                    {
                        tmp_rootConvention = tempRow1;
                        if (i < title1Nodes.Count - 1)
                        {
                            if (title2Nodes[j].Line <title1Nodes[i + 1].Line && title2Nodes[j].Line> title1Nodes[i].Line)
                            {
                                ConventionRow tempRow2 = new ConventionRow(tmp_rootConvention, str_title2List[j],
                                                                           ++k, ConventionOptions.CATEGORY.IS_CONTENT, str_contentList[j]);
                                sqlUtils.writeRow_local(tempRow2);
                                //retInfo.retTable.Rows.Add(tempRow2);
                            }
                        }
                        else if (title2Nodes[j].Line > title1Nodes[i].Line)
                        {
                            ConventionRow tempRow2 = new ConventionRow(tmp_rootConvention, str_title2List[j],
                                                                       ++k, ConventionOptions.CATEGORY.IS_CONTENT, str_contentList[j]);
                            sqlUtils.writeRow_local(tempRow2);
                            //retInfo.retTable.Rows.Add(tempRow2);
                        }
                    }
                }
                retInfo.title1s           = str_title1List;
                retInfo.title2s           = str_title2List;
                retInfo.title2Contents    = str_contentList;
                retInfo.titles            = str_titleList;
                retInfo.title1ContentsNum = dic_title1Content.Count;
            }
            catch (Exception err)
            {
                Console.WriteLine(err.Message);
                retInfo.errorInfo = "录入失败。错误原因:" + err.Message;
            }
            return(retInfo);

            #endregion
        }
示例#40
0
        private string GetNodeValues(HtmlNode docNode)
        {
            StringBuilder builder = new StringBuilder();
            int           step    = 0;

            //get each moduleHeaderLabel and append as tr
            //then get the following nodes that have class data as a set and add
            //as td pairs underneath with the date as the first column(td)

            HtmlNodeCollection headers     = docNode.SelectNodes("//h3");
            HtmlNodeCollection sections    = new HtmlNodeCollection(docNode);
            List <string>      headersText = new List <string>();

            foreach (var h in headers)
            {
                headersText.Add(h.InnerText);
                sections.Add(h.NextSibling);
            }

            if (sections != null)
            {
                foreach (var s in sections)
                {
                    builder.AppendFormat(TdPair, headersText[step], String.Empty);
                    builder.AppendLine();
                    step++;

                    //handle demographics
                    if (s.PreviousSibling.InnerText.Contains("DEMOGRAPHICS"))
                    {
                        builder.AppendFormat(TdSingle, "Name");
                        builder.AppendFormat(TdSingle, s.SelectSingleNode("tr").LastChild.InnerText);
                        builder.AppendLine();
                    }

                    //handle certification
                    else if (s.PreviousSibling.InnerText.Contains("CERTIFICATION"))
                    {
                        HtmlNodeCollection rows = s.SelectNodes("tr");
                        foreach (var r in rows)
                        {
                            HtmlNodeCollection cells = r.SelectNodes("td");
                            foreach (var c in cells)
                            {
                                builder.AppendFormat(TdSingle, c.InnerText);
                            }
                            builder.AppendLine();
                        }
                    }

                    //handle substantiated findings
                    else if (s.PreviousSibling.InnerText.Contains("SUBSTANTIATED FINDINGS"))
                    {
                        builder.AppendFormat(TdSingle, "Data");
                        builder.AppendFormat(TdSingle, s.NextSibling.InnerText);
                        builder.AppendLine();
                    }

                    else
                    {
                        break;
                    }
                }
            }

            return(builder.ToString());
        }