public static HtmlNodeCollection SelectNodesFromClass(this HtmlNode node, string className) { HtmlNodeCollection selectedNodes = new HtmlNodeCollection(node); // Iterate through all immediate child nodes foreach (HtmlNode subNode in node.ChildNodes) { // Skip nodes containing an embedded card if (!subNode.ContainsClassName("vcard")) { // If sub-node contains class name then add it to the list if (subNode.ContainsClassName(className)) { selectedNodes.Add(subNode); } // Otherwise select nodes recursively from descendants else { HtmlNodeCollection selectedSubNodes = SelectNodesFromClass(subNode, className); foreach (HtmlNode sn in selectedSubNodes) { selectedNodes.Add(sn); } } } } return(selectedNodes); }
private string Furigana(HtmlNode node) { if (_footnotes.Count == 0) { var firstNode = HtmlNode.CreateNode("<p id=\"Lf0\">================Footnotes================</p>"); _footnotes.Add(firstNode); } var sub = Regex.Match(node.InnerHtml, "<ruby>(.*)</ruby>").Value; var kMatches = Regex.Matches(sub, "<rb>(.*?)</rb>"); var fMatches = Regex.Matches(sub, "<rt>(.*?)</rt>"); var kanji = ""; var furigana = ""; foreach (var kMatch in kMatches.Where(kMatch => kMatch.Groups.Count > 1)) { kanji += kMatch.Groups[1].Value; } foreach (var fMatch in fMatches.Where(fMatch => fMatch.Groups.Count > 1)) { furigana += fMatch.Groups[1].Value; } var line = node.InnerHtml.Replace(sub, kanji); _footnotes.Add(HtmlNode.CreateNode($"<p id=\"{node.Id}L\">{line}</p>")); _footnotes.Add(HtmlNode.CreateNode($"<p id=\"{node.Id}K\">{kanji}</p>")); _footnotes.Add(HtmlNode.CreateNode($"<p id=\"{node.Id}F\">{furigana}</p>")); _footnotes.Add(HtmlNode.CreateNode($"<p id=\"{node.Id}E\"></p>")); return(line); }
static void Main(string[] args) { var html = @"<body> <h1>This is <b>bold</b> heading</h1> <p>This is <u>underlined</u> paragraph</p> </body>"; var htmlDoc = new HtmlDocument(); htmlDoc.LoadHtml(html); var htmlBody = htmlDoc.DocumentNode.SelectSingleNode("//body"); HtmlNode refChild = htmlBody.ChildNodes[1]; HtmlNode newChild = HtmlNode.CreateNode("<p> This is inserted after node paragraph"); DisplayNode(htmlBody); htmlBody.InsertAfter(newChild, refChild); Console.WriteLine("\n ******** Node inserted after first child ***************\n"); DisplayNode(htmlBody); HtmlNode newChild2 = HtmlNode.CreateNode("<h1> This is inserted before node heading</h1>"); htmlBody.InsertBefore(newChild2, refChild); Console.WriteLine("\n ************ Node inserted after second child ********************\n"); DisplayNode(htmlBody); Console.WriteLine("\n *************** Node inserted in the beginning ******************\n"); HtmlNode newChild3 = HtmlNode.CreateNode("<h1> This is added at the beginning</h1>"); htmlBody.PrependChild(newChild3); DisplayNode(htmlBody); Console.WriteLine("\n ****************** Prepend Children method ********************\n"); HtmlNode H1Node = HtmlNode.CreateNode("<h1>This is new heading</h1>"); HtmlNode pNode = HtmlNode.CreateNode("<p>This is new paragraph 1</p>"); HtmlNodeCollection newChildren = new HtmlNodeCollection(htmlBody); newChildren.Add(H1Node); newChildren.Add(pNode); htmlBody.PrependChildren(newChildren); DisplayNode(htmlBody); Console.ReadLine(); }
public void AddTest() { HtmlElement root = new HtmlElement("root"); HtmlNodeCollection target = new HtmlNodeCollection(root); HtmlElement child = new HtmlElement("child"); int index = target.Add(child); Assert.AreEqual(root, child.Parent); Assert.AreEqual(index, 0); target.Add(null); }
private HtmlNodeCollection GenerateParticipantsNodes(HtmlDocument doc, HtmlNode parent, Rp rp) { HtmlNodeCollection res = new HtmlNodeCollection(parent); res.Add(doc.CreateTextNode("Feat. ")); foreach (var participant in rp.Partenaires) { var partenaireNode = doc.CreateElement("span"); partenaireNode.AddClass(participant.Groupe.GetDescription()); partenaireNode.AppendChild(doc.CreateTextNode(participant.Nom)); res.Add(partenaireNode); res.Add(doc.CreateTextNode(" - ")); } return(res); }
public static HtmlNodeCollection GetNodesWithTagAndAttribute(string content, string htmlTag, string attr, string attrValue) { var doc = new HtmlDocument(); doc.LoadHtml(content); if (doc.DocumentNode == null) { return(null); } if (content == null) { return(null); } var htmlNodes = doc.DocumentNode.SelectNodes("//" + htmlTag); var result = new HtmlNodeCollection(doc.DocumentNode); if (htmlNodes == null) { return(result); } foreach (var node in htmlNodes) { var attribute = node.Attributes[attr]?.Value; if (attribute?.IndexOf(attrValue, StringComparison.OrdinalIgnoreCase) >= 0) { result.Add(node); } } return(result); }
public HtmlTokenizer(string text, IStemmer stemmer, bool decodeTextBlocks, bool tokenizeTextBlocks, bool applySkipRules) { Utils.ThrowException(text == null ? new ArgumentNullException("text") : null); mText = text; mStemmer = stemmer; mDecodeTextBlocks = decodeTextBlocks; mTokenizeTextBlocks = tokenizeTextBlocks; mApplySkipRules = applySkipRules; HtmlDocument htmlDoc = new HtmlDocument(); Configure(htmlDoc); htmlDoc.LoadHtml(text); HtmlNodeCollection nodes = new HtmlNodeCollection(/*parentNode=*/ null); nodes.Add(htmlDoc.DocumentNode); RegexTokenizer textBlockTokenizer = null; if (mTokenizeTextBlocks) { textBlockTokenizer = new RegexTokenizer(); textBlockTokenizer.TokenRegex = string.Format("({0})|({1})", mWordRegexStr, mNumberRegexStr); textBlockTokenizer.IgnoreUnknownTokens = true; } CreateTokens(nodes, textBlockTokenizer); }
public void FindByAttributeNameTest() { HtmlElement root = new HtmlElement("root"); HtmlNodeCollection target = new HtmlNodeCollection(root); target.Add(new HtmlElement("first")); target.Add(new HtmlElement("second")); target.Add(new HtmlElement("third")); ((HtmlElement)target[0]).Nodes.Add(new HtmlElement("secondchild")); ((HtmlElement)target[1]).Attributes.Add(new HtmlAttribute("firstattribute")); ((HtmlElement)target[1]).Attributes.Add(new HtmlAttribute("secondattribute")); ((HtmlElement)target[2]).Attributes.Add(new HtmlAttribute("firstattribute")); Assert.AreEqual(target.FindByAttributeName("firstattribute").Count, 2); ((HtmlElement)((HtmlElement)target[0]).Nodes[0]).Attributes.Add(new HtmlAttribute("firstattribute")); Assert.AreEqual(target.FindByAttributeName("firstattribute", false).Count, 2); Assert.AreEqual(target.FindByAttributeName("firstattribute", true).Count, 3); }
public void ProcessCourse() { var htmlDoc = new HtmlDocument(); htmlDoc.LoadHtml(course_result); HtmlNode table = htmlDoc.DocumentNode.SelectSingleNode("//table[@class='table listTable']"); HtmlNodeCollection tableNodes = table.ChildNodes; HtmlNodeCollection trNodes = new HtmlNodeCollection(table); foreach (var n in tableNodes) { if (n.Name == "tr") { trNodes.Add(n); } } trNodes.RemoveAt(0); //第一个tr节点非课程项,故去除 foreach (var n in trNodes) { HtmlNodeCollection tdNodes = n.ChildNodes; var itemlist = new List <String>(); foreach (var td in tdNodes) //每个tdNodes中含27个td { var a = td.InnerText.Replace("\r", "").Replace("\n", "").Replace("\t", "").Replace(" ", ""); itemlist.Add(a); } using (var context = new jwContext()) { var thisLessonNum = itemlist[1]; //直接将itemlist[1]放入Linq表达式将报错 var course = context.Courses.SingleOrDefault(c => c.StuID == stuid && c.LessonNum == thisLessonNum); if (course == null) //确保表中不存在此项记录 { var newcourse = new Course { StuID = stuid, LessonNum = itemlist[1], LessonName = itemlist[3], LessonType = itemlist[5], LearninType = itemlist[7], TeachingCollege = itemlist[9], Teacher = itemlist[11], Specialty = itemlist[13], Credit = itemlist[15], LessonHours = itemlist[17], Time = itemlist[19], Note = itemlist[21] }; context.Courses.Add(newcourse); context.SaveChanges(); } } } }
private void GetMethods(IEnumerable <Test> methodList, HtmlNodeCollection htmlNode) { foreach (Test method in methodList) { HtmlNode testNodeMethod = HtmlNode.CreateNode($"<ul>" + $"<li>{CreateColoredResult(method.Result)}" + $"<b>Method Name</b> <br> {method.MethodName}</li>" + $"</ul>"); htmlNode.Add(testNodeMethod); } }
private void FinalizeHtmlDocument(IList <ArelleColumnSection> outlist) { var mtn = this.mainTableNode; for (int i = 0; i < mtn.ChildNodes.Count; i++) { mtn.ChildNodes[i].RemoveAllChildren(); var row = new HtmlNodeCollection(mtn.ChildNodes[i]); foreach (var item in outlist[i].DynamicColumns) { row.Add(item); } foreach (var item in outlist[i].NewStaticColumns) { row.Add(item); } mtn.ChildNodes[i].AppendChildren(row); } }
public static HtmlNodeCollection SelectNodesEx(this HtmlNode node, string xpath) { HtmlNodeCollection nodes = new HtmlNodeCollection(null); XPathNodeIterator iterator = new HtmlNodeNavigator(node.OwnerDocument, node).Select(xpath); while (iterator.MoveNext()) { HtmlNodeNavigator current = (HtmlNodeNavigator)iterator.Current; nodes.Add(current.CurrentNode); } return(nodes); }
public static HtmlNodeCollection ChildElements(this HtmlNode node) { var childNodes = node.ChildNodes; HtmlNodeCollection elems = new HtmlNodeCollection(node); foreach (var child in childNodes) { if (child.NodeType == HtmlNodeType.Element) { elems.Add(child); } } return(elems); }
public void Scan_sends_article_for_analysis_if_keyword_found() { var loggerMock = new Mock <ILogger <Tracker> >(); var spiderMock = new Mock <ISpider>(); spiderMock.Setup(s => s.LoadPage(It.IsAny <string>())); var headerNode = new HtmlNode(HtmlNodeType.Element, new HtmlDocument(), 0); headerNode.InnerHtml = "Some valid article header"; var headers = new HtmlNodeCollection(null); headers.Add(headerNode); spiderMock.Setup(s => s.GetHeadersOfSize(It.IsAny <HtmlDocument>(), It.IsAny <int>())).Returns(headers); spiderMock.Setup(s => s.DownloadArticleByHeader(It.IsAny <string>(), It.IsAny <HtmlNode>())).Returns(("", new HtmlDocument())); var pipelineMock = new Mock <IPipeline>(); pipelineMock.Setup(p => p.SendForAnalysis(It.IsAny <Article>())); var validatorMock = new Mock <IValidator>(); validatorMock.Setup(v => v.ConsideredArticleHeader(It.IsAny <string>())).Returns(true); var extractorMock = new Mock <IExtractor>(); var extractedBodyText = @"this body text was extracted from an html document and it contains the default keyword Sverige which we know is a keyword since no env var KEYWORDS was set"; extractorMock.Setup(e => e.ExtractBodyTextFromArticleDocument(It.IsAny <HtmlDocument>())).Returns(extractedBodyText); var tracker = new Tracker(pipelineMock.Object, spiderMock.Object, loggerMock.Object, validatorMock.Object, extractorMock.Object); var baseUrlInTest = "http://madeupnews.com"; tracker.Scan(baseUrlInTest, new List <Article>()); var numberOfHeaderLevels = 4; // recursive for four header levels h1, h2, h3 and h4 // all levels will return the one header in this test... spiderMock.Verify(s => s.LoadPage(baseUrlInTest), Times.Exactly(numberOfHeaderLevels)); spiderMock.Verify(s => s.GetHeadersOfSize(It.IsAny <HtmlDocument>(), It.IsAny <int>()), Times.Exactly(numberOfHeaderLevels)); spiderMock.Verify(s => s.DownloadArticleByHeader(It.IsAny <string>(), It.IsAny <HtmlNode>()), Times.Exactly(numberOfHeaderLevels)); extractorMock.Verify(e => e.ExtractBodyTextFromArticleDocument(It.IsAny <HtmlDocument>()), Times.Exactly(numberOfHeaderLevels)); pipelineMock.Verify(p => p.SendForAnalysis(It.IsAny <Article>()), Times.Exactly(numberOfHeaderLevels)); }
public static string SuccessSignup() { HtmlDocument login = new HtmlDocument(); login.LoadHtml(Resources.header + Resources.login); HtmlNode alert = login.DocumentNode.SelectSingleNode("//*[contains(concat(\" \", normalize-space(@class), \" \"), \" alert \")]"); alert.AddClass("alert-success in"); alert.RemoveClass("hidden"); HtmlNode message = HtmlNode.CreateNode("<p><strong>Success!</strong> Please login below</p>"); HtmlNodeCollection children = new HtmlNodeCollection(alert); children.Add(message); alert.AppendChildren(children); return(login.DocumentNode.OuterHtml); }
public static async Task <string> GetVietnamCoronaData() { try { HtmlNode coronaTbl = null; HtmlNode coronathead = null; HtmlNode coronatbody = null; HttpResponseMessage response = null; //string content = "_congbothongke_WAR_coronadvcportlet_ma={0}&_congbothongke_WAR_coronadvcportlet_jsonData=%5B%7B%22name%22%3A%22Ha+Noi%22%2C%22ma%22%3A%2201%22%2C%22soCaNhiem%22%3A%223%22%2C%22tuVong%22%3A%220%22%2C%22nghiNhiem%22%3A%220%22%2C%22binhPhuc%22%3A%220%22%2C%22cachLy%22%3A%220%22%7D%2C%7B%22name%22%3A%22aaaaa%22%2C%22ma%22%3A%22%22%2C%22soCaNhiem%22%3A%2220%22%2C%22tuVong%22%3A%220%22%2C%22nghiNhiem%22%3A%22120%22%7D%2C%7B%22name%22%3A%22bbb%22%2C%22ma%22%3A%22%22%2C%22soCaNhiem%22%3A%2220%22%2C%22tuVong%22%3A%220%22%2C%22nghiNhiem%22%3A%22120%22%7D%2C%7B%22ma%22%3A%2202%22%2C%22soCaNhiem%22%3A%220%22%2C%22tuVong%22%3A%220%22%2C%22nghiNhiem%22%3A%220%22%2C%22binhPhuc%22%3A%220%22%2C%22cachLy%22%3A%220%22%7D%2C%7B%22ma%22%3A%22--Ch%E1%BB%8Dn+%C4%91%E1%BB%8Ba+ph%C6%B0%C6%A1ng--%22%2C%22soCaNhiem%22%3A%22%22%2C%22tuVong%22%3A%22%22%2C%22nghiNhiem%22%3A%22%22%7D%2C%7B%22ma%22%3A%22VNALL%22%2C%22soCaNhiem%22%3A%2238%22%2C%22tuVong%22%3A%220%22%2C%22nghiNhiem%22%3A%22210%22%2C%22binhPhuc%22%3A%2216%22%2C%22cachLy%22%3A%222.336%22%7D%2C%7B%22ma%22%3A%2279%22%2C%22soCaNhiem%22%3A%224+%22%2C%22tuVong%22%3A%220%22%2C%22nghiNhiem%22%3A%220%22%2C%22binhPhuc%22%3A%223+%22%2C%22cachLy%22%3A%22%22%7D%2C%7B%22ma%22%3A%2226%22%2C%22soCaNhiem%22%3A%2211%22%2C%22tuVong%22%3A%220%22%2C%22nghiNhiem%22%3A%220%22%2C%22binhPhuc%22%3A%2210%22%2C%22cachLy%22%3A%22%22%7D%2C%7B%22ma%22%3A%2238%22%2C%22soCaNhiem%22%3A%221%22%2C%22tuVong%22%3A%220%22%2C%22nghiNhiem%22%3A%220%22%2C%22binhPhuc%22%3A%221%22%2C%22cachLy%22%3A%22%22%7D%2C%7B%22ma%22%3A%2256%22%2C%22soCaNhiem%22%3A%221%22%2C%22tuVong%22%3A%220%22%2C%22nghiNhiem%22%3A%220%22%2C%22binhPhuc%22%3A%221%22%2C%22cachLy%22%3A%22%22%7D%2C%7B%22ma%22%3A%2208%22%2C%22soCaNhiem%22%3A%220%22%2C%22tuVong%22%3A%220%22%2C%22nghiNhiem%22%3A%220%22%2C%22binhPhuc%22%3A%220%22%2C%22cachLy%22%3A%220%22%7D%2C%7B%22ma%22%3A%2222%22%2C%22soCaNhiem%22%3A%224+%22%2C%22tuVong%22%3A%220%22%2C%22nghiNhiem%22%3A%220%22%2C%22binhPhuc%22%3A%220%22%2C%22cachLy%22%3A%220%22%7D%2C%7B%22ma%22%3A%2246%22%2C%22soCaNhiem%22%3A%221+%22%2C%22tuVong%22%3A%220%22%2C%22nghiNhiem%22%3A%220%22%2C%22binhPhuc%22%3A%220%22%2C%22cachLy%22%3A%220%22%7D%2C%7B%22ma%22%3A%2210%22%2C%22soCaNhiem%22%3A%222%22%2C%22tuVong%22%3A%220%22%2C%22nghiNhiem%22%3A%220%22%2C%22binhPhuc%22%3A%220%22%2C%22cachLy%22%3A%220%22%7D%2C%7B%22ma%22%3A%2237%22%2C%22soCaNhiem%22%3A%221+%22%2C%22tuVong%22%3A%220%22%2C%22nghiNhiem%22%3A%220%22%2C%22binhPhuc%22%3A%220%22%2C%22cachLy%22%3A%220%22%7D%2C%7B%22ma%22%3A%2248%22%2C%22soCaNhiem%22%3A%222+%22%2C%22tuVong%22%3A%220%22%2C%22nghiNhiem%22%3A%220%22%2C%22binhPhuc%22%3A%220%22%2C%22cachLy%22%3A%220%22%7D%2C%7B%22ma%22%3A%2249%22%2C%22soCaNhiem%22%3A%221+%22%2C%22tuVong%22%3A%220%22%2C%22nghiNhiem%22%3A%220%22%2C%22binhPhuc%22%3A%220%22%2C%22cachLy%22%3A%220%22%7D%2C%7B%22ma%22%3A%2260%22%2C%22soCaNhiem%22%3A%221+%22%2C%22tuVong%22%3A%220%22%2C%22nghiNhiem%22%3A%220%22%2C%22binhPhuc%22%3A%220%22%2C%22cachLy%22%3A%220%22%7D%5D"; coronaTbl = HtmlNode.CreateNode("<table></table>"); coronaTbl.Attributes.Add("class", "table table-bordered"); coronathead = HtmlNode.CreateNode("<thead><tr><th>No</th><th>Tỉnh/Thành phố</th><th>Số ca mắc</th><th>Phục hồi</th><th>Tử vong</th><th>Tỉ lệ tử vong</th></tr></thead>"); coronaTbl.AppendChild(coronathead); coronatbody = HtmlNode.CreateNode("<tbody></tbody>"); response = await client.GetAsync("https://ncov.moh.gov.vn/"); if (response.IsSuccessStatusCode) { byte[] bytecontentArr = await response.Content.ReadAsByteArrayAsync(); string htmlpage = Unzip(bytecontentArr); var htmlDoc = new HtmlDocument(); htmlDoc.LoadHtml(htmlpage); HtmlNode dataTbl = htmlDoc.DocumentNode.SelectSingleNode("/html/body/div[1]/div/div/div/div/div[2]/div/div/section[2]/div/div[1]/table"); HtmlNodeCollection trColl = dataTbl.SelectNodes("tbody/tr"); int count = 1; foreach (HtmlNode tr in trColl) { HtmlNodeCollection tdData = tr.SelectNodes("td"); HtmlNode newTr = HtmlNode.CreateNode("<tr></tr>"); HtmlNodeCollection tdColl = new HtmlNodeCollection(newTr); string tinh = tdData[0].InnerText; int soCaMac = int.Parse(tdData[1].InnerText); int phuchoi = int.Parse(tdData[3].InnerText); int tuvong = int.Parse(tdData[4].InnerText); double rate = Math.Round(1.0 * tuvong / soCaMac * 100, 2); tdColl.Add(HtmlNode.CreateNode($"<td>{count++}</td>")); tdColl.Add(HtmlNode.CreateNode($"<td>{tinh}</td>")); tdColl.Add(HtmlNode.CreateNode($"<td>{soCaMac.ToString()}</td>")); tdColl.Add(HtmlNode.CreateNode($"<td>{phuchoi.ToString()}</td>")); tdColl.Add(HtmlNode.CreateNode($"<td>{tuvong.ToString()}</td>")); tdColl.Add(HtmlNode.CreateNode($"<td>{rate}%</td>")); newTr.AppendChildren(tdColl); coronatbody.AppendChild(newTr); } coronaTbl.AppendChild(coronatbody); } return(coronaTbl.OuterHtml); } catch (Exception) { return(""); } }
public static string FailLogin() { HtmlDocument login = new HtmlDocument(); login.LoadHtml(Resources.header + Resources.login); HtmlNode alert = login.DocumentNode.SelectSingleNode("//*[contains(concat(\" \", normalize-space(@class), \" \"), \" alert \")]"); alert.AddClass("alert-danger in"); alert.RemoveClass("hidden"); HtmlNode message = HtmlNode.CreateNode("<p><strong>Uh-Oh...</strong> Looks like we didn't recognize that Username/Password pair." + " Try again or <a data-toggle=\"modal\" href=\"#resetPassword\">Reset your Password</a></p>"); HtmlNodeCollection children = new HtmlNodeCollection(alert); children.Add(message); alert.AppendChildren(children); return(login.DocumentNode.OuterHtml); }
/// <summary> /// Selects a list of nodes matching the <see cref="XPath"/> expression. /// </summary> /// <param name="xpath">The XPath expression.</param> /// <returns>An <see cref="HtmlNodeCollection"/> containing a collection of nodes matching the <see cref="XPath"/> query, or <c>null</c> if no node matched the XPath expression.</returns> public HtmlNodeCollection SelectNodes(string xpath) { HtmlNodeCollection list = new HtmlNodeCollection(null); HtmlNodeNavigator nav = new HtmlNodeNavigator(OwnerDocument, this); XPathNodeIterator it = nav.Select(xpath); while (it.MoveNext()) { HtmlNodeNavigator n = (HtmlNodeNavigator)it.Current; list.Add(n.CurrentNode); } if (list.Count == 0) { return null; } return list; }
public static string SuccessResetPassword() { HtmlDocument login = new HtmlDocument(); login.LoadHtml(LoginManager.Login()); HtmlNode alert = login.DocumentNode.SelectSingleNode("//*[contains(concat(\" \", normalize-space(@class), \" \"), \" alert \")]"); alert.AddClass("alert-success"); alert.AddClass("in"); alert.RemoveClass("hidden"); HtmlNode message = HtmlNode.CreateNode("<p><strong>Password Reset</strong> Please login below with your new password</p>"); HtmlNodeCollection children = new HtmlNodeCollection(alert); children.Add(message); alert.AppendChildren(children); return(login.DocumentNode.OuterHtml); }
public HtmlNodeCollection TraversalPtt(string target, int count, int?targetCount, HtmlNodeCollection htmlNodes) { string res = ""; if (count == 0)//第一筆,index { if (target.Contains("search")) { res = RequestPtt($"{target}"); } else { res = RequestPtt($"bbs/{target}/index.html"); } } else { res = RequestPtt($"{target}"); } var htmlDoc = new HtmlDocument(); htmlDoc.LoadHtml(res); if (htmlDoc.DocumentNode.SelectNodes("//div[@class='btn-group btn-group-paging']")[0].ChildNodes[3].Attributes.Count == 1)//沒有下一頁 { return(htmlNodes); } var next = htmlDoc.DocumentNode.SelectNodes("//div[@class='btn-group btn-group-paging']")[0].ChildNodes[3].Attributes[1].Value; var infos = htmlDoc.DocumentNode.SelectNodes("//div[@class='r-ent']"); count += infos.Count; var infoList = infos.Reverse();//抓下來的資料會是倒排的 foreach (var info in infoList) { htmlNodes.Add(info); } if (count >= targetCount) { return(htmlNodes); } return(TraversalPtt(next, count, targetCount, htmlNodes)); }
public void Scan_doesnt_send_article_for_analysis_if_text_extraction_fails() { var loggerMock = new Mock <ILogger <Tracker> >(); var spiderMock = new Mock <ISpider>(); spiderMock.Setup(s => s.LoadPage(It.IsAny <string>())); var headerNode = new HtmlNode(HtmlNodeType.Element, new HtmlDocument(), 0); headerNode.InnerHtml = "Some valid article header"; var headers = new HtmlNodeCollection(null); headers.Add(headerNode); spiderMock.Setup(s => s.GetHeadersOfSize(It.IsAny <HtmlDocument>(), It.IsAny <int>())).Returns(headers); spiderMock.Setup(s => s.DownloadArticleByHeader(It.IsAny <string>(), It.IsAny <HtmlNode>())).Returns(("", new HtmlDocument())); var pipelineMock = new Mock <IPipeline>(); pipelineMock.Setup(p => p.SendForAnalysis(It.IsAny <Article>())); var validatorMock = new Mock <IValidator>(); validatorMock.Setup(v => v.ConsideredArticleHeader(It.IsAny <string>())).Returns(true); var extractorMock = new Mock <IExtractor>(); extractorMock.Setup(e => e.ExtractBodyTextFromArticleDocument(It.IsAny <HtmlDocument>())).Throws(new Exception("something bad happend")); var tracker = new Tracker(pipelineMock.Object, spiderMock.Object, loggerMock.Object, validatorMock.Object, extractorMock.Object); var baseUrlInTest = "http://madeupnews.com"; tracker.Scan(baseUrlInTest, new List <Article>()); var numberOfHeaderLevels = 4; // recursive for four header levels h1, h2, h3 and h4 // all levels will return the one header in this test... spiderMock.Verify(s => s.LoadPage(baseUrlInTest), Times.Exactly(numberOfHeaderLevels)); spiderMock.Verify(s => s.GetHeadersOfSize(It.IsAny <HtmlDocument>(), It.IsAny <int>()), Times.Exactly(numberOfHeaderLevels)); spiderMock.Verify(s => s.DownloadArticleByHeader(It.IsAny <string>(), It.IsAny <HtmlNode>()), Times.Exactly(numberOfHeaderLevels)); extractorMock.Verify(e => e.ExtractBodyTextFromArticleDocument(It.IsAny <HtmlDocument>()), Times.Exactly(numberOfHeaderLevels)); pipelineMock.Verify(p => p.SendForAnalysis(It.IsAny <Article>()), Times.Exactly(0)); }
public static HtmlNodeCollection AddRange(this HtmlNodeCollection collection, HtmlNodeCollection other) { if (other == null) { return(collection); } if (collection == null) { return(other); } foreach (var node in other) { collection.Add(node); } return(collection); }
private static HtmlNodeCollection FindClassNameNodes(string classname, HtmlNodeCollection nodes) { var doc = new HtmlDocument(); var result = new HtmlNodeCollection(doc.DocumentNode); if (nodes == null) { return(result); } foreach (var node in nodes) { var className = node.Attributes["class"]?.Value; if (className?.IndexOf(classname, StringComparison.OrdinalIgnoreCase) >= 0) { result.Add(node); } } return(result); }
public static HtmlNodeCollection SelectNodes(this HtmlNode node, String xpath) { HtmlNodeCollection results = new HtmlNodeCollection(node); if (xpath.Equals("comment()")) { foreach (HtmlNode n in node.ChildNodes) { if (n.NodeType == HtmlNodeType.Comment) { results.Add(n); } } } else { throw new NotSupportedException("Only the XPath expressions required by dotNetRDF code are supported by this method"); } return(results); }
public HDocument ParseHocr(HDocument hOrcDoc, string hOcrFile, bool append) { _hDoc = hOrcDoc; if (_doc == null) { _doc = new HtmlDocument(); } _hOcrFilePath = hOcrFile; if (File.Exists(hOcrFile) == false) { throw new Exception("hocr file not found"); } _currentPage = null; _currentPara = null; _currentLine = null; _doc.Load(hOcrFile, Encoding.UTF8); HtmlNode body = _doc.DocumentNode.SelectNodes("//body")[0]; HtmlNodeCollection nodes1 = body.SelectNodes("//div"); //#Issue #1 reported by Ryan-George IEnumerable <HtmlNode> divs = body.ChildNodes.Where(node => node.Name.ToLower() == "div"); HtmlNodeCollection nodes = new HtmlNodeCollection(null); foreach (HtmlNode div in divs) { nodes.Add(div); } _hDoc.ClassName = "body"; ParseNodes(nodes); return(_hDoc); }
/// <summary> /// Grabs the required trs from the market table after calculating the range from the base number. /// </summary> /// <param name="marketURL">The market URL</param> /// <param name="openMarketBaseNumber">The open market base number</param> /// <returns>HtmlNodeCollection</returns> private HtmlNodeCollection DownloadMarketData(string marketURL, int openMarketBaseNumber) { // Define the range baseNumber = Math.Round(Convert.ToDecimal(openMarketBaseNumber), 2); baseNumberPlus50 = baseNumber + 100; baseNumberPlus100 = baseNumber + 200; baseNumberPlus150 = baseNumber + 300; baseNumberPlus200 = baseNumber - 100; baseNumberMinus50 = baseNumber - 200; baseNumberMinus100 = baseNumber - 300; // Grab all rows var htmlWeb = new HtmlWeb(); HtmlAgilityPack.HtmlDocument htmlDocument = htmlWeb.Load(marketURL); HtmlNodeCollection tableRows = htmlDocument.DocumentNode.SelectNodes("//table[@id=\"octable\"]//tr"); tableRows.RemoveAt(tableRows.Count - 1); tableRows.RemoveAt(0); tableRows.RemoveAt(0); // Get only those rows which contain values for the defined tange HtmlNodeCollection workSetRows = new HtmlNodeCollection(null); foreach (var currentTableRow in tableRows) { if (currentTableRow.InnerHtml.Contains(baseNumber.ToString()) || currentTableRow.InnerHtml.Contains(baseNumberPlus50.ToString()) || currentTableRow.InnerHtml.Contains(baseNumberPlus100.ToString()) || currentTableRow.InnerHtml.Contains(baseNumberMinus50.ToString()) || currentTableRow.InnerHtml.Contains(baseNumberMinus100.ToString()) || currentTableRow.InnerHtml.Contains(baseNumberPlus150.ToString()) || currentTableRow.InnerHtml.Contains(baseNumberPlus200.ToString())) { workSetRows.Add(currentTableRow); } } return(workSetRows); }
public void Scan_doesnt_download_article_with_invalid_header() { var pipelineMock = new Mock <IPipeline>(); var loggerMock = new Mock <ILogger <Tracker> >(); var extractorMock = new Mock <IExtractor>(); var validatorMock = new Mock <IValidator>(); validatorMock.Setup(v => v.ConsideredArticleHeader(It.IsAny <string>())).Returns(false); var spiderMock = new Mock <ISpider>(); spiderMock.Setup(s => s.LoadPage(It.IsAny <string>())); var headerNode = new HtmlNode(HtmlNodeType.Element, new HtmlDocument(), 0); headerNode.InnerHtml = "<h1>doesnt matter - mocked</h1>"; //known invalid article header var headers = new HtmlNodeCollection(null); headers.Add(headerNode); spiderMock.Setup(s => s.GetHeadersOfSize(It.IsAny <HtmlDocument>(), It.IsAny <int>())).Returns(headers); spiderMock.Setup(s => s.DownloadArticleByHeader(It.IsAny <string>(), It.IsAny <HtmlNode>())).Returns((null, null)); var tracker = new Tracker(pipelineMock.Object, spiderMock.Object, loggerMock.Object, validatorMock.Object, extractorMock.Object); var baseUrlInTest = "http://madeupnews.com"; tracker.Scan(baseUrlInTest, new List <Article>()); var numberOfHeaderLevels = 4; // all levels will return the one header in this test... spiderMock.Verify(s => s.LoadPage(baseUrlInTest), Times.Exactly(numberOfHeaderLevels)); spiderMock.Verify(s => s.GetHeadersOfSize(It.IsAny <HtmlDocument>(), It.IsAny <int>()), Times.Exactly(numberOfHeaderLevels)); spiderMock.Verify(s => s.DownloadArticleByHeader(It.IsAny <string>(), It.IsAny <HtmlNode>()), Times.Exactly(0)); }
private static HtmlNode CreateNewTr(HtmlNodeCollection dtd, int no, bool highlightFlg) { CultureInfo viCulture = new CultureInfo("vi-VN"); HtmlNode newTr = HtmlNode.CreateNode("<tr></tr>"); if (highlightFlg) { newTr.Attributes.Add("style", "background-color:yellow"); } HtmlNodeCollection tdColl = new HtmlNodeCollection(newTr); int soCaMac = int.Parse(dtd[1].InnerText.Trim(), NumberStyles.AllowThousands); int phuchoi = int.Parse(dtd[5].InnerText.Trim(), NumberStyles.AllowThousands); int tuvong = dtd[3].InnerText.Trim() == string.Empty ? 0 : int.Parse(dtd[3].InnerText.Trim(), NumberStyles.AllowThousands); double rate = Math.Round(1.0 * tuvong / soCaMac * 100, 2); tdColl.Add(HtmlNode.CreateNode($"<td>{no}</td>")); tdColl.Add(HtmlNode.CreateNode($"<td>{dtd[0].InnerText.Trim()}</td>")); tdColl.Add(HtmlNode.CreateNode($"<td>{soCaMac.ToString("N0", viCulture)}</td>")); tdColl.Add(HtmlNode.CreateNode($"<td>{phuchoi.ToString("N0", viCulture)}</td>")); tdColl.Add(HtmlNode.CreateNode($"<td>{tuvong.ToString("N0", viCulture)}</td>")); tdColl.Add(HtmlNode.CreateNode($"<td>{rate}%</td>")); newTr.AppendChildren(tdColl); return(newTr); }
private static HtmlNodeCollection BuildNodeCollection(Queue<string> tokens) { HtmlNodeCollection nodes = new HtmlNodeCollection(null); HtmlElement element = null; string current; while (tokens.Count > 0) { current = tokens.Dequeue(); switch (current) { case ("<"): // Read open tag if (tokens.Count == 0) break; current = tokens.Dequeue(); element = new HtmlElement(current); // read the attributes and values while (tokens.Count > 0 && (current = tokens.Dequeue()) != ">" && current != "/>") { string attribute_name = current; if (tokens.Count > 0 && tokens.Peek() == "=") { tokens.Dequeue(); current = (tokens.Count > 0) ? tokens.Dequeue() : null; HtmlAttribute attribute = new HtmlAttribute(attribute_name, HttpUtility.HtmlDecode(current)); element.Attributes.Add(attribute); } else //if (tokens.Count == 0) { // Null-attributeValue attribute HtmlAttribute attribute = new HtmlAttribute(attribute_name); element.Attributes.Add(attribute); } } nodes.Add(element); if (current == "/>") { element.IsTerminated = true; element = null; //could not have any sub elements } else if (current == ">") { continue; } break; case (">"): continue; case ("</"): // Read close tag if (tokens.Count == 0) break; current = tokens.Dequeue(); int open_index = FindTagOpenNodeIndex(nodes, current); if (open_index != -1) { MoveNodesDown(ref nodes, open_index + 1, (HtmlElement)nodes[open_index]); } // Skip to the end of this tag while (tokens.Count > 0 && (current = tokens.Dequeue()) != ">") { //shouldn't happen } element = null; break; default: HtmlText node = new HtmlText(current); nodes.Add(node); break; } } return nodes; }
public void HtmlNodeCollectionConstructorTest() { HtmlElement root = new HtmlElement("root"); HtmlNodeCollection target = new HtmlNodeCollection(root); HtmlElement child = new HtmlElement("child"); target.Add(child); Assert.AreEqual(root, child.Parent); }
public void InsertTest() { HtmlElement root = new HtmlElement("root"); HtmlNodeCollection target = new HtmlNodeCollection(root); HtmlElement child = new HtmlElement("child"); target.Add(child); child = new HtmlElement("second"); target.Insert(0, child); Assert.AreEqual(root, child.Parent); Assert.AreEqual(target.IndexOf(child), 0); target.Insert(0, null); }
static void Main(string[] args) { var html = @"<body> <h1>This is <b>bold</b> heading</h1> <p>This is <u>underlined</u> paragraph</p> <h1>This is <i>italic</i> heading</h1> <p>This is <u>underlined</u> paragraph</p> </body>"; var htmlDoc = new HtmlDocument(); htmlDoc.LoadHtml(html); var htmlNodes = htmlDoc.DocumentNode.SelectNodes("//body/h1"); Console.WriteLine("-------------------------------------------------"); Console.WriteLine("InnerHtml"); foreach (var node in htmlNodes) { Console.WriteLine(node.InnerHtml); } Console.WriteLine("--------------------------------------------------"); Console.WriteLine("InnerText"); foreach (var node in htmlNodes) { Console.WriteLine(node.InnerText); } Console.WriteLine("----------------------------------------------------"); Console.WriteLine("OuterHtml"); foreach (var node in htmlNodes) { Console.WriteLine(node.OuterHtml); } Console.WriteLine("-----------------------------------------------------"); Console.WriteLine("ParentNode of h1 is :"); var selectSingleNode = htmlDoc.DocumentNode.SelectSingleNode("//body/h1"); HtmlNode parentNode = selectSingleNode.ParentNode; Console.WriteLine(parentNode.Name); Console.WriteLine("--------------------------------------------------------"); Console.WriteLine("Child nodes present initially"); var htmlBody = htmlDoc.DocumentNode.SelectSingleNode("//body"); DisplayChildNodes(htmlBody); HtmlNode h2Node = HtmlNode.CreateNode("<h2> This is h2 heading</h2>"); htmlBody.AppendChild(h2Node); Console.WriteLine("---------------------------------------------------------"); Console.WriteLine("After child node appended"); DisplayChildNodes(htmlBody); HtmlNode h3Node = HtmlNode.CreateNode("<h3> THis is H2 heading</h3>"); HtmlNode pNode1 = HtmlNode.CreateNode("<p>This is appended paragraph 1</p>"); HtmlNode pNode2 = HtmlNode.CreateNode("<p>This is appended paragraph 2</p>"); HtmlNodeCollection children = new HtmlNodeCollection(htmlBody); children.Add(h3Node); children.Add(pNode1); children.Add(pNode2); htmlBody.AppendChildren(children); Console.WriteLine("\n*********************After children appended**********************"); DisplayChildNodes(htmlBody); HtmlNode newHtmlBody = htmlBody.Clone(); Console.WriteLine("\n Duplicate Node Name :" + newHtmlBody.Name); Console.WriteLine("\n*********************** Display children of the duplicate node **************************\n"); DisplayChildNodes(newHtmlBody); var htmlBodyTwo = htmlBody.CloneNode(false); Console.WriteLine("\n Clone Node Name: " + newHtmlBody.Name); Console.WriteLine("\n************* Display children of the clone node *******************\n"); DisplayChildNodes(htmlBodyTwo); HtmlNode h1Node = htmlBody.ChildNodes[1]; DisplayNode(h1Node); HtmlNode h4Node = h1Node.CloneNode("h2", true); Console.WriteLine("\n************* CLone node *********************\n"); DisplayNode(h4Node); HtmlNode h5Node = htmlBody.ChildNodes[1]; DisplayNode(h5Node); HtmlNode h6Node = h5Node.CloneNode("h6", true); Console.WriteLine("\n*************** Clone node ***********************\n"); DisplayNode(h6Node); HtmlNode newBody = HtmlNode.CreateNode("<body></body>"); newBody.CopyFrom(htmlBody); DisplayNode(htmlBody); Console.WriteLine("\n****************** Display node **********************\n"); DisplayNode(newBody); HtmlNode newBody2 = HtmlNode.CreateNode("<body></body>"); newBody2.CopyFrom(htmlBodyTwo, false); DisplayNode(htmlBodyTwo); Console.WriteLine("\n **************** Duplicate node ********************\n"); DisplayNode(newBody2); HtmlNode newPara = HtmlNode.CreateNode("<p>This is new paragraph</p>"); htmlBodyTwo.ChildNodes.Add(newPara); Console.WriteLine("\n ***************** After adding new child node ************\n"); DisplayNode(htmlBodyTwo); Console.ReadKey(); }
/// <summary> /// Gets the formatted html for the specified message. /// </summary> /// <param name="message">The message.</param> /// <returns>The formatted html.</returns> public static async Task <string> FormattedHtml(MailMessage message) { try { // Load the html HtmlDocument htmlDocument = new HtmlDocument(); htmlDocument.OptionFixNestedTags = true; string html = (message.TextContentType == ETextContentType.Html ? message.Text : string.Format("<p>{0}</p>", (message.Text + string.Empty).Replace(Environment.NewLine, "<br/>"))); htmlDocument.LoadHtml(html); // Get the link nodes IEnumerable <HtmlNode> linkNodes = htmlDocument.DocumentNode.Descendants("a") .Where(o => !string.IsNullOrEmpty(o.GetAttributeValue("href", null)) && (o.GetAttributeValue("href", null).StartsWith("http", StringComparison.OrdinalIgnoreCase) || o.GetAttributeValue("href", null).StartsWith("www", StringComparison.OrdinalIgnoreCase))); // Loop through each external link - ensure it opens in new window foreach (HtmlNode linkNode in linkNodes) { if (linkNode.Attributes.Contains("target")) { linkNode.Attributes["target"].Value = "_blank"; } else { linkNode.Attributes.Add("target", "_blank"); } } // Get the image nodes IEnumerable <HtmlNode> imageNodes = htmlDocument.DocumentNode.Descendants("img") .Where(o => !string.IsNullOrEmpty(o.GetAttributeValue("src", null)) && (!o.GetAttributeValue("src", null).StartsWith("http", StringComparison.OrdinalIgnoreCase) || !o.GetAttributeValue("src", null).StartsWith("www", StringComparison.OrdinalIgnoreCase))); // Loop through each local image foreach (HtmlNode imageNode in imageNodes) { try { // Find the image attachment string srcWithoutCid = imageNode.GetAttributeValue("src", null).Replace("cid:", ""); Attachment attachment = message.Attachments[srcWithoutCid]; // If found if (attachment != null) { // Convert image to base64 StorageFile attachmentFile = await IOUtil.GetCreateFile(attachment.FullFilename, CreationCollisionOption.ReplaceExisting); imageNode.Attributes["src"].Value = await TransformFileToBase64ImageString(attachmentFile.Path); } } catch (Exception ex) { LogFile.Instance.LogError("", "", ex.ToString()); } } // Ensure that the html node exists HtmlNode htmlNode = htmlDocument.DocumentNode.Descendants("html").FirstOrDefault(); if (htmlNode == null) { htmlNode = htmlDocument.CreateElement("html"); htmlDocument.DocumentNode.AppendChild(htmlNode); } // Ensure that the head node exists HtmlNode headNode = htmlDocument.DocumentNode.Descendants("head").FirstOrDefault(); if (headNode == null) { headNode = htmlDocument.CreateElement("head"); htmlNode.AppendChild(headNode); } // Create page css transition HtmlNode cssTransitionNode = htmlDocument.CreateElement("style"); cssTransitionNode.InnerHtml = "body{opacity:0;transition: all 2s ease;}.loaded{opacity:1;}"; headNode.PrependChild(cssTransitionNode); // Create page javascript transition HtmlNode javascriptTransitionNode = htmlDocument.CreateElement("script"); javascriptTransitionNode.Attributes.Add("type", "text/javascript"); javascriptTransitionNode.InnerHtml = "document.addEventListener('DOMContentLoaded', function () { document.body.classList.add('loaded'); }, false);"; headNode.AppendChild(javascriptTransitionNode); // Ensure that the body node exists HtmlNode bodyNode = htmlDocument.DocumentNode.Descendants("body").FirstOrDefault(); if (bodyNode == null) { bodyNode = htmlDocument.CreateElement("body"); htmlNode.AppendChild(bodyNode); } // Add the body tags HtmlNodeCollection htmlNodes = new HtmlNodeCollection(bodyNode); foreach (HtmlNode node in htmlDocument.DocumentNode.ChildNodes.ToList()) { if (!node.Name.Equals("html", StringComparison.OrdinalIgnoreCase) && !node.Name.Equals("head", StringComparison.OrdinalIgnoreCase) && !node.Name.Equals("body", StringComparison.OrdinalIgnoreCase)) { htmlNodes.Add(node); htmlDocument.DocumentNode.RemoveChild(node); } } bodyNode.AppendChildren(htmlNodes); // Return the html return(htmlDocument.DocumentNode.InnerHtml); } catch (Exception ex) { LogFile.Instance.LogError("", "", ex.ToString()); return(message.Text); } }
/// <summary> /// Selects a list of nodes matching the <see cref="XPath"/> expression. /// </summary> /// <param name="xpath">The XPath expression.</param> /// <returns>An <see cref="HtmlNodeCollection"/> containing a collection of nodes matching the <see cref="XPath"/> query</returns> public HtmlNodeCollection SelectNodes(string xpath, XmlNamespaceManager xmgr) { HtmlNodeCollection list = new HtmlNodeCollection(null); HtmlNodeNavigator nav = new HtmlNodeNavigator(_ownerdocument, this); XPathNodeIterator it = nav.Select(xpath, xmgr); while (it.MoveNext()) { HtmlNodeNavigator n = (HtmlNodeNavigator)it.Current; list.Add(n.CurrentNode); } if (list.Count == 0) { return list; } return list; }
public void ItemByIndexTest() { HtmlElement root = new HtmlElement("root"); HtmlNodeCollection target = new HtmlNodeCollection(root); target.Add(new HtmlElement("first")); target.Add(new HtmlElement("second")); target.Add(new HtmlElement("third")); Assert.AreEqual(target[1], target["second"]); target[2] = new HtmlElement("another"); target[0] = null; StringAssert.Contains(target[2].ToString(), "another"); }
public void ItemByNameTest() { HtmlElement root = new HtmlElement("root"); HtmlNodeCollection target = new HtmlNodeCollection(root); target.Add(new HtmlElement("first")); target.Add(new HtmlElement("second")); target.Add(new HtmlElement("second")); Assert.IsNotNull(target["second"]); Assert.IsNull(target["anyname"]); }
public void GetByNameTest() { HtmlElement root = new HtmlElement("root"); HtmlNodeCollection target = new HtmlNodeCollection(root); target.Add(new HtmlElement("first")); target.Add(new HtmlElement("second")); target.Add(new HtmlElement("second")); Assert.AreEqual(target.GetByName("second").Count, 2); ((HtmlElement)target[0]).Nodes.Add(new HtmlElement("second")); Assert.AreEqual(target.GetByName("second", false).Count, 2); Assert.AreEqual(target.GetByName("second").Count, 3); }
private RaceParsingResult ParseRace(string html) { var result = new RaceParsingResult { PlayerResults = new List <ResultParsed>(), RaceInfo = new RaceInfo() }; HtmlDocument doc = new HtmlDocument(); doc.LoadHtml(html); HtmlNode gameTypeSpan = doc.DocumentNode.SelectSingleNode("//td[@id='gamedesc']/span"); string modeId = gameTypeSpan.GetAttributeValue("class", "0").Replace("gametype-", ""); var SpanA = gameTypeSpan.SelectSingleNode("a"); string modeName; if (SpanA == null) //станд режим { modeName = gameTypeSpan.InnerText; } else // словарь { modeId += "-" + Regex.Match(SpanA.GetAttributeValue("href", "0"), "\\d+"); modeName = SpanA.InnerText; } result.RaceInfo.Mode = new Mode(modeId, modeName); result.RaceInfo.BookAuthor = doc.DocumentNode.SelectSingleNode("//div[@id='bookinfo']//div[@class='author']")?.InnerText; result.RaceInfo.BookName = doc.DocumentNode.SelectSingleNode("//div[@id='bookinfo']//div[@class='name']")?.InnerText; HtmlNode totalPlayersNode = doc.DocumentNode.SelectSingleNode("//div[@id='players-count-lbl']/span"); if (!string.IsNullOrEmpty(totalPlayersNode?.InnerText)) { var match = Regex.Match(totalPlayersNode.InnerText, @"\d+"); if (match.Success) { result.RaceInfo.TotalPlayers = int.Parse(match.Value); } } HtmlNode pointsIncreaseNode = doc.DocumentNode.SelectSingleNode("//div[@id='players-count-lbl']/b"); int pointsIncrease = 0; if (!string.IsNullOrEmpty(pointsIncreaseNode?.InnerText)) { var match = Regex.Match(pointsIncreaseNode.InnerText, @"\d+"); if (match.Success) { pointsIncrease = int.Parse(match.Value); } } HtmlNodeCollection players = doc.DocumentNode.SelectNodes("//div[@class='player other ng-scope']"); if (players != null) { players.Add(doc.DocumentNode.SelectSingleNode("//div[@class='player you ng-scope']"));// ng-scope } else { players = doc.DocumentNode.SelectNodes("//div[@class='player you ng-scope']"); } foreach (HtmlNode player in players) { ResultParsed resultParsed = new ResultParsed(); resultParsed.Mode = new Mode(modeId, modeName); resultParsed.PointsIncrease = pointsIncrease; HtmlNode rating = player.SelectSingleNode("div[@class='rating']"); HtmlNode car = player.SelectSingleNode("table[@class='car']"); HtmlNode place = rating.SelectSingleNode("div/ins"); HtmlNode nick = player.SelectSingleNode("table//a"); if (nick != null) { resultParsed.Nick = nick != null ? nick.InnerText : "Гость"; string id_str = nick.GetAttributeValue("href", "0"); resultParsed.Id = int.Parse(Regex.Match(id_str, "[0-9]+").ToString()); resultParsed.Rank = Rank.GetByIndex(int.Parse(nick.GetAttributeValue("class", "000000").Substring(4, 1))); } else { continue; //result.Nick = "Гость"; //result.Id = 0; //result.Rank = Rank.GetByIndex(0); } if (place != null) { string place_str = place.InnerText; resultParsed.RealPlace = int.Parse(place_str.Substring(0, place_str.Length - 6)); resultParsed.Time = TimeSpan.Parse("00:" + rating.SelectSingleNode("div[@class='stats']/div").InnerText.Replace(" ", "").Replace("\r", "").Replace("\n", "")); resultParsed.Speed = (int)Math.Round(double.Parse(rating.SelectSingleNode("div[@class='stats']/div[2]/span").InnerText, new NumberFormatInfo() { NumberDecimalSeparator = "," })); resultParsed.ErCnt = int.Parse(rating.SelectSingleNode("div[@class='stats']/div[3]/span").InnerText); resultParsed.ErRate = double.Parse(rating.SelectSingleNode("div[@class='stats']/div[3]/span[2]").InnerText, new NumberFormatInfo() { NumberDecimalSeparator = "," }) / 100; result.RaceInfo.ArrivedPlayers++; } if (player.SelectSingleNode("div[@class='newrecord']//span[@class='']") != null) { resultParsed.IsRecord = true; //рекорд с записью или без } int.TryParse(Regex.Match(car.GetAttributeValue("style", ""), "(?<=left: )\\d+(?=px)").ToString(), out int progress); resultParsed.Progress = (int)(progress / 4.8); //result.finished = progress >= 100; //style="top: 0px; left: 480px; " HtmlNode _imgcont = car.SelectSingleNode(".//div[@class='imgcont']"); HtmlNode _left = car.SelectSingleNode(".//div[@class='imgcont leave']"); resultParsed.HasLeftRace = _left != null; HtmlNode _noerror_fail = car.SelectSingleNode(".//img[@class='noerror-fail']"); resultParsed.NoErrorFail = _noerror_fail != null; HtmlNode _i_style = car.SelectSingleNode(".//i"); if (_i_style != null) { int.TryParse(Regex.Match(_i_style.GetAttributeValue("title", ""), "\\d+").ToString(), out int _mileage); resultParsed.Mileage = _mileage; } result.PlayerResults.Add(resultParsed); } return(result); }
/// <summary> /// 附录需要在word里按目录要求,手动改为一级或者二级标题的格式 /// </summary> /// <param name="rootConvention"></param> public ReturnInfo ReadHtml(ConventionRow rootConvention) { HtmlAgilityPack.HtmlDocument doc = new HtmlAgilityPack.HtmlDocument(); doc.Load(htmlPath); HtmlNode htmlRootNode = doc.DocumentNode; HtmlNodeCollection title1Nodes_init; HtmlNodeCollection title2Nodes_init; List <string> str_contentList = new List <string>(); List <string> str_titleList = new List <string>(); List <string> str_title1List = new List <string>(); List <string> str_title2List = new List <string>(); HtmlNodeCollection contentNodes = new HtmlNodeCollection(htmlRootNode.Clone()); Dictionary <int, string> dic_title1Content = new Dictionary <int, string>(); HtmlNodeCollection titleNodes = new HtmlNodeCollection(htmlRootNode.Clone()); HtmlNodeCollection title1Nodes = new HtmlNodeCollection(htmlRootNode.Clone()); HtmlNodeCollection title2Nodes = new HtmlNodeCollection(htmlRootNode.Clone()); HtmlNodeCollection ftNoteRefnodes = new HtmlNodeCollection(htmlRootNode.Clone()); string htmlTxt = htmlRootNode.InnerHtml; //正文识别标题 #region (废弃选项:一级标题粗体识别) //if (method == ReadMethod.TITLE1_BOLD) //{ // //一级标题 // title1Nodes_init = htmlRootNode.SelectNodes(title1_select); // //二级标题可能所在span // title2Nodes_init = htmlRootNode.SelectNodes(title2_select); // #region 找出一级标题,HtmlNode保存在title1Nodes,文本存储在 str_title1List // if (title1Nodes_init != null) // { // for (int i = 0; i < title1Nodes_init.Count; i++) // { // if ((title1Nodes_init[i].ParentNode.Name == "p" && title1Nodes_init[i].ParentNode.ParentNode.Name == "div" && title1Nodes_init[i].HasChildNodes) // || (title1Nodes_init[i].Name == "h1" && title1Nodes_init[i].ParentNode.Name == "div") // || (title1Nodes_init[i].Name == "h2" && title1Nodes_init[i].ParentNode.Name == "div") // || (title1Nodes_init[i].ParentNode.Name == "a" && title1Nodes_init[i].ParentNode.ParentNode.Name == "p") // ) // { // foreach (var child in title1Nodes_init[i].DescendantsAndSelf()) // { // if (child.Name == "span" && child.HasAttributes) // { // foreach (var atbt in child.Attributes) // { // if (atbt.Name == "style")//&& atbt.Value== "font-size:15.0pt;font-family:黑体") // { // if ((title1Nodes_init[i].ParentNode.InnerText.Contains("第") && title1Nodes_init[i].ParentNode.InnerText.Contains("章")) // ) // { // if (title1Nodes_init[i].ParentNode.ParentNode.Name == "p") // { // title1Nodes.Add(title1Nodes_init[i].ParentNode.ParentNode); // str_title1List.Add(title1Nodes_init[i].ParentNode.ParentNode.InnerText.Replace(" ", " ").Replace("\r\n", "")); // } // else if (title1Nodes_init[i].ParentNode.Name == "p") // { // title1Nodes.Add(title1Nodes_init[i].ParentNode); // str_title1List.Add(title1Nodes_init[i].ParentNode.InnerText.Trim().Replace(" ", " ").Replace("\r\n", "")); // } // else if (title1Nodes_init[i].Name == "h" || title1Nodes_init[i].Name == "h1" || title1Nodes_init[i].Name == "h2") // { // title1Nodes.Add(title1Nodes_init[i]); // str_title1List.Add(title1Nodes_init[i].InnerText.Trim().Replace(" ", " ").Replace("\r\n", "")); // } // } // break; // } // } // break; // } // } // } // } // } //#region 找出二级小节标题,HtmlNode保存在title2Nodes ,文本存储在str_title2List ////span所在的几种情形:div->p->a->span div->p->span div->h1->span //if (title2Nodes_init != null) //{ // for (int i = 0; i < title2Nodes_init.Count; i++) // { // //标题span存在的情形1 // if (title2Nodes_init[i].ParentNode.Name == "a" && title2Nodes_init[i].ParentNode.ParentNode.Name == "p") // { // //避免添加重复的部分 // if ((i == 0) || (i > 0 && title2Nodes_init[i].ParentNode.ParentNode.Line != title2Nodes_init[i - 1].ParentNode.ParentNode.Line)) // { // title2Nodes.Add(title2Nodes_init[i].ParentNode.ParentNode); // str_title2List.Add(title2Nodes_init[i].ParentNode.ParentNode.InnerText.Trim().Replace(" ", " ").Replace("\r\n", "")); // } // } // //标题span存在的情形2、3 // else if ((title2Nodes_init[i].ParentNode.Name == "p" && title2Nodes_init[i].ParentNode.ParentNode.Name == "div") // || (title2Nodes_init[i].ParentNode.Name == "h1" && title2Nodes_init[i].ParentNode.ParentNode.Name == "div")) // { // //避免添加重复的部分 // if ((i == 0) || (i > 0 && title2Nodes_init[i].ParentNode.Line != title2Nodes_init[i - 1].ParentNode.Line)) // { // title2Nodes.Add(title2Nodes_init[i].ParentNode); // str_title2List.Add(title2Nodes_init[i].ParentNode.InnerText.Trim().Replace(" ", " ").Replace("\r\n", "")); // } // } // } // for (int i = 0; i < title2Nodes.Count; i++) // { // if ((i > 0 && title2Nodes[i].Line == title2Nodes[i - 1].Line)) // { // str_title2List.RemoveAt(i); // title2Nodes.RemoveAt(i); // } // } //} //#endregion // //} #endregion #region 项1:pdf转为图片的word文件后,通过p节点class属性提取标题 if (method == ReadMethod.TITLE_CLASS) { //HtmlNodeCollection title1Nodes_tmp = new HtmlNodeCollection(htmlRootNode.Clone()); title1Nodes_init = htmlRootNode.SelectNodes(@"//p[@class=1]"); title2Nodes_init = htmlRootNode.SelectNodes(@"//p[@class=2]"); for (int i = 0; i < title1Nodes_init.Count; i++) { if (title1Nodes_init[i].InnerText.Replace(" ", "").Trim() != string.Empty) { str_title1List.Add(title1Nodes_init[i].InnerText.Trim().Replace(" ", " ").Replace("\r\n", "")); title1Nodes.Add(title1Nodes_init[i]); } } for (int i = 0; i < title2Nodes_init.Count; i++) { if (title2Nodes_init[i].InnerText.Replace(" ", "").Trim() != string.Empty) { str_title2List.Add(title2Nodes_init[i].InnerText.Trim().Replace(" ", " ").Replace("\r\n", "")); title2Nodes.Add(title2Nodes_init[i]); } } } #endregion #region 项2:标题中Span 标签 Style属性识别 else if (method == ReadMethod.TITLE_SPANSTYLE) { HtmlNodeCollection title1Nodes_tmp = new HtmlNodeCollection(htmlRootNode.Clone()); #region 提取一级标题节点,生成一级目录的节点集合title1Nodes,和字符串集合str_title1List title1Nodes_init = htmlRootNode.SelectNodes(@"//p"); if (title1Nodes_init != null) { for (int i = 0; i < title1Nodes_init.Count; i++) { string str_style = title1Nodes_init[i].InnerHtml.Replace("\r\n", ""); bool condition = str_style.Contains(title1_select); //bool condition = str_style.Contains(title1_select) // && (title1Nodes_init[i].InnerText.Substring(0, 1) == "第") //|| title1Nodes_init[i].InnerText.Substring(0, 1) == "附"; if (RecogOptions.title1_has_zitizihao) { string str_style_zihao = title1_select.Substring(0, title1_select.IndexOf(';')); string str_style_ziti = title1_select.Substring(title1_select.IndexOf(';') + 1); condition = str_style.Contains(str_style_zihao) && str_style.Contains(str_style_ziti); } if (condition) { foreach (var match in title1Nodes_init[i].DescendantsAndSelf()) { if (RecogOptions.title1_child == 0 && match.Name == "p") { title1Nodes_tmp.Add(title1Nodes_init[i]); break; } if (RecogOptions.title1_child == 1 && match.Name == "b") { title1Nodes_tmp.Add(title1Nodes_init[i]); break; } if (RecogOptions.title1_child == 2 && match.Name == "a") { title1Nodes_tmp.Add(title1Nodes_init[i]); break; } } } } for (int i = 0; i < title1Nodes_tmp.Count; i++) { if (title1Nodes_tmp[i].InnerText.Replace(" ", "").Trim() != string.Empty && (i == 0 || (i > 0 && title1Nodes_tmp[i].Line != title1Nodes_tmp[i - 1].Line))) { str_title1List.Add(title1Nodes_tmp[i].InnerText.Trim().Replace(" ", " ").Replace("\r\n", "")); title1Nodes.Add(title1Nodes_tmp[i]); } } } #endregion #region 提取二级标题节点,生成二级目录的节点集合title2Nodes,和字符串集合str_title2List HtmlNodeCollection tempNodes = new HtmlNodeCollection(htmlRootNode.Clone()); if (RecogOptions.title2RecogMethod == 1) { title2Nodes_init = htmlRootNode.SelectNodes(@"//p"); if (title2Nodes_init != null) { for (int i = 0; i < title2Nodes_init.Count; i++) { string str_tmp = title2Nodes_init[i].InnerText.Replace(" ", " "); string regExp = Patterns.title2_x_dot_x_XXX; Regex reg = new Regex(regExp, RegexOptions.Multiline); MatchCollection matches = reg.Matches(str_tmp); if (matches.Count > 0) { string tmp = matches[0].Value; //有些文档中形如“1 XXX”的不是二级标题,需要手动在程序中修改 //if(tmp.Substring(0, 1) == "第" || tmp.Substring(0, 1) == "附" || tmp.Substring(0, 1) == "修") if (!tmp.Contains("。") //&& tmp.Substring(tmp.Length - 1, 1) != ":" && !tmp.Contains(";") //&&!tmp.Contains("p")) ) //tmp.Length>0&&(tmp.Substring(0,1)=="第"|| tmp.Substring(0, 1) == "附"|| tmp.Substring(0, 1) == "修")) { foreach (var match in title2Nodes_init[i].DescendantsAndSelf()) { if (RecogOptions.title2_child == 0 && match.Name == "p") { tempNodes.Add(title2Nodes_init[i]); break; } if (RecogOptions.title2_child == 1 && match.Name == "b") { tempNodes.Add(title2Nodes_init[i]); break; } if (RecogOptions.title2_child == 2 && match.Name == "a") { tempNodes.Add(title2Nodes_init[i]); break; } } } } } } } if (RecogOptions.title2RecogMethod == 0) { title2Nodes_init = htmlRootNode.SelectNodes(@"//span[@style]"); if (title2Nodes_init != null) { for (int i = 0; i < title2Nodes_init.Count; i++) { string str_style = title2Nodes_init[i].Attributes["style"].Value.Replace("\r\n", ""); bool condition = str_style.Contains(title2_select); if (RecogOptions.title2_has_zitizihao) { string str_style_zihao = title2_select.Substring(0, title2_select.IndexOf(';')); string str_style_ziti = title2_select.Substring(title2_select.IndexOf(';') + 1); condition = str_style.Contains(str_style_zihao) && str_style.Contains(str_style_ziti); } if (condition) { if ((RecogOptions.title2_child == 0) || (RecogOptions.title2_child == 1 && title2Nodes_init[i].ParentNode.Name == "b") || (RecogOptions.title2_child == 2 && title2Nodes_init[i].ParentNode.Name == "a")) { foreach (var match in title2Nodes_init[i].AncestorsAndSelf()) { if (match.Name == "p") { //foreach(var match1 in match.Descendants()) //{ // if (match1.Name == "a") // { // tempNodes.Add(match); // break; // } //} string tmp = match.InnerText.Replace(" ", "").Replace("\r\n", "").Trim(); int a = 0; if (tmp.Length > 1) { //有些文档中形如“1 XXX”的不是二级标题,需要手动在程序中修改 //if((tmp.Contains("条") && tmp.Substring(0, 1) == "第") || tmp.Substring(0, 1) == "附" || tmp.Substring(0, 1) == "标") //if(tmp.Contains("条")&&tmp.Substring(0,1)=="第") //if(!(tmp.Substring(0,1)=="第")&& !(tmp.Substring(0, 1) == "附")) //if(int.TryParse(tmp.Substring(0, 1),out a)==true) if (!tmp.Contains("。"))//&& tmp.Substring(tmp.Length - 1, 1) != ":" && !tmp.Contains(";")) //tmp.Length>0&&(tmp.Substring(0,1)=="第"|| tmp.Substring(0, 1) == "附"|| tmp.Substring(0, 1) == "修")) { tempNodes.Add(match); } } break; } } } } } } } for (int i = 0; i < tempNodes.Count; i++) { if (tempNodes[i].InnerText.Replace(" ", "").Trim() != String.Empty && (i == 0 || (i > 0 && tempNodes[i].Line != tempNodes[i - 1].Line))) { title2Nodes.Add(tempNodes[i]); string tmp = tempNodes[i].InnerText.Replace("\r\n", "").Replace(" ", " "); str_title2List.Add(tmp.Trim()); } } #endregion } #endregion #region 项3:h1/h2/h3标签识别标题 //else if (method == ReadMethod.TITLE_TAG) //{ // titleNodes_init = htmlRootNode.SelectNodes(@"//" + title1_select + @"|" + @"//" + title2_select); // title1Nodes_init = htmlRootNode.SelectNodes(@"//" + title1_select); // title2Nodes_init = htmlRootNode.SelectNodes(@"//" + title2_select); // for (int i = 0; i < titleNodes_init.Count; i++) // { // string tmpstr = titleNodes_init[i].InnerText; // if (titleNodes_init[i].Name == title1_select && tmpstr.Contains("第") && tmpstr.Contains("章")) // { // titleNodes.Add(titleNodes_init[i]); // title1Nodes.Add(titleNodes_init[i]); // str_titleList.Add(titleNodes_init[i].InnerText.Trim().Replace(" ", " ").Replace("\r\n", "")); // str_title1List.Add(titleNodes_init[i].InnerText.Trim().Replace(" ", " ").Replace("\r\n", "")); // } // else if (titleNodes_init[i].Name == title2_select) // { // titleNodes.Add(titleNodes_init[i]); // title2Nodes.Add(titleNodes_init[i]); // str_titleList.Add(titleNodes_init[i].InnerText.Trim().Replace(" ", " ").Replace("\r\n", "")); // str_title2List.Add(titleNodes_init[i].InnerText.Trim().Replace(" ", " ").Replace("\r\n", "")); // } // } //} #endregion #region 生成包含按序排列的一二级目录的节点集合titleNodes,和字符串集合str_titleList foreach (var match in title1Nodes) { titleNodes.Add(match); } foreach (var match in title2Nodes) { titleNodes.Add(match); } for (int i = 0; i < titleNodes.Count; i++) { for (int j = i; j < titleNodes.Count; j++) { if (titleNodes[i].Line > titleNodes[j].Line) { var temp = titleNodes[i]; titleNodes[i] = titleNodes[j]; titleNodes[j] = temp; } } } for (int i = 0; i < titleNodes.Count; i++) { string tmp = titleNodes[i].InnerText.Replace(" ", " ").Replace("\r\n", ""); str_titleList.Add(tmp.Trim()); } #endregion try { #region 找出html文本末尾可能存在的各脚注div,HtmlNode存储在ftNoteRefnodes foreach (var match in htmlRootNode.Descendants()) { if (match.Name == "div" && match.HasAttributes) { string tmp = match.GetAttributeValue("id", "notfound"); if (tmp != "notfound") { ftNoteRefnodes.Add(match); } } } #endregion #region html文档中去除文档末尾的脚注,保存在 htmlTxt 字符串 if (ftNoteRefnodes != null) { for (int i = 0; i < ftNoteRefnodes.Count; i++) { htmlTxt = htmlTxt.Replace(ftNoteRefnodes[i].OuterHtml, ""); } } htmlTxt = htmlTxt.Replace("</body>", "").Replace("</html>", "").Replace("<body>", "").Replace("<html>", ""); #endregion #region 替换图片路径 Regex reg = new Regex(Patterns.imageSrc); MatchCollection matches = reg.Matches(htmlTxt); if (matches.Count == 0) { retInfo.picResult = "无匹配图片"; } else { htmlTxt = reg.Replace(htmlTxt, "${1}" + imageFilePath + "${2}"); retInfo.picResult = "识别到图片数目:" + matches.Count.ToString(); } //System.IO.File.WriteAllText(@"../../../htmlRcgTest/全文.html", htmlTxt); #endregion #region 提取一级标题下可能有的正文,此标题序号和正文键值对 存储在字典dic_title1Content(包含脚注)dic_title1Content_tmp(不含脚注) Dictionary <int, string> dic_title1Content_tmp = new Dictionary <int, string>(); for (int i = 0; i < titleNodes.Count; i++) { for (int j = 0; j < title1Nodes.Count - 1; j++) { if (titleNodes[i].Line == title1Nodes[j].Line) { if ((i < titleNodes.Count - 1 && titleNodes[i + 1].Line == title1Nodes[j + 1].Line)) { int start = htmlTxt.IndexOf(title1Nodes[j].OuterHtml); int end = htmlTxt.IndexOf(title1Nodes[j + 1].OuterHtml, start + 1); if (start != -1 && end > start) { dic_title1Content_tmp.Add(j, htmlTxt.Substring(start, end - start)); break; } else { throw new Exception("title1 content提取出错"); } } } } } for (int i = 0; i < title1Nodes.Count; i++) { if (titleNodes.Last().Line == title1Nodes[i].Line) { int start = htmlTxt.IndexOf(title1Nodes.Last().OuterHtml); if (start != -1) { dic_title1Content_tmp.Add(title1Nodes.Count - 1, htmlTxt.Substring(start)); break; } else { throw new Exception("title1 last content提取出错"); } } } foreach (var pair in dic_title1Content_tmp) { string v = pair.Value; foreach (var ftnref in ftNoteRefnodes) { if (pair.Value.Contains("href=\"#_" + ftnref.Attributes["id"].Value + "\"")) { v = v + ftnref.OuterHtml; } } dic_title1Content.Add(pair.Key, v); } #endregion #region 更新 htmlTxt 字符串,将html文本中一级标题和一级标题下直接的正文 删除 if (title1Nodes != null) { for (int i = 0; i < title1Nodes.Count; i++) { if (dic_title1Content_tmp.Count != 0)//若存在一级标题下直接的正文 { foreach (var pair in dic_title1Content_tmp) { int index = htmlTxt.IndexOf(pair.Value); htmlTxt = htmlTxt.Replace(pair.Value, ""); if (i != pair.Key) { htmlTxt = htmlTxt.Replace(title1Nodes[i].OuterHtml, ""); } } } else //若不存在一级标题下直接的正文 { htmlTxt = htmlTxt.Replace(title1Nodes[i].OuterHtml, ""); } } } #endregion #region 提取二级标题下Html正文,分小节存储,HtmlNode节点存储在contentNodes,文本存储在str_contentList int index_PartStart = 0, index_PartEnd = 0; for (int i = 0; i < title2Nodes.Count; i++) { HtmlAgilityPack.HtmlDocument contentNodeDoc = new HtmlAgilityPack.HtmlDocument(); string str_content; if (i < title2Nodes.Count - 1) { index_PartStart = htmlTxt.IndexOf(title2Nodes[i].OuterHtml, index_PartStart + 1); index_PartEnd = htmlTxt.IndexOf(title2Nodes[i + 1].OuterHtml, index_PartStart + 1); if (index_PartStart != -1 && index_PartEnd > index_PartStart) { str_content = htmlTxt.Substring(index_PartStart, index_PartEnd - index_PartStart); } else { throw new Exception("提取出错"); } } else { index_PartStart = htmlTxt.IndexOf(title2Nodes[title2Nodes.Count - 1].OuterHtml, index_PartStart + 1); if (index_PartStart != -1) { str_content = htmlTxt.Substring(index_PartStart); } else { throw new Exception("提取出错"); } } foreach (var ftnref in ftNoteRefnodes) { if (str_content.Contains("href=\"#_" + ftnref.Attributes["id"].Value + "\"")) { str_content = str_content + ftnref.OuterHtml; } } contentNodeDoc.LoadHtml(str_content); contentNodes.Add(contentNodeDoc.DocumentNode); str_contentList.Add(contentNodes[i].OuterHtml); System.IO.File.WriteAllText(@"../../../htmlRcgTest/" + i + @".html", str_contentList[i]); } #endregion // 断点位置:在局部变量窗口中检查str_contentList/str_titleList/ // str_title1List /str_title2List/dic_title1Content // 1、数目是否正确 // 2、的内容是否正确,是否有缺失(二级标题下的正文可以在输出的文件 // "../../../htmlRcgTest/" + i + @".html"中查看) } catch (Exception err) { Console.WriteLine(err.Message); } #region 将一、二级标题及内容录入数据库 try { SQLUtils sqlUtils = SQLUtils.getInstance(); sqlUtils.makeConnect(); ConventionRow tmp_rootConvention = rootConvention; for (int i = 0; i < title1Nodes.Count; i++) { ConventionRow tempRow1 = null; foreach (var pair in dic_title1Content) { if (pair.Key == i)//若一级标题下有内容,而无二级目录 { tempRow1 = new ConventionRow(rootConvention, str_title1List[i], i + 1, ConventionOptions.CATEGORY.IS_CONTENT, pair.Value); sqlUtils.writeRow_local(tempRow1); retInfo.title1Guids.Add(tempRow1.Guid); //retInfo.retTable.Rows.Add(tempRow1); break; } } if (tempRow1 == null)////若一级标题下无内容,有二级目录 { tempRow1 = new ConventionRow(rootConvention, str_title1List[i], i + 1, ConventionOptions.CATEGORY.IS_CATEGORY); sqlUtils.writeRow_local(tempRow1); retInfo.title1Guids.Add(tempRow1.Guid); //retInfo.retTable.Rows.Add(tempRow1); } for (int j = 0, k = 0; j < title2Nodes.Count; j++) { tmp_rootConvention = tempRow1; if (i < title1Nodes.Count - 1) { if (title2Nodes[j].Line <title1Nodes[i + 1].Line && title2Nodes[j].Line> title1Nodes[i].Line) { ConventionRow tempRow2 = new ConventionRow(tmp_rootConvention, str_title2List[j], ++k, ConventionOptions.CATEGORY.IS_CONTENT, str_contentList[j]); sqlUtils.writeRow_local(tempRow2); //retInfo.retTable.Rows.Add(tempRow2); } } else if (title2Nodes[j].Line > title1Nodes[i].Line) { ConventionRow tempRow2 = new ConventionRow(tmp_rootConvention, str_title2List[j], ++k, ConventionOptions.CATEGORY.IS_CONTENT, str_contentList[j]); sqlUtils.writeRow_local(tempRow2); //retInfo.retTable.Rows.Add(tempRow2); } } } retInfo.title1s = str_title1List; retInfo.title2s = str_title2List; retInfo.title2Contents = str_contentList; retInfo.titles = str_titleList; retInfo.title1ContentsNum = dic_title1Content.Count; } catch (Exception err) { Console.WriteLine(err.Message); retInfo.errorInfo = "录入失败。错误原因:" + err.Message; } return(retInfo); #endregion }
private string GetNodeValues(HtmlNode docNode) { StringBuilder builder = new StringBuilder(); int step = 0; //get each moduleHeaderLabel and append as tr //then get the following nodes that have class data as a set and add //as td pairs underneath with the date as the first column(td) HtmlNodeCollection headers = docNode.SelectNodes("//h3"); HtmlNodeCollection sections = new HtmlNodeCollection(docNode); List <string> headersText = new List <string>(); foreach (var h in headers) { headersText.Add(h.InnerText); sections.Add(h.NextSibling); } if (sections != null) { foreach (var s in sections) { builder.AppendFormat(TdPair, headersText[step], String.Empty); builder.AppendLine(); step++; //handle demographics if (s.PreviousSibling.InnerText.Contains("DEMOGRAPHICS")) { builder.AppendFormat(TdSingle, "Name"); builder.AppendFormat(TdSingle, s.SelectSingleNode("tr").LastChild.InnerText); builder.AppendLine(); } //handle certification else if (s.PreviousSibling.InnerText.Contains("CERTIFICATION")) { HtmlNodeCollection rows = s.SelectNodes("tr"); foreach (var r in rows) { HtmlNodeCollection cells = r.SelectNodes("td"); foreach (var c in cells) { builder.AppendFormat(TdSingle, c.InnerText); } builder.AppendLine(); } } //handle substantiated findings else if (s.PreviousSibling.InnerText.Contains("SUBSTANTIATED FINDINGS")) { builder.AppendFormat(TdSingle, "Data"); builder.AppendFormat(TdSingle, s.NextSibling.InnerText); builder.AppendLine(); } else { break; } } } return(builder.ToString()); }