public void GetChapter(HtmlNode chapterNode = null) { if (chapterNode != null) { _doc = new HtmlDocument(); } var chk = 0; var pageIndex = 0; _footnotes = new HtmlNodeCollection(_doc.DocumentNode); if (Settings.Default.IncludeChapterTitle) { _header = new HtmlNodeCollection(_doc.DocumentNode); var hNode1 = HtmlNode.CreateNode($"<p id=\"Lh0\">{Name}</p>"); var hNode2 = HtmlNode.CreateNode("================================"); _header.Insert(0, hNode1); _header.Insert(1, hNode2); DivideInPages(_header, ref chk, ref pageIndex); } if (chapterNode == null) { chapterNode = _doc.DocumentNode.SelectSingleNode("//div[@id='novel_honbun']"); } var lineNodes = chapterNode?.SelectNodes("./p[starts-with(@id, 'L')]"); if (lineNodes == null) { return; } DivideInPages(lineNodes, ref chk, ref pageIndex); if (Settings.Default.IncludeAuthorNote) { var anoteNode = _doc.DocumentNode.SelectSingleNode("//div[@id='novel_a']"); var aLineNodes = anoteNode?.SelectNodes("./p[starts-with(@id, 'La')]"); if (aLineNodes != null) { var aNode = HtmlNode.CreateNode("<p id=\"La0\">================Author Note================</p>"); aLineNodes.Insert(0, aNode); DivideInPages(aLineNodes, ref chk, ref pageIndex); } } if (Settings.Default.IncludeFootnotes) { if (_footnotes.Count > 0) { DivideInPages(_footnotes, ref chk, ref pageIndex); } } Save(); }
public void InsertTest() { HtmlElement root = new HtmlElement("root"); HtmlNodeCollection target = new HtmlNodeCollection(root); HtmlElement child = new HtmlElement("child"); target.Add(child); child = new HtmlElement("second"); target.Insert(0, child); Assert.AreEqual(root, child.Parent); Assert.AreEqual(target.IndexOf(child), 0); target.Insert(0, null); }
private void ExtractAgenda(string url, ref List <Documents> docs, ref List <QueryResult> queries) { Regex dateReg = new Regex("[A-Za-z]+[\\s]{1}[0-9]{1,2},[\\s]+[0-9]{2,4}"); string category = "Planning Commission"; HtmlWeb web = new HtmlWeb(); HtmlDocument archiveDoc = web.Load(url); HtmlNodeCollection archiveMonthList = archiveDoc.DocumentNode.SelectNodes("//a[@class='archivedisplaymonthlink']"); HtmlNode currentNode = archiveDoc.DocumentNode.SelectSingleNode("//a[text()='Current']"); archiveMonthList.Insert(0, currentNode); foreach (HtmlNode archiveNode in archiveMonthList) { Regex digitReg = new Regex("[0-9]{4}"); int year = archiveNode.InnerText == "Current" ? 2017 : int.Parse(digitReg.Match(archiveNode.InnerText).Value); if (year < this.dtStartFrom.Year) { Console.WriteLine("Too early, skip..."); continue; } Console.WriteLine("Working on {0}...", archiveNode.InnerText); string monthUrl = archiveNode.Attributes["href"].Value; HtmlDocument monthDoc = web.Load(monthUrl); HtmlNode pageNode = monthDoc.DocumentNode.SelectSingleNode("//a[text()='Current']/parent::td/following-sibling::td"); int totalPage = pageNode.SelectNodes("./a").Count; HtmlNodeCollection meetingNodes = monthDoc.DocumentNode.SelectNodes("//span[@class='newstitle']/a"); for (int page = 1; page <= totalPage; page++) { if (page > 1) { Console.WriteLine("Go to page {0}...", page); monthUrl = monthUrl.Replace("/158/", string.Format("/158/nnpg1480/{0}/", page)); monthDoc = web.Load(monthUrl); meetingNodes = monthDoc.DocumentNode.SelectNodes("//span[@class='newstitle']/a"); } foreach (HtmlNode meetingNode in meetingNodes) { string meetingUrl = meetingNode.Attributes["href"].Value; string meetingTitle = meetingNode.InnerText; bool goIn = meetingTitle.Contains("Planning Commission") && meetingTitle.ToLower().Contains("cancelled") == false; goIn = goIn || (meetingTitle.ToLower().Contains("city council")); if (goIn) { string meetingAgendaUrl = meetingNode.Attributes["href"].Value; Documents localDoc = docs.FirstOrDefault(t => t.DocSource == meetingAgendaUrl); DateTime meetingDate = DateTime.MinValue; if (localDoc == null) { localDoc = new Documents(); localDoc.DocType = category; localDoc.DocId = Guid.NewGuid().ToString(); localDoc.CityId = this.cityEntity.CityId; localDoc.DocSource = meetingAgendaUrl; string localFile = string.Format("{0}\\{1}.html", this.localDirectory, meetingAgendaUrl.Split('?').FirstOrDefault().Split('/').Reverse().ElementAt(1)); localDoc.DocLocalPath = localFile; HtmlDocument agendaDoc = web.Load(meetingAgendaUrl); HtmlNode agendaContentNode = agendaDoc.GetElementbyId("Table1"); if (agendaContentNode != null) { File.WriteAllText(localFile, agendaContentNode.InnerHtml, Encoding.UTF8); } localDoc.DocBodyDic.Add(1, agendaContentNode.InnerText); docs.Add(localDoc); } else { if (localDoc.DocBodyDic.Count == 0) { string html = File.ReadAllText(localDoc.DocLocalPath); HtmlDocument htmlDoc = new HtmlDocument(); htmlDoc.LoadHtml(html); localDoc.DocBodyDic.Add(1, htmlDoc.DocumentNode.InnerText); } Console.WriteLine("this file already downloaded.."); } meetingDate = DateTime.Parse(dateReg.Match(localDoc.DocBodyDic[1]).ToString()); if (meetingTitle.Contains("City Council") && meetingDate <= DateTime.Now.AddDays(1 - DateTime.Now.Day)) { continue; } if (meetingDate < this.dtStartFrom) { continue; } QueryResult qr = queries.FirstOrDefault(t => t.DocId == localDoc.DocId); if (qr == null) { qr = new QueryResult(); qr.CityId = localDoc.CityId; qr.DocId = localDoc.DocId; qr.MeetingDate = meetingDate; qr.SearchTime = DateTime.Now; queries.Add(qr); } this.ExtractQueriesFromDoc(localDoc, ref qr); Console.WriteLine("{0} docs saved, {1} queries saved...", docs.Count, queries.Count); } } } } }