示例#1
0
        public void GetChapter(HtmlNode chapterNode = null)
        {
            if (chapterNode != null)
            {
                _doc = new HtmlDocument();
            }

            var chk       = 0;
            var pageIndex = 0;

            _footnotes = new HtmlNodeCollection(_doc.DocumentNode);

            if (Settings.Default.IncludeChapterTitle)
            {
                _header = new HtmlNodeCollection(_doc.DocumentNode);
                var hNode1 = HtmlNode.CreateNode($"<p id=\"Lh0\">{Name}</p>");
                var hNode2 = HtmlNode.CreateNode("================================");
                _header.Insert(0, hNode1);
                _header.Insert(1, hNode2);
                DivideInPages(_header, ref chk, ref pageIndex);
            }

            if (chapterNode == null)
            {
                chapterNode = _doc.DocumentNode.SelectSingleNode("//div[@id='novel_honbun']");
            }

            var lineNodes = chapterNode?.SelectNodes("./p[starts-with(@id, 'L')]");

            if (lineNodes == null)
            {
                return;
            }

            DivideInPages(lineNodes, ref chk, ref pageIndex);

            if (Settings.Default.IncludeAuthorNote)
            {
                var anoteNode  = _doc.DocumentNode.SelectSingleNode("//div[@id='novel_a']");
                var aLineNodes = anoteNode?.SelectNodes("./p[starts-with(@id, 'La')]");

                if (aLineNodes != null)
                {
                    var aNode = HtmlNode.CreateNode("<p id=\"La0\">================Author Note================</p>");
                    aLineNodes.Insert(0, aNode);

                    DivideInPages(aLineNodes, ref chk, ref pageIndex);
                }
            }

            if (Settings.Default.IncludeFootnotes)
            {
                if (_footnotes.Count > 0)
                {
                    DivideInPages(_footnotes, ref chk, ref pageIndex);
                }
            }

            Save();
        }
        public void InsertTest()
        {
            HtmlElement root = new HtmlElement("root");
            HtmlNodeCollection target = new HtmlNodeCollection(root);
            HtmlElement child = new HtmlElement("child");
            target.Add(child);

            child = new HtmlElement("second");
            target.Insert(0, child);
            Assert.AreEqual(root, child.Parent);
            Assert.AreEqual(target.IndexOf(child), 0);

            target.Insert(0, null);
        }
示例#3
0
        private void ExtractAgenda(string url, ref List <Documents> docs, ref List <QueryResult> queries)
        {
            Regex              dateReg          = new Regex("[A-Za-z]+[\\s]{1}[0-9]{1,2},[\\s]+[0-9]{2,4}");
            string             category         = "Planning Commission";
            HtmlWeb            web              = new HtmlWeb();
            HtmlDocument       archiveDoc       = web.Load(url);
            HtmlNodeCollection archiveMonthList = archiveDoc.DocumentNode.SelectNodes("//a[@class='archivedisplaymonthlink']");
            HtmlNode           currentNode      = archiveDoc.DocumentNode.SelectSingleNode("//a[text()='Current']");

            archiveMonthList.Insert(0, currentNode);
            foreach (HtmlNode archiveNode in archiveMonthList)
            {
                Regex digitReg = new Regex("[0-9]{4}");
                int   year     = archiveNode.InnerText == "Current" ? 2017 : int.Parse(digitReg.Match(archiveNode.InnerText).Value);
                if (year < this.dtStartFrom.Year)
                {
                    Console.WriteLine("Too early, skip...");
                    continue;
                }

                Console.WriteLine("Working on {0}...", archiveNode.InnerText);
                string             monthUrl     = archiveNode.Attributes["href"].Value;
                HtmlDocument       monthDoc     = web.Load(monthUrl);
                HtmlNode           pageNode     = monthDoc.DocumentNode.SelectSingleNode("//a[text()='Current']/parent::td/following-sibling::td");
                int                totalPage    = pageNode.SelectNodes("./a").Count;
                HtmlNodeCollection meetingNodes = monthDoc.DocumentNode.SelectNodes("//span[@class='newstitle']/a");

                for (int page = 1; page <= totalPage; page++)
                {
                    if (page > 1)
                    {
                        Console.WriteLine("Go to page {0}...", page);
                        monthUrl     = monthUrl.Replace("/158/", string.Format("/158/nnpg1480/{0}/", page));
                        monthDoc     = web.Load(monthUrl);
                        meetingNodes = monthDoc.DocumentNode.SelectNodes("//span[@class='newstitle']/a");
                    }

                    foreach (HtmlNode meetingNode in meetingNodes)
                    {
                        string meetingUrl   = meetingNode.Attributes["href"].Value;
                        string meetingTitle = meetingNode.InnerText;
                        bool   goIn         = meetingTitle.Contains("Planning Commission") && meetingTitle.ToLower().Contains("cancelled") == false;
                        goIn = goIn || (meetingTitle.ToLower().Contains("city council"));
                        if (goIn)
                        {
                            string    meetingAgendaUrl = meetingNode.Attributes["href"].Value;
                            Documents localDoc         = docs.FirstOrDefault(t => t.DocSource == meetingAgendaUrl);
                            DateTime  meetingDate      = DateTime.MinValue;

                            if (localDoc == null)
                            {
                                localDoc           = new Documents();
                                localDoc.DocType   = category;
                                localDoc.DocId     = Guid.NewGuid().ToString();
                                localDoc.CityId    = this.cityEntity.CityId;
                                localDoc.DocSource = meetingAgendaUrl;
                                string localFile = string.Format("{0}\\{1}.html",
                                                                 this.localDirectory,
                                                                 meetingAgendaUrl.Split('?').FirstOrDefault().Split('/').Reverse().ElementAt(1));
                                localDoc.DocLocalPath = localFile;
                                HtmlDocument agendaDoc         = web.Load(meetingAgendaUrl);
                                HtmlNode     agendaContentNode = agendaDoc.GetElementbyId("Table1");

                                if (agendaContentNode != null)
                                {
                                    File.WriteAllText(localFile, agendaContentNode.InnerHtml, Encoding.UTF8);
                                }

                                localDoc.DocBodyDic.Add(1, agendaContentNode.InnerText);
                                docs.Add(localDoc);
                            }
                            else
                            {
                                if (localDoc.DocBodyDic.Count == 0)
                                {
                                    string       html    = File.ReadAllText(localDoc.DocLocalPath);
                                    HtmlDocument htmlDoc = new HtmlDocument();
                                    htmlDoc.LoadHtml(html);
                                    localDoc.DocBodyDic.Add(1, htmlDoc.DocumentNode.InnerText);
                                }
                                Console.WriteLine("this file already downloaded..");
                            }

                            meetingDate = DateTime.Parse(dateReg.Match(localDoc.DocBodyDic[1]).ToString());

                            if (meetingTitle.Contains("City Council") && meetingDate <= DateTime.Now.AddDays(1 - DateTime.Now.Day))
                            {
                                continue;
                            }


                            if (meetingDate < this.dtStartFrom)
                            {
                                continue;
                            }

                            QueryResult qr = queries.FirstOrDefault(t => t.DocId == localDoc.DocId);

                            if (qr == null)
                            {
                                qr             = new QueryResult();
                                qr.CityId      = localDoc.CityId;
                                qr.DocId       = localDoc.DocId;
                                qr.MeetingDate = meetingDate;
                                qr.SearchTime  = DateTime.Now;

                                queries.Add(qr);
                            }

                            this.ExtractQueriesFromDoc(localDoc, ref qr);
                            Console.WriteLine("{0} docs saved, {1} queries saved...", docs.Count, queries.Count);
                        }
                    }
                }
            }
        }