private void ExtractCouncil(string url, string category, ref List <Documents> docs, ref List <QueryResult> queries) { WebClient c = new WebClient(); string json = c.DownloadString(url); json = string.Format("{0}\"root\":{1}{2}", "{", json, "}"); XmlDocument docXml = JsonConvert.DeserializeXmlNode(json, "root"); XmlNodeList docList = docXml.SelectNodes("//root/data"); Regex dateReg = new Regex("[A-Za-z]+[\\s]{1}[0-9]{1,2},[\\s]+[0-9]{2,4}"); foreach (XmlNode docNode in docList) { string title = docNode.SelectSingleNode("./title").InnerText; string docUrl = "https://southfieldcitymi.documents-on-demand.com" + docNode.SelectSingleNode("./attr/href").InnerText; string meetingText = dateReg.Match(title).ToString(); DateTime meetingDate = DateTime.Parse(dateReg.Match(title).ToString()); if (meetingDate < this.dtStartFrom) { Console.WriteLine("Too early, skip..."); continue; } Documents localDoc = docs.FirstOrDefault(t => t.DocSource == docUrl); if (localDoc == null) { localDoc = new Documents(); localDoc.DocId = Guid.NewGuid().ToString(); localDoc.CityId = this.cityEntity.CityId; localDoc.Checked = false; localDoc.DocType = category; localDoc.DocSource = docUrl; string localPath = string.Format("{0}\\{1}_{2}", this.localDirectory, Guid.NewGuid().ToString(), docUrl.Split('/').LastOrDefault()); localDoc.DocLocalPath = localPath; try { c.DownloadFile(docUrl, localPath); } catch { } docs.Add(localDoc); } else { Console.ForegroundColor = ConsoleColor.Yellow; Console.WriteLine("{0} already downloaded...", docUrl); Console.ResetColor(); } this.ReadText(false, localDoc.DocLocalPath, ref localDoc); QueryResult qr = queries.FirstOrDefault(t => t.DocId == localDoc.DocId); if (qr == null) { qr = new QueryResult(); qr.DocId = localDoc.DocId; qr.CityId = localDoc.CityId; qr.SearchTime = DateTime.Now; qr.MeetingDate = meetingDate; queries.Add(qr); } this.ExtractQueriesFromDoc(localDoc, ref qr); Console.WriteLine("{0} docs saved, {1} queries saved...", docs.Count, queries.Count); } }
public void DownloadCouncilPdfFiles() { List <Documents> docs = this.LoadDocumentsDoneSQL(); List <QueryResult> queries = this.LoadQueriesDoneSQL(); HtmlWeb web = new HtmlWeb(); WebClient c = new WebClient(); foreach (string url in this.docUrls) { HtmlDocument listDoc = web.Load(url); HtmlNodeCollection docNodeList = listDoc.DocumentNode.SelectNodes("//div[contains(@id,'vid')]//ul/li/a[@href]"); if (docNodeList != null) { Console.WriteLine("{0} files...", docNodeList.Count); foreach (HtmlNode docNode in docNodeList) { string pdfName = docNode.InnerText.Trim('\t', '\r', '\n', (char)32, (char)160); string tag = pdfName.ToLower().Contains("agenda") ? "agenda" : "minute"; string pdfUrl = docNode.Attributes["href"].Value; pdfUrl = pdfUrl.StartsWith("http") ? pdfUrl : "http://www.saginaw-mi.com" + pdfUrl; DateTime meetingDate = DateTime.Parse(pdfUrl.Split('?').FirstOrDefault().Split('/').LastOrDefault().Replace(".pdf", string.Empty)); if (meetingDate < this.dtStartFrom) { Console.WriteLine("{0} earlier than {1}. Skip...", meetingDate, dtStartFrom); continue; } Documents localdoc = docs.FirstOrDefault(t => t.DocSource == pdfUrl); if (localdoc == null) { string category = "Council"; localdoc = new Documents(); localdoc.DocId = Guid.NewGuid().ToString(); localdoc.DocType = category; localdoc.CityId = this.cityEntity.CityId; localdoc.DocSource = pdfUrl; string localPath = string.Format("{0}\\{1}_{2}", this.localDirectory, tag, pdfUrl.Split('?').FirstOrDefault().Split('/').LastOrDefault()); localPath = HttpUtility.UrlDecode(localPath); localdoc.DocLocalPath = localPath; try { c.DownloadFile(pdfUrl, localPath); } catch { } docs.Add(localdoc); } else { Console.ForegroundColor = ConsoleColor.Yellow; Console.WriteLine("This document already downloaded..."); Console.ResetColor(); } this.ReadText(false, localdoc.DocLocalPath, ref localdoc); QueryResult qr = queries.FirstOrDefault(t => t.DocId == localdoc.DocId); if (qr == null) { qr = new QueryResult(); qr.DocId = localdoc.DocId; qr.CityId = localdoc.CityId; qr.MeetingDate = meetingDate; qr.SearchTime = DateTime.Now; queries.Add(qr); } this.ExtractQueriesFromDoc(localdoc, ref qr); Console.WriteLine("{0} documents saved...", docs.Count); Console.WriteLine("{0} query results saved...", queries.Count); } } this.SaveMeetingResultsToSQL(docs, queries); } }
private void ExtractCommissionMinutes(string url, ref List <Documents> docs, ref List <QueryResult> queries) { Regex dateReg = new Regex("[A-Za-z]+[\\s]{1}[0-9]{1,2},[\\s]+[0-9]{2,4}"); string category = "Planning Commission"; WebClient c = new WebClient(); HtmlWeb web = new HtmlWeb(); HtmlDocument minuteDoc = web.Load(url); HtmlNodeCollection docNodeList = minuteDoc.DocumentNode.SelectNodes("//div[@id='dnn_ctr1988_Display_HtmlHolder']//a[contains(text(),'(pdf)')]"); if (docNodeList != null && docNodeList.Count > 0) { foreach (HtmlNode docNode in docNodeList) { string docUrl = this.cityEntity.CityUrl + docNode.Attributes["href"].Value; DateTime meetingDate = DateTime.MinValue; bool isDate = DateTime.TryParse(dateReg.Match(docNode.InnerText).ToString(), out meetingDate); if (!isDate) { Regex dateReg1 = new Regex("[0-9]{1,2}-[0-9]{1,2}-[0-9]{2}"); isDate = DateTime.TryParseExact(dateReg1.Match(docUrl).ToString(), "MM-dd-yy", null, System.Globalization.DateTimeStyles.None, out meetingDate); } if (meetingDate < this.dtStartFrom) { Console.WriteLine("too early, skip..."); continue; } Documents localDoc = docs.FirstOrDefault(t => t.DocSource == docUrl); if (localDoc == null) { localDoc = new Documents(); localDoc.DocId = Guid.NewGuid().ToString(); localDoc.CityId = this.cityEntity.CityId; localDoc.DocType = category; localDoc.Checked = false; localDoc.DocSource = docUrl; string localFileName = string.Format("{0}\\{1}_{2}", this.localDirectory, Guid.NewGuid().ToString(), docUrl.Split('/').LastOrDefault().Split('?').FirstOrDefault()); localDoc.DocLocalPath = localFileName; try { c.DownloadFile(docUrl, localFileName); } catch { } docs.Add(localDoc); } else { Console.WriteLine("{0} already downloaded", docUrl); } this.ReadText(false, localDoc.DocLocalPath, ref localDoc); QueryResult qr = queries.FirstOrDefault(t => t.DocId == localDoc.DocId); if (qr == null) { qr = new QueryResult(); qr.DocId = localDoc.DocId; qr.CityId = this.cityEntity.CityId; qr.SearchTime = DateTime.Now; qr.MeetingDate = meetingDate; queries.Add(qr); } this.ExtractQueriesFromDoc(localDoc, ref qr); Console.WriteLine("{0} docs, {1} queries...", docs.Count, queries.Count); } } }
private void ExtractAgenda(string url, ref List <Documents> docs, ref List <QueryResult> queries) { Regex dateReg = new Regex("[A-Za-z]+[\\s]{1}[0-9]{1,2},[\\s]+[0-9]{2,4}"); string category = "Planning Commission"; HtmlWeb web = new HtmlWeb(); HtmlDocument archiveDoc = web.Load(url); HtmlNodeCollection archiveMonthList = archiveDoc.DocumentNode.SelectNodes("//a[@class='archivedisplaymonthlink']"); HtmlNode currentNode = archiveDoc.DocumentNode.SelectSingleNode("//a[text()='Current']"); archiveMonthList.Insert(0, currentNode); foreach (HtmlNode archiveNode in archiveMonthList) { Regex digitReg = new Regex("[0-9]{4}"); int year = archiveNode.InnerText == "Current" ? 2017 : int.Parse(digitReg.Match(archiveNode.InnerText).Value); if (year < this.dtStartFrom.Year) { Console.WriteLine("Too early, skip..."); continue; } Console.WriteLine("Working on {0}...", archiveNode.InnerText); string monthUrl = archiveNode.Attributes["href"].Value; HtmlDocument monthDoc = web.Load(monthUrl); HtmlNode pageNode = monthDoc.DocumentNode.SelectSingleNode("//a[text()='Current']/parent::td/following-sibling::td"); int totalPage = pageNode.SelectNodes("./a").Count; HtmlNodeCollection meetingNodes = monthDoc.DocumentNode.SelectNodes("//span[@class='newstitle']/a"); for (int page = 1; page <= totalPage; page++) { if (page > 1) { Console.WriteLine("Go to page {0}...", page); monthUrl = monthUrl.Replace("/158/", string.Format("/158/nnpg1480/{0}/", page)); monthDoc = web.Load(monthUrl); meetingNodes = monthDoc.DocumentNode.SelectNodes("//span[@class='newstitle']/a"); } foreach (HtmlNode meetingNode in meetingNodes) { string meetingUrl = meetingNode.Attributes["href"].Value; string meetingTitle = meetingNode.InnerText; bool goIn = meetingTitle.Contains("Planning Commission") && meetingTitle.ToLower().Contains("cancelled") == false; goIn = goIn || (meetingTitle.ToLower().Contains("city council")); if (goIn) { string meetingAgendaUrl = meetingNode.Attributes["href"].Value; Documents localDoc = docs.FirstOrDefault(t => t.DocSource == meetingAgendaUrl); DateTime meetingDate = DateTime.MinValue; if (localDoc == null) { localDoc = new Documents(); localDoc.DocType = category; localDoc.DocId = Guid.NewGuid().ToString(); localDoc.CityId = this.cityEntity.CityId; localDoc.DocSource = meetingAgendaUrl; string localFile = string.Format("{0}\\{1}.html", this.localDirectory, meetingAgendaUrl.Split('?').FirstOrDefault().Split('/').Reverse().ElementAt(1)); localDoc.DocLocalPath = localFile; HtmlDocument agendaDoc = web.Load(meetingAgendaUrl); HtmlNode agendaContentNode = agendaDoc.GetElementbyId("Table1"); if (agendaContentNode != null) { File.WriteAllText(localFile, agendaContentNode.InnerHtml, Encoding.UTF8); } localDoc.DocBodyDic.Add(1, agendaContentNode.InnerText); docs.Add(localDoc); } else { if (localDoc.DocBodyDic.Count == 0) { string html = File.ReadAllText(localDoc.DocLocalPath); HtmlDocument htmlDoc = new HtmlDocument(); htmlDoc.LoadHtml(html); localDoc.DocBodyDic.Add(1, htmlDoc.DocumentNode.InnerText); } Console.WriteLine("this file already downloaded.."); } meetingDate = DateTime.Parse(dateReg.Match(localDoc.DocBodyDic[1]).ToString()); if (meetingTitle.Contains("City Council") && meetingDate <= DateTime.Now.AddDays(1 - DateTime.Now.Day)) { continue; } if (meetingDate < this.dtStartFrom) { continue; } QueryResult qr = queries.FirstOrDefault(t => t.DocId == localDoc.DocId); if (qr == null) { qr = new QueryResult(); qr.CityId = localDoc.CityId; qr.DocId = localDoc.DocId; qr.MeetingDate = meetingDate; qr.SearchTime = DateTime.Now; queries.Add(qr); } this.ExtractQueriesFromDoc(localDoc, ref qr); Console.WriteLine("{0} docs saved, {1} queries saved...", docs.Count, queries.Count); } } } } }
public void DownloadCouncilPdfFiles() { WebClient c = new WebClient(); HtmlWeb web = new HtmlWeb(); List <Documents> docs = this.LoadDocumentsDoneSQL(); List <QueryResult> queries = this.LoadQueriesDoneSQL(); foreach (string url in this.docUrls) { string category = url.Split('*')[0]; string categoryUrl = url.Split('*')[1]; HtmlDocument categoryDoc = web.Load(categoryUrl); HtmlNodeCollection listNodes = categoryDoc.DocumentNode.SelectNodes("//div[@id='dnn_ctr1872_HtmlModule_lblContent']/table/tbody/tr"); listNodes = listNodes == null? categoryDoc.DocumentNode.SelectNodes("//div[@id='dnn_ctr1880_HtmlModule_lblContent']/table/tbody/tr") : listNodes; listNodes = listNodes == null? categoryDoc.DocumentNode.SelectNodes("//div[@id='dnn_ctr1892_HtmlModule_lblContent']/table/tbody/tr") : listNodes; if (listNodes != null) { foreach (HtmlNode listNode in listNodes) { HtmlNode dateNode = listNode.SelectSingleNode("./td[2]"); string dateText = dateNode.InnerText.Replace("Febraury", "February"); if (listNode.InnerText.Contains((this.dtStartFrom.Year - 1).ToString())) { break; } DateTime meetingDate = DateTime.Parse(dateText); if (meetingDate < this.dtStartFrom) { Console.WriteLine("Too early, skip..."); continue; } HtmlNodeCollection fileNodes = listNode.SelectNodes(".//a[contains(@href,'fileticket')]"); if (fileNodes != null) { foreach (HtmlNode fileNode in fileNodes) { string fileUrl = fileNode.Attributes["href"].Value.Replace("&", "&"); fileUrl = fileUrl.StartsWith("http") ? fileUrl : this.cityEntity.CityUrl + fileUrl; Documents localdoc = docs.FirstOrDefault(t => t.DocSource == fileUrl); if (localdoc == null) { localdoc = new Documents(); localdoc.DocId = Guid.NewGuid().ToString(); localdoc.CityId = this.cityEntity.CityId; localdoc.DocType = category; localdoc.DocSource = fileUrl; localdoc.Important = false; localdoc.Checked = false; string localFile = string.Format("{0}\\{1}_{2}_{3}.pdf", this.localDirectory, category, fileNode.InnerText.Trim('\r', '\n', '\t', (char)32, (char)160), Guid.NewGuid().ToString()); localdoc.DocLocalPath = localFile; try { c.Headers.Add("user-agent", "chrome"); c.DownloadFile(fileUrl, localFile); } catch (Exception ex) { } docs.Add(localdoc); } else { Console.ForegroundColor = ConsoleColor.Yellow; Console.WriteLine("This file already downloaded..."); Console.ResetColor(); } this.ReadText(false, localdoc.DocLocalPath, ref localdoc); QueryResult qr = queries.FirstOrDefault(t => t.DocId == localdoc.DocId); if (qr == null) { qr = new QueryResult(); qr.CityId = localdoc.CityId; qr.DocId = localdoc.DocId; qr.SearchTime = DateTime.Now; qr.MeetingDate = meetingDate; queries.Add(qr); } this.ExtractQueriesFromDoc(localdoc, ref qr); Console.WriteLine("{0} docs added, {1} queries added...", docs.Count, queries.Count); this.SaveMeetingResultsToSQL(docs, queries); } } } } } }
public void ExtractHtml(HtmlWeb web, WebClient c, DateTime meetingDate, string category, string url, ref List <Documents> docs, ref List <QueryResult> queries) { Documents localdoc = docs.FirstOrDefault(t => t.DocSource == url); if (localdoc == null) { localdoc = new Documents(); localdoc.DocId = Guid.NewGuid().ToString(); localdoc.CityId = this.cityEntity.CityId; localdoc.DocSource = url; localdoc.DocType = category; localdoc.Checked = false; localdoc.Important = false; localdoc.DocLocalPath = string.Format("{0}\\{1}_Agenda_{2}.html", this.localDirectory, category, Guid.NewGuid().ToString()); try { string html = c.DownloadString(url); File.WriteAllText(localdoc.DocLocalPath, html); } catch { } docs.Add(localdoc); } else { Console.WriteLine("This file already downloaded..."); } HtmlDocument htmlDoc = new HtmlDocument(); htmlDoc.LoadHtml(File.ReadAllText(localdoc.DocLocalPath)); HtmlNodeCollection pdfFileNodes = htmlDoc.DocumentNode.SelectNodes("//a[contains(@href,'ViewFile')]"); localdoc.DocBodyDic.Clear(); localdoc.DocBodyDic.Add(1, htmlDoc.DocumentNode.InnerText); QueryResult qr = queries.FirstOrDefault(t => t.DocId == localdoc.DocId); if (qr == null) { qr = new QueryResult(); qr.CityId = localdoc.CityId; qr.DocId = localdoc.DocId; qr.SearchTime = DateTime.Now; qr.MeetingDate = meetingDate; queries.Add(qr); } this.ExtractQueriesFromDoc(localdoc, ref qr); Console.WriteLine("{0} docs added, {1} queries added...", docs.Count, queries.Count); this.SaveMeetingResultsToSQL(docs, queries); if (pdfFileNodes != null) { foreach (HtmlNode pdfNode in pdfFileNodes) { if (!pdfNode.InnerText.ToLower().Contains(".pdf")) { continue; } string pdfUrl = this.cityEntity.CityUrl + pdfNode.Attributes["href"].Value; this.ExtractPdf(c, meetingDate, pdfUrl, category, pdfNode.InnerText, ref docs, ref queries); } } }
public void DownloadCouncilPdfFiles() { string[] categories = { "City Council", "Planning Commission", "Zoning Board of Appeals" }; List <Documents> docs = this.LoadDocumentsDoneSQL(); List <QueryResult> queries = this.LoadQueriesDoneSQL(); WebClient c = new WebClient(); HtmlWeb web = new HtmlWeb(); ChromeDriver cd = new ChromeDriver(); cd.Navigate().GoToUrl(this.docUrl); System.Threading.Thread.Sleep(10000); foreach (string category in categories) { List <int> years = new List <int>(); for (int i = this.dtStartFrom.Year; i <= DateTime.Now.Year; i++) { years.Add(i); } HtmlDocument doc = new HtmlDocument(); doc.LoadHtml(cd.PageSource); List <string> tagsToSearch = doc.DocumentNode.SelectNodes(string.Format("//li[starts-with(text(),'{0}')]", category)) .Select(t => t.InnerText).ToList(); foreach (string tag in tagsToSearch) { Console.WriteLine("Working on [{0}]...", tag); foreach (int year in years) { IWebElement tagLabelEle = cd.FindElementById("ctl00_ContentPlaceHolder1_lstName_Input"); tagLabelEle.Click(); System.Threading.Thread.Sleep(1000); IWebElement tagEle = cd.FindElementByXPath(string.Format("//li[text()='{0}']", tag)); tagEle.Click(); System.Threading.Thread.Sleep(2000); IWebElement yearLabelEle = cd.FindElementByXPath("//*[contains(text(),'Date: ')]"); try { yearLabelEle.Click(); } catch { } System.Threading.Thread.Sleep(1000); IWebElement yearEle = cd.FindElementByXPath(string.Format("//span[text()='{0}']", year)); yearEle.Click(); System.Threading.Thread.Sleep(2000); doc.LoadHtml(cd.PageSource); HtmlNodeCollection docCollection = doc.DocumentNode.SelectNodes("//table[@id='ctl00_ContentPlaceHolder1_gridCalendar_ctl00']/tbody/tr"); if (docCollection != null && docCollection.Count > 0) { foreach (HtmlNode docNode in docCollection) { string meetingDateText = docNode.SelectSingleNode("./td").InnerText; if (meetingDateText == "No records to display.") { continue; } DateTime meetingDate = DateTime.Parse(meetingDateText); if (meetingDate < this.dtStartFrom) { Console.WriteLine("Too early, skip..."); continue; } HtmlNode agendaNode = docNode.SelectSingleNode(".//a[text()='Agenda']"); HtmlNode minuteNode = docNode.SelectSingleNode(".//a[text()='Minutes']"); Dictionary <string, string> docUrlDic = new Dictionary <string, string>(); if (agendaNode != null) { string agendaUrl = agendaNode.Attributes["href"].Value; agendaUrl = agendaUrl.StartsWith("http") ? agendaUrl : "http://roch.legistar.com/" + agendaUrl; string fileType = string.Empty; string tagText = agendaNode.PreviousSibling.PreviousSibling.Attributes["src"].Value; if (tagText.Contains("HTML")) { fileType = "html"; } else if (tagText.Contains("PDF")) { fileType = "pdf"; } docUrlDic.Add("Agenda_" + fileType, agendaUrl.Replace("&", "&")); } if (minuteNode != null) { string minuteUrl = minuteNode.Attributes["href"].Value; minuteUrl = minuteUrl.StartsWith("http") ? minuteUrl : "http://roch.legistar.com/" + minuteUrl; string fileType = string.Empty; string tagText = minuteNode.PreviousSibling.PreviousSibling.Attributes["src"].Value; if (tagText.Contains("HTML")) { fileType = "html"; } else if (tagText.Contains("PDF")) { fileType = "pdf"; } docUrlDic.Add("Minute_" + fileType, minuteUrl.Replace("&", "&")); } foreach (string key in docUrlDic.Keys) { Documents localdoc = docs.FirstOrDefault(t => t.DocSource == docUrlDic[key]); if (localdoc == null) { localdoc = new Documents(); localdoc.DocId = Guid.NewGuid().ToString(); localdoc.CityId = this.cityEntity.CityId; localdoc.Important = false; localdoc.Checked = false; localdoc.DocType = category; localdoc.DocSource = docUrlDic[key]; if (key.Contains("html")) { localdoc.DocLocalPath = string.Format("{0}\\{1}_{2}.html", localDirectory, key, docUrlDic[key].Split('=').LastOrDefault()); try { c.Headers.Add("user-agent", "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/57.0.2987.133 Safari/537.36"); string html = c.DownloadString(docUrlDic[key]); File.WriteAllText(localdoc.DocLocalPath, html); HtmlDocument agendaDoc = new HtmlDocument(); agendaDoc.LoadHtml(html); localdoc.DocBodyDic.Add(1, agendaDoc.DocumentNode.InnerText); } catch (Exception ex) { } } else { localdoc.DocLocalPath = string.Format("{0}\\{1}_{2}.pdf", localDirectory, key, docUrlDic[key].Split('=').LastOrDefault()); try { c.Headers.Add("user-agent", "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/57.0.2987.133 Safari/537.36"); c.DownloadFile(docUrlDic[key], localdoc.DocLocalPath); } catch (Exception ex) { } this.ReadText(false, localdoc.DocLocalPath, ref localdoc); } docs.Add(localdoc); } else { Console.WriteLine("This file already downloaded...."); if (localdoc.DocLocalPath.ToLower().Contains("pdf")) { this.ReadText(false, localdoc.DocLocalPath, ref localdoc); } else { string html = File.ReadAllText(localdoc.DocLocalPath); HtmlDocument pageContent = new HtmlDocument(); pageContent.LoadHtml(html); localdoc.DocBodyDic.Add(1, pageContent.DocumentNode.InnerText); } } QueryResult qr = queries.FirstOrDefault(t => t.DocId == localdoc.DocId); if (qr == null) { qr = new QueryResult(); qr.CityId = localdoc.CityId; qr.DocId = localdoc.DocId; qr.MeetingDate = meetingDate; qr.SearchTime = DateTime.Now; queries.Add(qr); } this.ExtractQueriesFromDoc(localdoc, ref qr); Console.WriteLine("{0} docs added, {1} queries added...", docs.Count, queries.Count); this.SaveMeetingResultsToSQL(docs, queries); } } } } } } cd.Quit(); cd = null; }
public void DownloadCouncilPdfFiles() { HtmlWeb web = new HtmlWeb(); WebClient c = new WebClient(); List <Documents> docs = this.LoadDocumentsDoneSQL(); List <QueryResult> queries = this.LoadQueriesDoneSQL(); Regex dateReg = new Regex("[A-Za-z]+[\\s]{0,1}[0-9]{1,2},[\\s]{0,1}[0-9]{4}"); foreach (string url in this.docUrls) { string category = url.Split('*')[0]; string categoryUrl = url.Split('*')[1]; HtmlDocument listDoc = web.Load(categoryUrl); HtmlNodeCollection recordNodes = listDoc.DocumentNode.SelectNodes("//table/tbody/tr[@class='catAgendaRow']"); if (recordNodes != null && recordNodes.Count > 0) { foreach (HtmlNode recordNode in recordNodes) { try { HtmlNode dateNode = recordNode.SelectSingleNode(".//strong"); string dateText = dateReg.Match(dateNode.InnerText).ToString(); DateTime meetingDate = DateTime.Parse(dateText); if (meetingDate < this.dtStartFrom) { Console.WriteLine("Too early, skip..."); continue; } HtmlNode agendaNode = dateNode == null? recordNode.SelectNodes(".//a[contains(@href,'ViewFile')]") .Where(t => !t.Attributes["href"].Value.Contains("html")) .FirstOrDefault(t => t.Attributes["href"].Value .ToLower().Contains("/agenda/")) : dateNode.ParentNode; string agendaUrl = agendaNode.Attributes["href"].Value; agendaUrl = agendaUrl.StartsWith("http") ? agendaUrl : this.cityEntity.CityUrl + agendaUrl; HtmlNode minuteNode = recordNode.SelectNodes(".//a[contains(@href,'ViewFile')]") .FirstOrDefault(t => t.Attributes["href"].Value.ToLower().Contains("minutes")); string minuteUrl = minuteNode == null ? string.Empty : minuteNode.Attributes["href"].Value; List <string> fileUrls = new List <string>(); fileUrls.Add(agendaUrl); if (!string.IsNullOrEmpty(minuteUrl)) { minuteUrl = minuteUrl.StartsWith("http") ? minuteUrl : this.cityEntity.CityUrl + minuteUrl; fileUrls.Add(minuteUrl); } foreach (string fileUrl in fileUrls) { Documents localdoc = docs.FirstOrDefault(t => t.DocSource == fileUrl); string tag = fileUrl.ToLower().Contains("minute") ? "minute" : "agenda"; if (localdoc == null) { localdoc = new Documents(); localdoc.CityId = this.cityEntity.CityId; localdoc.Checked = false; localdoc.DocId = Guid.NewGuid().ToString(); localdoc.DocSource = fileUrl; localdoc.DocType = category; string localFileName = string.Format("{0}\\{1}_{2}_{3}.pdf", this.localDirectory, category, meetingDate.ToString("yyyy-MM-dd"), tag); try { c.DownloadFile(fileUrl, localFileName); } catch { } localdoc.DocLocalPath = localFileName; docs.Add(localdoc); } else { Console.ForegroundColor = ConsoleColor.Yellow; Console.WriteLine("File already downloaded...."); Console.ResetColor(); } this.ReadText(false, localdoc.DocLocalPath, ref localdoc); QueryResult qr = queries.FirstOrDefault(t => t.DocId == localdoc.DocId); if (qr == null) { qr = new QueryResult(); qr.CityId = this.cityEntity.CityId; qr.DocId = localdoc.DocId; qr.MeetingDate = meetingDate; qr.SearchTime = DateTime.Now; queries.Add(qr); } this.ExtractQueriesFromDoc(localdoc, ref qr); Console.WriteLine("{0} docs saved, {1} queries saved...", docs.Count, queries.Count); } } catch (Exception ex) { Console.WriteLine("DEBUG EXCEPTION:{0}", ex.ToString()); Console.WriteLine("DATA: {0}", recordNode.InnerHtml); } } this.SaveMeetingResultsToSQL(docs, queries); } } }
public void DownloadCouncilPdfFiles() { List <Documents> docs = this.LoadDocumentsDoneSQL(); List <QueryResult> queries = this.LoadQueriesDoneSQL(); HtmlWeb web = new HtmlWeb(); WebClient c = new WebClient(); foreach (string url in this.docUrls) { HtmlDocument listDoc = web.Load(url); HtmlNodeCollection docNodeList = listDoc.DocumentNode.SelectNodes("//div[@id='system']//table//tr[@valign='top']"); if (docNodeList != null) { Console.WriteLine("{0} dates...", docNodeList.Count); foreach (HtmlNode docNode in docNodeList) { DateTime meetingDate = DateTime.MinValue; try { string dateText = docNode.SelectSingleNode("./td").InnerText.Trim((char)32, (char)160); if (!string.IsNullOrEmpty(dateText)) { meetingDate = DateTime.Parse(dateText); } } catch { } if (meetingDate < this.dtStartFrom) { Console.WriteLine("Earlier than {0}...", this.dtStartFrom); continue; } HtmlNodeCollection pdfNodes = docNode.SelectNodes(".//a[@href]"); if (pdfNodes != null) { Console.WriteLine("{0} files at {1}...", pdfNodes.Count, meetingDate); foreach (HtmlNode pdfNode in pdfNodes) { string pdfUrl = pdfNode.Attributes["href"].Value; pdfUrl = pdfUrl.StartsWith("http") ? pdfUrl : "http://www.ci.wayne.mi.us" + pdfUrl; if (pdfUrl.Contains("youtu")) { continue; } Documents localdoc = docs.FirstOrDefault(t => t.DocSource == pdfUrl); if (localdoc == null) { localdoc = new Documents(); localdoc.DocId = Guid.NewGuid().ToString(); localdoc.DocType = "Council"; localdoc.CityId = this.cityEntity.CityId; localdoc.DocSource = pdfUrl; string localPath = string.Format("{0}\\{1}", this.localDirectory, pdfUrl.Split('?').FirstOrDefault().Split('/').LastOrDefault()); localdoc.DocLocalPath = localPath; if (!File.Exists(localPath)) { try { c.DownloadFile(pdfUrl, localPath); } catch { } } docs.Add(localdoc); } else { Console.ForegroundColor = ConsoleColor.Yellow; Console.WriteLine("This document already downloaded..."); Console.ResetColor(); } this.ReadText(false, localdoc.DocLocalPath, ref localdoc); QueryResult qr = queries.FirstOrDefault(t => t.DocId == localdoc.DocId); if (qr == null) { qr = new QueryResult(); qr.DocId = localdoc.DocId; qr.CityId = localdoc.CityId; qr.MeetingDate = meetingDate; qr.SearchTime = DateTime.Now; queries.Add(qr); } this.ExtractQueriesFromDoc(localdoc, ref qr); Console.WriteLine("{0} documents saved...", docs.Count); Console.WriteLine("{0} query results saved...", queries.Count); } } } } } this.SaveMeetingResultsToSQL(docs, queries); }
public void DownloadCouncilPdfFiles() { List <Documents> docs = this.LoadDocumentsDoneSQL(); List <QueryResult> queries = this.LoadQueriesDoneSQL(); HtmlWeb web = new HtmlWeb(); WebClient c = new WebClient(); foreach (string url in this.docUrls) { HtmlDocument listDoc = web.Load(url); HtmlNodeCollection docNodeList = listDoc.DocumentNode.SelectNodes("//div[@id='file_name']/a"); if (docNodeList != null) { Console.WriteLine("{0} files...", docNodeList.Count); foreach (HtmlNode docNode in docNodeList) { DateTime meetingDate = DateTime.MinValue; try { meetingDate = DateTime.Parse(HttpUtility.HtmlDecode(docNode.InnerText.Replace("Special Meeting", string.Empty))); } catch { string[] meetingTagArray = docNode.InnerText.Split('.').FirstOrDefault().Split(' '); try { meetingDate = DateTime.Parse(string.Join(" ", meetingTagArray.Take(3))); } catch { } } if (meetingDate < this.dtStartFrom) { Console.WriteLine("{0} earlier than {1}. Skip...", meetingDate, dtStartFrom); continue; } string pdfUrl = docNode.Attributes["href"].Value; pdfUrl = pdfUrl.StartsWith("http") ? pdfUrl : "http://romulusgov.com/" + pdfUrl; Documents localdoc = docs.FirstOrDefault(t => t.DocSource == pdfUrl); if (localdoc == null) { string category = string.Empty; if (url.Contains("city_council")) { category = "Council"; } else if (url.Contains("planning")) { category = "Planning Commission"; } else { category = "Board of Appeal"; } localdoc = new Documents(); localdoc.DocId = Guid.NewGuid().ToString(); localdoc.DocType = category; localdoc.CityId = this.cityEntity.CityId; localdoc.DocSource = pdfUrl; string localPath = string.Format("{0}\\{1}", this.localDirectory, pdfUrl.Split('?').FirstOrDefault().Split('/').LastOrDefault()); localPath = HttpUtility.UrlDecode(localPath); localdoc.DocLocalPath = localPath; try { c.DownloadFile(pdfUrl, localPath); } catch { } docs.Add(localdoc); } else { Console.ForegroundColor = ConsoleColor.Yellow; Console.WriteLine("This document already downloaded..."); Console.ResetColor(); } this.ReadText(false, localdoc.DocLocalPath, ref localdoc); QueryResult qr = queries.FirstOrDefault(t => t.DocId == localdoc.DocId); if (qr == null) { qr = new QueryResult(); qr.DocId = localdoc.DocId; qr.CityId = localdoc.CityId; qr.MeetingDate = meetingDate; qr.SearchTime = DateTime.Now; queries.Add(qr); } this.ExtractQueriesFromDoc(localdoc, ref qr); Console.WriteLine("{0} documents saved...", docs.Count); Console.WriteLine("{0} query results saved...", queries.Count); } } this.SaveMeetingResultsToSQL(docs, queries); } }
public void DownloadCouncilPdfFiles() { List <Documents> docs = this.LoadDocumentsDoneSQL(); List <QueryResult> queries = this.LoadQueriesDoneSQL(); WebClient c = new WebClient(); Regex dateReg = new Regex("[a-zA-Z]+[\\s]{0,2}[0-9]{1,2},[\\s]{0,2}[0-9]{4}"); foreach (string url in this.docUrls) { string category = url.Split('*')[0]; string listUrl = url.Split('*')[1]; Console.WriteLine("Working on {0}...", listUrl); string json = c.DownloadString(listUrl); var jsonDoc = JsonConvert.DeserializeObject(json) as JToken; if (jsonDoc != null) { var fileUrlsNodes = jsonDoc.SelectTokens("$..href"); if (fileUrlsNodes != null) { foreach (var fileUrlNode in fileUrlsNodes) { string fileUrl = "https://eastpointecitymi.documents-on-demand.com" + fileUrlNode.ToString(); string meetingDateText = dateReg.Match(fileUrl).ToString(); DateTime meetingDate = DateTime.Parse(meetingDateText); if (meetingDate < this.dtStartFrom) { Console.WriteLine("Too early, skip..."); continue; } Documents localdoc = docs.FirstOrDefault(t => t.DocSource == fileUrl); if (localdoc == null) { localdoc = new Documents(); localdoc.DocId = Guid.NewGuid().ToString(); localdoc.CityId = this.cityEntity.CityId; localdoc.DocSource = fileUrl; localdoc.Important = false; localdoc.Checked = false; localdoc.DocType = category; string localPath = string.Format("{0}\\{1}", this.localDirectory, fileUrl.Split('/').LastOrDefault()); localdoc.DocLocalPath = localPath; try { c.DownloadFile(fileUrl, localPath); } catch { } docs.Add(localdoc); } else { Console.WriteLine("This file already downloaded...."); } this.ReadText(false, localdoc.DocLocalPath, ref localdoc); QueryResult qr = queries.FirstOrDefault(t => t.DocId == localdoc.DocId); if (qr == null) { qr = new QueryResult(); qr.DocId = localdoc.DocId; qr.CityId = localdoc.CityId; qr.MeetingDate = meetingDate; qr.SearchTime = DateTime.Now; queries.Add(qr); } this.ExtractQueriesFromDoc(localdoc, ref qr); Console.WriteLine("{0} docs saved, {1} queries saved...", docs.Count, queries.Count); this.SaveMeetingResultsToSQL(docs, queries); } } } } }