public void ExractCouncil(string url, ref List <Documents> docs, ref List <QueryResult> queries) { WebClient c = new WebClient(); ChromeDriver cd = new ChromeDriver(); cd.Navigate().GoToUrl(url); System.Threading.Thread.Sleep(3000); IWebElement rangeEle = cd.FindElementByXPath("//select[@class='agendasearch-input']"); SelectElement rangeSelectEle = new SelectElement(rangeEle); rangeSelectEle.SelectByValue("cus"); System.Threading.Thread.Sleep(3000); IWebElement dateStartEle = cd.FindElementById("ctl00_ContentPlaceHolder1_SearchAgendasMeetings_radCalendarFrom_dateInput"); IWebElement dateEndEle = cd.FindElementById("ctl00_ContentPlaceHolder1_SearchAgendasMeetings_radCalendarTo_dateInput"); dateStartEle.Clear(); dateStartEle.SendKeys(this.dtStartFrom.ToString("M/d/yyyy")); dateEndEle.Clear(); dateEndEle.SendKeys(DateTime.Now.ToString("M/d/yyyy")); System.Threading.Thread.Sleep(2000); IWebElement searchBtnEle = cd.FindElementById("ctl00_ContentPlaceHolder1_SearchAgendasMeetings_imageButtonSearch"); searchBtnEle.Click(); System.Threading.Thread.Sleep(2000); while (true) { try { HtmlDocument doc = new HtmlDocument(); doc.LoadHtml(cd.PageSource); HtmlNodeCollection rowList = doc.DocumentNode.SelectNodes("//table[@id='ctl00_ContentPlaceHolder1_SearchAgendasMeetings_radGridMeetings_ctl00']/tbody/tr[contains(@class,'Row')]"); if (rowList != null) { foreach (HtmlNode rowNode in rowList) { HtmlNode meetingDateNode = rowNode.SelectSingleNode("./td"); string meetingText = meetingDateNode.InnerText; DateTime meetingDate = DateTime.ParseExact(meetingText, "MM/dd/yy", null); if (meetingDate < this.dtStartFrom) { Console.WriteLine("Too early, skip..."); continue; } HtmlNodeCollection docNodeList = rowNode.SelectNodes(".//a[contains(@href,'ashx')]"); if (docNodeList != null) { foreach (HtmlNode docNode in docNodeList) { string docUrl = docNode.Attributes["href"].Value; docUrl = docUrl.StartsWith("http") ? docUrl : url.Trim('#') + docUrl; Documents localdoc = docs.FirstOrDefault(t => t.DocSource == docUrl); if (localdoc == null) { string tag = docUrl.Contains("Minute") ? "Minute" : "Agenda"; localdoc = new Documents(); localdoc.DocId = Guid.NewGuid().ToString(); localdoc.CityId = this.cityEntity.CityId; localdoc.Checked = false; localdoc.Important = false; localdoc.DocType = "City Council"; localdoc.DocSource = docUrl; string localFile = string.Format("{0}\\Council_{1}_{2}.pdf", this.localDirectory, tag, meetingDate.ToString("yyyy-MM-dd")); localdoc.DocLocalPath = localFile; try { c.DownloadFile(docUrl, localFile); } catch { } docs.Add(localdoc); } else { Console.WriteLine("This file already downloaded..."); } this.ReadText(false, localdoc.DocLocalPath, ref localdoc); QueryResult qr = queries.FirstOrDefault(t => t.DocId == localdoc.DocId); if (qr == null) { qr = new QueryResult(); qr.CityId = localdoc.CityId; qr.DocId = localdoc.DocId; qr.MeetingDate = meetingDate; qr.SearchTime = DateTime.Now; queries.Add(qr); } this.ExtractQueriesFromDoc(localdoc, ref qr); Console.WriteLine("{0} docs added, {1} queries added...", docs.Count, queries.Count); this.SaveMeetingResultsToSQL(docs, queries); } } } } IWebElement nextPageBtnEle = cd.FindElementByXPath("//a[@title='Next Page']"); nextPageBtnEle.Click(); System.Threading.Thread.Sleep(3000); } catch (Exception ex) { Console.ForegroundColor = ConsoleColor.Red; Console.WriteLine("Reach last page..."); Console.ResetColor(); break; } } cd.Quit(); cd = null; }
public void DownloadCouncilPdfFiles() { List <Documents> docs = this.LoadDocumentsDoneSQL(); List <QueryResult> queries = this.LoadQueriesDoneSQL(); WebClient c = new WebClient(); HtmlWeb web = new HtmlWeb(); Regex dateReg = new Regex("(0|1)[0-9]{1}[0-9]{2}[0-9]{2}|(0|1)[0-9]{1}-[0-9]{2}-[0-9]{2}"); Regex dateReg1 = new Regex("[0-9]{4}-[0-9]{2}-[0-9]{2}"); Regex digitReg = new Regex("[0-9]{4}"); foreach (string url in this.docUrls) { string category = url.Split('*')[0]; string categoryUrl = url.Split('*')[1]; HtmlDocument doc = web.Load(categoryUrl); HtmlNodeCollection docNodes = doc.DocumentNode.SelectNodes("//td//a[contains(@href,'.pdf')]"); if (docNodes != null) { foreach (HtmlNode docNode in docNodes) { Console.WriteLine("DEBUG:{0}...", categoryUrl); string fileUrl = this.cityEntity.CityUrl + docNode.Attributes["href"].Value; string meetingDateText = string.Empty; DateTime meetingDate = DateTime.MinValue; if (dateReg1.IsMatch(fileUrl)) { meetingDateText = dateReg1.Match(fileUrl).ToString(); Console.WriteLine("DEBUG: meeting date 1: {0}", meetingDateText); meetingDate = DateTime.Parse(meetingDateText); } else if (dateReg.IsMatch(fileUrl)) { meetingDateText = dateReg.Match(fileUrl).ToString(); meetingDateText = meetingDateText.Replace("-", string.Empty); Console.WriteLine("DEBUG: meeting date 2: {0}", meetingDateText); meetingDate = DateTime.ParseExact(meetingDateText, "MMddyy", null); } else { string year = digitReg.Match(fileUrl.Split('/').LastOrDefault()).ToString(); year = year.Substring(year.Length - 2, 2); year = (2000 + int.Parse(year)).ToString(); meetingDateText = string.Format("{0}, {1}", docNode.InnerText, year); Console.WriteLine("DEBUG: meeting date 3: {0}", meetingDateText); meetingDate = DateTime.Parse(meetingDateText); } if (meetingDate < this.dtStartFrom) { Console.WriteLine("Too early, skip..."); continue; } Documents localdoc = docs.FirstOrDefault(t => t.DocSource == fileUrl); if (localdoc == null) { localdoc = new Documents(); localdoc.DocId = Guid.NewGuid().ToString(); localdoc.CityId = this.cityEntity.CityId; localdoc.DocType = category; localdoc.DocSource = fileUrl; localdoc.Important = false; localdoc.Checked = false; localdoc.DocLocalPath = string.Format("{0}\\{1}", this.localDirectory, fileUrl.Split('/').LastOrDefault()); try { c.DownloadFile(fileUrl, localdoc.DocLocalPath); } catch (Exception ex) { } docs.Add(localdoc); } else { Console.WriteLine("This file already downloaded..."); } this.ReadText(false, localdoc.DocLocalPath, ref localdoc); QueryResult qr = queries.FirstOrDefault(t => t.DocId == localdoc.DocId); if (qr == null) { qr = new QueryResult(); qr.SearchTime = DateTime.Now; qr.MeetingDate = meetingDate; qr.DocId = localdoc.DocId; qr.CityId = localdoc.CityId; queries.Add(qr); } this.ExtractQueriesFromDoc(localdoc, ref qr); Console.WriteLine("{0} docs saved, {1} queries added...", docs.Count, queries.Count); this.SaveMeetingResultsToSQL(docs, queries); } } } }
public void DownloadCouncilPdfFiles() { var docs = this.LoadDocumentsDoneSQL(); var queries = this.LoadQueriesDoneSQL(); ChromeDriver cd = new ChromeDriver(); Regex dateReg = new Regex("[a-zA-Z]+[\\s]{0,1}[0-9]{1,2}[\\s]{0,1}[0-9]{4}"); foreach (string docUrl in this.docUrls) { string category = docUrl.Split('*')[0]; string categoryUrl = docUrl.Split('*')[1]; cd.Navigate().GoToUrl(categoryUrl); System.Threading.Thread.Sleep(2000); HtmlDocument yearListDoc = new HtmlDocument(); yearListDoc.LoadHtml(cd.PageSource); HtmlNodeCollection yearNodeList = yearListDoc.DocumentNode.SelectNodes("//div[@class='media']//a[@class='page-list__item']"); for (int i = this.dtStartFrom.Year; i <= DateTime.Now.Year; i++) { Console.WriteLine("Working on year {0}...", i); var yearNode = yearNodeList.FirstOrDefault(t => t.InnerText.Contains(i.ToString())); if (yearNode != null) { string yearUrl = this.cityEntity.CityUrl + yearNode.Attributes["href"].Value; cd.Navigate().GoToUrl(yearUrl); System.Threading.Thread.Sleep(2000); var fileList = cd.FindElementsByXPath("//a[@class='blog__post']"); Console.WriteLine("{0} nodes found...", fileList.Count); int page = 1; do { bool breakNow = false; var nextButtonEle = cd.FindElementByXPath("//a[@class='su_next']/parent::li"); if (nextButtonEle.GetAttribute("class") != "next") { Console.WriteLine("Reach last page..."); breakNow = true; } if (fileList.All(t => { string href = t.GetAttribute("href"); string text = t.Text; return(string.IsNullOrEmpty(href) || string.IsNullOrEmpty(text)); })) { Console.WriteLine("No doc..."); fileList = null; continue; } var fileUrlDic = new Dictionary <string, string>(); foreach (var ele in fileList) { string url = ele.GetAttribute("href"); if (!string.IsNullOrEmpty(url)) { fileUrlDic.Add(url, ele.Text); } } foreach (string url in fileUrlDic.Keys) { string meetingDateText = dateReg.Match(fileUrlDic[url]).ToString(); Console.WriteLine("DEBUG: {0} - {1}", meetingDateText, fileUrlDic[url]); DateTime meetingDate = DateTime.Parse(meetingDateText); if (meetingDate < this.dtStartFrom) { Console.WriteLine("Too early, skip..."); continue; } var localdoc = docs.FirstOrDefault(t => t.DocSource == url); if (localdoc == null) { localdoc = new Documents(); localdoc.DocId = Guid.NewGuid().ToString(); localdoc.CityId = this.cityEntity.CityId; localdoc.Checked = false; localdoc.Important = false; localdoc.DocType = category; localdoc.DocSource = url; string localPath = string.Format("{0}\\{1}_{2}_{3}.html", this.localDirectory, category, meetingDate.ToString("yyyy-MM-dd"), localdoc.DocId); localdoc.DocLocalPath = localPath; localdoc.Readable = true; docs.Add(localdoc); cd.Navigate().GoToUrl(url); System.Threading.Thread.Sleep(1000); var targetEle = cd.FindElementByXPath("//div[@class='su_bootstrap_safe su-content-wrapper']"); string js = "document.documentElement.scrollTop=" + targetEle.Location.Y; ((IJavaScriptExecutor)cd).ExecuteScript(js); System.Threading.Thread.Sleep(1000); string meetingText = targetEle.Text; localdoc.DocBodyDic.Add(1, meetingText); File.WriteAllText(localdoc.DocLocalPath, meetingText); } else { Console.WriteLine("This file already downloaded...."); localdoc.DocBodyDic.Add(1, File.ReadAllText(localdoc.DocLocalPath)); } var qr = queries.FirstOrDefault(t => t.DocId == localdoc.DocId); if (qr == null) { qr = new QueryResult(); qr.CityId = this.cityEntity.CityId; qr.DocId = localdoc.DocId; qr.SearchTime = DateTime.Now; qr.MeetingDate = meetingDate; qr.QueryId = Guid.NewGuid().ToString(); queries.Add(qr); } this.ExtractQueriesFromDoc(localdoc, ref qr); Console.WriteLine("{0} docs saved, {1} queries save...", docs.Count, queries.Count); } this.SaveMeetingResultsToSQL(docs, queries); page++; if (breakNow) { break; } Console.WriteLine("Go to page {0}...", page); string newPage = yearUrl + "?page=" + page; cd.Navigate().GoToUrl(newPage); System.Threading.Thread.Sleep(2000); fileList = cd.FindElementsByXPath("//a[@class='blog__post']"); }while (fileList != null && fileList.Count > 0); } } } cd.Quit(); cd = null; }
public void DownloadCouncilPdfFiles() { List <Documents> docs = this.LoadDocumentsDoneSQL(); List <QueryResult> queries = this.LoadQueriesDoneSQL(); WebClient c = new WebClient(); HtmlWeb web = new HtmlWeb(); List <string> rangeUrls = new List <string>(); for (int i = dtStartFrom.Year; i <= DateTime.Now.Year; i++) { DateTime rangeStart = new DateTime(i, 1, 1);; DateTime rangeEnd = new DateTime(i, 12, 31); string rangeUrl = string.Format("{0}?From={1}&To={2}", this.docUrls[0], rangeStart.ToString("M/d/yyyy"), rangeEnd.ToString("M/d/yyyy")); rangeUrls.Add(rangeUrl); } foreach (string rangeUrl in rangeUrls) { HtmlDocument doc = web.Load(rangeUrl); HtmlNodeCollection recordNodes = doc.DocumentNode.SelectNodes("//div[contains(@class,'Row MeetingRow')]"); if (recordNodes != null) { foreach (HtmlNode recordNode in recordNodes) { List <HtmlNode> docNodes = new List <HtmlNode>(); HtmlNode agendaNode = recordNode.SelectSingleNode(".//a[text()='Agenda']"); if (agendaNode != null && string.IsNullOrEmpty(agendaNode.Attributes["href"].Value) == false) { docNodes.Add(agendaNode); } HtmlNode minuteNode = recordNode.SelectSingleNode(".//a[text()='Minutes']"); if (minuteNode != null && string.IsNullOrEmpty(minuteNode.Attributes["href"].Value) == false) { docNodes.Add(minuteNode); } HtmlNode agendePacketNode = recordNode.SelectSingleNode(".//a[text()='Agenda Packet']"); if (agendePacketNode != null && string.IsNullOrEmpty(agendePacketNode.Attributes["href"].Value) == false) { docNodes.Add(agendePacketNode); } if (docNodes.Count == 0) { Console.WriteLine("No files found...."); continue; } HtmlNode meetingCategoryNode = recordNode.SelectSingleNode(".//div[@class='MainScreenText RowDetails']"); string category = meetingCategoryNode == null ? string.Empty : meetingCategoryNode.InnerText; if (category.Contains("City Commission")) { category = "City Council"; } else if (category.Contains("Board of Zoning Appeals")) { category = "Zoning Board of Appeals"; } else if (category.Contains("Planning Commission")) { category = "Planning Commission"; } else { continue; } HtmlNode dateNode = recordNode.SelectSingleNode(".//div[@class='RowLink']/a"); string meetingDateText = dateNode.InnerText; DateTime meetingDate = DateTime.Parse(meetingDateText); if (meetingDate < this.dtStartFrom) { Console.WriteLine("Too early, skip..."); continue; } foreach (HtmlNode docNode in docNodes) { string docUrl = "http://grandrapidscitymi.iqm2.com/Citizens/" + docNode.Attributes["href"].Value.Replace("&", "&"); docUrl = docUrl.Replace("FileView", "FileOpen"); Documents localdoc = docs.FirstOrDefault(t => t.DocSource == docUrl); if (localdoc == null) { localdoc = new Documents(); localdoc.DocId = Guid.NewGuid().ToString(); localdoc.CityId = this.cityEntity.CityId; localdoc.DocSource = docUrl; localdoc.Important = false; localdoc.Checked = false; localdoc.DocType = category; localdoc.DocLocalPath = string.Format("{0}\\{1}_{2}_{3}.pdf", this.localDirectory, category, docNode.InnerText, Guid.NewGuid().ToString()); try { c.Headers.Add("user-agent", "Chrome"); c.DownloadFile(docUrl, localdoc.DocLocalPath); } catch (Exception ex) { } docs.Add(localdoc); } else { Console.WriteLine("This file already downloaded..."); } this.ReadText(false, localdoc.DocLocalPath, ref localdoc); QueryResult qr = queries.FirstOrDefault(t => t.DocId == localdoc.DocId); if (qr == null) { qr = new QueryResult(); qr.CityId = this.cityEntity.CityId; qr.DocId = localdoc.DocId; qr.SearchTime = DateTime.Now; qr.MeetingDate = meetingDate; queries.Add(qr); } this.ExtractQueriesFromDoc(localdoc, ref qr); Console.WriteLine("{0} docs added, {1} queries added...", docs.Count, queries.Count); this.SaveMeetingResultsToSQL(docs, queries); } } } } }
private void ExtractAgendas(HtmlDocument doc, string url, string category, ref List <Documents> docs, ref List <QueryResult> queries) { try { WebClient c = new WebClient(); HtmlNodeCollection agendasNodes = doc.DocumentNode.SelectNodes("//table[@class='tablewithheadingresponsive']/tbody/tr[@class]"); if (agendasNodes != null) { foreach (HtmlNode agendaNode in agendasNodes) { HtmlNode meetingDateNode = agendaNode.SelectSingleNode("./td"); string meetingDateText = meetingDateNode.InnerText.Split('-').FirstOrDefault().Trim((char)32, (char)160); DateTime meetingDate = string.IsNullOrEmpty(meetingDateText) ? DateTime.MinValue : DateTime.Parse(meetingDateText); HtmlNode agendaDocNode = agendaNode.SelectSingleNode(".//div[@class='html']/a"); string fileUrl = agendaDocNode == null ? string.Empty : agendaDocNode.Attributes["href"].Value; if (string.IsNullOrEmpty(fileUrl)) { continue; } fileUrl = fileUrl.ToLower().StartsWith("http") ? fileUrl : this.cityEntity.CityUrl + fileUrl; Documents localDoc = docs.FirstOrDefault(t => t.DocSource == fileUrl); HtmlDocument agendaHtmlDoc = new HtmlDocument(); if (localDoc == null) { localDoc = new Documents(); localDoc.CityId = this.cityEntity.CityId; localDoc.Checked = false; localDoc.DocType = category; localDoc.DocId = Guid.NewGuid().ToString(); localDoc.Readable = true; string html = c.DownloadString(fileUrl); agendaHtmlDoc.LoadHtml(html); HtmlNode agendaBodyNode = agendaHtmlDoc.GetElementbyId("interiorcontenttext"); string localFile = string.Format("{0}\\{1}_{2}", this.localDirectory, Guid.NewGuid().ToString(), fileUrl.Split('?').FirstOrDefault().Split('/').LastOrDefault()); File.WriteAllText(localFile, html, Encoding.UTF8); localDoc.DocBodyDic.Add(1, agendaBodyNode.InnerText); docs.Add(localDoc); } else { string html = File.ReadAllText(localDoc.DocLocalPath); agendaHtmlDoc.LoadHtml(html); HtmlNode agendaBodyNode = agendaHtmlDoc.GetElementbyId("interiorcontenttext"); if (localDoc.DocBodyDic.Count == 0) { localDoc.DocBodyDic.Add(1, agendaDocNode.InnerText); } Console.ForegroundColor = ConsoleColor.Yellow; Console.WriteLine("This document already downloaded..."); Console.ResetColor(); } QueryResult qr = queries.FirstOrDefault(t => t.DocId == localDoc.DocId); if (qr == null) { qr = new QueryResult(); qr.DocId = localDoc.DocId; qr.CityId = localDoc.CityId; qr.SearchTime = DateTime.Now; qr.MeetingDate = meetingDate; queries.Add(qr); } this.ExtractQueriesFromDoc(localDoc, ref qr); Console.WriteLine("{0} docs added, {1} queries added...", docs.Count, queries.Count); this.ExtractMoreAgenda(agendaHtmlDoc, category, meetingDate, ref docs, ref queries); } } } catch (Exception ex) { Console.ForegroundColor = ConsoleColor.Red; Console.WriteLine("DEBUG EXCEPTION:{0}", ex.ToString()); Console.WriteLine("CURRENT URL: {0}", url); Console.ResetColor(); } this.SaveMeetingResultsToSQL(docs, queries); }
public void ExtractHtml(HtmlWeb web, WebClient c, DateTime meetingDate, string category, string url, ref List <Documents> docs, ref List <QueryResult> queries) { Documents localdoc = docs.FirstOrDefault(t => t.DocSource == url); if (localdoc == null) { localdoc = new Documents(); localdoc.DocId = Guid.NewGuid().ToString(); localdoc.CityId = this.cityEntity.CityId; localdoc.DocSource = url; localdoc.DocType = category; localdoc.Checked = false; localdoc.Important = false; localdoc.DocLocalPath = string.Format("{0}\\{1}_Agenda_{2}.html", this.localDirectory, category, Guid.NewGuid().ToString()); try { string html = c.DownloadString(url); File.WriteAllText(localdoc.DocLocalPath, html); } catch { } docs.Add(localdoc); } else { Console.WriteLine("This file already downloaded..."); } HtmlDocument htmlDoc = new HtmlDocument(); htmlDoc.LoadHtml(File.ReadAllText(localdoc.DocLocalPath)); HtmlNodeCollection pdfFileNodes = htmlDoc.DocumentNode.SelectNodes("//a[contains(@href,'ViewFile')]"); localdoc.DocBodyDic.Clear(); localdoc.DocBodyDic.Add(1, htmlDoc.DocumentNode.InnerText); QueryResult qr = queries.FirstOrDefault(t => t.DocId == localdoc.DocId); if (qr == null) { qr = new QueryResult(); qr.CityId = localdoc.CityId; qr.DocId = localdoc.DocId; qr.SearchTime = DateTime.Now; qr.MeetingDate = meetingDate; queries.Add(qr); } this.ExtractQueriesFromDoc(localdoc, ref qr); Console.WriteLine("{0} docs added, {1} queries added...", docs.Count, queries.Count); this.SaveMeetingResultsToSQL(docs, queries); if (pdfFileNodes != null) { foreach (HtmlNode pdfNode in pdfFileNodes) { if (!pdfNode.InnerText.ToLower().Contains(".pdf")) { continue; } string pdfUrl = this.cityEntity.CityUrl + pdfNode.Attributes["href"].Value; this.ExtractPdf(c, meetingDate, pdfUrl, category, pdfNode.InnerText, ref docs, ref queries); } } }
private void ExtractAgendas(HtmlWeb web, string docUrl, string category, DateTime meetingDate, ref List <Documents> docs, ref List <QueryResult> queries) { Documents localdoc = docs.FirstOrDefault(t => t.DocSource == docUrl); HtmlDocument doc = new HtmlDocument(); if (localdoc == null) { localdoc = new Documents(); localdoc.DocId = Guid.NewGuid().ToString(); localdoc.CityId = this.cityEntity.CityId; localdoc.DocSource = docUrl; localdoc.DocType = category; localdoc.Important = false; localdoc.Checked = false; localdoc.DocLocalPath = string.Format("{0}\\{1}_{2}_{3}.html", this.localDirectory, category, meetingDate.ToString("yyyy-MM-dd"), Guid.NewGuid().ToString()); try { doc = web.Load(docUrl); doc.Save(localdoc.DocLocalPath); } catch { } docs.Add(localdoc); } else { Console.WriteLine("This document already downloaded..."); } doc.LoadHtml(File.ReadAllText(localdoc.DocLocalPath)); localdoc.DocBodyDic.Add(1, doc.DocumentNode.InnerText); HtmlNodeCollection fileNodes = doc.DocumentNode.SelectNodes("//a[contains(@href,'/ViewFile/')]"); QueryResult qr = queries.FirstOrDefault(t => t.DocId == localdoc.DocId); if (qr == null) { qr = new QueryResult(); qr.DocId = localdoc.DocId; qr.CityId = this.cityEntity.CityId; qr.MeetingDate = meetingDate; qr.SearchTime = DateTime.Now; queries.Add(qr); } this.ExtractQueriesFromDoc(localdoc, ref qr); Console.WriteLine("{0} docs added, {1} queries added...", docs.Count, queries.Count); this.SaveMeetingResultsToSQL(docs, queries); if (fileNodes != null) { WebClient c = new WebClient(); foreach (HtmlNode fileNode in fileNodes) { string fileUrl = this.cityEntity.CityUrl + fileNode.Attributes["href"].Value; if (fileUrl.Contains("pdf")) { this.ExtractADoc(c, fileUrl, category, "pdf", meetingDate, ref docs, ref queries); } } } }
public void DownloadCouncilPdfFiles() { var docs = this.LoadDocumentsDoneSQL(); var queries = this.LoadQueriesDoneSQL(); WebClient c = new WebClient(); HtmlWeb web = new HtmlWeb(); Regex dateReg = new Regex("[a-zA-Z]+[\\s]{0,2}[0-9]{1,2},[\\s]{0,2}[0-9]{4}"); foreach (string url in this.docUrls) { string category = url.Split('*')[0]; string categoryUrl = url.Split('*')[1]; for (int i = 1; ; i++) { Console.WriteLine("Working on page {0}...", i); string categoryPagedUrl = i == 1 ? categoryUrl : string.Format("{0}&paged={1}", categoryUrl, i); HtmlDocument listDoc = web.Load(categoryPagedUrl); HtmlNode notFoundNode = listDoc.DocumentNode.SelectSingleNode("//section[@class='error-404 not-found']"); if (notFoundNode != null) { break; } HtmlNodeCollection entryNodes = listDoc.DocumentNode.SelectNodes("//article[contains(@id,'post')]"); if (entryNodes != null) { foreach (HtmlNode entryNode in entryNodes) { HtmlNode dateNode = entryNode.SelectSingleNode(".//time[contains(@class,'entry-date published')]"); string meetingDateText = dateNode.InnerText; DateTime meetingDate = DateTime.Parse(meetingDateText); if (meetingDate < this.dtStartFrom) { Console.WriteLine("Early! Skip...."); continue; } HtmlNode meetingUrlNode = dateNode == null ? null : dateNode.ParentNode; string meetingUrl = meetingUrlNode == null ? string.Empty : meetingUrlNode.Attributes["href"].Value; if (!string.IsNullOrEmpty(meetingUrl)) { HtmlNode contentNode = entryNode.SelectSingleNode(".//div[@class='entry-content']"); Documents localdoc = docs.FirstOrDefault(t => t.DocSource == meetingUrl); if (localdoc == null) { localdoc = new Documents(); localdoc.DocId = Guid.NewGuid().ToString(); localdoc.CityId = this.cityEntity.CityId; localdoc.DocType = category; localdoc.Checked = false; localdoc.Important = false; localdoc.Readable = true; localdoc.DocSource = meetingUrl; localdoc.DocLocalPath = string.Format("{0}\\{1}_{2}.html", this.localDirectory, category, Guid.NewGuid().ToString()); File.WriteAllText(localdoc.DocLocalPath, contentNode.OuterHtml); docs.Add(localdoc); } else { Console.WriteLine("This file already downloaded..."); } localdoc.DocBodyDic.Add(1, contentNode.InnerText); QueryResult qr = queries.FirstOrDefault(q => q.DocId == localdoc.DocId); if (qr == null) { qr = new QueryResult(); qr.MeetingDate = meetingDate; qr.SearchTime = DateTime.Now; qr.QueryId = Guid.NewGuid().ToString(); qr.DocId = localdoc.DocId; queries.Add(qr); } this.ExtractQueriesFromDoc(localdoc, ref qr); Console.WriteLine("{0} docs added, {1} queries added...", docs.Count, queries.Count); } } this.SaveMeetingResultsToSQL(docs, queries); } } } }
protected void ExtractQueriesFromDoc(Documents doc, ref QueryResult qr) { foreach (int page in doc.DocBodyDic.Keys) { string text = doc.DocBodyDic[page]; foreach (string searchterm in searchterms) { QueryResult.KeywordEntry entry = qr.Entries.FirstOrDefault(t => t.Keyword == searchterm && t.PageNumber == page); if (entry == null) { entry = new QueryResult.KeywordEntry(); entry.Keyword = searchterm; entry.PageNumber = page; entry.GuidDic = new Dictionary <string, string>(); entry.CommentDic = new Dictionary <string, string>(); qr.Entries.Add(entry); } if (text.ToLower().Contains(searchterm.ToLower())) { string[] bodyWords = text.Split(' '); string[] targetWords = Array.FindAll(bodyWords, t => t.ToLower().StartsWith(searchterm.ToLower())); List <string> entryLines = entry.Contents; int indexOfCurrent = 0; for (int j = 0; j < targetWords.Length; j++) { indexOfCurrent = Array.IndexOf(bodyWords, targetWords[j], indexOfCurrent + 1); List <string> words = new List <string>(); int rangeStart = indexOfCurrent - 11; int rangeEnd = indexOfCurrent + 11; rangeStart = rangeStart < 0 ? 0 : rangeStart; rangeEnd = rangeEnd >= bodyWords.Length ? bodyWords.Length - 1 : rangeEnd; for (int i = rangeStart; i <= rangeEnd; i++) { words.Add(bodyWords[i]); } string line = string.Join(" ", words.Select(t => t.Replace("\r", string.Empty).Replace("\n", " "))); if (words.Count < 20) { Console.WriteLine("Please check!"); } if (!entryLines.Exists(t => t.Contains(line))) { entryLines.Add(line); } } Console.ForegroundColor = ConsoleColor.Green; Console.WriteLine("search term {0} appears {1} times in document {2}...", searchterm, targetWords.Length, doc.DocLocalPath); Console.ResetColor(); entry.Contents = entryLines; foreach (string content in entry.Contents) { if (!entry.GuidDic.ContainsKey(content)) { entry.GuidDic.Add(content, Guid.NewGuid().ToString()); } if (!entry.CommentDic.ContainsKey(content)) { entry.CommentDic.Add(content, string.Empty); } } } if (entry.Contents.Count == 0) { qr.Entries.Remove(entry); } } foreach (string searchTermD in searchTermsDependency) { QueryResult.KeywordEntry entry = qr.Entries.FirstOrDefault(t => t.Keyword == searchTermD && t.PageNumber == page); if (entry == null) { entry = new QueryResult.KeywordEntry(); entry.Keyword = searchTermD; entry.PageNumber = page; qr.Entries.Add(entry); } Console.WriteLine("Search {0}...", searchTermD); if (text.ToLower().Contains(searchTermD.ToLower())) { Console.ForegroundColor = ConsoleColor.Yellow; Console.WriteLine("Dependency search term {0} found...", searchTermD); Console.ResetColor(); string[] bodyWords = text.Split(' '); string[] targetWords = Array.FindAll(bodyWords, t => t.ToLower().StartsWith(searchTermD.ToLower())); if (targetWords.Length != 0) { List <string> entryLines = entry.Contents; int indexOfCurrent = 0; for (int j = 0; j < targetWords.Length; j++) { indexOfCurrent = Array.IndexOf(bodyWords, targetWords[j], indexOfCurrent); List <string> words = new List <string>(); int rangeStart = indexOfCurrent - 11; int rangeEnd = indexOfCurrent + 11; rangeStart = rangeStart < 0 ? 0 : rangeStart; rangeEnd = rangeEnd >= bodyWords.Length ? bodyWords.Length - 1 : rangeEnd; for (int i = rangeStart; i <= rangeEnd; i++) { words.Add(bodyWords[i]); } string line = string.Join(" ", words.Select(t => t.Replace("\r", string.Empty).Replace("\n", " "))); List <string> termsDepends = "marijuana;marihuana;cannabis;Dispensary;Dispensaries;provisioning;Cultivat".Split(';').ToList(); if (termsDepends.Exists(t => line.ToLower().Contains(t.ToLower())) && entryLines.Exists(t => t.Contains(line) == false)) { entryLines.Add(line); } } Console.ForegroundColor = ConsoleColor.Green; Console.WriteLine("Search term {0} appears {1} times...", searchTermD, entryLines.Count); Console.ResetColor(); entry.Contents = entryLines; foreach (string content in entry.Contents) { if (!entry.GuidDic.ContainsKey(content)) { entry.GuidDic.Add(content, Guid.NewGuid().ToString()); } if (!entry.CommentDic.ContainsKey(content)) { entry.CommentDic.Add(content, string.Empty); } } } } if (entry.Contents.Count == 0) { qr.Entries.Remove(entry); } } } }
public void DownloadCouncilPdfFiles() { List <Documents> docs = this.LoadDocumentsDoneSQL(); List <QueryResult> queries = this.LoadQueriesDoneSQL(); WebClient c = new WebClient(); HtmlWeb web = new HtmlWeb(); Regex dateReg = new Regex("[a-zA-Z]+[\\s]{0,1}[0-9]{1,2},[\\s]{0,1}[0-9]{4}"); foreach (string url in this.docUrls) { string category = url.Split('*')[0]; string listUrl = url.Split('*')[1]; HtmlDocument doc = web.Load(listUrl); HtmlNodeCollection docNodeList = doc.DocumentNode.SelectNodes("//table[@id='table2']/tbody/tr"); if (docNodeList != null) { foreach (HtmlNode docNode in docNodeList) { HtmlNode dateNode = docNode.SelectSingleNode(".//strong"); string meetingDateText = dateReg.Match(dateNode.InnerText).ToString(); DateTime meetingDate = DateTime.Parse(meetingDateText); if (meetingDate < this.dtStartFrom) { Console.WriteLine("Too early, skip..."); continue; } HtmlNode agendaNode = dateNode.ParentNode; HtmlNode minuteNode = docNode.SelectSingleNode(".//a[contains(@href,'Minute')]"); Dictionary <string, string> urlDic = new Dictionary <string, string>(); urlDic.Add(this.cityEntity.CityUrl + agendaNode.Attributes["href"].Value, "agenda"); if (minuteNode != null) { urlDic.Add(this.cityEntity.CityUrl + minuteNode.Attributes["href"].Value, "minute"); } foreach (string key in urlDic.Keys) { Documents localdoc = docs.FirstOrDefault(t => t.DocSource == key); if (localdoc == null) { localdoc = new Documents(); localdoc.DocId = Guid.NewGuid().ToString(); localdoc.CityId = this.cityEntity.CityId; localdoc.Checked = false; localdoc.Important = false; localdoc.DocType = category; localdoc.DocSource = key; localdoc.DocLocalPath = string.Format("{0}\\{1}_{2}.pdf", this.localDirectory, urlDic[key], Guid.NewGuid().ToString()); try { c.DownloadFile(key, localdoc.DocLocalPath); } catch { } docs.Add(localdoc); } else { Console.WriteLine("This file already downloaded..."); } this.ReadText(false, localdoc.DocLocalPath, ref localdoc); QueryResult qr = queries.FirstOrDefault(t => t.DocId == localdoc.DocId); if (qr == null) { qr = new QueryResult(); qr.CityId = localdoc.CityId; qr.DocId = localdoc.DocId; qr.SearchTime = DateTime.Now; qr.MeetingDate = meetingDate; queries.Add(qr); } this.ExtractQueriesFromDoc(localdoc, ref qr); Console.WriteLine("{0} docs added, {1} queries added...", docs.Count, queries.Count); this.SaveMeetingResultsToSQL(docs, queries); } } } } }
public void DownloadCouncilPdfFiles() { Regex dateReg = new Regex("[0-9]{1,2}\\/[0-9]{1,2}\\/[0-9]{4}"); List <Documents> docs = this.LoadDocumentsDoneSQL(); List <QueryResult> queries = this.LoadQueriesDoneSQL(); WebClient c = new WebClient(); HtmlWeb web = new HtmlWeb(); foreach (string url in this.docUrls) { string category = url.Split('*')[0]; string categoryUrl = url.Split('*')[1]; HtmlDocument categoryDoc = web.Load(categoryUrl); HtmlNodeCollection docNodeList = categoryDoc.DocumentNode.SelectNodes("//table[@class='tablewithheadingresponsive']/tbody/tr"); if (docNodeList != null) { foreach (HtmlNode lineNode in docNodeList) { if (lineNode.SelectSingleNode("./th") != null) { continue; } string meetingDateText = dateReg.Match(lineNode.InnerText).ToString(); DateTime meetingDate = DateTime.Parse(meetingDateText); if (meetingDate < this.dtStartFrom) { Console.WriteLine("Too early, skip..."); continue; } HtmlNodeCollection docNodes = lineNode.SelectNodes(".//div[@class='pdf']/a"); if (docNodes != null) { foreach (HtmlNode docNode in docNodes) { string docLink = docNode.Attributes["href"].Value; docLink = docLink.StartsWith("http") ? docLink : this.cityEntity.CityUrl + docLink; Documents localDoc = docs.FirstOrDefault(t => t.DocSource == docLink); if (localDoc == null) { localDoc = new Documents(); localDoc.DocSource = docLink; localDoc.DocId = Guid.NewGuid().ToString(); localDoc.CityId = this.cityEntity.CityId; localDoc.Important = false; localDoc.Checked = false; localDoc.DocType = category; string tag = docNode.InnerText == "Agenda" ? "agenda" : "minute"; string localFile = string.Format("{0}\\{1}_{2}_{3}.pdf", this.localDirectory, category, tag, meetingDate.ToString("yyyyMMdd")); localDoc.DocLocalPath = localFile; try { c.DownloadFile(docLink, localFile); } catch { } docs.Add(localDoc); } else { Console.ForegroundColor = ConsoleColor.Yellow; Console.WriteLine("This file already downloaded..."); Console.ResetColor(); } this.ReadText(false, localDoc.DocLocalPath, ref localDoc); QueryResult qr = queries.FirstOrDefault(t => t.DocId == localDoc.DocId); if (qr == null) { qr = new QueryResult(); qr.DocId = localDoc.DocId; qr.CityId = localDoc.CityId; qr.MeetingDate = meetingDate; qr.SearchTime = DateTime.Now; queries.Add(qr); } this.ExtractQueriesFromDoc(localDoc, ref qr); Console.WriteLine("{0} docs added, {1} queries added...", docs.Count, queries.Count); this.SaveMeetingResultsToSQL(docs, queries); } } } } } }
public void DownloadCouncilPdfFiles() { List <Documents> docs = this.LoadDocumentsDoneSQL(); List <QueryResult> queries = this.LoadQueriesDoneSQL(); HtmlWeb web = new HtmlWeb(); WebClient c = new WebClient(); Regex dateReg = new Regex("[a-zA-Z]+[\\s]+[0-9]{1,2},[\\s]{0,2}[0-9]{4}"); Regex dateReg1 = new Regex("[a-zA-Z]+_[0-9]{1,2}_[0-9]{4}"); foreach (string url in this.docUrls) { string category = url.Split('*')[0]; string yearUrl = url.Split('*')[1]; HtmlDocument yearDoc = web.Load(yearUrl); HtmlNodeCollection yearNodes = yearDoc.DocumentNode.SelectNodes("//div[@class='box sectionIntro']//div[@class='body']//a"); for (int year = this.dtStartFrom.Year; year <= DateTime.Now.Year; year++) { if (yearNodes != null) { List <HtmlNode> currentYearNodes = yearNodes.Where(t => t.InnerText.Contains(year.ToString())).ToList(); foreach (HtmlNode currentYearNode in currentYearNodes) { string currentYearUrl = currentYearNode.Attributes["href"].Value; HtmlDocument currentYearDoc = web.Load(currentYearUrl.Replace("&", "&")); HtmlNodeCollection docNodes = currentYearDoc.DocumentNode.SelectNodes("//div[@class='attachment']/a"); if (docNodes != null) { foreach (HtmlNode docNode in docNodes) { string docUrl = this.cityEntity.CityUrl + docNode.Attributes["href"].Value; string meetingDateText = dateReg.Match(docNode.InnerText).ToString(); meetingDateText = string.IsNullOrEmpty(meetingDateText) ? dateReg1.Match(docUrl).ToString() : meetingDateText; Console.WriteLine("Url {0}\r\nUrl 1 {1}\r\nUrl 2 {2}\r\nDateTime {3}", yearUrl, currentYearUrl, docUrl, meetingDateText); if (String.IsNullOrEmpty(meetingDateText)) { continue; } DateTime meetingDate = dateReg.IsMatch(docNode.InnerText) ? DateTime.Parse(meetingDateText) : DateTime.Parse(meetingDateText.Replace("_", " ")); if (meetingDate < this.dtStartFrom) { Console.WriteLine("Too early, skip..."); continue; } Documents localdoc = docs.FirstOrDefault(t => t.DocSource == docUrl); if (localdoc == null) { localdoc = new Documents(); localdoc.DocId = Guid.NewGuid().ToString(); localdoc.CityId = this.cityEntity.CityId; localdoc.DocSource = docUrl; localdoc.DocType = category; localdoc.Checked = false; localdoc.Important = false; localdoc.DocLocalPath = string.Format("{0}\\{1}", this.localDirectory, docUrl.Split('/').LastOrDefault()); try { c.DownloadFile(docUrl, localdoc.DocLocalPath); } catch (Exception ex) { } docs.Add(localdoc); } else { Console.WriteLine("This file already downloaded..."); } this.ReadText(false, localdoc.DocLocalPath, ref localdoc); QueryResult qr = queries.FirstOrDefault(t => t.DocId == localdoc.DocId); if (qr == null) { qr = new QueryResult(); qr.CityId = localdoc.CityId; qr.DocId = localdoc.DocId; qr.SearchTime = DateTime.Now; qr.MeetingDate = meetingDate; queries.Add(qr); } this.ExtractQueriesFromDoc(localdoc, ref qr); Console.WriteLine("{0} docs added, {1} queries added...", docs.Count, queries.Count); this.SaveMeetingResultsToSQL(docs, queries); } } } } } } }
public void DownloadCouncilPdfFiles() { Regex dateReg = new Regex("[A-Za-z]+[\\s]{0,1}[0-9]{1,2},[\\s]{0,1}[0-9]{4}"); HtmlWeb web = new HtmlWeb(); WebClient c = new WebClient(); List <Documents> docs = this.LoadDocumentsDoneSQL(); List <QueryResult> queries = this.LoadQueriesDoneSQL(); for (int i = this.dtStartFrom.Year; i <= DateTime.Now.Year; i++) { foreach (string docUrl in docUrls) { string category = docUrl.Split('*')[0]; string apiUrl = string.Format(docUrl.Split('*')[1], i); HtmlDocument listDoc = web.Load(apiUrl); HtmlNodeCollection recordNodes = listDoc.DocumentNode.SelectNodes("//table/tbody/tr[@class='catAgendaRow']"); if (recordNodes != null && recordNodes.Count > 0) { foreach (HtmlNode recordNode in recordNodes) { HtmlNode dateNode = recordNode.SelectSingleNode(".//strong"); string dateText = dateReg.Match(dateNode.InnerText).ToString(); DateTime meetingDate = DateTime.Parse(dateText); if (meetingDate < this.dtStartFrom) { Console.WriteLine("Too early, skip..."); continue; } HtmlNodeCollection docUrlsNodes = recordNode.SelectNodes(".//a[contains(@href,'ViewFile')]"); List <string> docFileUrls = docUrlsNodes == null ? null : docUrlsNodes.Select(t => this.cityEntity.CityUrl + t.Attributes["href"].Value) .Where(t => t.ToLower().Contains("previous") == false) .Distinct() .ToList(); if (docFileUrls != null) { foreach (string fileUrl in docFileUrls) { Documents localdoc = docs.FirstOrDefault(t => t.DocSource == fileUrl); if (localdoc == null) { localdoc = new Documents(); localdoc.CityId = this.cityEntity.CityId; localdoc.Checked = false; localdoc.DocId = Guid.NewGuid().ToString(); localdoc.DocSource = fileUrl; localdoc.DocType = category; string localFileName = string.Format("{0}\\{1}_{2}_{3}.pdf", this.localDirectory, category, meetingDate.ToString("yyyy-MM-dd"), fileUrl.ToLower().Contains("agenda") ? "agenda" : "minutes" ); try { c.DownloadFile(fileUrl, localFileName); } catch { } localdoc.DocLocalPath = localFileName; docs.Add(localdoc); } else { Console.ForegroundColor = ConsoleColor.Yellow; Console.WriteLine("File already downloaded...."); Console.ResetColor(); } this.ReadText(false, localdoc.DocLocalPath, ref localdoc); QueryResult qr = queries.FirstOrDefault(t => t.DocId == localdoc.DocId); if (qr == null) { qr = new QueryResult(); qr.CityId = this.cityEntity.CityId; qr.DocId = localdoc.DocId; qr.MeetingDate = meetingDate; qr.SearchTime = DateTime.Now; queries.Add(qr); } this.ExtractQueriesFromDoc(localdoc, ref qr); Console.WriteLine("{0} docs saved, {1} queries saved...", docs.Count, queries.Count); } this.SaveMeetingResultsToSQL(docs, queries); } } } } } }
private void ExtractDocsFromNodeList(string category, List <HtmlNode> nodeList, ref List <Documents> docs, ref List <QueryResult> queries) { WebClient c = new WebClient(); foreach (HtmlNode docNode in nodeList) { string url = docNode.Attributes["href"].Value.StartsWith("http") ? docNode.Attributes["href"].Value : "http://www.lincolnpark.govoffice.com" + docNode.Attributes["href"].Value; DateTime meetingDate = this.ExtractDate(url); if (meetingDate < this.dtStartFrom) { Console.WriteLine("Date:{0}, Earlier than {1}...", meetingDate.ToString("yyyy-MM-dd"), this.dtStartFrom.ToString("yyyy-MM-dd")); continue; } Documents doc = docs.FirstOrDefault(t => t.DocSource.Contains(url)); if (doc == null) { Console.WriteLine("Found new document on {0}...", url); doc = new Documents(); doc.CityId = this.cityEntity.CityId; doc.DocId = Guid.NewGuid().ToString(); doc.DocType = category; doc.DocSource = url; docs.Add(doc); string localFileName = doc.DocSource.Split('?').FirstOrDefault().Split('/').LastOrDefault(); string localPath = string.Format("{0}\\{1}", this.localDirectory, localFileName); localPath = File.Exists(localPath) ? localPath.Replace(Path.GetExtension(localPath), string.Format("_{0}{1}", meetingDate.ToString("yyyy-MM-dd"), Path.GetExtension(localPath))) : localPath; doc.DocLocalPath = localPath; try { c.DownloadFile(doc.DocSource, localPath); } catch (Exception ex) { continue; } } this.ReadText(false, doc.DocLocalPath, ref doc); QueryResult qr = queries.FirstOrDefault(t => t.DocId == doc.DocId); if (qr == null) { qr = new QueryResult(); qr.CityId = doc.CityId; qr.DocId = doc.DocId; qr.MeetingDate = meetingDate; qr.SearchTime = DateTime.Now; queries.Add(qr); } this.ExtractQueriesFromDoc(doc, ref qr); Console.WriteLine("{0} docs added, {1} queries added...", docs.Count, queries.Count); } }
public void ExtractOthers(List <string> urls, ref List <Documents> docs, ref List <QueryResult> queries) { WebClient c = new WebClient(); Regex dateReg = new Regex("[a-zA-Z]+[\\s]+[0-9]{1,2},[\\s]+[0-9]{4}"); foreach (string url in urls) { string category = url.Split('*')[0]; string listUrl = url.Split('*')[1]; Console.WriteLine("Working on {0}...", listUrl); string json = c.DownloadString(listUrl); var jsonDoc = JsonConvert.DeserializeObject(json) as JToken; if (jsonDoc != null) { var fileUrlsNodes = jsonDoc.SelectTokens("$..href"); if (fileUrlsNodes != null) { foreach (var fileUrlNode in fileUrlsNodes) { string fileUrl = "https://shelbytwpmi.documents-on-demand.com" + fileUrlNode.ToString(); string meetingDateText = dateReg.Match(fileUrl).ToString(); DateTime meetingDate = DateTime.Parse(meetingDateText); if (meetingDate < this.dtStartFrom) { Console.WriteLine("Too early, skip..."); continue; } Documents localdoc = docs.FirstOrDefault(t => t.DocSource == fileUrl); if (localdoc == null) { localdoc = new Documents(); localdoc.DocId = Guid.NewGuid().ToString(); localdoc.CityId = this.cityEntity.CityId; localdoc.DocSource = fileUrl; localdoc.Important = false; localdoc.Checked = false; localdoc.DocType = category; string localPath = string.Format("{0}\\{1}", this.localDirectory, fileUrl.Split('/').LastOrDefault()); localdoc.DocLocalPath = localPath; try { c.DownloadFile(fileUrl, localPath); } catch { } docs.Add(localdoc); } else { Console.WriteLine("This file already downloaded...."); } this.ReadText(false, localdoc.DocLocalPath, ref localdoc); QueryResult qr = queries.FirstOrDefault(t => t.DocId == localdoc.DocId); if (qr == null) { qr = new QueryResult(); qr.DocId = localdoc.DocId; qr.CityId = localdoc.CityId; qr.MeetingDate = meetingDate; qr.SearchTime = DateTime.Now; queries.Add(qr); } this.ExtractQueriesFromDoc(localdoc, ref qr); Console.WriteLine("{0} docs saved, {1} queries saved...", docs.Count, queries.Count); this.SaveMeetingResultsToSQL(docs, queries); } } } } }
public bool ReadPdf(string pdfFile, ref Documents doc, ref int pages) { bool success = false; try { if (pdfFile.ToLower().Contains("pdf")) { StringBuilder textBuilder = new StringBuilder(); PdfReader r = new PdfReader(pdfFile); pages = r.NumberOfPages; for (int i = 1; i <= pages; i++) { PdfReaderContentParser parser = new PdfReaderContentParser(r); ITextExtractionStrategy st = parser.ProcessContent <SimpleTextExtractionStrategy>(i, new SimpleTextExtractionStrategy()); string text = st.GetResultantText().Trim('\r', '\n', '\t', (char)32, (char)160); if (!string.IsNullOrEmpty(text)) { doc.DocBodyDic.Add(i, text); } else { text = PdfTextExtractor.GetTextFromPage(r, i).Trim('\r', '\n', '\t', (char)32, (char)160); if (!string.IsNullOrEmpty(text)) { doc.DocBodyDic.Add(i, text); } } } r.Close(); success = true; } else if (pdfFile.ToLower().Contains("doc")) { MsWord.Application newApp = null; MsWord.Document msdoc = null; try { int retry = 2; while (retry > 0) { try { //newApp = (MsWord.Application)Marshal.GetActiveObject("Word.Application"); newApp = newApp == null ? new MsWord.Application() : newApp; System.Threading.Thread.Sleep(1000); //msdoc = newApp.ActiveDocument; msdoc = newApp.Documents.Open(pdfFile); System.Threading.Thread.Sleep(1000); object nothing = Missing.Value; MsWord.WdStatistic stat = MsWord.WdStatistic.wdStatisticPages; int num = msdoc.ComputeStatistics(stat, ref nothing); for (int i = 1; i <= num; i++) { if (doc.DocBodyDic.ContainsKey(i)) { continue; } object objWhat = MsWord.WdGoToItem.wdGoToPage; object objWhich = MsWord.WdGoToDirection.wdGoToAbsolute; object objPage = (object)i; MsWord.Range range1 = msdoc.GoTo(ref objWhat, ref objWhich, ref objPage, ref nothing); MsWord.Range range2 = range1.GoToNext(MsWord.WdGoToItem.wdGoToPage); object objStart = range1.Start; object objEnd = range2.Start; if (range1.Start == range2.Start) { objEnd = msdoc.Characters.Count; } Console.ForegroundColor = ConsoleColor.Red; Console.WriteLine("DEBUG: Path: {0}, {1}-{2}........", pdfFile, objStart, objEnd); Console.ResetColor(); if ((int)objStart <= (int)objEnd) { string innerText = msdoc.Range(ref objStart, ref objEnd).Text; doc.DocBodyDic.Add(i, innerText); } } success = true; break; } catch (Exception ex) { Console.ForegroundColor = ConsoleColor.Red; Console.WriteLine("Retry to read word {0}, Exception: {1}..", pdfFile, ex.ToString()); Console.ResetColor(); System.Threading.Thread.Sleep(1000); retry--; } finally { if (newApp != null) { newApp.NormalTemplate.Saved = true; if (msdoc != null) { msdoc.Close(false); } newApp.Quit(); } } } } catch (Exception e) { } } } catch (Exception ex) { } return(success); }
public void ExtractCouncilAgenda(List <string> urls, ref List <Documents> docs, ref List <QueryResult> queries) { HtmlWeb web = new HtmlWeb(); WebClient c = new WebClient(); Regex dateReg = new Regex("[a-zA-Z]+[\\s]+[0-9]{1,2},[\\s]+[0-9]{4}"); foreach (string url in urls) { string category = url.Split('*')[0]; string listUrl = url.Split('*')[1]; HtmlDocument doc = web.Load(listUrl); HtmlNodeCollection recordNodes = doc.DocumentNode.SelectNodes("//div[contains(@class,'Row MeetingRow')]"); if (recordNodes != null) { foreach (HtmlNode recordNode in recordNodes) { List <HtmlNode> docNodes = new List <HtmlNode>(); HtmlNode agendaNode = recordNode.SelectSingleNode(".//a[text()='Agenda']"); if (agendaNode != null) { docNodes.Add(agendaNode); } HtmlNode agendePacketNode = recordNode.SelectSingleNode(".//a[text()='Agenda Packet']"); if (agendePacketNode != null) { docNodes.Add(agendePacketNode); } if (docNodes.Count == 0) { Console.WriteLine("No files found...."); continue; } HtmlNode dateNode = recordNode.SelectSingleNode(".//div[@class='RowLink']/a"); string meetingDateText = dateNode.InnerText; DateTime meetingDate = DateTime.Parse(meetingDateText); if (meetingDate < this.dtStartFrom) { Console.WriteLine("Too early, skip..."); continue; } foreach (HtmlNode docNode in docNodes) { string docUrl = "http://shelbytownmi.iqm2.com/Citizens/" + docNode.Attributes["href"].Value.Replace("&", "&"); Documents localdoc = docs.FirstOrDefault(t => t.DocSource == docUrl); if (localdoc == null) { localdoc = new Documents(); localdoc.DocId = Guid.NewGuid().ToString(); localdoc.CityId = this.cityEntity.CityId; localdoc.DocSource = docUrl; localdoc.Important = false; localdoc.Checked = false; localdoc.DocType = "City Council"; localdoc.DocLocalPath = string.Format("{0}\\Council_{1}_{2}.pdf", this.localDirectory, docNode.InnerText, meetingDate.ToString("yyyy-MM-dd")); try { c.Headers.Add("user-agent", "chrome"); c.DownloadFile(docUrl, localdoc.DocLocalPath); } catch (Exception ex) { } docs.Add(localdoc); } else { Console.WriteLine("This file already downloaded..."); } this.ReadText(false, localdoc.DocLocalPath, ref localdoc); QueryResult qr = queries.FirstOrDefault(t => t.DocId == localdoc.DocId); if (qr == null) { qr = new QueryResult(); qr.CityId = this.cityEntity.CityId; qr.DocId = localdoc.DocId; qr.SearchTime = DateTime.Now; qr.MeetingDate = meetingDate; queries.Add(qr); } this.ExtractQueriesFromDoc(localdoc, ref qr); Console.WriteLine("{0} docs added, {1} queries added...", docs.Count, queries.Count); this.SaveMeetingResultsToSQL(docs, queries); } } } } }
public void OCRPdf(bool rotate, string docPath, ref Documents doc) { PdfReader pdfReader = new PdfReader(docPath); int totalPage = pdfReader.NumberOfPages; Console.WriteLine("Pdf file {0} contains {1} pages...", docPath, totalPage); List <int> pageNos = new List <int>(); for (int i = 1; i <= totalPage; i++) { if (!doc.DocBodyDic.ContainsKey(i)) { pageNos.Add(i); } } foreach (int pageNumber in pageNos) { try { Console.WriteLine("Working on page {0}...", pageNumber); PdfReader pdf = new PdfReader(docPath); PdfDictionary pg = pdf.GetPageN(pageNumber); PdfDictionary res = (PdfDictionary)PdfReader.GetPdfObject(pg.Get(PdfName.RESOURCES)); PdfDictionary xobj = (PdfDictionary)PdfReader.GetPdfObject(res.Get(PdfName.XOBJECT)); foreach (PdfName name in xobj.Keys) { PdfObject obj = xobj.Get(name); if (obj.IsIndirect()) { PdfDictionary tg = (PdfDictionary)PdfReader.GetPdfObject(obj); string width = tg.Get(PdfName.WIDTH).ToString(); float widthValue = float.Parse(width); string height = tg.Get(PdfName.HEIGHT).ToString(); float heightValue = -1; bool isDigit = float.TryParse(height, out heightValue); heightValue = isDigit ? heightValue : widthValue; if (heightValue < 100 || widthValue < 100) { continue; } ImageRenderInfo imgRI = ImageRenderInfo.CreateForXObject(new Matrix(float.Parse(width), heightValue), (PRIndirectReference)obj, tg); PdfImageObject image = imgRI.GetImage(); string imageFileName = string.Empty; using (Image dotnetImg = image.GetDrawingImage()) { if (dotnetImg != null) { using (MemoryStream ms = new MemoryStream()) { dotnetImg.Save(ms, ImageFormat.Jpeg); } } string ocrFolder = string.Format("{0}\\{1}", this.localDirectory, Path.GetFileNameWithoutExtension(docPath)); if (!Directory.Exists(ocrFolder)) { Directory.CreateDirectory(ocrFolder); } imageFileName = string.Format("{0}\\{1}\\Page_{2}.jpg", localDirectory, Path.GetFileNameWithoutExtension(docPath), pageNumber); dotnetImg.Save(imageFileName); } //string text = RunOCRCommand(imageFileName); string text = RetryText(imageFileName); if ((!doc.DocBodyDic.ContainsKey(pageNumber)) && (!string.IsNullOrEmpty(text))) { doc.DocBodyDic.Add(pageNumber, text); } else { Console.ForegroundColor = ConsoleColor.Yellow; Console.WriteLine("Page {0} could read...", pageNumber); Console.ResetColor(); } } } } catch (Exception ex) { } } pdfReader.Close(); }
public void DownloadCouncilPdfFiles() { WebClient c = new WebClient(); HtmlWeb web = new HtmlWeb(); List <QueryResult> queries = this.LoadQueriesDoneSQL(); List <Documents> docs = this.LoadDocumentsDoneSQL(); Console.WriteLine("Will download pdf files from {0}...", this.dtStartFrom.ToString("yyyy-MM-dd")); foreach (string key in meetingUrlMap.Keys) { Console.WriteLine("Working on {0} files...", key); string meetingUrl = meetingUrlMap[key]; HtmlDocument doc = web.Load(meetingUrl); HtmlNodeCollection docNodeList = doc.DocumentNode.SelectNodes("//a[text()='Agenda']/ancestor::p"); if (docNodeList != null) { foreach (HtmlNode docNode in docNodeList) { Regex dateReg = new Regex("[a-zA-Z]+[\\s]{0,2}[0-9]+,[\\s]{0,2}[0-9]+"); string dateText = dateReg.IsMatch(docNode.InnerText) ? dateReg.Match(docNode.InnerText).ToString() : string.Empty; if (DateTime.Parse(dateText) < this.dtStartFrom) { Console.WriteLine("Earlier than {0}, skip...", dtStartFrom); continue; } var targetLinkNodes = docNode.SelectNodes(".//a[@href]").Where(t => t.InnerText == "Agenda" || t.InnerText == "Minutes" || t.InnerText == "Approved Minutes"); if (targetLinkNodes != null && targetLinkNodes.Count() > 0) { foreach (HtmlNode docLinkNode in targetLinkNodes) { string docUrl = docLinkNode.Attributes["href"].Value; docUrl = docUrl.StartsWith("http") ? docUrl : string.Format("http://www.cityofwarren.org{0}", docUrl); Documents localdoc = docs.FirstOrDefault(t => t.DocSource.Contains(docUrl)); if (localdoc == null) { localdoc = new Documents(); localdoc.CityId = cityEntity.CityId; localdoc.DocId = Guid.NewGuid().ToString(); localdoc.DocType = key; string localDocPath = string.Format("{0}\\{1}", localDirectory, docUrl.Split('?').FirstOrDefault().Split('/').LastOrDefault()); localdoc.DocSource = docUrl; localdoc.DocLocalPath = localDocPath; try { c.DownloadFile(docUrl, localDocPath); } catch (Exception ex) { Console.WriteLine("File {0} failed to download due to {1}...", docUrl, ex.ToString()); continue; } docs.Add(localdoc); } this.ReadText(false, localdoc.DocLocalPath, ref localdoc); #region extract to another method ReadText() //string content = string.Empty; //bool readable = this.ReadPdf(localdoc.DocLocalPath, out content); //if (readable == false || (readable = true && string.IsNullOrEmpty(content))) //{ // Console.ForegroundColor = ConsoleColor.Yellow; // Console.WriteLine("File {0} cannot read, OCR!", localdoc.DocLocalPath); // Console.ResetColor(); // Dictionary<int, string> docBodyDic = new Dictionary<int, string>(); // this.OCRPdf(localdoc.DocLocalPath, ref docBodyDic); // StringBuilder contentBuilder = new StringBuilder(); // foreach (int page in docBodyDic.Keys) // { // contentBuilder.AppendFormat("{0} ", docBodyDic[page].ToString()); // } // content = contentBuilder.ToString(); //} //localdoc.DocBody = content; //localdoc.Readable = readable; #endregion QueryResult qr = queries.FirstOrDefault(t => t.DocId == localdoc.DocId); if (qr == null) { qr = new QueryResult(); qr.QueryId = Guid.NewGuid().ToString(); qr.DocId = localdoc.DocId; qr.SearchTime = DateTime.Now; qr.MeetingDate = DateTime.Parse(dateText); qr.CityId = this.cityEntity.CityId; queries.Add(qr); } this.ExtractQueriesFromDoc(localdoc, ref qr); Console.WriteLine("{0} query results saved...", queries.Count); Console.WriteLine("{0} docs saved...", docs.Count); } } } } } queries.RemoveAll(t => t.Entries.Count == 0); this.SaveMeetingResultsToSQL(docs, queries); }
protected virtual void ExtractADoc(WebClient c, string docUrl, string category, string fileType, DateTime meetingDate, ref List <Documents> docs, ref List <QueryResult> queries) { this.DealWithFileName(ref category); Documents localdoc = docs.FirstOrDefault(t => t.DocSource == docUrl); string xpath = fileType.Split(':').LastOrDefault(); fileType = fileType.Split(':').FirstOrDefault(); if (localdoc == null) { localdoc = new Documents(); localdoc.DocSource = docUrl; localdoc.CityId = this.cityEntity.CityId; localdoc.DocId = Guid.NewGuid().ToString(); localdoc.DocType = category; localdoc.Important = false; localdoc.Checked = false; localdoc.DocLocalPath = string.Format("{0}\\{1}_{2}_{3}.{4}", this.localDirectory, category, meetingDate.ToString("yyyy-MM-dd"), Guid.NewGuid().ToString(), fileType); try { c.Headers.Add("user-agent", "chrome"); c.DownloadFile(docUrl, localdoc.DocLocalPath); } catch (Exception ex) { if (ex.ToString().Contains("404")) { Console.ForegroundColor = ConsoleColor.Red; Console.WriteLine("NOT FOUND......"); Console.ResetColor(); } Console.ForegroundColor = ConsoleColor.DarkCyan; Console.WriteLine("Failed to download file {0}...", docUrl); Console.WriteLine("ERROR: {0}", ex.ToString()); Console.ResetColor(); return; } docs.Add(localdoc); } else { Console.WriteLine("This file already downloaded..."); } if (fileType != "html") { this.ReadText(false, localdoc.DocLocalPath, ref localdoc); } else { HtmlAgilityPack.HtmlDocument doc = new HtmlAgilityPack.HtmlDocument(); doc.LoadHtml(File.ReadAllText(localdoc.DocLocalPath)); localdoc.DocBodyDic.Add(1, doc.DocumentNode.SelectSingleNode(xpath).InnerText); } QueryResult qr = queries.FirstOrDefault(t => t.DocId == localdoc.DocId); if (qr == null) { qr = new QueryResult(); qr.QueryId = Guid.NewGuid().ToString(); qr.CityId = this.cityEntity.CityId; qr.DocId = localdoc.DocId; qr.MeetingDate = meetingDate; qr.SearchTime = DateTime.Now; queries.Add(qr); } if (qr.MeetingDate == DateTime.MinValue) { List <Regex> dateRegList = new List <Regex>(); dateRegList.Add(new Regex("[a-zA-Z]{3,}[\\s]{1}[0-9]{1,2},[\\s]{1}[0-9]{4}")); dateRegList.Add(new Regex("[a-zA-Z]{3,}[\\s]{1}[0-9]{1,2}[\\s]{1}[0-9]{4}")); dateRegList.Add(new Regex("[0-9]{1,2}-[0-9]{1,2}-[0-9]{4}")); dateRegList.Add(new Regex("[a-zA-Z]+[\\s]{1,2}[0-9]{1,2}(st|ST|nd|ND|rd|RD|th|TH)[,]{0,1}[\\s]{0,2}[0-9]{4}")); foreach (Regex dateReg in dateRegList) { Console.WriteLine("Match meeting date again..."); if (localdoc.DocBodyDic.Count > 0) { string meetingDateText = localdoc.DocBodyDic.FirstOrDefault().Value; Console.WriteLine("DEBUG:{0}", meetingDateText); if (dateReg.IsMatch(meetingDateText)) { if (dateRegList.IndexOf(dateReg) == dateRegList.Count - 1) { meetingDateText = meetingDateText.ToLower().Replace("th", string.Empty).Replace("rd", string.Empty).Replace("nd", string.Empty).Replace("st", string.Empty); } Console.WriteLine("Match meeting date succeefully."); meetingDateText = dateReg.Match(meetingDateText).ToString().Replace("Sept ", "Sep "); Console.WriteLine("DEBUG:{0}", meetingDateText); meetingDate = DateTime.Parse(meetingDateText.ToLower()); qr.MeetingDate = meetingDate; break; } } } } this.ExtractQueriesFromDoc(localdoc, ref qr); Console.WriteLine("{0} docs added, {1} queries added...", docs.Count, queries.Count); // this.SaveMeetingResultsToSQL(docs, queries); }
public void DownloadCouncilPdfFiles() { List <Documents> docs = this.LoadDocumentsDoneSQL(); List <QueryResult> queries = this.LoadQueriesDoneSQL(); HtmlWeb web = new HtmlWeb(); WebClient c = new WebClient(); Regex dateReg = new Regex("(0|1)[0-9]{7}"); Regex dateReg1 = new Regex("[a-zA-Z]+[\\s]{0,1}[0-9]{1,2},[0-9]{4}"); foreach (string url in this.docUrls) { Console.WriteLine("Working on {0}...", url); string category = url.Split('*')[0]; string categoryUrl = url.Split('*')[1]; string dataJson = c.DownloadString(categoryUrl); JToken dataToken = JsonConvert.DeserializeObject(dataJson) as JToken; if (dataToken != null) { var docTokens = dataToken.SelectTokens("$..href"); if (docTokens != null) { foreach (JToken docToken in docTokens) { string docUrl = "https://harrisontwpmi.documents-on-demand.com" + docToken.ToString(); DateTime meetingDate = DateTime.MinValue; if (dateReg.IsMatch(docUrl)) { string meetingDateText = dateReg.Match(docUrl).ToString(); meetingDate = DateTime.ParseExact(meetingDateText, "MMddyyyy", null); } else if (dateReg1.IsMatch(docUrl)) { string meetingDateText = dateReg1.Match(docUrl).ToString(); meetingDate = DateTime.Parse(meetingDateText); } if (meetingDate < this.dtStartFrom) { Console.WriteLine("Too early, skip..."); continue; } Documents localdoc = docs.FirstOrDefault(t => t.DocSource == docUrl); if (localdoc == null) { localdoc = new Documents(); localdoc.DocId = Guid.NewGuid().ToString(); localdoc.CityId = this.cityEntity.CityId; localdoc.DocType = category; localdoc.DocSource = docUrl; localdoc.Checked = false; localdoc.Important = false; localdoc.DocLocalPath = string.Format("{0}\\{1}", this.localDirectory, docUrl.Split('/').LastOrDefault()); try { c.DownloadFile(docUrl, localdoc.DocLocalPath); } catch (Exception ex) { } docs.Add(localdoc); } else { Console.WriteLine("This file already downloaded..."); } this.ReadText(false, localdoc.DocLocalPath, ref localdoc); QueryResult qr = queries.FirstOrDefault(t => t.DocId == localdoc.DocId); if (qr == null) { qr = new QueryResult(); qr.MeetingDate = meetingDate; qr.SearchTime = DateTime.Now; qr.CityId = this.cityEntity.CityId; qr.DocId = localdoc.DocId; queries.Add(qr); } this.ExtractQueriesFromDoc(localdoc, ref qr); Console.WriteLine("{0} docs added, {1} queries added...", docs.Count, queries.Count); this.SaveMeetingResultsToSQL(docs, queries); } } } } }
public void DownloadCouncilPdfFiles() { //tsl12 - 3072; tsl11 - 768 ServicePointManager.SecurityProtocol = (SecurityProtocolType)3072; List <Documents> docs = this.LoadDocumentsDoneSQL(); List <QueryResult> queries = this.LoadQueriesDoneSQL(); WebClient c = new WebClient(); ChromeDriver cd = new ChromeDriver(); Regex dateReg = new Regex("[a-zA-Z]+[\\s]+[0-9]{1,2},[\\s]+[0-9]{4}"); foreach (string docUrl in this.docUrls) { string category = docUrl.Split('*')[0]; string categoryUrl = docUrl.Split('*')[1] + "?limit=0"; cd.Navigate().GoToUrl(categoryUrl); System.Threading.Thread.Sleep(3000); HtmlDocument doc = new HtmlDocument(); doc.LoadHtml(cd.PageSource); HtmlNodeCollection meetingNodes = doc.DocumentNode.SelectNodes("//td[@headers='tableOrdering']/a"); if (meetingNodes != null) { foreach (HtmlNode meetingNode in meetingNodes) { string meetingDateText = dateReg.Match(meetingNode.InnerText).ToString(); DateTime meetingDate = DateTime.Parse(meetingDateText); if (meetingDate < this.dtStartFrom) { Console.WriteLine("Too early..."); continue; } string fileUrl = this.cityEntity.CityUrl + meetingNode.Attributes["href"].Value + "?format=pdf"; Documents localdoc = docs.FirstOrDefault(t => t.DocSource == fileUrl); if (localdoc == null) { localdoc = new Documents(); localdoc.DocId = Guid.NewGuid().ToString(); localdoc.DocType = category; localdoc.CityId = this.cityEntity.CityId; localdoc.Important = false; localdoc.Checked = false; localdoc.DocSource = fileUrl; localdoc.DocLocalPath = string.Format("{0}\\{1}.pdf", this.localDirectory, fileUrl.Split('?').FirstOrDefault().Split('/').LastOrDefault()); try { c.Headers.Add("user-agent", "chrome"); c.DownloadFile(fileUrl, localdoc.DocLocalPath); } catch (Exception ex) { } docs.Add(localdoc); } else { Console.WriteLine("This file already downloaded...."); } this.ReadText(false, localdoc.DocLocalPath, ref localdoc); QueryResult qr = queries.FirstOrDefault(t => t.DocId == localdoc.DocId); if (qr == null) { qr = new QueryResult(); qr.CityId = this.cityEntity.CityId; qr.DocId = localdoc.DocId; qr.MeetingDate = meetingDate; qr.SearchTime = DateTime.Now; queries.Add(qr); } this.ExtractQueriesFromDoc(localdoc, ref qr); Console.WriteLine("{0} docs added, {1} queries added...", docs.Count, queries.Count); this.SaveMeetingResultsToSQL(docs, queries); } } } cd.Quit(); cd = null; }
private void ExtractMinutes(HtmlDocument doc, string category, ref List <Documents> docs, ref List <QueryResult> queries) { WebClient c = new WebClient(); HtmlNodeCollection docNodeCollection = doc.DocumentNode.SelectNodes("//table[@class='tablewithheadingresponsive']/tbody/tr[@class]"); if (docNodeCollection != null) { foreach (HtmlNode destinationNode in docNodeCollection) { HtmlNode meetingDateNode = destinationNode.SelectSingleNode("./td"); string meetingDateText = meetingDateNode.InnerText.Split('-').FirstOrDefault().Trim((char)32, (char)160); DateTime meetingDate = string.IsNullOrEmpty(meetingDateText) ? DateTime.MinValue : DateTime.Parse(meetingDateText); HtmlNodeCollection minuteDocNode = destinationNode.SelectNodes(".//div[@class='pdf']/a[text()='Minutes']"); if (minuteDocNode == null || minuteDocNode.Count == 0) { continue; } foreach (HtmlNode docNode in minuteDocNode) { string pdfUrl = docNode.Attributes["href"].Value; pdfUrl = !pdfUrl.StartsWith("http") ? this.cityEntity.CityUrl + pdfUrl : pdfUrl; Documents localDoc = docs.FirstOrDefault(t => t.DocSource == pdfUrl); if (localDoc == null) { localDoc = new Documents(); localDoc.DocId = Guid.NewGuid().ToString(); localDoc.DocSource = pdfUrl; localDoc.CityId = this.cityEntity.CityId; localDoc.DocType = category; string localFilePath = string.Format("{0}\\Minutes_{1}_{2}.pdf", this.localDirectory, Path.GetFileNameWithoutExtension(pdfUrl.Split('?').FirstOrDefault().Split('/').LastOrDefault()), Guid.NewGuid().ToString()); localDoc.DocLocalPath = localFilePath; localDoc.Checked = false; try { c.DownloadFile(pdfUrl, localFilePath); } catch { } docs.Add(localDoc); } else { Console.ForegroundColor = ConsoleColor.Yellow; Console.WriteLine("This document already downloaded..."); Console.ResetColor(); } this.ReadText(false, localDoc.DocLocalPath, ref localDoc); QueryResult qr = queries.FirstOrDefault(t => t.DocId == localDoc.DocId); if (qr == null) { qr = new QueryResult(); qr.CityId = localDoc.CityId; qr.SearchTime = DateTime.Now; qr.MeetingDate = meetingDate; qr.DocId = localDoc.DocId; queries.Add(qr); } this.ExtractQueriesFromDoc(localDoc, ref qr); Console.WriteLine("{0} docs added, {1} queries added...", docs.Count, queries.Count); } } } this.SaveMeetingResultsToSQL(docs, queries); }
public void ExtractCouncil(string url, ref List <Documents> docs, ref List <QueryResult> queries) { WebClient c = new WebClient(); HtmlWeb web = new HtmlWeb(); Regex dateReg = new Regex("[a-zA-Z]+[\\s]{0,1}[0-9]{1,2},[\\s]{0,1}[0-9]+"); HtmlDocument doc = web.Load(url); HtmlNodeCollection pdfFileNodes = doc.DocumentNode.SelectNodes("//a[contains(@href,'pdf')]"); List <HtmlNode> targetFilesNodes = null; List <int> years = new List <int>(); for (int i = this.dtStartFrom.Year; i <= DateTime.Now.Year; i++) { years.Add(i); } if (pdfFileNodes != null) { targetFilesNodes = pdfFileNodes.Where(t => years.Exists(y => t.InnerText.Contains(y.ToString()))).ToList(); } foreach (HtmlNode pdfFileNode in targetFilesNodes) { string dateText = dateReg.Match(pdfFileNode.InnerText).ToString(); DateTime meetingDate = string.IsNullOrEmpty(dateText) ? DateTime.MinValue : DateTime.Parse(dateText); string pdfUrl = pdfFileNode.Attributes["href"].Value; pdfUrl = pdfUrl.StartsWith("http") ? pdfUrl : this.cityEntity.CityUrl.Trim('/') + "/" + pdfUrl.TrimStart('/'); Documents localdoc = docs.FirstOrDefault(t => t.DocSource == pdfUrl); if (localdoc == null) { localdoc = new Documents(); localdoc.Important = false; localdoc.Checked = false; localdoc.CityId = this.cityEntity.CityId; localdoc.DocId = Guid.NewGuid().ToString(); localdoc.DocSource = pdfUrl; localdoc.DocType = "City Council"; localdoc.DocLocalPath = string.Format("{0}\\City Council_{1}", this.localDirectory, pdfUrl.Split('?').LastOrDefault().Split('/').LastOrDefault()); try { c.DownloadFile(pdfUrl, localdoc.DocLocalPath); } catch { } docs.Add(localdoc); } else { Console.WriteLine("This file already downloaded..."); } this.ReadText(false, localdoc.DocLocalPath, ref localdoc); if (meetingDate == DateTime.MinValue) { if (localdoc.DocBodyDic.Count > 0) { if (dateReg.IsMatch(localdoc.DocBodyDic[1])) { meetingDate = DateTime.Parse(dateReg.Match(localdoc.DocBodyDic[1]).ToString()); } } } QueryResult qr = queries.FirstOrDefault(t => t.DocId == localdoc.DocId); if (qr == null) { qr = new QueryResult(); qr.CityId = localdoc.CityId; qr.DocId = localdoc.DocId; qr.SearchTime = DateTime.Now; qr.MeetingDate = meetingDate; queries.Add(qr); } this.ExtractQueriesFromDoc(localdoc, ref qr); Console.WriteLine("{0} docs added, {1} queries added...", docs.Count, queries.Count); this.SaveMeetingResultsToSQL(docs, queries); } }
private void ExtractMoreAgenda(HtmlDocument moreDoc, string category, DateTime meetingDate, ref List <Documents> docs, ref List <QueryResult> queries) { WebClient c = new WebClient(); HtmlNodeCollection docNodeList = moreDoc.DocumentNode.SelectNodes("//img[@src='/Images/IconsClipArtGraphics/Icon-PDFSmall.aspx']/following-sibling::a"); if (docNodeList != null && docNodeList.Count > 0) { foreach (HtmlNode docNode in docNodeList) { string pdfUrl = docNode.Attributes["href"].Value; pdfUrl = !pdfUrl.StartsWith("http") ? this.cityEntity.CityUrl + pdfUrl : pdfUrl; Documents localDoc = docs.FirstOrDefault(t => t.DocSource == pdfUrl); if (localDoc == null) { localDoc = new Documents(); localDoc.DocId = Guid.NewGuid().ToString(); localDoc.DocSource = pdfUrl; localDoc.CityId = this.cityEntity.CityId; localDoc.DocType = category; string localFilePath = string.Format("{0}\\Agenda_{1}_{2}.pdf", this.localDirectory, Path.GetFileNameWithoutExtension(pdfUrl.Split('?').FirstOrDefault().Split('/').LastOrDefault()), Guid.NewGuid().ToString()); localDoc.DocLocalPath = localFilePath; localDoc.Checked = false; try { c.DownloadFile(pdfUrl, localFilePath); } catch { } docs.Add(localDoc); } else { Console.ForegroundColor = ConsoleColor.Yellow; Console.WriteLine("The file already downloaded..."); Console.ResetColor(); } this.ReadText(false, localDoc.DocLocalPath, ref localDoc); QueryResult qr = queries.FirstOrDefault(t => t.DocId == localDoc.DocId); if (qr == null) { qr = new QueryResult(); qr.DocId = localDoc.DocId; qr.CityId = localDoc.CityId; qr.MeetingDate = meetingDate; qr.SearchTime = DateTime.Now; queries.Add(qr); } this.ExtractQueriesFromDoc(localDoc, ref qr); Console.WriteLine("{0} docs added, {1} queries added...", docs.Count, queries.Count); } } this.SaveMeetingResultsToSQL(docs, queries); }
public void DownloadZBAFiles(ref List <Documents> docs, ref List <QueryResult> queries) { //List<string> zbaUrls = this.GetZBAMeetingsThisMonth(); HtmlWeb web = new HtmlWeb(); WebClient c = new WebClient(); string zbaUrl = this.docUrls.FirstOrDefault(t => t.StartsWith("Zoning Board of Appeals")); string zbaTemplate = zbaUrl.Split('*')[1]; string earliestUrl = string.Empty; string latestUrl = string.Empty; this.GetEarliestMeeting(this.zbaCalendarUrl, ref earliestUrl, ref latestUrl); Regex digitReg = new Regex("[0-9]+"); int early = int.Parse(digitReg.Matches(earliestUrl).Cast <Match>().LastOrDefault().ToString()); int end = int.Parse(digitReg.Matches(latestUrl).Cast <Match>().LastOrDefault().ToString()); if (string.IsNullOrEmpty(latestUrl)) { Console.WriteLine("No more meetings..."); return; } for (int i = early; ; i++) { if (this.listNoFiles.Contains(i)) { continue; } string zba = string.Format(zbaTemplate, i); HtmlDocument doc = web.Load(zba); HtmlNode dateNode = doc.DocumentNode.SelectSingleNode("//*[text()='Start Date/Time:']/parent::div"); dateNode = dateNode == null ? null : dateNode.NextSibling.NextSibling; if (dateNode == null && i > end) { Console.WriteLine("The last meeting..."); break; } if (dateNode == null) { this.listNoFiles.Add(i); Console.WriteLine("No meetings..."); continue; } DateTime dtMeeting = DateTime.Parse(dateNode.InnerText); var docNodesCollection = doc.DocumentNode.SelectNodes("//a[@href]"); List <HtmlNode> fileLinksNodes = docNodesCollection != null? docNodesCollection.Where(t => t.Attributes["href"].Value.ToLower().Contains(".pdf") || t.Attributes["href"].Value.ToLower().Contains(".doc") || t.Attributes["href"].Value.ToLower().Contains("fileticket")) .ToList() : null; if (fileLinksNodes != null && fileLinksNodes.Count > 0) { foreach (HtmlNode fileNode in fileLinksNodes) { string pdfUrl = fileNode.Attributes["href"].Value; pdfUrl = pdfUrl.StartsWith("http") ? pdfUrl : "http://www.detroitmi.gov" + fileNode.Attributes["href"].Value; Documents localDoc = docs.FirstOrDefault(t => t.DocSource.Contains(pdfUrl)); if (localDoc == null) { localDoc = new Documents(); localDoc.DocId = Guid.NewGuid().ToString(); HtmlNode categoryNode = doc.DocumentNode.SelectSingleNode("//*[text()='Category:']/parent::div"); categoryNode = categoryNode != null ? categoryNode.NextSibling.NextSibling : categoryNode; if (categoryNode == null || categoryNode.InnerText.ToLower().Contains("zoning ") == false) { continue; } localDoc.DocType = categoryNode == null ? string.Empty : categoryNode.InnerText.Trim('\t', '\n', '\r'); string localFile = pdfUrl.ToLower().Contains("fileticket") ? pdfUrl.Split('&').FirstOrDefault().Split('=').LastOrDefault().Replace("%", string.Empty) : pdfUrl.Split('?').FirstOrDefault().Split('/').LastOrDefault(); string localFileFull = pdfUrl.ToLower().Contains("fileticket") ? string.Format("{0}\\Zoning_{1}_{2}.pdf", this.localDirectory, localFile, dtMeeting.ToString("yyyy-MM-dd")) : string.Format("{0}\\{1}", localDirectory, localFile); localDoc.DocLocalPath = localFileFull; localDoc.CityId = this.cityEntity.CityId; localDoc.DocSource = pdfUrl; try { c.DownloadFile(pdfUrl, localFileFull); } catch (Exception) { continue; } docs.Add(localDoc); } this.ReadText(false, localDoc.DocLocalPath, ref localDoc); Console.ForegroundColor = ConsoleColor.Yellow; Console.WriteLine("Now search in doc {0}.", localDoc.DocLocalPath); Console.ResetColor(); QueryResult qr = queries.FirstOrDefault(q => q.DocId == localDoc.DocId); if (qr == null) { qr = this.ExtractMeetingInformation(doc, localDoc); queries.Add(qr); } else { this.ExtractQueriesFromDoc(localDoc, ref qr); } Console.WriteLine("{0} queries added, {1} docs added...", queries.Count, docs.Count); } } else { this.listNoFiles.Add(i); File.WriteAllLines(string.Format("{0}_NoFiles.txt", this.GetType().Name), listNoFiles.Select(t => t.ToString())); Console.WriteLine("No files on {0}...", zba); } largest = i; } }
public void DownloadCouncilPdfFiles() { List <Documents> docs = this.LoadDocumentsDoneSQL(); List <QueryResult> queries = this.LoadQueriesDoneSQL(); HtmlWeb web = new HtmlWeb(); Regex dateReg = new Regex("[0-9]{1,2}\\.[0-9]{1,2}\\.[0-9]{2}"); WebClient c = new WebClient(); foreach (string url in this.docUrls) { string date = string.Empty; string category = string.Empty; if (url.Contains("*")) { date = url.Split('*')[0]; category = "Council"; } else { if (url.Contains("council")) { category = "Council"; } else { category = "Planning Commission"; } } List <HtmlNode> docNodeList = null; if (string.IsNullOrEmpty(date)) { HtmlDocument listDoc = web.Load(url); docNodeList = listDoc.DocumentNode.SelectNodes("//ul[@class='linklist ']/li/a").ToList(); } else { docNodeList = new List <HtmlNode>(); docNodeList.Add(HtmlNode.CreateNode(string.Format("<a href='{0}'>{1}</a>", url.Split('*')[1], date))); } if (docNodeList != null) { Console.WriteLine("{0} files...", docNodeList.Count); foreach (HtmlNode docNode in docNodeList) { string dateText = string.Join(" ", docNode.InnerText.Trim('\t', '\r', '\n', (char)32, (char)160).Split(' ').Take(3)); DateTime meetingDate = DateTime.MinValue; try { meetingDate = string.IsNullOrEmpty(date) ? DateTime.Parse(dateText) : DateTime.Parse(date); } catch { dateText = dateReg.Match(dateText).ToString(); meetingDate = DateTime.ParseExact(dateText, "M.d.yy", null); } if (meetingDate < this.dtStartFrom) { Console.WriteLine("{0} earlier than {1}. Skip...", meetingDate, dtStartFrom); continue; } string pdfUrl = docNode.Attributes["href"].Value; pdfUrl = pdfUrl.StartsWith("http") ? pdfUrl : "http://www.algonac-mi.gov" + pdfUrl; Documents localdoc = docs.FirstOrDefault(t => t.DocSource == pdfUrl); if (localdoc == null) { localdoc = new Documents(); localdoc.DocId = Guid.NewGuid().ToString(); localdoc.DocType = category; localdoc.CityId = this.cityEntity.CityId; localdoc.DocSource = pdfUrl; localdoc.DocLocalPath = string.Format("{0}\\{1}_{2}_{3}.pdf", this.localDirectory, category, meetingDate.ToString("yyyy-MM-dd"), Guid.NewGuid().ToString()); try { c.DownloadFile(pdfUrl, localdoc.DocLocalPath); } catch { } docs.Add(localdoc); } else { Console.ForegroundColor = ConsoleColor.Yellow; Console.WriteLine("This document already downloaded..."); Console.ResetColor(); } this.ReadText(false, localdoc.DocLocalPath, ref localdoc); QueryResult qr = queries.FirstOrDefault(t => t.DocId == localdoc.DocId); if (qr == null) { qr = new QueryResult(); qr.DocId = localdoc.DocId; qr.CityId = localdoc.CityId; qr.MeetingDate = meetingDate; qr.SearchTime = DateTime.Now; queries.Add(qr); } this.ExtractQueriesFromDoc(localdoc, ref qr); Console.WriteLine("{0} documents saved...", docs.Count); Console.WriteLine("{0} query results saved...", queries.Count); } } this.SaveMeetingResultsToSQL(docs, queries); } }
public void DownloadCouncilPdfFiles() { HtmlWeb web = new HtmlWeb(); WebClient c = new WebClient(); List <Documents> docs = this.LoadDocumentsDoneSQL(); List <QueryResult> queries = this.LoadQueriesDoneSQL(); this.listNoFiles = File.Exists(string.Format("{0}_NoFiles.txt", this.GetType().Name)) ? File.ReadAllLines(string.Format("{0}_NoFiles.txt", this.GetType().Name)).Select(t => int.Parse(t)).ToList() : new List <int>(); this.ExtractPlanningCommissionFiles(ref docs, ref queries); this.DownloadZBAFiles(ref docs, ref queries); this.SaveMeetingResultsToSQL(docs, queries); string earlistUrl = string.Empty; string latestUrl = string.Empty; this.GetEarliestMeeting(this.cityCouncilUrl, ref earlistUrl, ref latestUrl); string councilTemplate = this.docUrls.FirstOrDefault(t => t.StartsWith("City Council")); string template = councilTemplate.Split('"').FirstOrDefault().Split('*').LastOrDefault(); Regex digitReg = new Regex("[0-9]+"); //if (string.IsNullOrWhiteSpace(latestUrl)) //{ // Console.WriteLine("No meeting on page..."); // return; //} int end = int.Parse(digitReg.Matches(latestUrl).Cast <Match>().ToList().LastOrDefault().ToString()); if (end > this.largest) { this.largest = end; } listNoFiles.RemoveAll(t => t > this.largest); int start = int.Parse(councilTemplate.Split('"').LastOrDefault()); Console.WriteLine("Start from {0}...", start); Console.WriteLine("End at {0}...", this.largest); for (int i = start; i <= this.largest; i++) { if (listNoFiles.Contains(i)) { continue; } string councilUrl = template.Contains("{0}") ? string.Format(template, i) : template; HtmlDocument doc = web.Load(councilUrl); HtmlNodeCollection linksNodes = doc.DocumentNode.SelectNodes("//a[@href]"); List <HtmlNode> fileLinksNodes = linksNodes != null? linksNodes.Where(t => t.Attributes["href"].Value.ToLower().Contains(".pdf") || t.Attributes["href"].Value.ToLower().Contains(".doc") || t.Attributes["href"].Value.ToLower().Contains("fileticket")) .ToList() : null; HtmlNode dateNode = doc.DocumentNode.SelectSingleNode("//*[text()='Start Date/Time:']/parent::div"); dateNode = dateNode == null ? null : dateNode.NextSibling.NextSibling; string date = dateNode == null ? string.Empty : DateTime.Parse(dateNode.InnerText).ToString("yyyy-MM-dd"); if (fileLinksNodes != null && fileLinksNodes.Count > 0) { foreach (HtmlNode fileNode in fileLinksNodes) { string pdfUrl = fileNode.Attributes["href"].Value; pdfUrl = pdfUrl.StartsWith("http") ? pdfUrl : "http://www.detroitmi.gov" + fileNode.Attributes["href"].Value; Documents localDoc = docs.FirstOrDefault(t => t.DocSource.Contains(pdfUrl)); if (localDoc == null) { localDoc = new Documents(); localDoc.DocId = Guid.NewGuid().ToString(); HtmlNode categoryNode = doc.DocumentNode.SelectSingleNode("//*[text()='Category:']/parent::div"); categoryNode = categoryNode != null ? categoryNode.NextSibling.NextSibling : categoryNode; if (categoryNode == null) { continue; } localDoc.DocType = categoryNode == null ? string.Empty : categoryNode.InnerText.Trim('\t', '\n', '\r'); string localFile = pdfUrl.ToLower().Contains("fileticket") ? pdfUrl.Split('&').FirstOrDefault().Split('=').LastOrDefault().Replace("%", string.Empty) : pdfUrl.Split('?').FirstOrDefault().Split('/').LastOrDefault(); string localFileFull = pdfUrl.ToLower().Contains("fileticket") ? string.Format("{0}\\Council_{1}_{2}.pdf", this.localDirectory, localFile, date) : string.Format("{0}\\{1}", localDirectory, localFile); if (File.Exists(localFileFull)) { localFileFull = string.Format("{0}\\{1}_{2}.pdf", localDirectory, Path.GetFileNameWithoutExtension(localFile), date); } localDoc.DocLocalPath = localFileFull; localDoc.CityId = this.cityEntity.CityId; localDoc.DocSource = pdfUrl; try { c.DownloadFile(pdfUrl, localFileFull); } catch (Exception) { continue; } docs.Add(localDoc); } this.ReadText(true, localDoc.DocLocalPath, ref localDoc); Console.ForegroundColor = ConsoleColor.Yellow; Console.WriteLine("Now search in doc {0}.", localDoc.DocLocalPath); Console.ResetColor(); QueryResult qr = queries.FirstOrDefault(q => q.DocId == localDoc.DocId); if (qr == null) { qr = this.ExtractMeetingInformation(doc, localDoc); queries.Add(qr); } else { this.ExtractQueriesFromDoc(localDoc, ref qr); } Console.WriteLine("{0} queries added, {1} docs added...", queries.Count, docs.Count); } } else { listNoFiles.Add(i); File.WriteAllLines(string.Format("{0}_NoFiles.txt", this.GetType().Name), listNoFiles.Select(t => t.ToString())); Console.WriteLine("No files on {0}...", councilUrl); } } //this.docUrls.Remove(councilTemplate); //this.docUrls.Add(string.Format("City Council*{0}\"{1}", template, council_current)); //File.WriteAllLines("DetroitMICity_Urls.txt", this.docUrls, Encoding.UTF8); this.SaveMeetingResultsToSQL(docs, queries); }
public void DownloadCouncilPdfFiles() { List <Documents> docs = this.LoadDocumentsDoneSQL(); List <QueryResult> queries = this.LoadQueriesDoneSQL(); WebClient c = new WebClient(); HtmlWeb web = new HtmlWeb(); Regex dateReg = new Regex("[0-9]{1,2}\\/[0-9]{1,2}\\/[0-9]{4}"); HtmlDocument doc = web.Load(this.docUrls.FirstOrDefault()); HtmlNodeCollection fileNodes = doc.DocumentNode.SelectNodes("//*[text()='Township Board']/ancestor::table/tbody/tr[position()>2]"); if (fileNodes != null) { foreach (HtmlNode fileNode in fileNodes) { HtmlNodeCollection tds = fileNode.SelectNodes("./td"); for (int i = 0; i < 6; i++) { string category = string.Empty; if (i == 0 || i == 1) { category = "City Council"; } else if (i == 2 || i == 3) { category = "Planning Commission"; } else if (i == 4 || i == 5) { category = "Zoning Board of Appeals"; } HtmlNode currentNode = tds[i]; HtmlNodeCollection docNodes = currentNode.SelectNodes("./a"); if (docNodes != null) { foreach (HtmlNode docNode in docNodes) { string meetingDocUrl = this.cityEntity.CityUrl + "/" + docNode.Attributes["href"].Value; string meetingDateText = dateReg.Match(docNode.InnerText).ToString(); if (string.IsNullOrEmpty(meetingDateText)) { Console.WriteLine("No file..."); continue; } DateTime meetingDate = DateTime.Parse(meetingDateText); if (meetingDate < this.dtStartFrom) { Console.WriteLine("Too early..."); continue; } Documents localdoc = docs.FirstOrDefault(t => t.DocSource == meetingDocUrl); if (localdoc == null) { localdoc = new Documents(); localdoc.DocId = Guid.NewGuid().ToString(); localdoc.CityId = this.cityEntity.CityId; localdoc.Important = false; localdoc.Checked = false; localdoc.DocSource = meetingDocUrl; localdoc.DocType = category; localdoc.DocLocalPath = string.Format("{0}\\{1}", this.localDirectory, meetingDocUrl.Split('/').LastOrDefault()); try { c.DownloadFile(meetingDocUrl, localdoc.DocLocalPath); } catch (Exception ex) { } docs.Add(localdoc); } else { Console.WriteLine("This file already downloaded..."); } this.ReadText(false, localdoc.DocLocalPath, ref localdoc); QueryResult qr = queries.FirstOrDefault(t => t.DocId == localdoc.DocId); if (qr == null) { qr = new QueryResult(); qr.CityId = this.cityEntity.CityId; qr.DocId = localdoc.DocId; qr.SearchTime = DateTime.Now; qr.MeetingDate = meetingDate; queries.Add(qr); } this.ExtractQueriesFromDoc(localdoc, ref qr); Console.WriteLine("{0} docs added, {1} queries added...", docs.Count, queries.Count); this.SaveMeetingResultsToSQL(docs, queries); } } } } } }
public void ExtractOthers(string url, string category, ref List <Documents> docs, ref List <QueryResult> queries) { HtmlWeb web = new HtmlWeb(); WebClient c = new WebClient(); Regex dateReg = new Regex("[a-zA-z]+[\\s]{0,2}[0-9]+,[\\s]{0,2}[0-9]+"); HtmlDocument listDoc = web.Load(url); HtmlNodeCollection docNodeList = listDoc.DocumentNode.SelectNodes("//a[contains(@href,'pdf')]"); if (docNodeList != null) { foreach (HtmlNode docNode in docNodeList) { string docUrl = docNode.Attributes["href"].Value; docUrl = docUrl.StartsWith("http") ? docUrl : this.cityEntity.CityUrl + docUrl; string meetingDateText = docNode.InnerText.Trim('\r', '\n', '\t', (char)32, (char)160); meetingDateText = dateReg.Match(meetingDateText).ToString(); DateTime meetingDate = DateTime.Parse(meetingDateText); if (meetingDate < this.dtStartFrom) { Console.WriteLine("Too early, skip!"); continue; } Documents localdoc = docs.FirstOrDefault(t => t.DocSource == docUrl); if (localdoc == null) { localdoc = new Documents(); localdoc.DocId = Guid.NewGuid().ToString(); localdoc.CityId = this.cityEntity.CityId; localdoc.Checked = false; localdoc.Important = false; localdoc.DocSource = docUrl; localdoc.DocType = category; localdoc.DocLocalPath = string.Format("{0}\\{1}", this.localDirectory, docUrl.Split('?').FirstOrDefault().Split('/').LastOrDefault()); try { c.DownloadFile(docUrl, localdoc.DocLocalPath); } catch { } docs.Add(localdoc); } else { Console.ForegroundColor = ConsoleColor.Yellow; Console.WriteLine("This document already downloaded..."); Console.ResetColor(); } this.ReadText(false, localdoc.DocLocalPath, ref localdoc); QueryResult qr = queries.FirstOrDefault(t => t.DocId == localdoc.DocId); if (qr == null) { qr = new QueryResult(); qr.CityId = localdoc.CityId; qr.DocId = localdoc.DocId; qr.SearchTime = DateTime.Now; qr.MeetingDate = meetingDate; queries.Add(qr); } this.ExtractQueriesFromDoc(localdoc, ref qr); Console.WriteLine("{0} docs added, {1} queries added...", docs.Count, queries.Count); } } }