Beispiel #1
0
        private void ExtractCouncil(string url, string category, ref List <Documents> docs, ref List <QueryResult> queries)
        {
            WebClient c    = new WebClient();
            string    json = c.DownloadString(url);

            json = string.Format("{0}\"root\":{1}{2}", "{", json, "}");
            XmlDocument docXml  = JsonConvert.DeserializeXmlNode(json, "root");
            XmlNodeList docList = docXml.SelectNodes("//root/data");
            Regex       dateReg = new Regex("[A-Za-z]+[\\s]{1}[0-9]{1,2},[\\s]+[0-9]{2,4}");

            foreach (XmlNode docNode in docList)
            {
                string   title       = docNode.SelectSingleNode("./title").InnerText;
                string   docUrl      = "https://southfieldcitymi.documents-on-demand.com" + docNode.SelectSingleNode("./attr/href").InnerText;
                string   meetingText = dateReg.Match(title).ToString();
                DateTime meetingDate = DateTime.Parse(dateReg.Match(title).ToString());
                if (meetingDate < this.dtStartFrom)
                {
                    Console.WriteLine("Too early, skip...");
                    continue;
                }
                Documents localDoc = docs.FirstOrDefault(t => t.DocSource == docUrl);

                if (localDoc == null)
                {
                    localDoc           = new Documents();
                    localDoc.DocId     = Guid.NewGuid().ToString();
                    localDoc.CityId    = this.cityEntity.CityId;
                    localDoc.Checked   = false;
                    localDoc.DocType   = category;
                    localDoc.DocSource = docUrl;
                    string localPath = string.Format("{0}\\{1}_{2}", this.localDirectory, Guid.NewGuid().ToString(), docUrl.Split('/').LastOrDefault());
                    localDoc.DocLocalPath = localPath;

                    try
                    {
                        c.DownloadFile(docUrl, localPath);
                    }
                    catch
                    {
                    }

                    docs.Add(localDoc);
                }
                else
                {
                    Console.ForegroundColor = ConsoleColor.Yellow;
                    Console.WriteLine("{0} already downloaded...", docUrl);
                    Console.ResetColor();
                }

                this.ReadText(false, localDoc.DocLocalPath, ref localDoc);
                QueryResult qr = queries.FirstOrDefault(t => t.DocId == localDoc.DocId);

                if (qr == null)
                {
                    qr        = new QueryResult();
                    qr.DocId  = localDoc.DocId;
                    qr.CityId = localDoc.CityId;

                    qr.SearchTime  = DateTime.Now;
                    qr.MeetingDate = meetingDate;
                    queries.Add(qr);
                }

                this.ExtractQueriesFromDoc(localDoc, ref qr);
                Console.WriteLine("{0} docs saved, {1} queries saved...", docs.Count, queries.Count);
            }
        }
Beispiel #2
0
        public void DownloadCouncilPdfFiles()
        {
            List <Documents>   docs    = this.LoadDocumentsDoneSQL();
            List <QueryResult> queries = this.LoadQueriesDoneSQL();
            HtmlWeb            web     = new HtmlWeb();
            WebClient          c       = new WebClient();

            foreach (string url in this.docUrls)
            {
                HtmlDocument       listDoc     = web.Load(url);
                HtmlNodeCollection docNodeList = listDoc.DocumentNode.SelectNodes("//div[contains(@id,'vid')]//ul/li/a[@href]");

                if (docNodeList != null)
                {
                    Console.WriteLine("{0} files...", docNodeList.Count);

                    foreach (HtmlNode docNode in docNodeList)
                    {
                        string pdfName = docNode.InnerText.Trim('\t', '\r', '\n', (char)32, (char)160);
                        string tag     = pdfName.ToLower().Contains("agenda") ? "agenda" : "minute";
                        string pdfUrl  = docNode.Attributes["href"].Value;
                        pdfUrl = pdfUrl.StartsWith("http") ? pdfUrl : "http://www.saginaw-mi.com" + pdfUrl;
                        DateTime meetingDate = DateTime.Parse(pdfUrl.Split('?').FirstOrDefault().Split('/').LastOrDefault().Replace(".pdf", string.Empty));

                        if (meetingDate < this.dtStartFrom)
                        {
                            Console.WriteLine("{0} earlier than {1}. Skip...", meetingDate, dtStartFrom);
                            continue;
                        }

                        Documents localdoc = docs.FirstOrDefault(t => t.DocSource == pdfUrl);

                        if (localdoc == null)
                        {
                            string category = "Council";
                            localdoc           = new Documents();
                            localdoc.DocId     = Guid.NewGuid().ToString();
                            localdoc.DocType   = category;
                            localdoc.CityId    = this.cityEntity.CityId;
                            localdoc.DocSource = pdfUrl;

                            string localPath = string.Format("{0}\\{1}_{2}", this.localDirectory, tag, pdfUrl.Split('?').FirstOrDefault().Split('/').LastOrDefault());
                            localPath             = HttpUtility.UrlDecode(localPath);
                            localdoc.DocLocalPath = localPath;

                            try
                            {
                                c.DownloadFile(pdfUrl, localPath);
                            }
                            catch
                            {
                            }

                            docs.Add(localdoc);
                        }
                        else
                        {
                            Console.ForegroundColor = ConsoleColor.Yellow;
                            Console.WriteLine("This document already downloaded...");
                            Console.ResetColor();
                        }

                        this.ReadText(false, localdoc.DocLocalPath, ref localdoc);
                        QueryResult qr = queries.FirstOrDefault(t => t.DocId == localdoc.DocId);

                        if (qr == null)
                        {
                            qr             = new QueryResult();
                            qr.DocId       = localdoc.DocId;
                            qr.CityId      = localdoc.CityId;
                            qr.MeetingDate = meetingDate;
                            qr.SearchTime  = DateTime.Now;
                            queries.Add(qr);
                        }

                        this.ExtractQueriesFromDoc(localdoc, ref qr);
                        Console.WriteLine("{0} documents saved...", docs.Count);
                        Console.WriteLine("{0} query results saved...", queries.Count);
                    }
                }

                this.SaveMeetingResultsToSQL(docs, queries);
            }
        }
Beispiel #3
0
        private void ExtractCommissionMinutes(string url, ref List <Documents> docs, ref List <QueryResult> queries)
        {
            Regex              dateReg     = new Regex("[A-Za-z]+[\\s]{1}[0-9]{1,2},[\\s]+[0-9]{2,4}");
            string             category    = "Planning Commission";
            WebClient          c           = new WebClient();
            HtmlWeb            web         = new HtmlWeb();
            HtmlDocument       minuteDoc   = web.Load(url);
            HtmlNodeCollection docNodeList = minuteDoc.DocumentNode.SelectNodes("//div[@id='dnn_ctr1988_Display_HtmlHolder']//a[contains(text(),'(pdf)')]");

            if (docNodeList != null && docNodeList.Count > 0)
            {
                foreach (HtmlNode docNode in docNodeList)
                {
                    string   docUrl      = this.cityEntity.CityUrl + docNode.Attributes["href"].Value;
                    DateTime meetingDate = DateTime.MinValue;
                    bool     isDate      = DateTime.TryParse(dateReg.Match(docNode.InnerText).ToString(), out meetingDate);

                    if (!isDate)
                    {
                        Regex dateReg1 = new Regex("[0-9]{1,2}-[0-9]{1,2}-[0-9]{2}");
                        isDate = DateTime.TryParseExact(dateReg1.Match(docUrl).ToString(), "MM-dd-yy", null, System.Globalization.DateTimeStyles.None, out meetingDate);
                    }

                    if (meetingDate < this.dtStartFrom)
                    {
                        Console.WriteLine("too early, skip...");
                        continue;
                    }

                    Documents localDoc = docs.FirstOrDefault(t => t.DocSource == docUrl);

                    if (localDoc == null)
                    {
                        localDoc           = new Documents();
                        localDoc.DocId     = Guid.NewGuid().ToString();
                        localDoc.CityId    = this.cityEntity.CityId;
                        localDoc.DocType   = category;
                        localDoc.Checked   = false;
                        localDoc.DocSource = docUrl;

                        string localFileName = string.Format("{0}\\{1}_{2}", this.localDirectory, Guid.NewGuid().ToString(), docUrl.Split('/').LastOrDefault().Split('?').FirstOrDefault());
                        localDoc.DocLocalPath = localFileName;

                        try
                        {
                            c.DownloadFile(docUrl, localFileName);
                        }
                        catch
                        {
                        }

                        docs.Add(localDoc);
                    }
                    else
                    {
                        Console.WriteLine("{0} already downloaded", docUrl);
                    }

                    this.ReadText(false, localDoc.DocLocalPath, ref localDoc);
                    QueryResult qr = queries.FirstOrDefault(t => t.DocId == localDoc.DocId);

                    if (qr == null)
                    {
                        qr             = new QueryResult();
                        qr.DocId       = localDoc.DocId;
                        qr.CityId      = this.cityEntity.CityId;
                        qr.SearchTime  = DateTime.Now;
                        qr.MeetingDate = meetingDate;

                        queries.Add(qr);
                    }

                    this.ExtractQueriesFromDoc(localDoc, ref qr);
                    Console.WriteLine("{0} docs, {1} queries...", docs.Count, queries.Count);
                }
            }
        }
Beispiel #4
0
        private void ExtractAgenda(string url, ref List <Documents> docs, ref List <QueryResult> queries)
        {
            Regex              dateReg          = new Regex("[A-Za-z]+[\\s]{1}[0-9]{1,2},[\\s]+[0-9]{2,4}");
            string             category         = "Planning Commission";
            HtmlWeb            web              = new HtmlWeb();
            HtmlDocument       archiveDoc       = web.Load(url);
            HtmlNodeCollection archiveMonthList = archiveDoc.DocumentNode.SelectNodes("//a[@class='archivedisplaymonthlink']");
            HtmlNode           currentNode      = archiveDoc.DocumentNode.SelectSingleNode("//a[text()='Current']");

            archiveMonthList.Insert(0, currentNode);
            foreach (HtmlNode archiveNode in archiveMonthList)
            {
                Regex digitReg = new Regex("[0-9]{4}");
                int   year     = archiveNode.InnerText == "Current" ? 2017 : int.Parse(digitReg.Match(archiveNode.InnerText).Value);
                if (year < this.dtStartFrom.Year)
                {
                    Console.WriteLine("Too early, skip...");
                    continue;
                }

                Console.WriteLine("Working on {0}...", archiveNode.InnerText);
                string             monthUrl     = archiveNode.Attributes["href"].Value;
                HtmlDocument       monthDoc     = web.Load(monthUrl);
                HtmlNode           pageNode     = monthDoc.DocumentNode.SelectSingleNode("//a[text()='Current']/parent::td/following-sibling::td");
                int                totalPage    = pageNode.SelectNodes("./a").Count;
                HtmlNodeCollection meetingNodes = monthDoc.DocumentNode.SelectNodes("//span[@class='newstitle']/a");

                for (int page = 1; page <= totalPage; page++)
                {
                    if (page > 1)
                    {
                        Console.WriteLine("Go to page {0}...", page);
                        monthUrl     = monthUrl.Replace("/158/", string.Format("/158/nnpg1480/{0}/", page));
                        monthDoc     = web.Load(monthUrl);
                        meetingNodes = monthDoc.DocumentNode.SelectNodes("//span[@class='newstitle']/a");
                    }

                    foreach (HtmlNode meetingNode in meetingNodes)
                    {
                        string meetingUrl   = meetingNode.Attributes["href"].Value;
                        string meetingTitle = meetingNode.InnerText;
                        bool   goIn         = meetingTitle.Contains("Planning Commission") && meetingTitle.ToLower().Contains("cancelled") == false;
                        goIn = goIn || (meetingTitle.ToLower().Contains("city council"));
                        if (goIn)
                        {
                            string    meetingAgendaUrl = meetingNode.Attributes["href"].Value;
                            Documents localDoc         = docs.FirstOrDefault(t => t.DocSource == meetingAgendaUrl);
                            DateTime  meetingDate      = DateTime.MinValue;

                            if (localDoc == null)
                            {
                                localDoc           = new Documents();
                                localDoc.DocType   = category;
                                localDoc.DocId     = Guid.NewGuid().ToString();
                                localDoc.CityId    = this.cityEntity.CityId;
                                localDoc.DocSource = meetingAgendaUrl;
                                string localFile = string.Format("{0}\\{1}.html",
                                                                 this.localDirectory,
                                                                 meetingAgendaUrl.Split('?').FirstOrDefault().Split('/').Reverse().ElementAt(1));
                                localDoc.DocLocalPath = localFile;
                                HtmlDocument agendaDoc         = web.Load(meetingAgendaUrl);
                                HtmlNode     agendaContentNode = agendaDoc.GetElementbyId("Table1");

                                if (agendaContentNode != null)
                                {
                                    File.WriteAllText(localFile, agendaContentNode.InnerHtml, Encoding.UTF8);
                                }

                                localDoc.DocBodyDic.Add(1, agendaContentNode.InnerText);
                                docs.Add(localDoc);
                            }
                            else
                            {
                                if (localDoc.DocBodyDic.Count == 0)
                                {
                                    string       html    = File.ReadAllText(localDoc.DocLocalPath);
                                    HtmlDocument htmlDoc = new HtmlDocument();
                                    htmlDoc.LoadHtml(html);
                                    localDoc.DocBodyDic.Add(1, htmlDoc.DocumentNode.InnerText);
                                }
                                Console.WriteLine("this file already downloaded..");
                            }

                            meetingDate = DateTime.Parse(dateReg.Match(localDoc.DocBodyDic[1]).ToString());

                            if (meetingTitle.Contains("City Council") && meetingDate <= DateTime.Now.AddDays(1 - DateTime.Now.Day))
                            {
                                continue;
                            }


                            if (meetingDate < this.dtStartFrom)
                            {
                                continue;
                            }

                            QueryResult qr = queries.FirstOrDefault(t => t.DocId == localDoc.DocId);

                            if (qr == null)
                            {
                                qr             = new QueryResult();
                                qr.CityId      = localDoc.CityId;
                                qr.DocId       = localDoc.DocId;
                                qr.MeetingDate = meetingDate;
                                qr.SearchTime  = DateTime.Now;

                                queries.Add(qr);
                            }

                            this.ExtractQueriesFromDoc(localDoc, ref qr);
                            Console.WriteLine("{0} docs saved, {1} queries saved...", docs.Count, queries.Count);
                        }
                    }
                }
            }
        }
Beispiel #5
0
        public void DownloadCouncilPdfFiles()
        {
            WebClient          c       = new WebClient();
            HtmlWeb            web     = new HtmlWeb();
            List <Documents>   docs    = this.LoadDocumentsDoneSQL();
            List <QueryResult> queries = this.LoadQueriesDoneSQL();

            foreach (string url in this.docUrls)
            {
                string category    = url.Split('*')[0];
                string categoryUrl = url.Split('*')[1];

                HtmlDocument       categoryDoc = web.Load(categoryUrl);
                HtmlNodeCollection listNodes   = categoryDoc.DocumentNode.SelectNodes("//div[@id='dnn_ctr1872_HtmlModule_lblContent']/table/tbody/tr");
                listNodes = listNodes == null?
                            categoryDoc.DocumentNode.SelectNodes("//div[@id='dnn_ctr1880_HtmlModule_lblContent']/table/tbody/tr") : listNodes;

                listNodes = listNodes == null?
                            categoryDoc.DocumentNode.SelectNodes("//div[@id='dnn_ctr1892_HtmlModule_lblContent']/table/tbody/tr") : listNodes;

                if (listNodes != null)
                {
                    foreach (HtmlNode listNode in listNodes)
                    {
                        HtmlNode dateNode = listNode.SelectSingleNode("./td[2]");
                        string   dateText = dateNode.InnerText.Replace("Febraury", "February");

                        if (listNode.InnerText.Contains((this.dtStartFrom.Year - 1).ToString()))
                        {
                            break;
                        }

                        DateTime meetingDate = DateTime.Parse(dateText);
                        if (meetingDate < this.dtStartFrom)
                        {
                            Console.WriteLine("Too early, skip...");
                            continue;
                        }
                        HtmlNodeCollection fileNodes = listNode.SelectNodes(".//a[contains(@href,'fileticket')]");

                        if (fileNodes != null)
                        {
                            foreach (HtmlNode fileNode in fileNodes)
                            {
                                string fileUrl = fileNode.Attributes["href"].Value.Replace("&amp;", "&");
                                fileUrl = fileUrl.StartsWith("http") ? fileUrl : this.cityEntity.CityUrl + fileUrl;

                                Documents localdoc = docs.FirstOrDefault(t => t.DocSource == fileUrl);

                                if (localdoc == null)
                                {
                                    localdoc           = new Documents();
                                    localdoc.DocId     = Guid.NewGuid().ToString();
                                    localdoc.CityId    = this.cityEntity.CityId;
                                    localdoc.DocType   = category;
                                    localdoc.DocSource = fileUrl;
                                    localdoc.Important = false;
                                    localdoc.Checked   = false;

                                    string localFile = string.Format("{0}\\{1}_{2}_{3}.pdf",
                                                                     this.localDirectory,
                                                                     category,
                                                                     fileNode.InnerText.Trim('\r', '\n', '\t', (char)32, (char)160),
                                                                     Guid.NewGuid().ToString());
                                    localdoc.DocLocalPath = localFile;

                                    try
                                    {
                                        c.Headers.Add("user-agent", "chrome");
                                        c.DownloadFile(fileUrl, localFile);
                                    }
                                    catch (Exception ex)
                                    { }

                                    docs.Add(localdoc);
                                }
                                else
                                {
                                    Console.ForegroundColor = ConsoleColor.Yellow;
                                    Console.WriteLine("This file already downloaded...");
                                    Console.ResetColor();
                                }

                                this.ReadText(false, localdoc.DocLocalPath, ref localdoc);
                                QueryResult qr = queries.FirstOrDefault(t => t.DocId == localdoc.DocId);

                                if (qr == null)
                                {
                                    qr             = new QueryResult();
                                    qr.CityId      = localdoc.CityId;
                                    qr.DocId       = localdoc.DocId;
                                    qr.SearchTime  = DateTime.Now;
                                    qr.MeetingDate = meetingDate;

                                    queries.Add(qr);
                                }

                                this.ExtractQueriesFromDoc(localdoc, ref qr);
                                Console.WriteLine("{0} docs added, {1} queries added...", docs.Count, queries.Count);

                                this.SaveMeetingResultsToSQL(docs, queries);
                            }
                        }
                    }
                }
            }
        }
Beispiel #6
0
        public void ExtractHtml(HtmlWeb web, WebClient c, DateTime meetingDate, string category, string url, ref List <Documents> docs, ref List <QueryResult> queries)
        {
            Documents localdoc = docs.FirstOrDefault(t => t.DocSource == url);

            if (localdoc == null)
            {
                localdoc              = new Documents();
                localdoc.DocId        = Guid.NewGuid().ToString();
                localdoc.CityId       = this.cityEntity.CityId;
                localdoc.DocSource    = url;
                localdoc.DocType      = category;
                localdoc.Checked      = false;
                localdoc.Important    = false;
                localdoc.DocLocalPath = string.Format("{0}\\{1}_Agenda_{2}.html",
                                                      this.localDirectory,
                                                      category,
                                                      Guid.NewGuid().ToString());

                try
                {
                    string html = c.DownloadString(url);
                    File.WriteAllText(localdoc.DocLocalPath, html);
                }
                catch
                { }

                docs.Add(localdoc);
            }
            else
            {
                Console.WriteLine("This file already downloaded...");
            }

            HtmlDocument htmlDoc = new HtmlDocument();

            htmlDoc.LoadHtml(File.ReadAllText(localdoc.DocLocalPath));
            HtmlNodeCollection pdfFileNodes = htmlDoc.DocumentNode.SelectNodes("//a[contains(@href,'ViewFile')]");

            localdoc.DocBodyDic.Clear();
            localdoc.DocBodyDic.Add(1, htmlDoc.DocumentNode.InnerText);
            QueryResult qr = queries.FirstOrDefault(t => t.DocId == localdoc.DocId);

            if (qr == null)
            {
                qr             = new QueryResult();
                qr.CityId      = localdoc.CityId;
                qr.DocId       = localdoc.DocId;
                qr.SearchTime  = DateTime.Now;
                qr.MeetingDate = meetingDate;
                queries.Add(qr);
            }

            this.ExtractQueriesFromDoc(localdoc, ref qr);
            Console.WriteLine("{0} docs added, {1} queries added...", docs.Count, queries.Count);
            this.SaveMeetingResultsToSQL(docs, queries);

            if (pdfFileNodes != null)
            {
                foreach (HtmlNode pdfNode in pdfFileNodes)
                {
                    if (!pdfNode.InnerText.ToLower().Contains(".pdf"))
                    {
                        continue;
                    }

                    string pdfUrl = this.cityEntity.CityUrl + pdfNode.Attributes["href"].Value;
                    this.ExtractPdf(c, meetingDate, pdfUrl, category, pdfNode.InnerText, ref docs, ref queries);
                }
            }
        }
        public void DownloadCouncilPdfFiles()
        {
            string[]           categories = { "City Council", "Planning Commission", "Zoning Board of Appeals" };
            List <Documents>   docs       = this.LoadDocumentsDoneSQL();
            List <QueryResult> queries    = this.LoadQueriesDoneSQL();
            WebClient          c          = new WebClient();
            HtmlWeb            web        = new HtmlWeb();
            ChromeDriver       cd         = new ChromeDriver();

            cd.Navigate().GoToUrl(this.docUrl);
            System.Threading.Thread.Sleep(10000);

            foreach (string category in categories)
            {
                List <int> years = new List <int>();
                for (int i = this.dtStartFrom.Year; i <= DateTime.Now.Year; i++)
                {
                    years.Add(i);
                }

                HtmlDocument doc = new HtmlDocument();
                doc.LoadHtml(cd.PageSource);

                List <string> tagsToSearch = doc.DocumentNode.SelectNodes(string.Format("//li[starts-with(text(),'{0}')]", category))
                                             .Select(t => t.InnerText).ToList();

                foreach (string tag in tagsToSearch)
                {
                    Console.WriteLine("Working on [{0}]...", tag);
                    foreach (int year in years)
                    {
                        IWebElement tagLabelEle = cd.FindElementById("ctl00_ContentPlaceHolder1_lstName_Input");
                        tagLabelEle.Click();
                        System.Threading.Thread.Sleep(1000);
                        IWebElement tagEle = cd.FindElementByXPath(string.Format("//li[text()='{0}']", tag));
                        tagEle.Click();
                        System.Threading.Thread.Sleep(2000);
                        IWebElement yearLabelEle = cd.FindElementByXPath("//*[contains(text(),'Date: ')]");
                        try
                        {
                            yearLabelEle.Click();
                        }
                        catch
                        {
                        }

                        System.Threading.Thread.Sleep(1000);
                        IWebElement yearEle = cd.FindElementByXPath(string.Format("//span[text()='{0}']", year));
                        yearEle.Click();
                        System.Threading.Thread.Sleep(2000);

                        doc.LoadHtml(cd.PageSource);
                        HtmlNodeCollection docCollection = doc.DocumentNode.SelectNodes("//table[@id='ctl00_ContentPlaceHolder1_gridCalendar_ctl00']/tbody/tr");

                        if (docCollection != null && docCollection.Count > 0)
                        {
                            foreach (HtmlNode docNode in docCollection)
                            {
                                string meetingDateText = docNode.SelectSingleNode("./td").InnerText;

                                if (meetingDateText == "No records to display.")
                                {
                                    continue;
                                }

                                DateTime meetingDate = DateTime.Parse(meetingDateText);
                                if (meetingDate < this.dtStartFrom)
                                {
                                    Console.WriteLine("Too early, skip...");
                                    continue;
                                }
                                HtmlNode agendaNode = docNode.SelectSingleNode(".//a[text()='Agenda']");
                                HtmlNode minuteNode = docNode.SelectSingleNode(".//a[text()='Minutes']");
                                Dictionary <string, string> docUrlDic = new Dictionary <string, string>();

                                if (agendaNode != null)
                                {
                                    string agendaUrl = agendaNode.Attributes["href"].Value;
                                    agendaUrl = agendaUrl.StartsWith("http") ? agendaUrl : "http://roch.legistar.com/" + agendaUrl;
                                    string fileType = string.Empty;
                                    string tagText  = agendaNode.PreviousSibling.PreviousSibling.Attributes["src"].Value;
                                    if (tagText.Contains("HTML"))
                                    {
                                        fileType = "html";
                                    }
                                    else if (tagText.Contains("PDF"))
                                    {
                                        fileType = "pdf";
                                    }
                                    docUrlDic.Add("Agenda_" + fileType, agendaUrl.Replace("&amp;", "&"));
                                }

                                if (minuteNode != null)
                                {
                                    string minuteUrl = minuteNode.Attributes["href"].Value;
                                    minuteUrl = minuteUrl.StartsWith("http") ? minuteUrl : "http://roch.legistar.com/" + minuteUrl;
                                    string fileType = string.Empty;
                                    string tagText  = minuteNode.PreviousSibling.PreviousSibling.Attributes["src"].Value;
                                    if (tagText.Contains("HTML"))
                                    {
                                        fileType = "html";
                                    }
                                    else if (tagText.Contains("PDF"))
                                    {
                                        fileType = "pdf";
                                    }
                                    docUrlDic.Add("Minute_" + fileType, minuteUrl.Replace("&amp;", "&"));
                                }

                                foreach (string key in docUrlDic.Keys)
                                {
                                    Documents localdoc = docs.FirstOrDefault(t => t.DocSource == docUrlDic[key]);
                                    if (localdoc == null)
                                    {
                                        localdoc           = new Documents();
                                        localdoc.DocId     = Guid.NewGuid().ToString();
                                        localdoc.CityId    = this.cityEntity.CityId;
                                        localdoc.Important = false;
                                        localdoc.Checked   = false;
                                        localdoc.DocType   = category;
                                        localdoc.DocSource = docUrlDic[key];

                                        if (key.Contains("html"))
                                        {
                                            localdoc.DocLocalPath = string.Format("{0}\\{1}_{2}.html", localDirectory, key, docUrlDic[key].Split('=').LastOrDefault());

                                            try
                                            {
                                                c.Headers.Add("user-agent", "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/57.0.2987.133 Safari/537.36");
                                                string html = c.DownloadString(docUrlDic[key]);
                                                File.WriteAllText(localdoc.DocLocalPath, html);
                                                HtmlDocument agendaDoc = new HtmlDocument();
                                                agendaDoc.LoadHtml(html);
                                                localdoc.DocBodyDic.Add(1, agendaDoc.DocumentNode.InnerText);
                                            }
                                            catch (Exception ex)
                                            {
                                            }
                                        }
                                        else
                                        {
                                            localdoc.DocLocalPath = string.Format("{0}\\{1}_{2}.pdf", localDirectory, key, docUrlDic[key].Split('=').LastOrDefault());

                                            try
                                            {
                                                c.Headers.Add("user-agent", "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/57.0.2987.133 Safari/537.36");
                                                c.DownloadFile(docUrlDic[key], localdoc.DocLocalPath);
                                            }
                                            catch (Exception ex)
                                            {
                                            }

                                            this.ReadText(false, localdoc.DocLocalPath, ref localdoc);
                                        }

                                        docs.Add(localdoc);
                                    }
                                    else
                                    {
                                        Console.WriteLine("This file already downloaded....");

                                        if (localdoc.DocLocalPath.ToLower().Contains("pdf"))
                                        {
                                            this.ReadText(false, localdoc.DocLocalPath, ref localdoc);
                                        }
                                        else
                                        {
                                            string       html        = File.ReadAllText(localdoc.DocLocalPath);
                                            HtmlDocument pageContent = new HtmlDocument();
                                            pageContent.LoadHtml(html);
                                            localdoc.DocBodyDic.Add(1, pageContent.DocumentNode.InnerText);
                                        }
                                    }

                                    QueryResult qr = queries.FirstOrDefault(t => t.DocId == localdoc.DocId);
                                    if (qr == null)
                                    {
                                        qr             = new QueryResult();
                                        qr.CityId      = localdoc.CityId;
                                        qr.DocId       = localdoc.DocId;
                                        qr.MeetingDate = meetingDate;
                                        qr.SearchTime  = DateTime.Now;

                                        queries.Add(qr);
                                    }

                                    this.ExtractQueriesFromDoc(localdoc, ref qr);
                                    Console.WriteLine("{0} docs added, {1} queries added...", docs.Count, queries.Count);
                                    this.SaveMeetingResultsToSQL(docs, queries);
                                }
                            }
                        }
                    }
                }
            }

            cd.Quit();
            cd = null;
        }
Beispiel #8
0
        public void DownloadCouncilPdfFiles()
        {
            HtmlWeb            web     = new HtmlWeb();
            WebClient          c       = new WebClient();
            List <Documents>   docs    = this.LoadDocumentsDoneSQL();
            List <QueryResult> queries = this.LoadQueriesDoneSQL();
            Regex dateReg = new Regex("[A-Za-z]+[\\s]{0,1}[0-9]{1,2},[\\s]{0,1}[0-9]{4}");

            foreach (string url in this.docUrls)
            {
                string             category    = url.Split('*')[0];
                string             categoryUrl = url.Split('*')[1];
                HtmlDocument       listDoc     = web.Load(categoryUrl);
                HtmlNodeCollection recordNodes = listDoc.DocumentNode.SelectNodes("//table/tbody/tr[@class='catAgendaRow']");

                if (recordNodes != null && recordNodes.Count > 0)
                {
                    foreach (HtmlNode recordNode in recordNodes)
                    {
                        try
                        {
                            HtmlNode dateNode    = recordNode.SelectSingleNode(".//strong");
                            string   dateText    = dateReg.Match(dateNode.InnerText).ToString();
                            DateTime meetingDate = DateTime.Parse(dateText);
                            if (meetingDate < this.dtStartFrom)
                            {
                                Console.WriteLine("Too early, skip...");
                                continue;
                            }
                            HtmlNode agendaNode = dateNode == null?
                                                  recordNode.SelectNodes(".//a[contains(@href,'ViewFile')]")
                                                  .Where(t => !t.Attributes["href"].Value.Contains("html"))
                                                  .FirstOrDefault(t => t.Attributes["href"].Value
                                                                  .ToLower().Contains("/agenda/")) :
                                                      dateNode.ParentNode;

                            string agendaUrl = agendaNode.Attributes["href"].Value;
                            agendaUrl = agendaUrl.StartsWith("http") ? agendaUrl : this.cityEntity.CityUrl + agendaUrl;
                            HtmlNode minuteNode = recordNode.SelectNodes(".//a[contains(@href,'ViewFile')]")
                                                  .FirstOrDefault(t => t.Attributes["href"].Value.ToLower().Contains("minutes"));
                            string        minuteUrl = minuteNode == null ? string.Empty : minuteNode.Attributes["href"].Value;
                            List <string> fileUrls  = new List <string>();
                            fileUrls.Add(agendaUrl);
                            if (!string.IsNullOrEmpty(minuteUrl))
                            {
                                minuteUrl = minuteUrl.StartsWith("http") ? minuteUrl : this.cityEntity.CityUrl + minuteUrl;
                                fileUrls.Add(minuteUrl);
                            }

                            foreach (string fileUrl in fileUrls)
                            {
                                Documents localdoc = docs.FirstOrDefault(t => t.DocSource == fileUrl);
                                string    tag      = fileUrl.ToLower().Contains("minute") ? "minute" : "agenda";

                                if (localdoc == null)
                                {
                                    localdoc           = new Documents();
                                    localdoc.CityId    = this.cityEntity.CityId;
                                    localdoc.Checked   = false;
                                    localdoc.DocId     = Guid.NewGuid().ToString();
                                    localdoc.DocSource = fileUrl;
                                    localdoc.DocType   = category;
                                    string localFileName = string.Format("{0}\\{1}_{2}_{3}.pdf",
                                                                         this.localDirectory,
                                                                         category,
                                                                         meetingDate.ToString("yyyy-MM-dd"),
                                                                         tag);
                                    try
                                    {
                                        c.DownloadFile(fileUrl, localFileName);
                                    }
                                    catch
                                    {
                                    }

                                    localdoc.DocLocalPath = localFileName;
                                    docs.Add(localdoc);
                                }
                                else
                                {
                                    Console.ForegroundColor = ConsoleColor.Yellow;
                                    Console.WriteLine("File already downloaded....");
                                    Console.ResetColor();
                                }

                                this.ReadText(false, localdoc.DocLocalPath, ref localdoc);
                                QueryResult qr = queries.FirstOrDefault(t => t.DocId == localdoc.DocId);

                                if (qr == null)
                                {
                                    qr             = new QueryResult();
                                    qr.CityId      = this.cityEntity.CityId;
                                    qr.DocId       = localdoc.DocId;
                                    qr.MeetingDate = meetingDate;
                                    qr.SearchTime  = DateTime.Now;

                                    queries.Add(qr);
                                }

                                this.ExtractQueriesFromDoc(localdoc, ref qr);
                                Console.WriteLine("{0} docs saved, {1} queries saved...", docs.Count, queries.Count);
                            }
                        }
                        catch (Exception ex)
                        {
                            Console.WriteLine("DEBUG EXCEPTION:{0}", ex.ToString());
                            Console.WriteLine("DATA: {0}", recordNode.InnerHtml);
                        }
                    }

                    this.SaveMeetingResultsToSQL(docs, queries);
                }
            }
        }
Beispiel #9
0
        public void DownloadCouncilPdfFiles()
        {
            List <Documents>   docs    = this.LoadDocumentsDoneSQL();
            List <QueryResult> queries = this.LoadQueriesDoneSQL();
            HtmlWeb            web     = new HtmlWeb();
            WebClient          c       = new WebClient();

            foreach (string url in this.docUrls)
            {
                HtmlDocument       listDoc     = web.Load(url);
                HtmlNodeCollection docNodeList = listDoc.DocumentNode.SelectNodes("//div[@id='system']//table//tr[@valign='top']");

                if (docNodeList != null)
                {
                    Console.WriteLine("{0} dates...", docNodeList.Count);

                    foreach (HtmlNode docNode in docNodeList)
                    {
                        DateTime meetingDate = DateTime.MinValue;
                        try
                        {
                            string dateText = docNode.SelectSingleNode("./td").InnerText.Trim((char)32, (char)160);
                            if (!string.IsNullOrEmpty(dateText))
                            {
                                meetingDate = DateTime.Parse(dateText);
                            }
                        }
                        catch
                        {
                        }

                        if (meetingDate < this.dtStartFrom)
                        {
                            Console.WriteLine("Earlier than {0}...", this.dtStartFrom);
                            continue;
                        }

                        HtmlNodeCollection pdfNodes = docNode.SelectNodes(".//a[@href]");

                        if (pdfNodes != null)
                        {
                            Console.WriteLine("{0} files at {1}...", pdfNodes.Count, meetingDate);

                            foreach (HtmlNode pdfNode in pdfNodes)
                            {
                                string pdfUrl = pdfNode.Attributes["href"].Value;
                                pdfUrl = pdfUrl.StartsWith("http") ? pdfUrl : "http://www.ci.wayne.mi.us" + pdfUrl;

                                if (pdfUrl.Contains("youtu"))
                                {
                                    continue;
                                }

                                Documents localdoc = docs.FirstOrDefault(t => t.DocSource == pdfUrl);

                                if (localdoc == null)
                                {
                                    localdoc           = new Documents();
                                    localdoc.DocId     = Guid.NewGuid().ToString();
                                    localdoc.DocType   = "Council";
                                    localdoc.CityId    = this.cityEntity.CityId;
                                    localdoc.DocSource = pdfUrl;

                                    string localPath = string.Format("{0}\\{1}", this.localDirectory, pdfUrl.Split('?').FirstOrDefault().Split('/').LastOrDefault());
                                    localdoc.DocLocalPath = localPath;

                                    if (!File.Exists(localPath))
                                    {
                                        try
                                        {
                                            c.DownloadFile(pdfUrl, localPath);
                                        }
                                        catch
                                        {
                                        }
                                    }

                                    docs.Add(localdoc);
                                }
                                else
                                {
                                    Console.ForegroundColor = ConsoleColor.Yellow;
                                    Console.WriteLine("This document already downloaded...");
                                    Console.ResetColor();
                                }

                                this.ReadText(false, localdoc.DocLocalPath, ref localdoc);
                                QueryResult qr = queries.FirstOrDefault(t => t.DocId == localdoc.DocId);

                                if (qr == null)
                                {
                                    qr             = new QueryResult();
                                    qr.DocId       = localdoc.DocId;
                                    qr.CityId      = localdoc.CityId;
                                    qr.MeetingDate = meetingDate;
                                    qr.SearchTime  = DateTime.Now;
                                    queries.Add(qr);
                                }

                                this.ExtractQueriesFromDoc(localdoc, ref qr);
                                Console.WriteLine("{0} documents saved...", docs.Count);
                                Console.WriteLine("{0} query results saved...", queries.Count);
                            }
                        }
                    }
                }
            }

            this.SaveMeetingResultsToSQL(docs, queries);
        }
Beispiel #10
0
        public void DownloadCouncilPdfFiles()
        {
            List <Documents>   docs    = this.LoadDocumentsDoneSQL();
            List <QueryResult> queries = this.LoadQueriesDoneSQL();
            HtmlWeb            web     = new HtmlWeb();
            WebClient          c       = new WebClient();

            foreach (string url in this.docUrls)
            {
                HtmlDocument       listDoc     = web.Load(url);
                HtmlNodeCollection docNodeList = listDoc.DocumentNode.SelectNodes("//div[@id='file_name']/a");

                if (docNodeList != null)
                {
                    Console.WriteLine("{0} files...", docNodeList.Count);

                    foreach (HtmlNode docNode in docNodeList)
                    {
                        DateTime meetingDate = DateTime.MinValue;

                        try
                        {
                            meetingDate = DateTime.Parse(HttpUtility.HtmlDecode(docNode.InnerText.Replace("Special Meeting", string.Empty)));
                        }
                        catch
                        {
                            string[] meetingTagArray = docNode.InnerText.Split('.').FirstOrDefault().Split(' ');
                            try
                            {
                                meetingDate = DateTime.Parse(string.Join(" ", meetingTagArray.Take(3)));
                            }
                            catch
                            {
                            }
                        }


                        if (meetingDate < this.dtStartFrom)
                        {
                            Console.WriteLine("{0} earlier than {1}. Skip...", meetingDate, dtStartFrom);
                            continue;
                        }

                        string pdfUrl = docNode.Attributes["href"].Value;
                        pdfUrl = pdfUrl.StartsWith("http") ? pdfUrl : "http://romulusgov.com/" + pdfUrl;
                        Documents localdoc = docs.FirstOrDefault(t => t.DocSource == pdfUrl);

                        if (localdoc == null)
                        {
                            string category = string.Empty;

                            if (url.Contains("city_council"))
                            {
                                category = "Council";
                            }
                            else if (url.Contains("planning"))
                            {
                                category = "Planning Commission";
                            }
                            else
                            {
                                category = "Board of Appeal";
                            }

                            localdoc           = new Documents();
                            localdoc.DocId     = Guid.NewGuid().ToString();
                            localdoc.DocType   = category;
                            localdoc.CityId    = this.cityEntity.CityId;
                            localdoc.DocSource = pdfUrl;

                            string localPath = string.Format("{0}\\{1}", this.localDirectory, pdfUrl.Split('?').FirstOrDefault().Split('/').LastOrDefault());
                            localPath             = HttpUtility.UrlDecode(localPath);
                            localdoc.DocLocalPath = localPath;

                            try
                            {
                                c.DownloadFile(pdfUrl, localPath);
                            }
                            catch
                            {
                            }

                            docs.Add(localdoc);
                        }
                        else
                        {
                            Console.ForegroundColor = ConsoleColor.Yellow;
                            Console.WriteLine("This document already downloaded...");
                            Console.ResetColor();
                        }

                        this.ReadText(false, localdoc.DocLocalPath, ref localdoc);
                        QueryResult qr = queries.FirstOrDefault(t => t.DocId == localdoc.DocId);

                        if (qr == null)
                        {
                            qr             = new QueryResult();
                            qr.DocId       = localdoc.DocId;
                            qr.CityId      = localdoc.CityId;
                            qr.MeetingDate = meetingDate;

                            qr.SearchTime = DateTime.Now;
                            queries.Add(qr);
                        }

                        this.ExtractQueriesFromDoc(localdoc, ref qr);
                        Console.WriteLine("{0} documents saved...", docs.Count);
                        Console.WriteLine("{0} query results saved...", queries.Count);
                    }
                }

                this.SaveMeetingResultsToSQL(docs, queries);
            }
        }
Beispiel #11
0
        public void DownloadCouncilPdfFiles()
        {
            List <Documents>   docs    = this.LoadDocumentsDoneSQL();
            List <QueryResult> queries = this.LoadQueriesDoneSQL();
            WebClient          c       = new WebClient();
            Regex dateReg = new Regex("[a-zA-Z]+[\\s]{0,2}[0-9]{1,2},[\\s]{0,2}[0-9]{4}");

            foreach (string url in this.docUrls)
            {
                string category = url.Split('*')[0];
                string listUrl  = url.Split('*')[1];
                Console.WriteLine("Working on {0}...", listUrl);

                string json    = c.DownloadString(listUrl);
                var    jsonDoc = JsonConvert.DeserializeObject(json) as JToken;

                if (jsonDoc != null)
                {
                    var fileUrlsNodes = jsonDoc.SelectTokens("$..href");

                    if (fileUrlsNodes != null)
                    {
                        foreach (var fileUrlNode in fileUrlsNodes)
                        {
                            string   fileUrl         = "https://eastpointecitymi.documents-on-demand.com" + fileUrlNode.ToString();
                            string   meetingDateText = dateReg.Match(fileUrl).ToString();
                            DateTime meetingDate     = DateTime.Parse(meetingDateText);
                            if (meetingDate < this.dtStartFrom)
                            {
                                Console.WriteLine("Too early, skip...");
                                continue;
                            }
                            Documents localdoc = docs.FirstOrDefault(t => t.DocSource == fileUrl);

                            if (localdoc == null)
                            {
                                localdoc           = new Documents();
                                localdoc.DocId     = Guid.NewGuid().ToString();
                                localdoc.CityId    = this.cityEntity.CityId;
                                localdoc.DocSource = fileUrl;
                                localdoc.Important = false;
                                localdoc.Checked   = false;
                                localdoc.DocType   = category;
                                string localPath = string.Format("{0}\\{1}", this.localDirectory, fileUrl.Split('/').LastOrDefault());
                                localdoc.DocLocalPath = localPath;

                                try
                                {
                                    c.DownloadFile(fileUrl, localPath);
                                }
                                catch { }

                                docs.Add(localdoc);
                            }
                            else
                            {
                                Console.WriteLine("This file already downloaded....");
                            }

                            this.ReadText(false, localdoc.DocLocalPath, ref localdoc);
                            QueryResult qr = queries.FirstOrDefault(t => t.DocId == localdoc.DocId);

                            if (qr == null)
                            {
                                qr             = new QueryResult();
                                qr.DocId       = localdoc.DocId;
                                qr.CityId      = localdoc.CityId;
                                qr.MeetingDate = meetingDate;
                                qr.SearchTime  = DateTime.Now;
                                queries.Add(qr);
                            }

                            this.ExtractQueriesFromDoc(localdoc, ref qr);
                            Console.WriteLine("{0} docs saved, {1} queries saved...", docs.Count, queries.Count);
                            this.SaveMeetingResultsToSQL(docs, queries);
                        }
                    }
                }
            }
        }